ParSparseMatrix.cpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, Oct 14, 16:51

ParSparseMatrix.cpp
View Options

	#include "ParSparseMatrix.h"
	#include <fstream>
	#ifdef TIMING_ON
	double time_diff(timespec &start, timespec &end)
	{
	return (double)(1e9 * (end.tv_sec - start.tv_sec) +
	end.tv_nsec - start.tv_nsec) / 1e9;
	}
	#endif
	using namespace MPI_Wrappers;
	using namespace std;
	namespace ATC_matrix {

	// All the same constructors as for SparseMatrix
	ParSparseMatrix<double>::ParSparseMatrix(MPI_Comm comm, INDEX rows, INDEX cols)
	: SparseMatrix<double>(rows, cols), _comm(comm) {}

	ParSparseMatrix<double>::ParSparseMatrix(MPI_Comm comm,
	const SparseMatrix<double> &c) :
	SparseMatrix<double>(c), _comm(comm) {}

	ParSparseMatrix<double>::ParSparseMatrix(MPI_Comm comm,
	INDEX* rows, INDEX* cols, double* vals, INDEX size,
	INDEX nRows, INDEX nCols, INDEX nRowsCRS)
	: SparseMatrix<double>(rows, cols, vals, size, nRows,
	nCols, nRowsCRS), _comm(comm) {}

	//============================================================
	void ParSparseMatrix<double>::MultMv(const Vector<double>& v,
	DenseVector<double>& c) const
	{
	int numProcs = MPI_Wrappers::size(_comm);
	#ifdef DISABLE_PAR_HEURISTICS
	// Use much more lenient heuristics to exercise parallel code
	if (numProcs == 1 \|\| _size < 300) {
	#else
	// These are simple heuristics to perform multiplication in serial if
	// parallel will be slower. They were determined experimentally.
	if ( numProcs == 1 \|\|
	(_size < 50000 \|\| _size > 10000000) \|\|
	((_size < 150000 \|\| _size > 5000000) && numProcs > 8) \|\|
	((_size < 500000 \|\| _size > 2500000) && numProcs > 16 ) \|\|
	(numProcs > 32)) {
	#endif
	SparseMatrix<double>::MultMv(v, c);
	return;
	}


	SparseMatrix<double>::compress(*this);
	GCK(this, v, this->nCols() != v.size(), "ParSparseMatrix Vector")

	SparseMatrix<double> A_local;

	// Split the sparse matrix. partition() takes a ParSparMat, so we cast.
	partition(static_cast<ParSparseMatrix<double>>(&A_local));

	// actually do multiplication - end up with partial result vector
	// on each processor
	#ifdef TIMING_ON
	timespec before, after;
	// barrier(MPI_COMM_WORLD);
	clock_gettime(CLOCK_MONOTONIC, &before);
	#endif
	DenseVector<double> c_local = A_local * v;
	#ifdef TIMING_ON
	// barrier(MPI_COMM_WORLD);
	clock_gettime(CLOCK_MONOTONIC, &after);
	cout << "P" << MPI_Wrappers::rank(MPI_COMM_WORLD) << " " << time_diff(before,after) << " mat.vec time\n";
	//LammpsInterface::instance()->all_print((after-before),"mat.vec time");
	barrier(MPI_COMM_WORLD);
	#endif

	// destroy A_local intelligently
	static_cast<ParSparseMatrix<double>*>(&A_local)->finalize();

	// Add all the result vectors together on each processor.
	#ifdef TIMING_ON
	barrier(MPI_COMM_WORLD);
	//barrier(MPI_COMM_WORLD);
	clock_gettime(CLOCK_MONOTONIC, &before);
	#endif
	allsum(_comm, c_local.ptr(), c.ptr(), c_local.size());
	#ifdef TIMING_ON
	//barrier(MPI_COMM_WORLD);
	clock_gettime(CLOCK_MONOTONIC, &after);
	cout << "P" << MPI_Wrappers::rank(MPI_COMM_WORLD) << " " << time_diff(before,after) << " allsum time\n";
	//LammpsInterface::instance()->print_msg_once((after-before),"allsum time");
	#endif
	}

	DenseVector<double> ParSparseMatrix<double>::transMat(
	const Vector<double>& v) const {
	SparseMatrix<double>::compress(*this);
	GCK(this, v, this->nRows() != v.size(), "ParSparseMatrix transpose Vector")

	DenseVector<double> c(nCols(), true);

	SparseMatrix<double> A_local;
	partition(static_cast<ParSparseMatrix<double>>(&A_local));

	// actually do multiplication - end up with partial result vector
	// on each processor
	DenseVector<double> c_local = A_local.transMat(v);

	static_cast<ParSparseMatrix<double>*>(&A_local)->finalize();

	// Add all the result vectors together on each processor.
	allsum(_comm, c_local.ptr(), c.ptr(), c_local.size());

	return c;
	}

	void ParSparseMatrix<double>::MultAB(const Matrix<double>& B,
	DenseMatrix<double>& C) const {
	SparseMatrix<double>::compress(*this);
	GCK(this, B, this->nCols() != B.nRows(), "ParSparseMatrix Matrix")

	SparseMatrix<double> A_local;
	partition(static_cast<ParSparseMatrix<double>>(&A_local));

	// actually do multiplication - end up with partial result matrix
	// on each processor

	#ifdef TIMING_ON
	timespec before, after;
	barrier(MPI_COMM_WORLD);
	clock_gettime(CLOCK_MONOTONIC, &before);
	#endif
	DenseMatrix<double> C_local = A_local * B;
	#ifdef TIMING_ON
	barrier(MPI_COMM_WORLD);
	clock_gettime(CLOCK_MONOTONIC, &after);
	cout << "P" << MPI_Wrappers::rank(MPI_COMM_WORLD) << " " << time_diff(after,before) << " mat.vec time\n";
	//LammpsInterface::instance()->all_print((after-before),"mat.vec time");
	#endif

	static_cast<ParSparseMatrix<double>*>(&A_local)->finalize();

	// Add all the result vectors together on each processor.
	#ifdef TIMING_ON
	barrier(MPI_COMM_WORLD);
	clock_gettime(CLOCK_MONOTONIC, &before);
	#endif
	allsum(_comm, C_local.ptr(), C.ptr(), C_local.size());
	#ifdef TIMING_ON
	barrier(MPI_COMM_WORLD);
	clock_gettime(CLOCK_MONOTONIC, &after);
	cout << "P" << MPI_Wrappers::rank(MPI_COMM_WORLD) << " " << time_diff(after,before) << " allsum time\n";
	//LammpsInterface::instance()->print_msg_once((after-before),"allsum time");
	#endif
	}

	DenseMatrix<double> ParSparseMatrix<double>::transMat(
	const DenseMatrix<double>& B) const {
	SparseMatrix<double>::compress(*this);
	GCK(this, B, this->nRows() != B.nRows(), "ParSparseMatrix transpose Matrix")

	DenseMatrix<double> C(nCols(), B.nCols(), true);

	SparseMatrix<double> A_local;
	partition(static_cast<ParSparseMatrix<double>>(&A_local));

	// actually do multiplication - end up with partial result matrix
	// on each processor
	DenseMatrix<double> C_local = A_local.transMat(B);

	static_cast<ParSparseMatrix<double>*>(&A_local)->finalize();

	// Add all the result vectors together on each processor.
	allsum(_comm, C_local.ptr(), C.ptr(), C_local.size());

	return C;
	}

	/*
	The two commented-out functions both need to return SparseMatrices. It's hard
	to combine sparse matrices between processors, so this has not yet been completed.

	void ParMultAB(const SparseMatrix<double> &B, SparseMatrix<double> &C) const
	{
	//SparseMatrix<T>::compress(*this);
	GCK(this, B, this->nCols()!=B.nRows(), "ParSparseMatrix SparseMatrix")

	ParSparseMatrix<double> A_local(this->_comm);
	this->partition(A_local);

	// actually do multiplication - end up with partial result matrix
	// on each processor

	SparseMatrix<double> C_local = ((SparseMatrix<double>)A_local) * B;

	// destroy newA intelligently
	static_cast<ParSparseMatrix<double>*>(&A_local)->finalize();

	// Add all the result vectors together on each processor.
	sumSparse(C_local, C);
	}*/

	DenseMatrix<double> ParSparseMatrix<double>::transMat(
	const SparseMatrix<double>& B) const {
	SparseMatrix<double>::compress(*this);
	GCK(this, B, this->nRows() != B.nRows(), "ParSparseMatrix transpose SparseMatrix")

	DenseMatrix<double> C(nCols(), B.nCols(), true);

	SparseMatrix<double> A_local;
	partition(static_cast<ParSparseMatrix<double>>(&A_local));

	// actually do multiplication - end up with partial result matrix
	// on each processor
	DenseMatrix<double> C_local = A_local.transMat(B);

	static_cast<ParSparseMatrix<double>*>(&A_local)->finalize();

	// Add all the result vectors together on each processor.
	allsum(_comm, C_local.ptr(), C.ptr(), C_local.size());

	return C;
	}

	/*void ParMultAB(const DiagonalMatrix<double> &B, SparseMatrix<double> &C) const
	{
	//SparseMatrix<T>::compress(*this);
	GCK(this, B, this->nCols()!=B.nRows(), "ParSparseMatrix DiagonalMatrix")

	ParSparseMatrix<double> A_local(this->_comm);
	this->partition(A_local);

	// actually do multiplication - end up with partial result matrix
	// on each processor

	SparseMatrix<double> C_local = ((SparseMatrix<double>)A_local) * B;

	// destroy newA intelligently
	A_local._val = NULL;
	A_local._ja = NULL;

	// Add all the result vectors together on each processor.
	sumSparse(C_local, C);
	}*/

	void ParSparseMatrix<double>::partition(
	ParSparseMatrix<double>& A_local) const {
	// create new sparse matrix on each processor, with same size and
	// a disjoint subset of A's elements.
	//
	// Ex: on two processors,
	//
	// \|0 1 0\| \|0 1 0\| \|0 0 0\|
	// \|2 6 0\| splits into \|2 0 0\| on proc 1 and \|0 6 0\| on proc 2
	// \|0 0 3\| \|0 0 0\| \|0 0 3\|
	//
	// We compute the subproducts individually on each processor, then
	// sum up all the vectors to get our final result.
	//

	// decide which elements will be in each submatrix
	INDEX startIndex = (MPI_Wrappers::rank(_comm) * size()) / MPI_Wrappers::size(_comm);
	INDEX endIndex = ((MPI_Wrappers::rank(_comm) + 1) * size()) / MPI_Wrappers::size(_comm);

	// update number of elements
	A_local._nRows = _nRows;
	A_local._nCols = _nCols;
	A_local._size = endIndex - startIndex;
	A_local._nRowsCRS = _nRowsCRS;
	// use pointer arithmetic to:
	// set newA's _val (to inside A's _val)
	A_local._val = _val + startIndex;
	// set newA's _ja (to inside A's _ja)
	A_local._ja = _ja + startIndex;
	// set newA's _ia (from scratch)
	A_local._ia = new INDEX[nRowsCRS() + 1];
	INDEX numRows = nRowsCRS();
	if (A_local._size > 0) {
	for (INDEX i = 0; i < numRows + 1; i++) {
	A_local._ia[i] = std::min(std::max((_ia[i] - startIndex), 0),
	endIndex - startIndex);
	}
	} else {
	A_local._nRowsCRS = 0;
	}
	}

	// Prepare an A_local matrix for deletion after it has been loaded with
	// data members from another matrix.
	void ParSparseMatrix<double>::finalize() {
	_val = NULL;
	_ja = NULL;
	}

	void ParSparseMatrix<double>::operator=(const SparseMatrix<double> &source)
	{
	copy(source);
	}


	/*void sumSparse(SparseMatrix<double> &C_local, SparseMatrix<double> &C)
	{
	}*/

	}

ParSparseMatrix.cppNo OneTemporaryActions

File Metadata

ParSparseMatrix.cppView Options

Event Timeline

ParSparseMatrix.cpp
No OneTemporary
Actions

ParSparseMatrix.cpp
View Options