cg.cu
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sat, Jul 27, 18:36

cg.cu
View Options

	#include "cg.hh"
	#include <cmath>
	#include <iostream>
	#include <assert.h>
	#include <cuda_runtime.h>
	#include "cublas_v2.h"

	// A few variables that allow to modify the behaviour of the program
	const bool DEBUG = false;
	const bool USE_SHARED_MEM = true;

	/*
	cgsolver solves the linear equation A*x = b where A is
	of size m x n

	Code based on MATLAB code (from wikipedia ;-) ):

	function x = conjgrad(A, b, x)
	r = b - A * x;
	p = r;
	rsold = r' * r;

	for i = 1:length(b)
	Ap = A * p;
	alpha = rsold / (p' * Ap);
	x = x + alpha * p;
	r = r - alpha * Ap;
	rsnew = r' * r;
	if sqrt(rsnew) < 1e-10
	break;
	end
	p = r + (rsnew / rsold) * p;
	rsold = rsnew;
	end
	end

	*/

	// Set acceleration parameters
	void Solver::setAccelerationParams(const dim3 grid_size, const dim3 block_size) {
	m_dimGrid = grid_size;
	m_dimBlock = block_size;
	}

	// Adapted from https://docs.nvidia.com/cuda/cuda-c-programming-guide/#memory-hierarchy

	// Perform vector - vector addition
	// out = x + a * y
	__global__ void VectAddKernel(const double* x, const double* y, double a, double* out, int N) {
	// each thread computes one entry of result
	int row = blockIdx.x * blockDim.x + threadIdx.x;
	if (row >= N) return;
	out[row] = x[row] + a * y [row]; // Coalesced access to x, y
	}

	// Perform Matrix-vector product
	// out = A * x
	__global__ void MatVectProdKernel(const double* A, const double* x, double* out, int N) {
	// Each thread computes one element of y = A * x
	int row = blockIdx.x * blockDim.x + threadIdx.x;
	if (row >= N) return;
	double sum = 0.0;
	for (int e = 0; e < N; e++)
	sum += A[row + e * N] * x[row];
	out[row] = sum;
	}

	/*
	Perform matrix-vector product, but load chunks of x to shared memory first
	Motivation: all threads need to read from x, leading to uncoalesed access from within the block
	Memory allocation must be determined at compile time. I allocate the max amount of doubles
	Memory size for shared memory allocation needs to be known at compile time. It is not, so we just allocate a big enough space and may not use all of it.
	We can store 6144 double-precision numbers in shared memory (49152 bytes available on Tesla V100-PCIE-32GB).
	*/
	#define MAX_CHUNK_SIZE 1024
	__global__ void LoadedMatVectProdKernel(const double* A, const double* x, double* out, int N) {
	int row = blockIdx.x * blockDim.x + threadIdx.x;
	if (row >= N) return;
	assert(blockDim.x <= MAX_CHUNK_SIZE);
	assert((blockDim.x + 1) * (blockDim.x + 1) <= 6144); // Not sure thats restrictive enough
	// iterate over chunks of x when computing the result
	double sum = 0.0;
	int numChunks = N / blockDim.x;
	if (N % blockDim.x > 0) numChunks++;
	for (int m = 0; m < numChunks; m++) {
	// load chunks of A, x
	__shared__ double xs[MAX_CHUNK_SIZE];
	//__shared__ double As[MAX_CHUNK_SIZE][MAX_CHUNK_SIZE];
	//__shared__ double As[MAX_CHUNK_SIZE * MAX_CHUNK_SIZE];
	xs[threadIdx.x] = x[m * blockDim.x + threadIdx.x];
	__syncthreads();

	int numCols = blockDim.x;
	if ((m + 1) * blockDim.x > N) numCols = N % blockDim.x;
	for (int e = 0; e < numCols; e++)
	sum += A[row + (m * blockDim.x + e) * N] * xs[e]; // Coalesced access to A
	__syncthreads();
	}
	// Return result
	out[row] = sum;
	}

	// --------------------------
	// Function that tests things
	// --------------------------
	void CGSolver::testThings() {
	// Create some interesting data
	int n = 4;
	double A [n * n];
	for (int i = 0; i < n * n; ++i)
	A[i] = i;


	// Show what A contains
	std::cout << " A = " << std::endl;
	std::cout << "[";
	for (int row = 0; row < n; row++) {
	std::cout << "[";
	for (int col = 0; col < n; col++)
	std::cout << " " << A[row + col * n] << " ";
	std::cout << "]" << std::endl;
	}
	std::cout << "]" << std::endl;

	double x[n];
	for (int i = 0; i < n; i++)
	x[i] = 1.0;

	double y[n];
	for (int i = 0; i < n; i++)
	y[i] = 0.0;

	// Load data to device
	double* d_A, d_x, d_y;
	cudaMalloc(&d_A, n * n * sizeof(double));
	cudaMalloc(&d_x, n * sizeof(double));
	cudaMalloc(&d_y, n * sizeof(double));
	cudaMemcpy(d_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(d_x, x, n * sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(d_y, y, n * sizeof(double), cudaMemcpyHostToDevice);

	// Compute A * x, retrieve and display results
	if (USE_SHARED_MEM) {
	LoadedMatVectProdKernel<<<m_dimGrid, m_dimBlock>>>(d_A, d_x, d_y, n);
	} else {
	MatVectProdKernel<<<m_dimGrid, m_dimBlock>>>(d_A, d_x, d_y, n);
	}
	cudaMemcpy(y, d_y, n * sizeof(double), cudaMemcpyDeviceToHost);
	std::cout << "A * x = " << std::endl;
	for (int i = 0; i < n; i++)
	std::cout << y[i] << std::endl;

	// Compute x + 3x, retrieve and display results
	VectAddKernel<<<m_dimGrid, m_dimBlock>>>(d_x, d_x, 3, d_y, n);
	cudaMemcpy(y, d_y, n * sizeof(double), cudaMemcpyDeviceToHost);
	std::cout << "x + 3x = " << std::endl;
	for (int i = 0; i < n; i++)
	std::cout << y[i] << std::endl;

	// Free memory
	cudaFree(d_A);
	cudaFree(d_x);
	cudaFree(d_y);
	}


	void CGSolver::solve(std::vector<double> & x) {
	/*
	Performs CG descent algorithm to find approximate solution x to linear system Ax=b. Uses Kernel functions
	to compute scalar products, matrix-vector products, and vector-vector additions.
	*/

	// --------------------------------------------
	// Before we actually start, let's test things:
	// --------------------------------------------
	if (DEBUG)
	testThings();

	// ---------------------------------------
	// cuBLAS setup for cublasDdot utilization
	// ---------------------------------------
	cublasHandle_t handle;
	cublasStatus_t stat;
	stat = cublasCreate(&handle);
	if (stat != CUBLAS_STATUS_SUCCESS) {
	printf ("CUBLAS runtime error creating cuBLAS handle\n");
	}

	// ----------------------------------
	// Load data A, b, x to device memory
	// ----------------------------------
	// A needs to be stored column-major style, so we copy it to a temporary array first
	double A_Fstyle[m_A.m() * m_A.n()];
	for (int r = 0; r < m_A.m(); ++r) {
	for (int c = 0; c < m_A.n(); ++c)
	A_Fstyle[r + m_A.n() * c] = m_A(r, c);
	}
	double *d_A;
	size_t size_A = m_A.m() * m_A.n() * sizeof(double);
	cudaMalloc(&d_A, size_A);
	cudaMemcpy(d_A, A_Fstyle, size_A, cudaMemcpyHostToDevice);

	double *d_b;
	size_t size_b = m_A.m() * sizeof(double);
	cudaMalloc(&d_b, size_b);
	cudaMemcpy(d_b, m_b.data(), size_b, cudaMemcpyHostToDevice);

	double *d_x;
	cudaMalloc(&d_x, size_b);
	cudaMemcpy(d_x, x.data(), size_b, cudaMemcpyHostToDevice);

	// -------------------------------------------------
	// Allocate memory for algorithm variables on device
	// -------------------------------------------------
	double d_r, d_p, d_Ap, d_tmp;
	cudaMalloc(&d_r, size_b);
	cudaMalloc(&d_p, size_b);
	cudaMalloc(&d_Ap, size_b);
	cudaMalloc(&d_tmp, size_b);

	// ---------------
	// Algorithm setup
	// ---------------
	// r = b - A * x;
	if (USE_SHARED_MEM)
	LoadedMatVectProdKernel<<<m_dimGrid, m_dimBlock>>>(d_A, d_x, d_Ap, m_A.n());
	else
	MatVectProdKernel<<<m_dimGrid, m_dimBlock>>>(d_A, d_x, d_Ap, m_A.n());
	VectAddKernel<<<m_dimGrid, m_dimBlock>>>(d_b, d_Ap, -1.0, d_r, m_A.m());

	// p = r
	cudaMemcpy(d_p, d_r, size_b, cudaMemcpyDeviceToDevice);

	// rsold = r' * r;
	double rsold;
	cublasDdot(handle, m_A.m(), d_r, 1, d_r, 1, &rsold);

	// -------------------
	// for i = 1:length(b)
	// -------------------
	int k = 0;
	for (; k < m_A.m(); ++k) {
	// Ap = A * p
	if (USE_SHARED_MEM)
	LoadedMatVectProdKernel<<<m_dimGrid, m_dimBlock>>>(d_A, d_p, d_Ap, m_A.n());
	else
	MatVectProdKernel<<<m_dimGrid, m_dimBlock>>>(d_A, d_p, d_Ap, m_A.n());

	// alpha = rsold / (p' * Ap);
	double pAp;
	cublasDdot(handle, m_A.m(), d_p, 1, d_Ap, 1, &pAp);
	auto alpha = rsold / pAp;

	// x = x + alpha * p;
	VectAddKernel<<<m_dimGrid, m_dimBlock>>>(d_x, d_p, alpha, d_x, m_A.m());

	// r = r - alpha * Ap;
	VectAddKernel<<<m_dimGrid, m_dimBlock>>>(d_r, d_Ap, -alpha, d_r, m_A.m());

	// rsnew = r' * r;
	double rsnew;
	cublasDdot(handle, m_A.m(), d_r, 1, d_r, 1, &rsnew);

	// p = r + (rsnew / rsold) * p;
	auto beta = rsnew / rsold;
	VectAddKernel<<<m_dimGrid, m_dimBlock>>>(d_r, d_p, beta, d_p, m_A.m());

	if (DEBUG && (k % 1000 == 0)) {
	std::cout << "Rsold is " << rsold << std::endl;
	std::cout << "pAp is " << pAp << std::endl;
	std::cout << "alpha is " << alpha << std::endl;
	std::cout << "rsnew is " << rsnew << std::endl;
	std::cout << "beta is " << beta << std::endl;
	}

	// rsold = rsnew;
	rsold = rsnew;

	// if sqrt(rsnew) < 1e-10
	// break;
	if (std::sqrt(rsnew) < m_tolerance)
	break; // Convergence test
	}

	if (DEBUG) {
	// r = A * x
	if (USE_SHARED_MEM)
	LoadedMatVectProdKernel<<<m_dimGrid, m_dimBlock>>>(d_A, d_x, d_r, m_A.n());
	else
	MatVectProdKernel<<<m_dimGrid, m_dimBlock>>>(d_A, d_x, d_r, m_A.n());

	// r = r - b
	VectAddKernel<<<m_dimGrid, m_dimBlock>>>(d_r, d_b, -1.0, d_r, m_A.m());

	// res = \|\| r \|\| / \|\| b \|\|
	double rsnew;
	stat = cublasDdot(handle, m_A.m(), d_r, 1, d_r, 1, &rsnew);
	double bTb;
	stat = cublasDdot(handle, m_A.m(), d_b, 1, d_b, 1, &bTb);
	auto res = std::sqrt(rsnew / bTb);
	double nx;
	stat = cublasDdot(handle, m_A.m(), d_x, 1, d_x, 1, &nx);

	// Print results
	std::cout << "\t[STEP " << k << "] residual = " << std::scientific
	<< std::sqrt(rsold) << ", \|\|x\|\| = " << nx
	<< ", \|\|Ax - b\|\|/\|\|b\|\| = " << res << std::endl;
	}

	// ---------------------------
	// Read solution x from device
	// ---------------------------
	cudaMemcpy(x.data(), d_x, size_b, cudaMemcpyDeviceToHost);

	// ---------------------------
	// Deallocate memory on device
	// ---------------------------
	cudaFree(d_A);
	cudaFree(d_b);
	cudaFree(d_x);
	cudaFree(d_r);
	cudaFree(d_p);
	cudaFree(d_Ap);
	cudaFree(d_tmp);

	// Destroy cuBLAS handle
	cublasDestroy(handle);
	}

	void CGSolver::read_matrix(const std::string & filename) {
	m_A.read(filename);
	m_m = m_A.m();
	m_n = m_A.n();
	}

	/*
	Initialization of the source term b
	*/
	void Solver::init_source_term(double h) {
	m_b.resize(m_n);

	for (int i = 0; i < m_n; i++) {
	m_b[i] = -2. * i * M_PI * M_PI * std::sin(10. * M_PI * i * h) *
	std::sin(10. * M_PI * i * h);
	}
	}

cg.cuNo OneTemporaryActions

File Metadata

cg.cuView Options

Event Timeline

cg.cu
No OneTemporary
Actions

cg.cu
View Options