Page MenuHomec4science

cg.cu
No OneTemporary

File Metadata

Created
Sat, Jul 27, 18:36
#include "cg.hh"
#include <cmath>
#include <iostream>
#include <assert.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
// A few variables that allow to modify the behaviour of the program
const bool DEBUG = false;
const bool USE_SHARED_MEM = true;
/*
cgsolver solves the linear equation A*x = b where A is
of size m x n
Code based on MATLAB code (from wikipedia ;-) ):
function x = conjgrad(A, b, x)
r = b - A * x;
p = r;
rsold = r' * r;
for i = 1:length(b)
Ap = A * p;
alpha = rsold / (p' * Ap);
x = x + alpha * p;
r = r - alpha * Ap;
rsnew = r' * r;
if sqrt(rsnew) < 1e-10
break;
end
p = r + (rsnew / rsold) * p;
rsold = rsnew;
end
end
*/
// Set acceleration parameters
void Solver::setAccelerationParams(const dim3 grid_size, const dim3 block_size) {
m_dimGrid = grid_size;
m_dimBlock = block_size;
}
// Adapted from https://docs.nvidia.com/cuda/cuda-c-programming-guide/#memory-hierarchy
// Perform vector - vector addition
// out = x + a * y
__global__ void VectAddKernel(const double* x, const double* y, double a, double* out, int N) {
// each thread computes one entry of result
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= N) return;
out[row] = x[row] + a * y [row]; // Coalesced access to x, y
}
// Perform Matrix-vector product
// out = A * x
__global__ void MatVectProdKernel(const double* A, const double* x, double* out, int N) {
// Each thread computes one element of y = A * x
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= N) return;
double sum = 0.0;
for (int e = 0; e < N; e++)
sum += A[row + e * N] * x[row];
out[row] = sum;
}
/*
Perform matrix-vector product, but load chunks of x to shared memory first
Motivation: all threads need to read from x, leading to uncoalesed access from within the block
Memory allocation must be determined at compile time. I allocate the max amount of doubles
Memory size for shared memory allocation needs to be known at compile time. It is not, so we just allocate a big enough space and may not use all of it.
We can store 6144 double-precision numbers in shared memory (49152 bytes available on Tesla V100-PCIE-32GB).
*/
#define MAX_CHUNK_SIZE 1024
__global__ void LoadedMatVectProdKernel(const double* A, const double* x, double* out, int N) {
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= N) return;
assert(blockDim.x <= MAX_CHUNK_SIZE);
assert((blockDim.x + 1) * (blockDim.x + 1) <= 6144); // Not sure thats restrictive enough
// iterate over chunks of x when computing the result
double sum = 0.0;
int numChunks = N / blockDim.x;
if (N % blockDim.x > 0) numChunks++;
for (int m = 0; m < numChunks; m++) {
// load chunks of A, x
__shared__ double xs[MAX_CHUNK_SIZE];
//__shared__ double As[MAX_CHUNK_SIZE][MAX_CHUNK_SIZE];
//__shared__ double As[MAX_CHUNK_SIZE * MAX_CHUNK_SIZE];
xs[threadIdx.x] = x[m * blockDim.x + threadIdx.x];
__syncthreads();
int numCols = blockDim.x;
if ((m + 1) * blockDim.x > N) numCols = N % blockDim.x;
for (int e = 0; e < numCols; e++)
sum += A[row + (m * blockDim.x + e) * N] * xs[e]; // Coalesced access to A
__syncthreads();
}
// Return result
out[row] = sum;
}
// --------------------------
// Function that tests things
// --------------------------
void CGSolver::testThings() {
// Create some interesting data
int n = 4;
double A [n * n];
for (int i = 0; i < n * n; ++i)
A[i] = i;
// Show what A contains
std::cout << " A = " << std::endl;
std::cout << "[";
for (int row = 0; row < n; row++) {
std::cout << "[";
for (int col = 0; col < n; col++)
std::cout << " " << A[row + col * n] << " ";
std::cout << "]" << std::endl;
}
std::cout << "]" << std::endl;
double x[n];
for (int i = 0; i < n; i++)
x[i] = 1.0;
double y[n];
for (int i = 0; i < n; i++)
y[i] = 0.0;
// Load data to device
double* d_A, *d_x, *d_y;
cudaMalloc(&d_A, n * n * sizeof(double));
cudaMalloc(&d_x, n * sizeof(double));
cudaMalloc(&d_y, n * sizeof(double));
cudaMemcpy(d_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_x, x, n * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, n * sizeof(double), cudaMemcpyHostToDevice);
// Compute A * x, retrieve and display results
if (USE_SHARED_MEM) {
LoadedMatVectProdKernel<<<m_dimGrid, m_dimBlock>>>(d_A, d_x, d_y, n);
} else {
MatVectProdKernel<<<m_dimGrid, m_dimBlock>>>(d_A, d_x, d_y, n);
}
cudaMemcpy(y, d_y, n * sizeof(double), cudaMemcpyDeviceToHost);
std::cout << "A * x = " << std::endl;
for (int i = 0; i < n; i++)
std::cout << y[i] << std::endl;
// Compute x + 3x, retrieve and display results
VectAddKernel<<<m_dimGrid, m_dimBlock>>>(d_x, d_x, 3, d_y, n);
cudaMemcpy(y, d_y, n * sizeof(double), cudaMemcpyDeviceToHost);
std::cout << "x + 3x = " << std::endl;
for (int i = 0; i < n; i++)
std::cout << y[i] << std::endl;
// Free memory
cudaFree(d_A);
cudaFree(d_x);
cudaFree(d_y);
}
void CGSolver::solve(std::vector<double> & x) {
/*
Performs CG descent algorithm to find approximate solution x to linear system Ax=b. Uses Kernel functions
to compute scalar products, matrix-vector products, and vector-vector additions.
*/
// --------------------------------------------
// Before we actually start, let's test things:
// --------------------------------------------
if (DEBUG)
testThings();
// ---------------------------------------
// cuBLAS setup for cublasDdot utilization
// ---------------------------------------
cublasHandle_t handle;
cublasStatus_t stat;
stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("CUBLAS runtime error creating cuBLAS handle\n");
}
// ----------------------------------
// Load data A, b, x to device memory
// ----------------------------------
// A needs to be stored column-major style, so we copy it to a temporary array first
double A_Fstyle[m_A.m() * m_A.n()];
for (int r = 0; r < m_A.m(); ++r) {
for (int c = 0; c < m_A.n(); ++c)
A_Fstyle[r + m_A.n() * c] = m_A(r, c);
}
double *d_A;
size_t size_A = m_A.m() * m_A.n() * sizeof(double);
cudaMalloc(&d_A, size_A);
cudaMemcpy(d_A, A_Fstyle, size_A, cudaMemcpyHostToDevice);
double *d_b;
size_t size_b = m_A.m() * sizeof(double);
cudaMalloc(&d_b, size_b);
cudaMemcpy(d_b, m_b.data(), size_b, cudaMemcpyHostToDevice);
double *d_x;
cudaMalloc(&d_x, size_b);
cudaMemcpy(d_x, x.data(), size_b, cudaMemcpyHostToDevice);
// -------------------------------------------------
// Allocate memory for algorithm variables on device
// -------------------------------------------------
double *d_r, *d_p, *d_Ap, *d_tmp;
cudaMalloc(&d_r, size_b);
cudaMalloc(&d_p, size_b);
cudaMalloc(&d_Ap, size_b);
cudaMalloc(&d_tmp, size_b);
// ---------------
// Algorithm setup
// ---------------
// r = b - A * x;
if (USE_SHARED_MEM)
LoadedMatVectProdKernel<<<m_dimGrid, m_dimBlock>>>(d_A, d_x, d_Ap, m_A.n());
else
MatVectProdKernel<<<m_dimGrid, m_dimBlock>>>(d_A, d_x, d_Ap, m_A.n());
VectAddKernel<<<m_dimGrid, m_dimBlock>>>(d_b, d_Ap, -1.0, d_r, m_A.m());
// p = r
cudaMemcpy(d_p, d_r, size_b, cudaMemcpyDeviceToDevice);
// rsold = r' * r;
double rsold;
cublasDdot(handle, m_A.m(), d_r, 1, d_r, 1, &rsold);
// -------------------
// for i = 1:length(b)
// -------------------
int k = 0;
for (; k < m_A.m(); ++k) {
// Ap = A * p
if (USE_SHARED_MEM)
LoadedMatVectProdKernel<<<m_dimGrid, m_dimBlock>>>(d_A, d_p, d_Ap, m_A.n());
else
MatVectProdKernel<<<m_dimGrid, m_dimBlock>>>(d_A, d_p, d_Ap, m_A.n());
// alpha = rsold / (p' * Ap);
double pAp;
cublasDdot(handle, m_A.m(), d_p, 1, d_Ap, 1, &pAp);
auto alpha = rsold / pAp;
// x = x + alpha * p;
VectAddKernel<<<m_dimGrid, m_dimBlock>>>(d_x, d_p, alpha, d_x, m_A.m());
// r = r - alpha * Ap;
VectAddKernel<<<m_dimGrid, m_dimBlock>>>(d_r, d_Ap, -alpha, d_r, m_A.m());
// rsnew = r' * r;
double rsnew;
cublasDdot(handle, m_A.m(), d_r, 1, d_r, 1, &rsnew);
// p = r + (rsnew / rsold) * p;
auto beta = rsnew / rsold;
VectAddKernel<<<m_dimGrid, m_dimBlock>>>(d_r, d_p, beta, d_p, m_A.m());
if (DEBUG && (k % 1000 == 0)) {
std::cout << "Rsold is " << rsold << std::endl;
std::cout << "pAp is " << pAp << std::endl;
std::cout << "alpha is " << alpha << std::endl;
std::cout << "rsnew is " << rsnew << std::endl;
std::cout << "beta is " << beta << std::endl;
}
// rsold = rsnew;
rsold = rsnew;
// if sqrt(rsnew) < 1e-10
// break;
if (std::sqrt(rsnew) < m_tolerance)
break; // Convergence test
}
if (DEBUG) {
// r = A * x
if (USE_SHARED_MEM)
LoadedMatVectProdKernel<<<m_dimGrid, m_dimBlock>>>(d_A, d_x, d_r, m_A.n());
else
MatVectProdKernel<<<m_dimGrid, m_dimBlock>>>(d_A, d_x, d_r, m_A.n());
// r = r - b
VectAddKernel<<<m_dimGrid, m_dimBlock>>>(d_r, d_b, -1.0, d_r, m_A.m());
// res = || r || / || b ||
double rsnew;
stat = cublasDdot(handle, m_A.m(), d_r, 1, d_r, 1, &rsnew);
double bTb;
stat = cublasDdot(handle, m_A.m(), d_b, 1, d_b, 1, &bTb);
auto res = std::sqrt(rsnew / bTb);
double nx;
stat = cublasDdot(handle, m_A.m(), d_x, 1, d_x, 1, &nx);
// Print results
std::cout << "\t[STEP " << k << "] residual = " << std::scientific
<< std::sqrt(rsold) << ", ||x|| = " << nx
<< ", ||Ax - b||/||b|| = " << res << std::endl;
}
// ---------------------------
// Read solution x from device
// ---------------------------
cudaMemcpy(x.data(), d_x, size_b, cudaMemcpyDeviceToHost);
// ---------------------------
// Deallocate memory on device
// ---------------------------
cudaFree(d_A);
cudaFree(d_b);
cudaFree(d_x);
cudaFree(d_r);
cudaFree(d_p);
cudaFree(d_Ap);
cudaFree(d_tmp);
// Destroy cuBLAS handle
cublasDestroy(handle);
}
void CGSolver::read_matrix(const std::string & filename) {
m_A.read(filename);
m_m = m_A.m();
m_n = m_A.n();
}
/*
Initialization of the source term b
*/
void Solver::init_source_term(double h) {
m_b.resize(m_n);
for (int i = 0; i < m_n; i++) {
m_b[i] = -2. * i * M_PI * M_PI * std::sin(10. * M_PI * i * h) *
std::sin(10. * M_PI * i * h);
}
}

Event Timeline