// Adapted from https://docs.nvidia.com/cuda/cuda-c-programming-guide/#memory-hierarchy
// Perform vector - vector addition
// out = x + a * y
__global__ void VectAddKernel(const double* x, const double* y, double a, double* out, int N) {
// each thread computes one entry of result
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= N) return;
out[row] = x[row] + a * y [row]; // Coalesced access to x, y
}
// Perform Matrix-vector product
// out = A * x
__global__ void MatVectProdKernel(const double* A, const double* x, double* out, int N) {
// Each thread computes one element of y = A * x
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= N) return;
double sum = 0.0;
for (int e = 0; e < N; e++)
sum += A[row + e * N] * x[row];
out[row] = sum;
}
/*
Perform matrix-vector product, but load chunks of x to shared memory first
Motivation: all threads need to read from x, leading to uncoalesed access from within the block
Memory allocation must be determined at compile time. I allocate the max amount of doubles
Memory size for shared memory allocation needs to be known at compile time. It is not, so we just allocate a big enough space and may not use all of it.
We can store 6144 double-precision numbers in shared memory (49152 bytes available on Tesla V100-PCIE-32GB).
*/
#define MAX_CHUNK_SIZE 1024
__global__ void LoadedMatVectProdKernel(const double* A, const double* x, double* out, int N) {