diff --git a/Serie08/hello_world/Makefile b/Serie08/hello_world/Makefile new file mode 100644 index 0000000..3b3bee4 --- /dev/null +++ b/Serie08/hello_world/Makefile @@ -0,0 +1,15 @@ +NVCC = nvcc +CXX ?= g++ +CXXFLAGS += -O3 -Wall +NVCCFLAGS += -O3 +LDFLAGS += $(NVCCFLAGS) + +#Target Rules +hello: hello_world.o + $(NVCC) $^ $(LDFLAGS) -o $@ + +%.o:%.cu + $(NVCC) $(NVCCFLAGS) -c $< -o $@ + +clean: + rm -rf *.o hello_world diff --git a/Serie08/hello_world/hello_world.cu b/Serie08/hello_world/hello_world.cu new file mode 100644 index 0000000..85548c7 --- /dev/null +++ b/Serie08/hello_world/hello_world.cu @@ -0,0 +1,21 @@ +// CUDA runtime +#include +#include + +__global__ void hello_world() { + // TODO: experiment with printf here and the thread and grid idx +} + +int main() { + printf("printf() is called. Output:\n"); + + // Kernel configuration, where a two-dimensional grid and + // three-dimensional blocks are configured. + dim3 dimGrid(2, 2); + dim3 dimBlock(2, 2, 2); + + hello_world<<>>(); + // TODO: what do you need to do in order to ensure printing? + + return 0; +} diff --git a/Serie08/hello_world/script.sh b/Serie08/hello_world/script.sh new file mode 100644 index 0000000..17981f8 --- /dev/null +++ b/Serie08/hello_world/script.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +#SBATCH --nodes=1 +#SBATCH --time=1:0:0 +#SBATCH --partition=gpu +#SBATCH --gres=gpu:1 +#SBATCH --account=phpc2021 +#SBATCH --reservation=phpc2021 + +module purge +module load gcc cuda +module list + +#nvprof is used to profile our code +srun nvprof ./hello diff --git a/Serie08/matrix_mul/Makefile b/Serie08/matrix_mul/Makefile new file mode 100644 index 0000000..892b0f9 --- /dev/null +++ b/Serie08/matrix_mul/Makefile @@ -0,0 +1,15 @@ +NVCC = nvcc +CXX ?= g++ +CXXFLAGS += -O3 -Wall +NVCCFLAGS += -O3 +LDFLAGS += $(NVCCFLAGS) + +#Target Rules +matrixmul: matrix_mul.o matrix_mul_cpu.o matrix_mul_gpu.o + $(NVCC) $^ $(LDFLAGS) -o $@ + +%.o:%.cu + $(NVCC) $(NVCCFLAGS) -c $< -o $@ + +clean: + rm -rf *.o matrixmul diff --git a/Serie08/matrix_mul/matrix_mul.cu b/Serie08/matrix_mul/matrix_mul.cu new file mode 100644 index 0000000..170bd9d --- /dev/null +++ b/Serie08/matrix_mul/matrix_mul.cu @@ -0,0 +1,99 @@ +// includes, system +#include +#include + +#include "matrix_mul.hh" + +using clk = std::chrono::high_resolution_clock; +using second = std::chrono::duration; +using time_point = std::chrono::time_point; + +/* -------------------------------------------------------------------------- */ +int main(int argc, char **argv) { + + // By default, we use device 0, + int dev_id = 0; + + cudaError error; + cudaDeviceProp device_prop; + error = cudaGetDevice(&dev_id); + error = cudaGetDeviceProperties(&device_prop, dev_id); + if (device_prop.computeMode == cudaComputeModeProhibited) { + std::cerr << "Error: device is running in , no " + "threads can use ::cudaSetDevice()" + << std::endl; + exit(EXIT_SUCCESS); + } + + if (error != cudaSuccess) { + std::cout << "cudaGetDeviceProperties returned error code " << error + << ", line(" << __LINE__ << ")" << std::endl; + } else { + std::cout << "GPU Device " << dev_id << ": \"" << device_prop.name + << "\" with compute capability " << device_prop.major << "." + << device_prop.minor << std::endl; + } + + // allocate device memory + matrix_gpu device_A(HA, WA); + matrix_gpu device_B(HB, WB); + + std::mt19937 gen(2006); + std::uniform_real_distribution<> dis(0.f, 1.f); + + float flop = 2.f * WC * HC * WA; + + // initialize host memory + device_A.randomInit(gen, dis); + device_B.randomInit(gen, dis); + +#if CHECK_RESULT == 1 + matrix_cpu host_C(HC, WC); + + auto t1 = clk::now(); + // compute reference solution + matMulCPU(host_C, device_A, device_B); + + second elapsed = clk::now() - t1; + + std::cout << "Naive CPU -- time: " << elapsed.count() + << " (s), GFLOPs: " << flop / elapsed.count() / 1e9 << std::endl; +#endif + + /****************************************************/ + /* naive implementation on GPU */ + /****************************************************/ + // setup execution parameters + dim3 threads = dim3(BLOCK_SIZE, BLOCK_SIZE); + dim3 grid = dim3(device_C.cols() / threads.x, device_C.rows() / threads.y); + + cudaEvent_t start; + cudaEvent_t stop; + float msec_total; + + matrix_gpu device_C(HC, WC); + + // create and start timer + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start, 0); + // naive implementation + matMulGPU_naive<<>>(device_C, device_A, device_B); + + // stop and destroy timer + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&msec_total, start, stop); + + cudaDeviceSynchronize(); + std::cout << "Naive GPU -- time: " << msec_total / 1e3 + << " (s), GFLOPs: " << flop / msec_total / 1e6 << std::endl; + +#if CHECK_RESULT == 1 + // check result + printDiff(host_C, device_C); +#endif + + return 0; +} diff --git a/Serie08/matrix_mul/matrix_mul.hh b/Serie08/matrix_mul/matrix_mul.hh new file mode 100644 index 0000000..37110a8 --- /dev/null +++ b/Serie08/matrix_mul/matrix_mul.hh @@ -0,0 +1,39 @@ +#include +#include +#include + +#include "matrix_mul_cpu.hh" +#include "matrix_mul_gpu.hh" + +#ifndef MATRIX_MUL_HH +#define MATRIX_MUL_HH + +#define CHECK_RESULT 1 + +inline void printDiff(const matrix_cpu &host_A, const matrix_gpu &device_A) { + int error_count = 0; + for (int i = 0; i < host_A.rows(); i++) { + for (int j = 0; j < host_A.cols(); j++) { + if (std::abs(host_A(i, j) - device_A(i, j)) > 0.1) { + std::cout << "diff(" << i << ", " << j << ") CPU=" << host_A(i, j) + << ", GPU=" << device_A(i, j) << std::endl; + error_count++; + } + } + } + std::cout << "Total Errors = " << error_count << std::endl; +} + +// Thread block size +#define BLOCK_SIZE 32 + +// Matrix dimensions +// (chosen as multiples of the thread block size for simplicity) +#define WA (32 * BLOCK_SIZE) // Matrix A width +#define HA (16 * BLOCK_SIZE) // Matrix A height +#define WB (24 * BLOCK_SIZE) // Matrix B width +#define HB WA // Matrix B height +#define WC WB // Matrix C width +#define HC HA // Matrix C height + +#endif // MATRIX_MUL_HH diff --git a/Serie08/matrix_mul/matrix_mul_cpu.cpp b/Serie08/matrix_mul/matrix_mul_cpu.cpp new file mode 100644 index 0000000..c481ed2 --- /dev/null +++ b/Serie08/matrix_mul/matrix_mul_cpu.cpp @@ -0,0 +1,15 @@ +#include "matrix_mul.hh" +#include + +void matMulCPU(matrix_cpu &C, const matrix_gpu &A, const matrix_gpu &B) { + for (int i = 0; i < C.rows(); ++i) + for (int j = 0; j < C.cols(); ++j) { + float sum = 0; + for (int k = 0; k < A.cols(); ++k) { + float a = A(i, k); + float b = B(k, j); + sum += a * b; + } + C(i, j) = sum; + } +} diff --git a/Serie08/matrix_mul/matrix_mul_cpu.hh b/Serie08/matrix_mul/matrix_mul_cpu.hh new file mode 100644 index 0000000..367591e --- /dev/null +++ b/Serie08/matrix_mul/matrix_mul_cpu.hh @@ -0,0 +1,25 @@ +#include + +#ifndef MATRIX_MUL_CPU_HH +#define MATRIX_MUL_CPU_HH + +class matrix_gpu; + +class matrix_cpu { +public: + matrix_cpu(int n, int m) : n(n), m(m), data(n * m) {} + inline float &operator()(int i, int j) { return data[i * n + j]; } + inline const float &operator()(int i, int j) const { return data[i * n + j]; } + + int rows() const { return m; } + int cols() const { return n; } + +private: + int n; + int m; + std::vector data; +}; + +void matMulCPU(matrix_cpu &C, const matrix_gpu &A, const matrix_gpu &B); + +#endif /* MATRIX_MUL_CPU_HH */ diff --git a/Serie08/matrix_mul/matrix_mul_gpu.cu b/Serie08/matrix_mul/matrix_mul_gpu.cu new file mode 100644 index 0000000..48f3e4e --- /dev/null +++ b/Serie08/matrix_mul/matrix_mul_gpu.cu @@ -0,0 +1,6 @@ +#include "matrix_mul.hh" + +__global__ void matMulGPU_naive(matrix_gpu C, const matrix_gpu A, + const matrix_gpu B) { + // TODO: implement the naive version of matmul here +} diff --git a/Serie08/matrix_mul/matrix_mul_gpu.hh b/Serie08/matrix_mul/matrix_mul_gpu.hh new file mode 100644 index 0000000..632629d --- /dev/null +++ b/Serie08/matrix_mul/matrix_mul_gpu.hh @@ -0,0 +1,54 @@ +#ifndef MATRIX_MUL_GPU_HH +#define MATRIX_MUL_GPU_HH + +class matrix_gpu { +public: + __host__ matrix_gpu(int n, int m) : n(n), m(m) { + cudaMallocManaged(&data, m * n * sizeof(float)); + } + __host__ __device__ matrix_gpu(matrix_gpu &M) + : n(M.n), m(M.m), data(M.data), copy(true) {} + __host__ ~matrix_gpu() { + if (not copy) { + cudaFree(data); + } + } + __host__ __device__ inline float &operator()(int i, int j) { + return data[i * n + j]; + } + __host__ __device__ inline const float &operator()(int i, int j) const { + return data[i * n + j]; + } + + __host__ matrix_gpu &operator=(const matrix_cpu &M) { + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + this->operator()(i, j) = M(i, j); + } + } + return *this; + } + + template + __host__ void randomInit(random_gen &gen, random_dis &dis) { + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + this->operator()(i, j) = dis(gen); + } + } + } + + __host__ __device__ int rows() const { return m; } + __host__ __device__ int cols() const { return n; } + +private: + int n; + int m; + float *data; + bool copy{false}; +}; + +__global__ void matMulGPU_naive(matrix_gpu C, const matrix_gpu A, + const matrix_gpu B); + +#endif /* MATRIX_MUL_GPU_HH */ diff --git a/Serie08/matrix_mul/script.sh b/Serie08/matrix_mul/script.sh new file mode 100644 index 0000000..e97e769 --- /dev/null +++ b/Serie08/matrix_mul/script.sh @@ -0,0 +1,14 @@ +#!/bin/bash -l + +#SBATCH --nodes=1 +#SBATCH --time=1:0:0 +#SBATCH --partition=gpu +#SBATCH --gres=gpu:1 +#SBATCH --qos=gpu +#SBATCH --account=phpc2021 +#SBATCH --reservation=phpc2021 + +module purge +module load gcc cuda + +srun nvprof ./matrixmul diff --git a/Serie08/vector_add/Makefile b/Serie08/vector_add/Makefile new file mode 100644 index 0000000..bcc43c6 --- /dev/null +++ b/Serie08/vector_add/Makefile @@ -0,0 +1,15 @@ +NVCC = nvcc +CXX ?= g++ +CXXFLAGS += -O3 -Wall +NVCCFLAGS += -O3 +LDFLAGS += $(NVCCFLAGS) + +#Target Rules +vector_add: vector_add.o + $(NVCC) $^ $(LDFLAGS) -o $@ + +%.o:%.cu + $(NVCC) $(NVCCFLAGS) -c $< -o $@ + +clean: + rm -rf *.o vector_add diff --git a/Serie08/vector_add/script.sh b/Serie08/vector_add/script.sh new file mode 100644 index 0000000..6d18b64 --- /dev/null +++ b/Serie08/vector_add/script.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +#SBATCH --nodes=1 +#SBATCH --time=1:0:0 +#SBATCH --partition=gpu +#SBATCH --gres=gpu:1 +#SBATCH --account=phpc2021 +#SBATCH --reservation=phpc2021 + +module purge +module load gcc cuda +module list + +#nvprof is used to profile our code +srun nvprof ./vector_add diff --git a/Serie08/vector_add/vector_add.cu b/Serie08/vector_add/vector_add.cu new file mode 100644 index 0000000..1d11917 --- /dev/null +++ b/Serie08/vector_add/vector_add.cu @@ -0,0 +1,96 @@ +#include +#include +#include +#include +#include + +/** + * TODO: write a kernel that does the vector addiction C = A + B with 1 thread + */ +__global__ void vectorAddOneThread(const float *A, const float *B, float *C, + int N) { +} + +/** + * TODO: write a kernel that does the vector addiction C = A + B with 1 Block + * and 256 threads Hint: When 256 threads are working on one loop how the loop + * changes? + */ +__global__ void vectorAddOneBlock(const float *A, const float *B, float *C, + int N) { +} + +/** + * TODO: write a kernel that does the vector addiction C = A+B with grid of + * blocks. Each block has 256 threads. Hint: what check do you need to implement + * to avoid invalid memory reference? + */ +__global__ void vectorAdd(const float *A, const float *B, float *C, int N) { +} + +/* -------------------------------------------------------------------------- */ +void checkResults(std::string test, const float *A, const float *B, + const float *C, int N) { + // Verify that the result vector is correct + for (int i = 0; i < N; ++i) { + if (std::abs(A[i] + B[i] - C[i]) > 1e-5) { + throw std::runtime_error("Result verification failed at element " + + std::to_string(i) + " for test " + test); + } + } +} + +/** + * Host main routine + */ +int main() { + // Print the vector length to be used, and compute its size + int N = 1 << 20; // 1M elements + size_t size_in_bytes = N * sizeof(float); + std::cout << "[Vector addition of " << N << " elements]" << std::endl; + + float *d_A{nullptr}; + float *d_B{nullptr}; + float *d_C{nullptr}; + + // TODO: allocate d_A, d_B, and d_C + + std::mt19937 gen(2006); + std::uniform_real_distribution<> dis(0.f, 1.f); + + // Initialize the input vectors + for (int i = 0; i < N; ++i) { + d_A[i] = dis(gen); + d_B[i] = dis(gen); + } + + // Launch the Vector Add CUDA Kernel + int threads_per_block = 256; + + // TODO: Launch the Vector Add CUDA Kernel with one threads + vectorAddOneThread <<>> (d_A, d_B, d_C, N); + cudaDeviceSynchronize(); // Since kernel launches is async wrt to the host we + // have to syncronize + + checkResults("vectorAddOneThread", d_A, d_B, d_C, N); + + // TODO: Launch the Vector Add CUDA Kernel with one block and 256 threads + vectorAddOneBlock <<>> (d_A, d_B, d_C, N); + cudaDeviceSynchronize(); + + checkResults("vectorAddOneBlock", d_A, d_B, d_C, N); + + int blocks_per_grid = ?; // TODO: compute the blocks per grid + // TODO: Launch the Vector Add CUDA Kernel with blocksPerGrid and 256 threads + vectorAdd <<>> (d_A, d_B, d_C, N); + cudaDeviceSynchronize(); + + checkResults("vectorAdd", d_A, d_B, d_C, N); + + std::cout << "Test PASSED" << std::endl; + + // TODO: Free device global memory + + std::cout << "Done" << std::endl; + return 0; +}