diff --git a/Serie08/hello_world/Makefile b/Serie08/hello_world/Makefile
new file mode 100644
index 0000000..3b3bee4
--- /dev/null
+++ b/Serie08/hello_world/Makefile
@@ -0,0 +1,15 @@
+NVCC       = nvcc
+CXX       ?= g++
+CXXFLAGS  += -O3 -Wall
+NVCCFLAGS += -O3
+LDFLAGS   += $(NVCCFLAGS)
+
+#Target Rules
+hello: hello_world.o
+	$(NVCC) $^ $(LDFLAGS) -o $@
+
+%.o:%.cu
+	$(NVCC) $(NVCCFLAGS) -c $< -o $@
+
+clean:
+	rm -rf *.o hello_world
diff --git a/Serie08/hello_world/hello_world.cu b/Serie08/hello_world/hello_world.cu
new file mode 100644
index 0000000..85548c7
--- /dev/null
+++ b/Serie08/hello_world/hello_world.cu
@@ -0,0 +1,21 @@
+// CUDA runtime
+#include <cuda_runtime.h>
+#include <iostream>
+
+__global__ void hello_world() {
+  // TODO: experiment with printf here and the thread and grid idx
+}
+
+int main() {
+  printf("printf() is called. Output:\n");
+
+  // Kernel configuration, where a two-dimensional grid and
+  // three-dimensional blocks are configured.
+  dim3 dimGrid(2, 2);
+  dim3 dimBlock(2, 2, 2);
+
+  hello_world<<<dimGrid, dimBlock>>>();
+  // TODO: what do you need to do in order to ensure printing?
+
+  return 0;
+}
diff --git a/Serie08/hello_world/script.sh b/Serie08/hello_world/script.sh
new file mode 100644
index 0000000..17981f8
--- /dev/null
+++ b/Serie08/hello_world/script.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+#SBATCH --nodes=1
+#SBATCH --time=1:0:0
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:1
+#SBATCH --account=phpc2021
+#SBATCH --reservation=phpc2021
+
+module purge
+module load gcc cuda
+module list
+
+#nvprof is used to profile our code
+srun nvprof ./hello
diff --git a/Serie08/matrix_mul/Makefile b/Serie08/matrix_mul/Makefile
new file mode 100644
index 0000000..892b0f9
--- /dev/null
+++ b/Serie08/matrix_mul/Makefile
@@ -0,0 +1,15 @@
+NVCC       = nvcc
+CXX       ?= g++
+CXXFLAGS  += -O3 -Wall
+NVCCFLAGS += -O3
+LDFLAGS   += $(NVCCFLAGS)
+
+#Target Rules
+matrixmul: matrix_mul.o matrix_mul_cpu.o matrix_mul_gpu.o
+	$(NVCC) $^ $(LDFLAGS) -o $@
+
+%.o:%.cu
+	$(NVCC) $(NVCCFLAGS) -c $< -o $@
+
+clean:
+	rm -rf *.o matrixmul
diff --git a/Serie08/matrix_mul/matrix_mul.cu b/Serie08/matrix_mul/matrix_mul.cu
new file mode 100644
index 0000000..170bd9d
--- /dev/null
+++ b/Serie08/matrix_mul/matrix_mul.cu
@@ -0,0 +1,99 @@
+// includes, system
+#include <chrono>
+#include <iostream>
+
+#include "matrix_mul.hh"
+
+using clk = std::chrono::high_resolution_clock;
+using second = std::chrono::duration<double>;
+using time_point = std::chrono::time_point<clk>;
+
+/* -------------------------------------------------------------------------- */
+int main(int argc, char **argv) {
+
+  // By default, we use device 0,
+  int dev_id = 0;
+
+  cudaError error;
+  cudaDeviceProp device_prop;
+  error = cudaGetDevice(&dev_id);
+  error = cudaGetDeviceProperties(&device_prop, dev_id);
+  if (device_prop.computeMode == cudaComputeModeProhibited) {
+    std::cerr << "Error: device is running in <Compute Mode Prohibited>, no "
+                 "threads can use ::cudaSetDevice()"
+              << std::endl;
+    exit(EXIT_SUCCESS);
+  }
+
+  if (error != cudaSuccess) {
+    std::cout << "cudaGetDeviceProperties returned error code " << error
+              << ", line(" << __LINE__ << ")" << std::endl;
+  } else {
+    std::cout << "GPU Device " << dev_id << ": \"" << device_prop.name
+              << "\" with compute capability " << device_prop.major << "."
+              << device_prop.minor << std::endl;
+  }
+
+    // allocate device memory
+  matrix_gpu device_A(HA, WA);
+  matrix_gpu device_B(HB, WB);
+
+  std::mt19937 gen(2006);
+  std::uniform_real_distribution<> dis(0.f, 1.f);
+
+  float flop = 2.f * WC * HC * WA;
+
+  // initialize host memory
+  device_A.randomInit(gen, dis);
+  device_B.randomInit(gen, dis);
+
+#if CHECK_RESULT == 1
+  matrix_cpu host_C(HC, WC);
+
+  auto t1 = clk::now();
+  // compute reference solution
+  matMulCPU(host_C, device_A, device_B);
+
+  second elapsed = clk::now() - t1;
+
+  std::cout << "Naive CPU -- time:  " << elapsed.count()
+            << " (s), GFLOPs: " << flop / elapsed.count() / 1e9 << std::endl;
+#endif
+
+  /****************************************************/
+  /*  naive implementation on GPU                     */
+  /****************************************************/
+  // setup execution parameters
+  dim3 threads = dim3(BLOCK_SIZE, BLOCK_SIZE);
+  dim3 grid = dim3(device_C.cols() / threads.x, device_C.rows() / threads.y);
+
+  cudaEvent_t start;
+  cudaEvent_t stop;
+  float msec_total;
+
+  matrix_gpu device_C(HC, WC);
+
+  // create and start timer
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  cudaEventRecord(start, 0);
+  // naive implementation
+  matMulGPU_naive<<<grid, threads>>>(device_C, device_A, device_B);
+
+  // stop and destroy timer
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&msec_total, start, stop);
+
+  cudaDeviceSynchronize();
+  std::cout << "Naive GPU -- time:  " << msec_total / 1e3
+            << " (s), GFLOPs: " << flop / msec_total / 1e6 << std::endl;
+
+#if CHECK_RESULT == 1
+  // check result
+  printDiff(host_C, device_C);
+#endif
+
+  return 0;
+}
diff --git a/Serie08/matrix_mul/matrix_mul.hh b/Serie08/matrix_mul/matrix_mul.hh
new file mode 100644
index 0000000..37110a8
--- /dev/null
+++ b/Serie08/matrix_mul/matrix_mul.hh
@@ -0,0 +1,39 @@
+#include <cuda_runtime.h>
+#include <iostream>
+#include <random>
+
+#include "matrix_mul_cpu.hh"
+#include "matrix_mul_gpu.hh"
+
+#ifndef MATRIX_MUL_HH
+#define MATRIX_MUL_HH
+
+#define CHECK_RESULT 1
+
+inline void printDiff(const matrix_cpu &host_A, const matrix_gpu &device_A) {
+  int error_count = 0;
+  for (int i = 0; i < host_A.rows(); i++) {
+    for (int j = 0; j < host_A.cols(); j++) {
+      if (std::abs(host_A(i, j) - device_A(i, j)) > 0.1) {
+        std::cout << "diff(" << i << ", " << j << ") CPU=" << host_A(i, j)
+                  << ", GPU=" << device_A(i, j) << std::endl;
+        error_count++;
+      }
+    }
+  }
+  std::cout << "Total Errors = " << error_count << std::endl;
+}
+
+// Thread block size
+#define BLOCK_SIZE 32
+
+// Matrix dimensions
+// (chosen as multiples of the thread block size for simplicity)
+#define WA (32 * BLOCK_SIZE) // Matrix A width
+#define HA (16 * BLOCK_SIZE) // Matrix A height
+#define WB (24 * BLOCK_SIZE) // Matrix B width
+#define HB WA                // Matrix B height
+#define WC WB                // Matrix C width
+#define HC HA                // Matrix C height
+
+#endif // MATRIX_MUL_HH
diff --git a/Serie08/matrix_mul/matrix_mul_cpu.cpp b/Serie08/matrix_mul/matrix_mul_cpu.cpp
new file mode 100644
index 0000000..c481ed2
--- /dev/null
+++ b/Serie08/matrix_mul/matrix_mul_cpu.cpp
@@ -0,0 +1,15 @@
+#include "matrix_mul.hh"
+#include <iostream>
+
+void matMulCPU(matrix_cpu &C, const matrix_gpu &A, const matrix_gpu &B) {
+  for (int i = 0; i < C.rows(); ++i)
+    for (int j = 0; j < C.cols(); ++j) {
+      float sum = 0;
+      for (int k = 0; k < A.cols(); ++k) {
+        float a = A(i, k);
+        float b = B(k, j);
+        sum += a * b;
+      }
+      C(i, j) = sum;
+    }
+}
diff --git a/Serie08/matrix_mul/matrix_mul_cpu.hh b/Serie08/matrix_mul/matrix_mul_cpu.hh
new file mode 100644
index 0000000..367591e
--- /dev/null
+++ b/Serie08/matrix_mul/matrix_mul_cpu.hh
@@ -0,0 +1,25 @@
+#include <vector>
+
+#ifndef MATRIX_MUL_CPU_HH
+#define MATRIX_MUL_CPU_HH
+
+class matrix_gpu;
+
+class matrix_cpu {
+public:
+  matrix_cpu(int n, int m) : n(n), m(m), data(n * m) {}
+  inline float &operator()(int i, int j) { return data[i * n + j]; }
+  inline const float &operator()(int i, int j) const { return data[i * n + j]; }
+
+  int rows() const { return m; }
+  int cols() const { return n; }
+
+private:
+  int n;
+  int m;
+  std::vector<float> data;
+};
+
+void matMulCPU(matrix_cpu &C, const matrix_gpu &A, const matrix_gpu &B);
+
+#endif /* MATRIX_MUL_CPU_HH */
diff --git a/Serie08/matrix_mul/matrix_mul_gpu.cu b/Serie08/matrix_mul/matrix_mul_gpu.cu
new file mode 100644
index 0000000..48f3e4e
--- /dev/null
+++ b/Serie08/matrix_mul/matrix_mul_gpu.cu
@@ -0,0 +1,6 @@
+#include "matrix_mul.hh"
+
+__global__ void matMulGPU_naive(matrix_gpu C, const matrix_gpu A,
+                                const matrix_gpu B) {
+  // TODO: implement the naive version of matmul here
+}
diff --git a/Serie08/matrix_mul/matrix_mul_gpu.hh b/Serie08/matrix_mul/matrix_mul_gpu.hh
new file mode 100644
index 0000000..632629d
--- /dev/null
+++ b/Serie08/matrix_mul/matrix_mul_gpu.hh
@@ -0,0 +1,54 @@
+#ifndef MATRIX_MUL_GPU_HH
+#define MATRIX_MUL_GPU_HH
+
+class matrix_gpu {
+public:
+  __host__ matrix_gpu(int n, int m) : n(n), m(m) {
+    cudaMallocManaged(&data, m * n * sizeof(float));
+  }
+  __host__ __device__ matrix_gpu(matrix_gpu &M)
+      : n(M.n), m(M.m), data(M.data), copy(true) {}
+  __host__ ~matrix_gpu() {
+    if (not copy) {
+      cudaFree(data);
+    }
+  }
+  __host__ __device__ inline float &operator()(int i, int j) {
+    return data[i * n + j];
+  }
+  __host__ __device__ inline const float &operator()(int i, int j) const {
+    return data[i * n + j];
+  }
+
+  __host__ matrix_gpu &operator=(const matrix_cpu &M) {
+    for (int i = 0; i < m; ++i) {
+      for (int j = 0; j < n; ++j) {
+        this->operator()(i, j) = M(i, j);
+      }
+    }
+    return *this;
+  }
+
+  template <class random_gen, class random_dis>
+  __host__ void randomInit(random_gen &gen, random_dis &dis) {
+    for (int i = 0; i < m; ++i) {
+      for (int j = 0; j < n; ++j) {
+        this->operator()(i, j) = dis(gen);
+      }
+    }
+  }
+
+  __host__ __device__ int rows() const { return m; }
+  __host__ __device__ int cols() const { return n; }
+
+private:
+  int n;
+  int m;
+  float *data;
+  bool copy{false};
+};
+
+__global__ void matMulGPU_naive(matrix_gpu C, const matrix_gpu A,
+                                const matrix_gpu B);
+
+#endif /* MATRIX_MUL_GPU_HH */
diff --git a/Serie08/matrix_mul/script.sh b/Serie08/matrix_mul/script.sh
new file mode 100644
index 0000000..e97e769
--- /dev/null
+++ b/Serie08/matrix_mul/script.sh
@@ -0,0 +1,14 @@
+#!/bin/bash -l
+
+#SBATCH --nodes=1
+#SBATCH --time=1:0:0
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:1
+#SBATCH --qos=gpu
+#SBATCH --account=phpc2021
+#SBATCH --reservation=phpc2021
+
+module purge
+module load gcc cuda
+
+srun nvprof ./matrixmul
diff --git a/Serie08/vector_add/Makefile b/Serie08/vector_add/Makefile
new file mode 100644
index 0000000..bcc43c6
--- /dev/null
+++ b/Serie08/vector_add/Makefile
@@ -0,0 +1,15 @@
+NVCC       = nvcc
+CXX       ?= g++
+CXXFLAGS  += -O3 -Wall
+NVCCFLAGS += -O3
+LDFLAGS   += $(NVCCFLAGS)
+
+#Target Rules
+vector_add: vector_add.o
+	$(NVCC) $^ $(LDFLAGS) -o $@
+
+%.o:%.cu
+	$(NVCC) $(NVCCFLAGS) -c $< -o $@
+
+clean:
+	rm -rf *.o vector_add
diff --git a/Serie08/vector_add/script.sh b/Serie08/vector_add/script.sh
new file mode 100644
index 0000000..6d18b64
--- /dev/null
+++ b/Serie08/vector_add/script.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+#SBATCH --nodes=1
+#SBATCH --time=1:0:0
+#SBATCH --partition=gpu
+#SBATCH --gres=gpu:1
+#SBATCH --account=phpc2021
+#SBATCH --reservation=phpc2021
+
+module purge
+module load gcc cuda
+module list
+
+#nvprof is used to profile our code
+srun nvprof ./vector_add
diff --git a/Serie08/vector_add/vector_add.cu b/Serie08/vector_add/vector_add.cu
new file mode 100644
index 0000000..1d11917
--- /dev/null
+++ b/Serie08/vector_add/vector_add.cu
@@ -0,0 +1,96 @@
+#include <cuda_runtime.h>
+#include <random>
+#include <vector>
+#include <iostream>
+#include <exception>
+
+/**
+ * TODO: write a kernel that does the vector addiction C = A + B with 1 thread
+ */
+__global__ void vectorAddOneThread(const float *A, const float *B, float *C,
+                                   int N) {
+}
+
+/**
+ * TODO: write a kernel that does the vector addiction C = A + B with 1 Block
+ * and 256 threads Hint: When 256 threads are working on one loop how the loop
+ * changes?
+ */
+__global__ void vectorAddOneBlock(const float *A, const float *B, float *C,
+                                  int N) {
+}
+
+/**
+ * TODO: write a kernel that does the vector addiction C = A+B with grid of
+ * blocks. Each block has 256 threads. Hint: what check do you need to implement
+ * to avoid invalid memory reference?
+ */
+__global__ void vectorAdd(const float *A, const float *B, float *C, int N) {
+}
+
+/* -------------------------------------------------------------------------- */
+void checkResults(std::string test, const float *A, const float *B,
+                  const float *C, int N) {
+  // Verify that the result vector is correct
+  for (int i = 0; i < N; ++i) {
+    if (std::abs(A[i] + B[i] - C[i]) > 1e-5) {
+      throw std::runtime_error("Result verification failed at element "
+                               + std::to_string(i) + " for test " + test);
+    }
+  }
+}
+
+/**
+ * Host main routine
+ */
+int main() {
+  // Print the vector length to be used, and compute its size
+  int N = 1 << 20; // 1M elements
+  size_t size_in_bytes = N * sizeof(float);
+  std::cout << "[Vector addition of " << N << " elements]" << std::endl;
+
+  float *d_A{nullptr};
+  float *d_B{nullptr};
+  float *d_C{nullptr};
+
+  // TODO: allocate d_A, d_B, and d_C
+
+  std::mt19937 gen(2006);
+  std::uniform_real_distribution<> dis(0.f, 1.f);
+
+  // Initialize the input vectors
+  for (int i = 0; i < N; ++i) {
+    d_A[i] = dis(gen);
+    d_B[i] = dis(gen);
+  }
+
+  // Launch the Vector Add CUDA Kernel
+  int threads_per_block = 256;
+
+  // TODO: Launch the Vector Add CUDA Kernel with one threads
+  vectorAddOneThread <<<?, ?>>> (d_A, d_B, d_C, N);
+  cudaDeviceSynchronize(); // Since kernel launches is async wrt to the host we
+                           // have to syncronize
+
+  checkResults("vectorAddOneThread", d_A, d_B, d_C, N);
+
+  // TODO: Launch the Vector Add CUDA Kernel with one block and 256 threads
+  vectorAddOneBlock <<<?, ?>>> (d_A, d_B, d_C, N);
+  cudaDeviceSynchronize();
+
+  checkResults("vectorAddOneBlock", d_A, d_B, d_C, N);
+
+  int blocks_per_grid = ?; // TODO: compute the blocks per grid
+  // TODO: Launch the Vector Add CUDA Kernel with blocksPerGrid and 256 threads
+  vectorAdd <<<?, ?>>> (d_A, d_B, d_C, N);
+  cudaDeviceSynchronize();
+
+  checkResults("vectorAdd", d_A, d_B, d_C, N);
+
+  std::cout << "Test PASSED" << std::endl;
+
+  // TODO: Free device global memory
+
+  std::cout << "Done" << std::endl;
+  return 0;
+}