diff --git a/exercises/ex3/Makefile b/exercises/ex3/Makefile
new file mode 100644
index 0000000..b805b7f
--- /dev/null
+++ b/exercises/ex3/Makefile
@@ -0,0 +1,13 @@
+F90=nvfortran
+F90FLAGS=-Mcuda
+
+all:    saxpy_cuda_256elements.x
+          
+
+saxpy_cuda_256elements.x: saxpy_cuda_256elements.o
+	$(F90) $(F90FLAGS) saxpy_cuda_256elements.o -o $@
+saxpy_cuda_256elements.o: saxpy_cuda_256elements.F90
+	$(F90) $(F90FLAGS) -c saxpy_cuda_256elements.F90 -o $@
+
+clean:
+	rm *.x *.o
diff --git a/exercises/ex3/saxpy_cuda_256elements.F90 b/exercises/ex3/saxpy_cuda_256elements.F90
new file mode 100644
index 0000000..a894cd0
--- /dev/null
+++ b/exercises/ex3/saxpy_cuda_256elements.F90
@@ -0,0 +1,48 @@
+module mathOps
+contains
+  attributes(global) subroutine saxpy_cuda(x, y, a)
+    implicit none
+    real :: x(:), y(:)
+    real, value :: a
+    integer :: i, n
+    n = size(x)
+    !implement saxpy on GPU here
+  end subroutine saxpy_cuda
+
+  subroutine saxpy_cpu(x, y, a)
+    implicit none
+    real :: x(:), y(:)
+    real, value :: a
+    integer :: i, n
+    n = size(x)
+    do i=1, n
+        y(i) = y(i) + a*x(i)
+    end do
+  end subroutine saxpy_cpu
+
+end module mathOps
+
+program testSaxpy
+  use mathOps
+  use cudafor
+  implicit none
+  integer, parameter :: N = 256
+  integer istat, i
+  real :: x(N), y(N), a, norm2_cpu, norm2_gpu
+  real, device :: x_d(N), y_d(N)
+
+
+  do i=1, N
+      y(i) = 0.0
+      x(i) = i*0.458
+      a = 12.1
+  end do
+
+  x_d = x
+  y_d = y
+  !call the GPU implementation here
+  !y = y_d
+  istat = cudaDeviceSynchronize() 
+  call saxpy_cpu(x, y, a)
+  write(*,*) 'L2 norm: ', norm2(y-y_d)
+end program testSaxpy
diff --git a/exercises/ex3/script.sh b/exercises/ex3/script.sh
new file mode 100644
index 0000000..a42041e
--- /dev/null
+++ b/exercises/ex3/script.sh
@@ -0,0 +1,14 @@
+#!/bin/bash -l
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --ntasks-per-core=1
+#SBATCH --cpus-per-task=1
+#SBATCH --gres=gpu:1
+#SBATCH --reservation=spc-cuda-training-12.04
+#SBATCH --account=spc-cuda-training
+#SBATCH --time=0:05:00
+
+module load nvhpc
+
+srun  -n 1 ./hello_world.x > output_hello_world
+