diff --git a/exercises/ex1/Makefile b/exercises/ex1/Makefile
index c6c1368..1aadef2 100644
--- a/exercises/ex1/Makefile
+++ b/exercises/ex1/Makefile
@@ -1,17 +1,18 @@
 F90=nvfortran
 F90FLAGS=-cuda
 
 .PHONY: clean distclean
 
-all:  hello_world.x
+all:  hello_world.x hello_world_solution.x
 
 hello_world.x: hello_world.f90
+hello_world_solution.x: hello_world_solution.f90
 
 %.x: %.f90
 	$(F90) $(F90FLAGS) $< -o $@
 
 distclean: clean
 	rm *.x
 
 clean:
 	rm -f *.o *.mod
diff --git a/exercises/ex1/hello_world_solution.f90 b/exercises/ex1/hello_world_solution.f90
new file mode 100644
index 0000000..7e4ea7c
--- /dev/null
+++ b/exercises/ex1/hello_world_solution.f90
@@ -0,0 +1,45 @@
+module helloWorld
+  implicit none
+
+contains
+
+  subroutine hello_world_cpu
+    write(*,*) 'hello world from CPU code'
+  end subroutine hello_world_cpu
+
+  attributes(global) subroutine hello_world_cuda
+    write(*,*) 'hello world from CUDA code'
+  end subroutine hello_world_cuda
+
+  attributes(global) subroutine hello_world_cuda_threads
+    write(*,*) 'hello world from thread', threadIdx%x
+  end subroutine hello_world_cuda_threads
+end module helloWorld
+
+program testHelloWorld
+  use cudafor
+  use helloWorld
+
+  implicit none
+
+  integer :: istat
+
+  call hello_world_cpu
+  ! This will print 16 times "hello world from thread n"
+  call hello_world_cuda<<<1, 16>>>
+
+  ! This will print 512 times "hello world from thread n"
+  ! call hello_world_cuda<<<1, 512>>>
+
+  ! This will print 1024 times "hello world from thread n"
+  ! call hello_world_cuda<<<1, 1024>>>
+
+  ! This is forbidden as the maximum number of threads per block (in the first dimension) is 1024
+  ! call hello_world_cuda<<<1, 2048>>>
+
+  ! Kernel execution is asynchronuous. This means that once the CPU has launched the kernel on the GPU, it continues doing
+  ! its work. In this case, if one removes the call to cudaDeviceSynchronize(), the CPU will immediately reach the end of
+  ! program thus killing the execution of the kernel. Different outputs will be observed but most often, it will not be complete.
+  ! Remember to always synchronize the device when you need to make sure that both CPU and GPU are at the same "place"
+  istat = cudaDeviceSynchronize() 
+end program testHelloWorld
diff --git a/exercises/ex1/script.sh b/exercises/ex1/script.sh
index a42041e..39567ca 100644
--- a/exercises/ex1/script.sh
+++ b/exercises/ex1/script.sh
@@ -1,14 +1,15 @@
 #!/bin/bash -l
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1
 #SBATCH --ntasks-per-core=1
 #SBATCH --cpus-per-task=1
 #SBATCH --gres=gpu:1
 #SBATCH --reservation=spc-cuda-training-12.04
 #SBATCH --account=spc-cuda-training
 #SBATCH --time=0:05:00
 
 module load nvhpc
 
 srun  -n 1 ./hello_world.x > output_hello_world
+srun  -n 1 ./hello_world_solution.x > output_hello_world_solution