diff --git a/exercises/ex1/Makefile b/exercises/ex1/Makefile index c6c1368..1aadef2 100644 --- a/exercises/ex1/Makefile +++ b/exercises/ex1/Makefile @@ -1,17 +1,18 @@ F90=nvfortran F90FLAGS=-cuda .PHONY: clean distclean -all: hello_world.x +all: hello_world.x hello_world_solution.x hello_world.x: hello_world.f90 +hello_world_solution.x: hello_world_solution.f90 %.x: %.f90 $(F90) $(F90FLAGS) $< -o $@ distclean: clean rm *.x clean: rm -f *.o *.mod diff --git a/exercises/ex1/hello_world_solution.f90 b/exercises/ex1/hello_world_solution.f90 new file mode 100644 index 0000000..7e4ea7c --- /dev/null +++ b/exercises/ex1/hello_world_solution.f90 @@ -0,0 +1,45 @@ +module helloWorld + implicit none + +contains + + subroutine hello_world_cpu + write(*,*) 'hello world from CPU code' + end subroutine hello_world_cpu + + attributes(global) subroutine hello_world_cuda + write(*,*) 'hello world from CUDA code' + end subroutine hello_world_cuda + + attributes(global) subroutine hello_world_cuda_threads + write(*,*) 'hello world from thread', threadIdx%x + end subroutine hello_world_cuda_threads +end module helloWorld + +program testHelloWorld + use cudafor + use helloWorld + + implicit none + + integer :: istat + + call hello_world_cpu + ! This will print 16 times "hello world from thread n" + call hello_world_cuda<<<1, 16>>> + + ! This will print 512 times "hello world from thread n" + ! call hello_world_cuda<<<1, 512>>> + + ! This will print 1024 times "hello world from thread n" + ! call hello_world_cuda<<<1, 1024>>> + + ! This is forbidden as the maximum number of threads per block (in the first dimension) is 1024 + ! call hello_world_cuda<<<1, 2048>>> + + ! Kernel execution is asynchronuous. This means that once the CPU has launched the kernel on the GPU, it continues doing + ! its work. In this case, if one removes the call to cudaDeviceSynchronize(), the CPU will immediately reach the end of + ! program thus killing the execution of the kernel. Different outputs will be observed but most often, it will not be complete. + ! Remember to always synchronize the device when you need to make sure that both CPU and GPU are at the same "place" + istat = cudaDeviceSynchronize() +end program testHelloWorld diff --git a/exercises/ex1/script.sh b/exercises/ex1/script.sh index a42041e..39567ca 100644 --- a/exercises/ex1/script.sh +++ b/exercises/ex1/script.sh @@ -1,14 +1,15 @@ #!/bin/bash -l #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 #SBATCH --ntasks-per-core=1 #SBATCH --cpus-per-task=1 #SBATCH --gres=gpu:1 #SBATCH --reservation=spc-cuda-training-12.04 #SBATCH --account=spc-cuda-training #SBATCH --time=0:05:00 module load nvhpc srun -n 1 ./hello_world.x > output_hello_world +srun -n 1 ./hello_world_solution.x > output_hello_world_solution