diff --git a/examples/hello_world/Makefile b/examples/hello_world/Makefile index fc0ffdf..2dc6ea6 100644 --- a/examples/hello_world/Makefile +++ b/examples/hello_world/Makefile @@ -1,19 +1,16 @@ F90=nvfortran F90FLAGS=-cuda .PHONY: clean distclean all: hello_world_cpu.x hello_world_cuda.x mem_alloc.x hello_world_cuda.x: hello_world_cuda.f90 hello_world_cpu.x: hello_world_cpu.f90 mem_alloc.x: mem_alloc.f90 %.x: %.f90 $(F90) $(F90FLAGS) $< -o $@ -distclean: clean - rm *.x - clean: - rm -f *.o *.mod + rm -f *.o *.mod *.x diff --git a/examples/hello_world/README b/examples/hello_world/README new file mode 100644 index 0000000..5e1c6f1 --- /dev/null +++ b/examples/hello_world/README @@ -0,0 +1,5 @@ +This directory contains the warmup examples you can submit to follow the lecture. +The directory contains: hello_world, memory allocation +commands to be esecuted: +$source source_file +$sbatch script.sh diff --git a/examples/hello_world/source_file b/examples/hello_world/source_file new file mode 100644 index 0000000..9030d0f --- /dev/null +++ b/examples/hello_world/source_file @@ -0,0 +1,2 @@ +#!/bin/bashrc +module load nvhpc diff --git a/examples/warmup/script.sh b/examples/warmup/script.sh new file mode 100644 index 0000000..006f956 --- /dev/null +++ b/examples/warmup/script.sh @@ -0,0 +1,13 @@ +#!/bin/bash -l +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --ntasks-per-core=1 +#SBATCH --cpus-per-task=1 +#SBATCH --gres=gpu:1 +#SBATCH --reservation=spc-cuda-training-12.04 +#SBATCH --account=spc-cuda-training +#SBATCH --time=0:05:00 + +module load nvhpc + +nvidia-smi -a > output diff --git a/exercises/ex1/hello_world.f90 b/exercises/ex1/hello_world.f90 index 17cd66a..070bae6 100644 --- a/exercises/ex1/hello_world.f90 +++ b/exercises/ex1/hello_world.f90 @@ -1,30 +1,29 @@ module helloWorld implicit none contains subroutine hello_world_cpu write(*,*) 'hello world from CPU code' end subroutine hello_world_cpu - attributes(global) subroutine hello_world_cuda - write(*,*) 'hello world from CUDA code' - end subroutine hello_world_cuda attributes(global) subroutine hello_world_cuda_threads - write(*,*) 'hello world from thread', threadIdx%x + integer :: tid + tid = threadIdx%x + write(*,*) 'hello world from thread', tid end subroutine hello_world_cuda_threads end module helloWorld program testHelloWorld use cudafor use helloWorld implicit none integer :: istat call hello_world_cpu - call hello_world_cuda<<<1, 1>>> + call hello_world_cuda_threads<<<1, 1>>> istat = cudaDeviceSynchronize() end program testHelloWorld diff --git a/exercises/ex2/Makefile b/exercises/ex2/Makefile new file mode 100644 index 0000000..c6c1368 --- /dev/null +++ b/exercises/ex2/Makefile @@ -0,0 +1,17 @@ +F90=nvfortran +F90FLAGS=-cuda + +.PHONY: clean distclean + +all: hello_world.x + +hello_world.x: hello_world.f90 + +%.x: %.f90 + $(F90) $(F90FLAGS) $< -o $@ + +distclean: clean + rm *.x + +clean: + rm -f *.o *.mod diff --git a/exercises/ex1/hello_world.f90 b/exercises/ex2/hello_world.f90 similarity index 67% copy from exercises/ex1/hello_world.f90 copy to exercises/ex2/hello_world.f90 index 17cd66a..d9b56cc 100644 --- a/exercises/ex1/hello_world.f90 +++ b/exercises/ex2/hello_world.f90 @@ -1,30 +1,29 @@ module helloWorld implicit none contains subroutine hello_world_cpu write(*,*) 'hello world from CPU code' end subroutine hello_world_cpu - attributes(global) subroutine hello_world_cuda - write(*,*) 'hello world from CUDA code' - end subroutine hello_world_cuda attributes(global) subroutine hello_world_cuda_threads - write(*,*) 'hello world from thread', threadIdx%x + integer :: tid + tid = ... + write(*,*) 'hello world from thread', tid end subroutine hello_world_cuda_threads end module helloWorld program testHelloWorld use cudafor use helloWorld implicit none integer :: istat call hello_world_cpu - call hello_world_cuda<<<1, 1>>> + call hello_world_cuda_threads<<<4, 4>>> istat = cudaDeviceSynchronize() end program testHelloWorld diff --git a/exercises/ex2/script.sh b/exercises/ex2/script.sh new file mode 100644 index 0000000..a42041e --- /dev/null +++ b/exercises/ex2/script.sh @@ -0,0 +1,14 @@ +#!/bin/bash -l +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --ntasks-per-core=1 +#SBATCH --cpus-per-task=1 +#SBATCH --gres=gpu:1 +#SBATCH --reservation=spc-cuda-training-12.04 +#SBATCH --account=spc-cuda-training +#SBATCH --time=0:05:00 + +module load nvhpc + +srun -n 1 ./hello_world.x > output_hello_world + diff --git a/exercises/ex3/Makefile b/exercises/ex3/Makefile new file mode 100644 index 0000000..b805b7f --- /dev/null +++ b/exercises/ex3/Makefile @@ -0,0 +1,13 @@ +F90=nvfortran +F90FLAGS=-Mcuda + +all: saxpy_cuda_256elements.x + + +saxpy_cuda_256elements.x: saxpy_cuda_256elements.o + $(F90) $(F90FLAGS) saxpy_cuda_256elements.o -o $@ +saxpy_cuda_256elements.o: saxpy_cuda_256elements.F90 + $(F90) $(F90FLAGS) -c saxpy_cuda_256elements.F90 -o $@ + +clean: + rm *.x *.o diff --git a/exercises/ex3/saxpy_cuda_256elements.F90 b/exercises/ex3/saxpy_cuda_256elements.F90 new file mode 100644 index 0000000..a894cd0 --- /dev/null +++ b/exercises/ex3/saxpy_cuda_256elements.F90 @@ -0,0 +1,48 @@ +module mathOps +contains + attributes(global) subroutine saxpy_cuda(x, y, a) + implicit none + real :: x(:), y(:) + real, value :: a + integer :: i, n + n = size(x) + !implement saxpy on GPU here + end subroutine saxpy_cuda + + subroutine saxpy_cpu(x, y, a) + implicit none + real :: x(:), y(:) + real, value :: a + integer :: i, n + n = size(x) + do i=1, n + y(i) = y(i) + a*x(i) + end do + end subroutine saxpy_cpu + +end module mathOps + +program testSaxpy + use mathOps + use cudafor + implicit none + integer, parameter :: N = 256 + integer istat, i + real :: x(N), y(N), a, norm2_cpu, norm2_gpu + real, device :: x_d(N), y_d(N) + + + do i=1, N + y(i) = 0.0 + x(i) = i*0.458 + a = 12.1 + end do + + x_d = x + y_d = y + !call the GPU implementation here + !y = y_d + istat = cudaDeviceSynchronize() + call saxpy_cpu(x, y, a) + write(*,*) 'L2 norm: ', norm2(y-y_d) +end program testSaxpy diff --git a/exercises/ex3/script.sh b/exercises/ex3/script.sh new file mode 100644 index 0000000..a42041e --- /dev/null +++ b/exercises/ex3/script.sh @@ -0,0 +1,14 @@ +#!/bin/bash -l +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --ntasks-per-core=1 +#SBATCH --cpus-per-task=1 +#SBATCH --gres=gpu:1 +#SBATCH --reservation=spc-cuda-training-12.04 +#SBATCH --account=spc-cuda-training +#SBATCH --time=0:05:00 + +module load nvhpc + +srun -n 1 ./hello_world.x > output_hello_world + diff --git a/exercises/ex4/Makefile b/exercises/ex4/Makefile new file mode 100644 index 0000000..897d486 --- /dev/null +++ b/exercises/ex4/Makefile @@ -0,0 +1,13 @@ +F90=nvfortran +F90FLAGS=-Mcuda + +all: saxpy_cuda.x + + +saxpy_cuda.x: saxpy_cuda.o + $(F90) $(F90FLAGS) saxpy_cuda.o -o $@ +saxpy_cuda.o: saxpy_cuda.F90 + $(F90) $(F90FLAGS) -c saxpy_cuda.F90 -o $@ + +clean: + rm *.x *.o diff --git a/exercises/ex4/saxpy_cuda.F90 b/exercises/ex4/saxpy_cuda.F90 new file mode 100644 index 0000000..523b490 --- /dev/null +++ b/exercises/ex4/saxpy_cuda.F90 @@ -0,0 +1,48 @@ +module mathOps +contains + attributes(global) subroutine saxpy_cuda(x, y, a) + implicit none + real :: x(:), y(:) + real, value :: a + integer :: i, n + n = size(x) + !implement saxpy on GPU here + end subroutine saxpy_cuda + + subroutine saxpy_cpu(x, y, a) + implicit none + real :: x(:), y(:) + real, value :: a + integer :: i, n + n = size(x) + do i=1, n + y(i) = y(i) + a*x(i) + end do + end subroutine saxpy_cpu + +end module mathOps + +program testSaxpy + use mathOps + use cudafor + implicit none + integer, parameter :: N = 4000 + integer istat, i + real :: x(N), y(N), a, norm2_cpu, norm2_gpu + real, device :: x_d(N), y_d(N) + + + do i=1, N + y(i) = 0.0 + x(i) = i*0.458 + a = 12.1 + end do + + x_d = x + y_d = y + !call the GPU implementation here + !y = y_d + istat = cudaDeviceSynchronize() + call saxpy_cpu(x, y, a) + write(*,*) 'L2 norm: ', norm2(y-y_d) +end program testSaxpy diff --git a/exercises/ex4/script.sh b/exercises/ex4/script.sh new file mode 100644 index 0000000..4f9804a --- /dev/null +++ b/exercises/ex4/script.sh @@ -0,0 +1,14 @@ +#!/bin/bash -l +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --ntasks-per-core=1 +#SBATCH --cpus-per-task=1 +#SBATCH --gres=gpu:1 +#SBATCH --reservation=spc-cuda-training-12.04 +#SBATCH --account=spc-cuda-training +#SBATCH --time=0:05:00 + +module load nvhpc + +srun -n 1 ./saxpy_cuda.x > output_saxpy_cuda + diff --git a/exercises/ex5/Makefile b/exercises/ex5/Makefile new file mode 100644 index 0000000..897d486 --- /dev/null +++ b/exercises/ex5/Makefile @@ -0,0 +1,13 @@ +F90=nvfortran +F90FLAGS=-Mcuda + +all: saxpy_cuda.x + + +saxpy_cuda.x: saxpy_cuda.o + $(F90) $(F90FLAGS) saxpy_cuda.o -o $@ +saxpy_cuda.o: saxpy_cuda.F90 + $(F90) $(F90FLAGS) -c saxpy_cuda.F90 -o $@ + +clean: + rm *.x *.o diff --git a/exercises/ex5/saxpy_cuda.F90 b/exercises/ex5/saxpy_cuda.F90 new file mode 100644 index 0000000..523b490 --- /dev/null +++ b/exercises/ex5/saxpy_cuda.F90 @@ -0,0 +1,48 @@ +module mathOps +contains + attributes(global) subroutine saxpy_cuda(x, y, a) + implicit none + real :: x(:), y(:) + real, value :: a + integer :: i, n + n = size(x) + !implement saxpy on GPU here + end subroutine saxpy_cuda + + subroutine saxpy_cpu(x, y, a) + implicit none + real :: x(:), y(:) + real, value :: a + integer :: i, n + n = size(x) + do i=1, n + y(i) = y(i) + a*x(i) + end do + end subroutine saxpy_cpu + +end module mathOps + +program testSaxpy + use mathOps + use cudafor + implicit none + integer, parameter :: N = 4000 + integer istat, i + real :: x(N), y(N), a, norm2_cpu, norm2_gpu + real, device :: x_d(N), y_d(N) + + + do i=1, N + y(i) = 0.0 + x(i) = i*0.458 + a = 12.1 + end do + + x_d = x + y_d = y + !call the GPU implementation here + !y = y_d + istat = cudaDeviceSynchronize() + call saxpy_cpu(x, y, a) + write(*,*) 'L2 norm: ', norm2(y-y_d) +end program testSaxpy diff --git a/exercises/ex5/script.sh b/exercises/ex5/script.sh new file mode 100644 index 0000000..4f9804a --- /dev/null +++ b/exercises/ex5/script.sh @@ -0,0 +1,14 @@ +#!/bin/bash -l +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --ntasks-per-core=1 +#SBATCH --cpus-per-task=1 +#SBATCH --gres=gpu:1 +#SBATCH --reservation=spc-cuda-training-12.04 +#SBATCH --account=spc-cuda-training +#SBATCH --time=0:05:00 + +module load nvhpc + +srun -n 1 ./saxpy_cuda.x > output_saxpy_cuda + diff --git a/notes b/notes new file mode 100644 index 0000000..34aea3e --- /dev/null +++ b/notes @@ -0,0 +1 @@ +-Load the compiler module with the command: module load nvhpc