diff --git a/exercises/ex3/Makefile b/exercises/ex3/Makefile new file mode 100644 index 0000000..b805b7f --- /dev/null +++ b/exercises/ex3/Makefile @@ -0,0 +1,13 @@ +F90=nvfortran +F90FLAGS=-Mcuda + +all: saxpy_cuda_256elements.x + + +saxpy_cuda_256elements.x: saxpy_cuda_256elements.o + $(F90) $(F90FLAGS) saxpy_cuda_256elements.o -o $@ +saxpy_cuda_256elements.o: saxpy_cuda_256elements.F90 + $(F90) $(F90FLAGS) -c saxpy_cuda_256elements.F90 -o $@ + +clean: + rm *.x *.o diff --git a/exercises/ex3/saxpy_cuda_256elements.F90 b/exercises/ex3/saxpy_cuda_256elements.F90 new file mode 100644 index 0000000..a894cd0 --- /dev/null +++ b/exercises/ex3/saxpy_cuda_256elements.F90 @@ -0,0 +1,48 @@ +module mathOps +contains + attributes(global) subroutine saxpy_cuda(x, y, a) + implicit none + real :: x(:), y(:) + real, value :: a + integer :: i, n + n = size(x) + !implement saxpy on GPU here + end subroutine saxpy_cuda + + subroutine saxpy_cpu(x, y, a) + implicit none + real :: x(:), y(:) + real, value :: a + integer :: i, n + n = size(x) + do i=1, n + y(i) = y(i) + a*x(i) + end do + end subroutine saxpy_cpu + +end module mathOps + +program testSaxpy + use mathOps + use cudafor + implicit none + integer, parameter :: N = 256 + integer istat, i + real :: x(N), y(N), a, norm2_cpu, norm2_gpu + real, device :: x_d(N), y_d(N) + + + do i=1, N + y(i) = 0.0 + x(i) = i*0.458 + a = 12.1 + end do + + x_d = x + y_d = y + !call the GPU implementation here + !y = y_d + istat = cudaDeviceSynchronize() + call saxpy_cpu(x, y, a) + write(*,*) 'L2 norm: ', norm2(y-y_d) +end program testSaxpy diff --git a/exercises/ex3/script.sh b/exercises/ex3/script.sh new file mode 100644 index 0000000..a42041e --- /dev/null +++ b/exercises/ex3/script.sh @@ -0,0 +1,14 @@ +#!/bin/bash -l +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --ntasks-per-core=1 +#SBATCH --cpus-per-task=1 +#SBATCH --gres=gpu:1 +#SBATCH --reservation=spc-cuda-training-12.04 +#SBATCH --account=spc-cuda-training +#SBATCH --time=0:05:00 + +module load nvhpc + +srun -n 1 ./hello_world.x > output_hello_world +