diff --git a/examples/saxpy/Makefile b/examples/saxpy/Makefile new file mode 100644 index 0000000..4d4c92f --- /dev/null +++ b/examples/saxpy/Makefile @@ -0,0 +1,13 @@ +F90=nvfortran +F90FLAGS=-Mcuda + +all: saxpy.x + + +saxpy.x: saxpy.o + $(F90) $(F90FLAGS) saxpy.o -o $@ +saxpy.o: saxpy.F90 + $(F90) $(F90FLAGS) -c saxpy.F90 -o $@ + +clean: + rm *.x *.o diff --git a/examples/saxpy/saxpy.F90 b/examples/saxpy/saxpy.F90 new file mode 100644 index 0000000..41f2659 --- /dev/null +++ b/examples/saxpy/saxpy.F90 @@ -0,0 +1,48 @@ +module mathOps +contains + attributes(global) subroutine saxpy(x, y, a) + implicit none + real :: x(:), y(:) + real, value :: a + integer :: i, n + n = size(x) + i = blockDim%x * (blockIdx%x - 1) + threadIdx%x + if (i <= n) y(i) = y(i) + a*x(i) + end subroutine saxpy +end module mathOps + +program testSaxpy + use mathOps + use cudafor + implicit none + integer, parameter :: N = 20*1024*1024 + real :: x(N), y(N), a + real, device :: x_d(N), y_d(N) + type(dim3) :: grid, tBlock + type (cudaEvent) :: startEvent, stopEvent + real :: time + integer :: istat + + tBlock = dim3(512,1,1) + grid = dim3(ceiling(real(N)/tBlock%x),1,1) + + istat = cudaEventCreate(startEvent) + istat = cudaEventCreate(stopEvent) + + x = 1.0; y = 2.0; a = 2.0 + x_d = x + y_d = y + istat = cudaEventRecord(startEvent, 0) + call saxpy<<>>(x_d, y_d, a) + istat = cudaEventRecord(stopEvent, 0) + istat = cudaEventSynchronize(stopEvent) + istat = cudaEventElapsedTime(time, startEvent, stopEvent) + + y = y_d + write(*,*) 'Max error: ', maxval(abs(y-4.0)) + write(*,*) 'Effective Bandwidth (GB/s): ', N*4*3/time/10**6 + + istat = cudaEventDestroy(startEvent) + istat = cudaEventDestroy(stopEvent) + +end program testSaxpy diff --git a/examples/saxpy/script.sh b/examples/saxpy/script.sh new file mode 100644 index 0000000..b1ff517 --- /dev/null +++ b/examples/saxpy/script.sh @@ -0,0 +1,13 @@ +#!/bin/bash -l +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --ntasks-per-core=1 +#SBATCH --cpus-per-task=1 +#SBATCH --gres=gpu:1 +#SBATCH --reservation=spc-cuda-training-19.04 +#SBATCH --account=spc-cuda-training +#SBATCH --time=0:05:00 + +module load nvhpc + +srun -n 1 ./saxpy.x > output_saxpy diff --git a/examples/warmup/script.sh b/examples/warmup/script.sh new file mode 100644 index 0000000..006f956 --- /dev/null +++ b/examples/warmup/script.sh @@ -0,0 +1,13 @@ +#!/bin/bash -l +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --ntasks-per-core=1 +#SBATCH --cpus-per-task=1 +#SBATCH --gres=gpu:1 +#SBATCH --reservation=spc-cuda-training-12.04 +#SBATCH --account=spc-cuda-training +#SBATCH --time=0:05:00 + +module load nvhpc + +nvidia-smi -a > output diff --git a/exercises/binding/Makefile b/exercises/binding/Makefile new file mode 100644 index 0000000..1111bb7 --- /dev/null +++ b/exercises/binding/Makefile @@ -0,0 +1,10 @@ +all: bind + +bind: c_func.o f_mod.o fmain.o + $(F90) $(F90FLAGS) $? -o $@ + +%.o: %.f90 + $(F90) $(F90FLAGS) $< -o $@ -c + +clean: + rm -rf *.o *.mod bind diff --git a/exercises/binding/c_func.c b/exercises/binding/c_func.c new file mode 100644 index 0000000..6b1f5e2 --- /dev/null +++ b/exercises/binding/c_func.c @@ -0,0 +1,11 @@ +/* -------------------------------------------------------------------------- */ +#include "c_func.h" +/* -------------------------------------------------------------------------- */ +#include +/* -------------------------------------------------------------------------- */ + +void c_function(matrix * in, double d) { + printf("d = %g\n", d); + + // print matrix to screan +} diff --git a/exercises/binding/c_func.h b/exercises/binding/c_func.h new file mode 100644 index 0000000..e4635d7 --- /dev/null +++ b/exercises/binding/c_func.h @@ -0,0 +1,11 @@ +#ifndef __C_FUNC_H_ +#define __C_FUNC_H_ + +typedef struct { + int m, n; + double * data; +} matrix; + +void c_function(matrix * in, double d); + +#endif // __C_FUNC_H_ diff --git a/exercises/binding/f_mod.f90 b/exercises/binding/f_mod.f90 new file mode 100644 index 0000000..afaf1b2 --- /dev/null +++ b/exercises/binding/f_mod.f90 @@ -0,0 +1,31 @@ +module f_mod + use, intrinsic :: iso_c_binding, only : c_int, c_double, c_ptr, c_loc + implicit none + +contains + subroutine f_func(data) + type, bind(C) :: c_matrix + integer(c_int) :: m + integer(c_int) :: n + type(c_ptr) :: data + end type c_matrix + + interface + subroutine c_function(mat, d) BIND(C, name='c_function') + import c_matrix, c_double + type(c_matrix) :: mat + real(c_double), value :: d + end subroutine c_function + end interface + + type(c_matrix) :: mat + double precision, dimension(:, :), allocatable, intent(in) :: data + real(c_double) :: d = 1.45 + + mat%m = size(data, 1) + mat%n = size(data, 2) + mat%data = c_loc(data(lbound(data,1), lbound(data, 2))) + + call c_function(mat, d) + end subroutine f_func +end module f_mod diff --git a/exercises/binding/fmain.f90 b/exercises/binding/fmain.f90 new file mode 100644 index 0000000..d2c6668 --- /dev/null +++ b/exercises/binding/fmain.f90 @@ -0,0 +1,19 @@ +program test_binding + use f_mod + + implicit none + + double precision, dimension(:, :), allocatable :: data + integer :: i, j + integer :: m = 20, n = 10 + + allocate (data(m, n)) + + do i = 1, m + do j = 1, n + ! initialize data here + end do + end do + + call f_func(data) +end program test_binding diff --git a/exercises/binding/solution/Makefile b/exercises/binding/solution/Makefile new file mode 100644 index 0000000..7da3388 --- /dev/null +++ b/exercises/binding/solution/Makefile @@ -0,0 +1,16 @@ +CXXFLAGS=-std=c++11 -g -O3 +CFLAGS=-g -O3 +LDFLAGS=-g -O3 +F90FLAGS=-g -O3 + + +all: clean bind + +bind: c_func.o f_mod.o fmain.o + $(F90) $(F90FLAGS) $? -o $@ + +%.o: %.f90 + $(F90) $(F90FLAGS) $< -o $@ -c + +clean: + rm -rf *.o *.mod bind diff --git a/exercises/binding/solution/c_func.c b/exercises/binding/solution/c_func.c new file mode 100644 index 0000000..1497ab0 --- /dev/null +++ b/exercises/binding/solution/c_func.c @@ -0,0 +1,15 @@ +/* -------------------------------------------------------------------------- */ +#include "c_func.h" +/* -------------------------------------------------------------------------- */ +#include +/* -------------------------------------------------------------------------- */ + +void c_function(matrix * in, double d) { + printf("d = %g\n", d); + for(int i = 0; i < in->m; ++i) { + for(int j = 0; j < in->n; ++j) { + printf("%3g ", in->data[j * in->m + i]); + } + printf("\n"); + } +} diff --git a/exercises/binding/solution/c_func.h b/exercises/binding/solution/c_func.h new file mode 100644 index 0000000..e4635d7 --- /dev/null +++ b/exercises/binding/solution/c_func.h @@ -0,0 +1,11 @@ +#ifndef __C_FUNC_H_ +#define __C_FUNC_H_ + +typedef struct { + int m, n; + double * data; +} matrix; + +void c_function(matrix * in, double d); + +#endif // __C_FUNC_H_ diff --git a/exercises/binding/solution/f_mod.f90 b/exercises/binding/solution/f_mod.f90 new file mode 100644 index 0000000..afaf1b2 --- /dev/null +++ b/exercises/binding/solution/f_mod.f90 @@ -0,0 +1,31 @@ +module f_mod + use, intrinsic :: iso_c_binding, only : c_int, c_double, c_ptr, c_loc + implicit none + +contains + subroutine f_func(data) + type, bind(C) :: c_matrix + integer(c_int) :: m + integer(c_int) :: n + type(c_ptr) :: data + end type c_matrix + + interface + subroutine c_function(mat, d) BIND(C, name='c_function') + import c_matrix, c_double + type(c_matrix) :: mat + real(c_double), value :: d + end subroutine c_function + end interface + + type(c_matrix) :: mat + double precision, dimension(:, :), allocatable, intent(in) :: data + real(c_double) :: d = 1.45 + + mat%m = size(data, 1) + mat%n = size(data, 2) + mat%data = c_loc(data(lbound(data,1), lbound(data, 2))) + + call c_function(mat, d) + end subroutine f_func +end module f_mod diff --git a/exercises/binding/solution/fmain.f90 b/exercises/binding/solution/fmain.f90 new file mode 100644 index 0000000..533aeaa --- /dev/null +++ b/exercises/binding/solution/fmain.f90 @@ -0,0 +1,19 @@ +program test_binding + use f_mod + + implicit none + + double precision, dimension(:, :), allocatable :: data + integer :: i, j + integer :: m = 20, n = 10 + + allocate (data(m, n)) + + do i = 1, m + do j = 1, n + data(i, j) = n * (i-1) + (j-1) + end do + end do + + call f_func(data) +end program test_binding diff --git a/exercises/binding/solution_cxx/Makefile b/exercises/binding/solution_cxx/Makefile new file mode 100644 index 0000000..81e90f8 --- /dev/null +++ b/exercises/binding/solution_cxx/Makefile @@ -0,0 +1,14 @@ +CXXFLAGS=-std=c++11 -g -O3 +LDFLAGS=-g -O3 +F90FLAGS=-g -O3 + +all: bind + +bind: c_func.o f_mod.o fmain.o + $(F90) $(F90FLAGS) $? -o $@ + +%.o: %.f90 + $(F90) $(F90FLAGS) $< -o $@ -c + +clean: + rm -rf *.o *.mod bind diff --git a/exercises/binding/solution_cxx/c_func.cc b/exercises/binding/solution_cxx/c_func.cc new file mode 100644 index 0000000..86affd7 --- /dev/null +++ b/exercises/binding/solution_cxx/c_func.cc @@ -0,0 +1,16 @@ +/* -------------------------------------------------------------------------- */ +#include "c_func.h" +/* -------------------------------------------------------------------------- */ +#include +/* -------------------------------------------------------------------------- */ + +void c_function(matrix * in, double d) { + printf("d = %g\n", d); + auto & mat = *in; + for(int i = 0; i < mat.m; ++i) { + for(int j = 0; j < mat.n; ++j) { + printf("%g ", mat(i, j)); + } + printf("\n"); + } +} diff --git a/exercises/binding/solution_cxx/c_func.h b/exercises/binding/solution_cxx/c_func.h new file mode 100644 index 0000000..5d76654 --- /dev/null +++ b/exercises/binding/solution_cxx/c_func.h @@ -0,0 +1,18 @@ +#ifndef __C_FUNC_H_ +#define __C_FUNC_H_ + +extern "C" { + +struct matrix { + int m, n; + double * data; + inline double & operator()(int i, int j) { + return data[j*m + i]; + } +}; + +void c_function(matrix * in, double d); + +} + +#endif // __C_FUNC_H_ diff --git a/exercises/binding/solution_cxx/f_mod.f90 b/exercises/binding/solution_cxx/f_mod.f90 new file mode 100644 index 0000000..afaf1b2 --- /dev/null +++ b/exercises/binding/solution_cxx/f_mod.f90 @@ -0,0 +1,31 @@ +module f_mod + use, intrinsic :: iso_c_binding, only : c_int, c_double, c_ptr, c_loc + implicit none + +contains + subroutine f_func(data) + type, bind(C) :: c_matrix + integer(c_int) :: m + integer(c_int) :: n + type(c_ptr) :: data + end type c_matrix + + interface + subroutine c_function(mat, d) BIND(C, name='c_function') + import c_matrix, c_double + type(c_matrix) :: mat + real(c_double), value :: d + end subroutine c_function + end interface + + type(c_matrix) :: mat + double precision, dimension(:, :), allocatable, intent(in) :: data + real(c_double) :: d = 1.45 + + mat%m = size(data, 1) + mat%n = size(data, 2) + mat%data = c_loc(data(lbound(data,1), lbound(data, 2))) + + call c_function(mat, d) + end subroutine f_func +end module f_mod diff --git a/exercises/binding/solution_cxx/fmain.f90 b/exercises/binding/solution_cxx/fmain.f90 new file mode 100644 index 0000000..533aeaa --- /dev/null +++ b/exercises/binding/solution_cxx/fmain.f90 @@ -0,0 +1,19 @@ +program test_binding + use f_mod + + implicit none + + double precision, dimension(:, :), allocatable :: data + integer :: i, j + integer :: m = 20, n = 10 + + allocate (data(m, n)) + + do i = 1, m + do j = 1, n + data(i, j) = n * (i-1) + (j-1) + end do + end do + + call f_func(data) +end program test_binding diff --git a/exercises/gradients/Makefile b/exercises/gradients/Makefile new file mode 100644 index 0000000..fa6f63c --- /dev/null +++ b/exercises/gradients/Makefile @@ -0,0 +1,28 @@ +F90=gfortran +NVCC=nvcc +LDFLAGS=-lc -lstdc++ -lcuda -lcudart -lcudadevrt +F90FLAGS= -ffree-line-length-0 -O3 -g + + +all: test_gradients.x + +test_gradients.x: cudalloc.o grad_test.o prec_const_mod.o space_grid_mod.o cuda_interface_mod.o gradients_mod.o test_gradients.o + $(F90) $(LDFLAGS) cudalloc.o grad_test.o prec_const_mod.o space_grid_mod.o cuda_interface_mod.o \ + gradients_mod.o test_gradients.o -o $@ +test_gradients.o: test_gradients.F90 + $(F90) $(F90FLAGS) -c test_gradients.F90 -o $@ +cuda_interface_mod.o: cuda_interface_mod.F90 + $(F90) $(F90FLAGS) -c cuda_interface_mod.F90 -o $@ +space_grid_mod.o: space_grid_mod.F90 + $(F90) $(F90FLAGS) -c space_grid_mod.F90 -o $@ +prec_const_mod.o: prec_const_mod.F90 + $(F90) $(F90FLAGS) -c prec_const_mod.F90 -o $@ +gradients_mod.o: gradients_mod.F90 + $(F90) $(F90FLAGS) -c gradients_mod.F90 -o $@ +grad_test.o: grad_test.cu + $(NVCC) $(NVCCFLAGS) -c grad_test.cu -o $@ +cudalloc.o: cudalloc.cu + $(NVCC) $(NVCCFLAGS) -c cudalloc.cu -o $@ + +clean: + rm *.x *.o diff --git a/exercises/gradients/cuda_interface_mod.F90 b/exercises/gradients/cuda_interface_mod.F90 new file mode 100644 index 0000000..f0decef --- /dev/null +++ b/exercises/gradients/cuda_interface_mod.F90 @@ -0,0 +1,128 @@ + +MODULE cuda_interface +use prec_const +INTERFACE + FUNCTION allocate_cuda_memory(n) BIND(C, NAME='allocate_cuda_memory') + use iso_c_binding + TYPE(C_PTR) :: allocate_cuda_memory + integer :: n + END FUNCTION allocate_cuda_memory + + +END INTERFACE + INTERFACE gbs_allocate_cuda + MODULE PROCEDURE gbs_allocate_cuda_dp3, gbs_allocate_cuda_dp2, gbs_allocate_cuda_dp1 + END INTERFACE gbs_allocate_cuda + real(dp), pointer, save :: strmf_cpu(:,:,:),strmf_gpu(:,:,:), indata(:,:,:) + real(dp), pointer, save :: strmf1_cpu(:,:,:), strmf1_gpu(:,:,:) + real(dp), pointer, save :: grady_f(:,:,:), grady_diff(:,:,:), diff(:,:,:), f_x(:,:,:) + real(dp), pointer, save :: f_z(:,:,:), f_y(:,:,:), f_n(:,:,:), f_x_n(:,:,:) + real(dp), pointer, save :: f_v(:,:,:), f_x_v(:,:,:), f_xx_n(:,:,:),f_xx_v(:,:,:) + real(dp), pointer, save :: f_zz(:,:,:), f_yy(:,:,:), f_xx(:,:,:) + real(dp), pointer, save :: f_zy(:,:,:), f_yz(:,:,:), f_zx(:,:,:) + real(dp), pointer, save :: f_xz(:,:,:), f_xy(:,:,:), f_yx(:,:,:) + + real(dp), pointer, save :: gradpar_x_n_gpu(:,:), gradpar_x_v_gpu(:,:), gradpar_y_n_gpu(:,:), gradpar_y_v_gpu(:,:) + real(dp), pointer, save :: grad2par_yy_n_gpu(:,:), grad2par_xx_n_gpu(:,:), grad2par_x_n_gpu(:,:), grad2par_y_n_gpu(:,:) + real(dp), pointer, save :: grad2par_zy_n_gpu(:,:), grad2par_zx_n_gpu(:,:), grad2par_xy_n_gpu(:,:) + real(dp), pointer, save :: grad2par_yy_v_gpu(:,:), grad2par_xx_v_gpu(:,:), grad2par_x_v_gpu(:,:), grad2par_y_v_gpu(:,:) + real(dp), pointer, save :: grad2par_zy_v_gpu(:,:), grad2par_zx_v_gpu(:,:), grad2par_xy_v_gpu(:,:) + real(dp), pointer, save :: cur_x_n_gpu(:), cur_x_v_gpu(:), ccur_xx_n_gpu(:), ccur_xx_v_gpu(:) + real(dp), pointer, save :: cur_y_n_gpu(:,:), cur_y_v_gpu(:,:), ccur_yy_n_gpu(:,:), ccur_xy_n_gpu(:,:) + real(dp), pointer, save :: ccur_yy_v_gpu(:,:), ccur_xy_v_gpu(:,:) + real(dp), pointer, dimension(:,:,:), save :: A_x_n_gpu, A_x_v_gpu, A_y_v_gpu, B_x_v_gpu, B_y_v_gpu + real(dp), pointer, dimension(:,:,:), save :: B_x_n_gpu, B_y_n_gpu, A_y_n_gpu + real(dp), pointer, dimension(:,:), save :: gradcur_xx_v_gpu, gradcur_yy_v_gpu, gradcur_xy_v_gpu, gradcur_yx_v_gpu + real(dp), pointer, dimension(:,:), save :: gradcur_zx_v_gpu, gradcur_zy_v_gpu, gradcur_x_v_gpu, gradcur_y_v_gpu + real(dp), pointer, dimension(:,:), save :: curgrad_xx_n_gpu, curgrad_yy_n_gpu, curgrad_xy_n_gpu, curgrad_yx_n_gpu + real(dp), pointer, dimension(:,:), save :: curgrad_zx_n_gpu, curgrad_zy_n_gpu, curgrad_x_n_gpu, curgrad_y_n_gpu + + !real(dp), pointer, save :: gradpar_x_n_gpu(:,:,:), gradpar_x_v_gpu(:,:,:), gradpar_y_n_gpu(:,:,:), gradpar_y_v_gpu(:,:,:) + integer :: nx_cuda, ny_cuda, nz_cuda +contains + + SUBROUTINE gbs_allocate_cuda_dp3(a,is1,ie1,is2,ie2,is3,ie3) + use iso_c_binding + real(dp), DIMENSION(:,:,:), pointer, INTENT(INOUT) :: a + INTEGER, INTENT(IN) :: is1,ie1,is2,ie2,is3,ie3 + integer ndata + ndata=(ie3-is3+1)*(ie2-is2+1)*(ie1-is1+1) + call c_f_pointer(allocate_cuda_memory(ndata), a, \ + [(ie1-is1+1),(ie2-is2+1),(ie3-is3+1)]) + a(is1:,is2:,is3:) => a + a=0.0_dp + END SUBROUTINE gbs_allocate_cuda_dp3 + + SUBROUTINE gbs_allocate_cuda_dp2(a,is1,ie1,is2,ie2) + use iso_c_binding + real(dp), DIMENSION(:,:), pointer, INTENT(INOUT) :: a + INTEGER, INTENT(IN) :: is1,ie1,is2,ie2 + integer ndata + ndata=(ie2-is2+1)*(ie1-is1+1) + call c_f_pointer(allocate_cuda_memory(ndata), a, \ + [(ie1-is1+1),(ie2-is2+1)]) + a(is1:,is2:) => a + a=0.0_dp + END SUBROUTINE gbs_allocate_cuda_dp2 + + SUBROUTINE gbs_allocate_cuda_dp1(a,is1,ie1) + use iso_c_binding + real(dp), DIMENSION(:), pointer, INTENT(INOUT) :: a + INTEGER, INTENT(IN) :: is1,ie1 + integer ndata + ndata=(ie1-is1+1) + call c_f_pointer(allocate_cuda_memory(ndata), a, \ + [(ie1-is1+1)]) + a(is1:) => a + a=0.0_dp + END SUBROUTINE gbs_allocate_cuda_dp1 + + + subroutine gradz_n2n_fd4cuda(f_in, f_out) + use space_grid + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out + call gradz_n2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda, deltazi) + end subroutine gradz_n2n_fd4cuda + + subroutine gradz_v2n_fd4cuda(f_in, f_out) + use space_grid + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out + call gradz_v2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda, deltazi) + end subroutine gradz_v2n_fd4cuda + + subroutine grady_v2n_fd4cuda(f_in, f_out) + use space_grid + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out + call grady_v2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda, deltayi) + end subroutine grady_v2n_fd4cuda + + subroutine interp_v2n_fd4cuda(f_in, f_out) + use space_grid + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out + call interp_v2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda) + end subroutine interp_v2n_fd4cuda + + subroutine gradx_n2n_fd4cuda(f_in, f_out) + use space_grid + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out + call gradx_n2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda, deltaxi) + end subroutine gradx_n2n_fd4cuda + + + subroutine gradpar_v2n_fd4cuda(f_in, f_out, gradpar_z) + use space_grid + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in + real(dp), INTENT(in) :: gradpar_z + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out + + CALL gradpar_v2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda, f_z, f_y, f_n, \ + f_x_n, deltazi, deltayi, deltaxi, gradpar_z, gradpar_x_n_gpu, gradpar_y_n_gpu) + end subroutine gradpar_v2n_fd4cuda + + +END MODULE cuda_interface diff --git a/exercises/gradients/cudalloc.cu b/exercises/gradients/cudalloc.cu new file mode 100644 index 0000000..1186170 --- /dev/null +++ b/exercises/gradients/cudalloc.cu @@ -0,0 +1,76 @@ +/*#include +//#include +#include +#include +struct cuda_struct{ + double *coef_der_cuda; +}; + +*/ +#include "cudalloc.h" +/*__global__ void set_coefficients(cuda_struct *structure){ + structure->coef_der_cuda[0] = 1.0/12.0; + structure->coef_der_cuda[1] = -2.0/3.0; + structure->coef_der_cuda[2] = 2.0/3.0; + structure->coef_der_cuda[3] = -1.0/12.0; +*/ +__global__ void set_coefficients(double *coef, double *coefstag, double *coef_int){ +coef[0] = 1.0/12.0; +coef[1] = -2.0/3.0; +coef[2] = 2.0/3.0; +coef[3] = -1.0/12.0; +coef_int[0] = -1.0/16.0; +coef_int[1] = 9.0/16.0; +coef_int[2] = 9.0/16.0; +coef_int[3] = -1.0/16.0; +coefstag[0] = 1.0/24.0; +coefstag[1] = -9.0/8.0; +coefstag[2] = 9.0/8.0; +coefstag[3] = -1.0/24.0; +} +void set_extent(int *out, int in){ +*out = in; +} +extern "C" void allocate_cuda_struct(cuda_struct *& structure){ + structure = new cuda_struct(); + return; +} + +//__global__ void print_coefficients(cuda_struct *structure){ +__global__ void print_coefficients(double *coef){ +} +extern "C" void fill_cuda_struct(cuda_struct *structure, int *izeg, int *izsg, int *iyeg, int *iysg, int *ixeg, int *ixsg){ +//extern "C" void fill_cuda_struct(cuda_struct *structure){ + + cudaMallocManaged( (void **)&(structure->coef_der_cuda), sizeof(double) * 4 ); + cudaMallocManaged( (void **)&(structure->coef_der1_stag), sizeof(double) * 4 ); + cudaMallocManaged( (void **)&(structure->coef_int_cuda), sizeof(double) * 4 ); + set_coefficients<<<1, 1>>>(structure->coef_der_cuda, structure->coef_der1_stag, structure->coef_int_cuda); + //set_coefficients<<<1, 1>>>(structure->coef_der_cuda); + //set_extent((structure->nz), (*izeg-*izsg+1)); + //set_extent(&(structure->ny), (*izeg-*izsg+1)); + //set_extent(&(structure->nx), (*izeg-*izsg+1)); + //set_extent(&(structure->nx), (*iyeg-*iysg+1)); + //&(structure->nz) = &k;//(*izeg-*izsg+1); + //&(structure->ny) = &k;//(*ixeg-*ixsg+1); + //&(structure->nx) = &k;//(*iyeg-*iysg+1); + //printf("AAAAAAA %d %d %d\n", structure->nz, structure->ny, structure->nx); + //set_extent<<<1, 1>>>(structure->nz, (*izeg-*izsg+1)); + //set_extent<<<1, 1>>>(structure->ny, (*ixeg-*ixsg+1)); + //set_extent<<<1, 1>>>(structure->nx, (*iyeg-*iysg+1)); + cudaDeviceSynchronize(); + + printf("cudalloc coef_int %f %f %f %f \n", structure->coef_int_cuda[0], structure->coef_int_cuda[1], structure->coef_int_cuda[2], structure->coef_int_cuda[3]); + return; +} + +extern "C" void *allocate_cuda_memory(int *n){ + double *a; + int N=*n; + cudaMallocManaged( (void **)&a, sizeof(double) * N ); + cudaDeviceSynchronize(); + return (void *) a; +} + + + diff --git a/exercises/gradients/cudalloc.h b/exercises/gradients/cudalloc.h new file mode 100644 index 0000000..0d950a9 --- /dev/null +++ b/exercises/gradients/cudalloc.h @@ -0,0 +1,11 @@ +#include +//#include +#include +#include + +struct cuda_struct{ + double *coef_der_cuda; + double *coef_int_cuda; + double *coef_der1_stag; + int *nz, *ny, *nx; +}; diff --git a/exercises/gradients/grad_test.cu b/exercises/gradients/grad_test.cu new file mode 100644 index 0000000..3b9eda0 --- /dev/null +++ b/exercises/gradients/grad_test.cu @@ -0,0 +1,256 @@ +#include +//#include +//#include "unistd.h" + + +__constant__ __device__ double coef_int[4] = {-1.0/16.0, 9.0/16.0, 9.0/16.0, -1.0/16.0 }; +__constant__ __device__ double coef_der1_stag[4] = { +1.0/24.0, -9.0/8.0, 9.0/8.0, -1.0/24.0 }; +__constant__ __device__ double coef_der2[5] = {-1.0/12.0, 4.0/3.0, -5.0/2.0, +4.0/3.0, -1.0/12.0}; +__constant__ __device__ double coef_der1_n2n[4] = { 1.0/12.0, -2.0/3.0, 2.0/3.0, -1.0/12.0}; +__constant__ __device__ double coef_der2_stag[4] = { 1.0/2., -1.0/2.0, -1.0/2., 1.0/2.0}; + +__device__ int xyindex(int i, int nz ) +{ + return i%nz; +} + + +__device__ int disp(int i, int j, int k, int nx, int ny) +{ + return i + j*nx + k*nx*ny; +} + +__device__ void gradz_n2n_fd4Device(double *f, double *fz, int nz, int ny, int nx, double fact) +{ + int index = blockDim.x*blockIdx.x+threadIdx.x; + int d1=index+disp(0,0,-2,nx,ny); + int d2=index+disp(0,0,-1,nx,ny); + int d3=index+disp(0,0,1,nx,ny); + int d4=index+disp(0,0,2,nx,ny); + if (d1>=0 && d4<(nx*ny*nz)){ + fz[index] = coef_der1_n2n[0]*f[d1] + coef_der1_n2n[1]*f[d2] + + coef_der1_n2n[2]*f[d3] + coef_der1_n2n[3]*f[d4]; + fz[index] *= fact; + } + +} + +__global__ void gradz_n2n_fd4(double *f, double *fz, int nz, int ny, int nx, double fact){ + gradz_n2n_fd4Device(f, fz, nz, ny, nx, fact); +} + +extern "C" void gradz_n2n_fd4_cuda_( double *f, double *f_z, int *nz, int *ny, int *nx, double *fact){ + int n=(*nx)*(*ny)*(*nz); + int threadsPerBlock = 256; + int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock); + gradz_n2n_fd4<<>>(f, f_z, *nz, *ny, *nx, *fact); + cudaDeviceSynchronize(); +} + + +__device__ void gradz_v2n_fd4Device(double *f, double *f_z_v2n, int nz, int ny, int nx, double fact) +{ + int index=blockDim.x*blockIdx.x+threadIdx.x; + int d11=index+disp(-1,0,-1,nx,ny); + int d12=index+disp(-1,0,0,nx,ny); + int d13=index+disp(-1,0,1,nx,ny); + int d14=index+disp(-1,0,2,nx,ny); + int d21=index+disp(0,0,-1,nx,ny); + int d22=index+disp(0,0,0,nx,ny); + int d23=index+disp(0,0,1,nx,ny); + int d24=index+disp(0,0,2,nx,ny); + int d31=index+disp(1,0,-1,nx,ny); + int d32=index+disp(1,0,0,nx,ny); + int d33=index+disp(1,0,1,nx,ny); + int d34=index+disp(1,0,2,nx,ny); + int d41=index+disp(2,0,-1,nx,ny); + int d42=index+disp(2,0,0,nx,ny); + int d43=index+disp(2,0,1,nx,ny); + int d44=index+disp(2,0,2,nx,ny); + + if (d11>=0 && d44<(nx*ny*nz)){ + f_z_v2n[index] = coef_int[0] * (coef_der1_stag[0]*f[d11] + coef_der1_stag[1]*f[d12] + + coef_der1_stag[2]*f[d13] + coef_der1_stag[3]*f[d14]) + + coef_int[1] * (coef_der1_stag[0]*f[d21] + coef_der1_stag[1]*f[d22] + + coef_der1_stag[2]*f[d23] + coef_der1_stag[3]*f[d24])+ + coef_int[2] * (coef_der1_stag[0]*f[d31] + coef_der1_stag[1]*f[d32] + + coef_der1_stag[2]*f[d33] + coef_der1_stag[3]*f[d34]) + + coef_int[3] * (coef_der1_stag[0]*f[d41] + coef_der1_stag[1]*f[d42] + + coef_der1_stag[2]*f[d43] + coef_der1_stag[3]*f[d44]); + f_z_v2n[index] *= fact; + } + +} + +__global__ void gradz_v2n_fd4(double *f, double *fz, int nz, int ny, int nx, double fact){ + gradz_v2n_fd4Device(f, fz, nz, ny, nx, fact); +} + +extern "C" void gradz_v2n_fd4_cuda_( double *f, double *f_z, int *nz, int *ny, int *nx, double *fact){ + int n=(*nx)*(*ny)*(*nz); + int threadsPerBlock = 256; + int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock); + gradz_v2n_fd4<<>>(f, f_z, *nz, *ny, *nx, *fact); + cudaDeviceSynchronize(); +} + +__device__ void grady_v2n_fd4Device(double *f, double *f_y_v2n, int nz, int ny, int nx, double fact) +{ + int index = blockDim.x*blockIdx.x+threadIdx.x; + int d11=index+disp(-1,0,-1,nx,ny); + int d12=index+disp(0,0,-1,nx,ny); + int d13=index+disp(1,0,-1,nx,ny); + int d14=index+disp(2,0,-1,nx,ny); + int d21=index+disp(-1,0,0,nx,ny); + int d22=index+disp(0,0,0,nx,ny); + int d23=index+disp(1,0,0,nx,ny); + int d24=index+disp(2,0,0,nx,ny); + int d31=index+disp(-1,0,1,nx,ny); + int d32=index+disp(0,0,1,nx,ny); + int d33=index+disp(1,0,1,nx,ny); + int d34=index+disp(2,0,1,nx,ny); + int d41=index+disp(-1,0,2,nx,ny); + int d42=index+disp(0,0,2,nx,ny); + int d43=index+disp(1,0,2,nx,ny); + int d44=index+disp(2,0,2,nx,ny); + + if(d11>=0 && d44<(nx*ny*nz)){ + f_y_v2n[index] = coef_int[0] * (coef_der1_stag[0]*f[d11] + coef_der1_stag[1]*f[d12] + + coef_der1_stag[2]*f[d13] + coef_der1_stag[3]*f[d14]) + + coef_int[1] * (coef_der1_stag[0]*f[d21] + coef_der1_stag[1]*f[d22] + + coef_der1_stag[2]*f[d23] + coef_der1_stag[3]*f[d24])+ + coef_int[2] * (coef_der1_stag[0]*f[d31] + coef_der1_stag[1]*f[d32] + + coef_der1_stag[2]*f[d33] + coef_der1_stag[3]*f[d34]) + + coef_int[3] * (coef_der1_stag[0]*f[d41] + coef_der1_stag[1]*f[d42] + + coef_der1_stag[2]*f[d43] + coef_der1_stag[3]*f[d44]); + f_y_v2n[index] *= fact; + } + +} + +__global__ void grady_v2n_fd4(double *f, double *f_y_v2n, int nz, int ny, int nx, double fact){ + grady_v2n_fd4Device(f, f_y_v2n, nz, ny, nx, fact); +} + +extern "C" void grady_v2n_fd4_cuda_( double *f, double *f_y_v2n, int *nz, int *ny, int *nx, double *fact){ + int n=(*nx)*(*ny)*(*nz); + int threadsPerBlock = 256; + int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock); + grady_v2n_fd4<<>>(f, f_y_v2n, *nz, *ny, *nx, *fact); + cudaDeviceSynchronize(); +} + +__device__ void interp_v2n_fd4Device(double *f, double *f_int_v2n, int nz, int ny, int nx) +{ + int index=blockDim.x*blockIdx.x+threadIdx.x; + int d11=index+disp(-1,0,-1,nx,ny); + int d12=index+disp(-1,0,0,nx,ny); + int d13=index+disp(-1,0,1,nx,ny); + int d14=index+disp(-1,0,2,nx,ny); + int d21=index+disp(0,0,-1,nx,ny); + int d22=index+disp(0,0,0,nx,ny); + int d23=index+disp(0,0,1,nx,ny); + int d24=index+disp(0,0,2,nx,ny); + int d31=index+disp(1,0,-1,nx,ny); + int d32=index+disp(1,0,0,nx,ny); + int d33=index+disp(1,0,1,nx,ny); + int d34=index+disp(1,0,2,nx,ny); + int d41=index+disp(2,0,-1,nx,ny); + int d42=index+disp(2,0,0,nx,ny); + int d43=index+disp(2,0,1,nx,ny); + int d44=index+disp(2,0,2,nx,ny); + + if (d11>=0 && d44<(nx*ny*nz)){ + f_int_v2n[index] = coef_int[0] * (coef_int[0]*f[d11] + coef_int[1]*f[d12] + + coef_int[2]*f[d13] + coef_int[3]*f[d14]) + + coef_int[1] * (coef_int[0]*f[d21] + coef_int[1]*f[d22] + + coef_int[2]*f[d23] + coef_int[3]*f[d24])+ + coef_int[2] * (coef_int[0]*f[d31] + coef_int[1]*f[d32] + + coef_int[2]*f[d33] + coef_int[3]*f[d34]) + + coef_int[3] * (coef_int[0]*f[d41] + coef_int[1]*f[d42] + + coef_int[2]*f[d43] + coef_int[3]*f[d44]); + } + +} + +__global__ void interp_v2n_fd4(double *f, double *f_int_v2n, int nz, int ny, int nx){ + interp_v2n_fd4Device(f, f_int_v2n, nz, ny, nx); +} + +extern "C" void interp_v2n_fd4_cuda_( double *f, double *f_int_v2n, int *nz, int *ny, int *nx){ + int n=(*nx)*(*ny)*(*nz); + int threadsPerBlock = 256; + int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock); + interp_v2n_fd4<<>>(f, f_int_v2n, *nz, *ny, *nx); + cudaDeviceSynchronize(); +} + +__device__ void gradx_n2n_fd4Device(double *f, double *fx, int nz, int ny, int nx, double fact) +{ + int d1=disp(0,1,0,nx,ny); + int d2=disp(0,2,0,nx,ny); + int index=blockDim.x*blockIdx.x+threadIdx.x; + if (index<(nx*ny*nz-d2) && index>=d2){ + fx[index] = coef_der1_n2n[0]*f[index-d2] + coef_der1_n2n[1]*f[index-d1] + coef_der1_n2n[2]*f[index+d1] + coef_der1_n2n[3]*f[index+d2]; + fx[index] *= fact; + } + +} + +__global__ void gradx_n2n_fd4(double *f, double *f_x_n2n, int nz, int ny, int nx, double fact){ + gradx_n2n_fd4Device(f, f_x_n2n, nz, ny, nx, fact); +} + +extern "C" void gradx_n2n_fd4_cuda_( double *f, double *f_x_n2n, int *nz, int *ny, int *nx, double *fact){ + int n=(*nx)*(*ny)*(*nz); + int threadsPerBlock = 256; + int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock); + gradx_n2n_fd4<<>>(f, f_x_n2n, *nz, *ny, *nx, *fact); + cudaDeviceSynchronize(); +} + +__global__ void gradpar_v2n_fd4(double *f, double *f_grad, int nz, int ny, int nx, double *f_z, double *f_y, double *f_n, double *f_x_n, double deltazi, double deltayi, double deltaxi, double gradpar_z, double *gradpar_x_n, double *gradpar_y_n) +{ + int index=blockDim.x*blockIdx.x+threadIdx.x; + int ind=xyindex(index, nx*ny); + + gradz_v2n_fd4Device(f, f_z, nz, ny, nx, deltazi); + grady_v2n_fd4Device(f, f_y, nz, ny, nx, deltayi); + interp_v2n_fd4Device(f, f_n, nz, ny, nx); + __syncthreads(); + if (index<(nx)*(ny)*(nz)){ + f_grad[index] = gradpar_z*f_z[index] + gradpar_y_n[ind]*f_y[index] ; + } + + gradx_n2n_fd4Device(f_n, f_x_n, nz, ny, nx, deltaxi); + __syncthreads(); + + if (index<(nx)*(ny)*(nz)){ + f_grad[index] += gradpar_x_n[ind]*f_x_n[index]; + } + +} + +__global__ void gradpar1_v2n_fd4(double *f, double *f_grad, int nz, int ny, int nx, double *f_z, double *f_y, double *f_n, double *f_x_n, double deltazi, double deltayi, double deltaxi, double gradpar_z, double *gradpar_x_n, double *gradpar_y_n) +{ + int index=blockDim.x*blockIdx.x+threadIdx.x; + int ind=xyindex(index, nx*ny); + gradx_n2n_fd4Device(f_n, f_x_n, nz, ny, nx, deltaxi); + __syncthreads(); + + if (index<(nx)*(ny)*(nz)){ + f_grad[index] += gradpar_x_n[ind]*f_x_n[index]; + } + + + +} + + +extern "C" void gradpar_v2n_fd4_cuda_(double *f, double *f_grad, int *nz, int *ny, int *nx, double *f_z, double *f_y, double *f_n, double *f_x_n, double *deltazi, double *deltayi, double *deltaxi, double *gradpar_z, double *gradpar_x_n, double *gradpar_y_n){ + int n = (*nx)*(*ny)*(*nz); + dim3 threadsPerBlock = 1024; + dim3 numBlocks = ((n + threadsPerBlock.x -1) / threadsPerBlock.x); + gradpar_v2n_fd4<<>>(f, f_grad, *nz, *ny, *nx, f_z, f_y, f_n, f_x_n, *deltazi, *deltayi, *deltaxi, *gradpar_z, gradpar_x_n, gradpar_y_n); + cudaDeviceSynchronize(); +} diff --git a/exercises/gradients/gradients_mod.F90 b/exercises/gradients/gradients_mod.F90 new file mode 100644 index 0000000..3f34974 --- /dev/null +++ b/exercises/gradients/gradients_mod.F90 @@ -0,0 +1,219 @@ +module gradients +use space_grid +use prec_const +real(dp), DIMENSION(1:4), public, PROTECTED :: coef_int, coef_der1_stag, coef_der1_n2n, coef_der2_stag +real(dp), DIMENSION(1:5), public, PROTECTED :: coef_der2 +real(dp), DIMENSION(:,:), pointer :: gradpar_x_n, gradpar_y_n +real(dp), PUBLIC :: gradpar_z +contains + +subroutine intialize_coefficients + coef_int = (/ -1.0_dp/16.0_dp, 9.0_dp/16.0_dp, 9.0_dp/16.0_dp, -1.0_dp/16.0_dp /) + coef_der1_stag = (/ +1.0_dp/24.0_dp, -9.0_dp/8.0_dp, 9.0_dp/8.0_dp, -1.0_dp/24.0_dp /) + coef_der2 = (/-1.0_dp/12.0_dp, 4.0_dp/3.0_dp, -5.0_dp/2.0_dp, +4.0_dp/3.0_dp, -1.0_dp/12.0_dp /) + coef_der1_n2n = (/ 1.0_dp/12.0_dp, -2.0_dp/3.0_dp, 2.0_dp/3.0_dp, -1.0_dp/12.0_dp /) + coef_der2_stag = (/ 1.0_dp/2._dp, -1.0_dp/2.0_dp, -1.0_dp/2._dp, 1.0_dp/2.0_dp /) + +end subroutine intialize_coefficients + + +subroutine gradz_n2n_fd4(f , f_z) + + use prec_const + + IMPLICIT none + + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z + integer :: iz + real(dp), DIMENSION(1:4) :: coef_der + + coef_der(:) = deltazi*coef_der1_n2n(:) + f_z(:,:,:) = nan_ + do iz=izs,ize + f_z(:, :, iz) = coef_der(1)*f(:, :, iz-2) & + + coef_der(2)*f(:, :, iz-1) & + + coef_der(3)*f(:, :, iz+1) & + + coef_der(4)*f(:, :, iz+2) + end do + + +end subroutine gradz_n2n_fd4 + + + + +subroutine gradz_v2n_fd4(f,f_z_v2n) + + use prec_const + + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z_v2n + integer :: iy, iz + real(dp), DIMENSION(1:4) :: coef_der + coef_der(:) = deltazi*coef_der1_stag(:) + + f_z_v2n(:,:,:) = nan_ + do iy=iys,iye + do iz=izs,ize + + f_z_v2n(iy,:,iz) = coef_int(1)*( coef_der(1)*f(iy-1, :, iz-1) & + + coef_der(2)*f(iy-1, :, iz) & + + coef_der(3)*f(iy-1, :, iz +1) & + + coef_der(4)*f(iy-1, :, iz+2) )& + + + coef_int(2)*( coef_der(1)*f(iy, :, iz-1) & + + coef_der(2)*f(iy, :, iz) & + + coef_der(3)*f(iy, :, iz+1) & + + coef_der(4)*f(iy, :, iz+2) )& + + + coef_int(3)*( coef_der(1)*f(iy+1, :, iz-1) & + + coef_der(2)*f(iy+1, :, iz) & + + coef_der(3)*f(iy+1, :, iz+1) & + + coef_der(4)*f(iy+1, :, iz+2) )& + + + coef_int(4)*( coef_der(1)*f(iy+2, :, iz-1) & + + coef_der(2)*f(iy+2, :, iz) & + + coef_der(3)*f(iy+2, :, iz+1) & + + coef_der(4)*f(iy+2, :, iz+2) ) + + end do + end do + + + +end subroutine gradz_v2n_fd4 + +subroutine grady_v2n_fd4(f,f_y_v2n) + + use prec_const + + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_y_v2n + integer :: iy, iz + real(dp), DIMENSION(1:4) :: coef_der + + coef_der(:) = deltayi*coef_der1_stag(:) + + f_y_v2n(:,:,:) = nan_ + + do iy=iys,iye + do iz=izs,ize + + f_y_v2n(iy,:,iz) = coef_int(1)*( coef_der(1)*f(iy-1, :, iz-1) & + + coef_der(2)*f(iy, :, iz-1) & + + coef_der(3)*f(iy+1, :, iz-1) & + + coef_der(4)*f(iy+2, :, iz-1) )& + + + coef_int(2)*( coef_der(1)*f(iy-1, :, iz) & + + coef_der(2)*f(iy, :, iz) & + + coef_der(3)*f(iy+1, :, iz) & + + coef_der(4)*f(iy+2, :, iz) )& + + + coef_int(3)*( coef_der(1)*f(iy-1, :, iz+1) & + + coef_der(2)*f(iy, :, iz+1) & + + coef_der(3)*f(iy+1, :, iz+1) & + + coef_der(4)*f(iy+2, :, iz+1) )& + + + coef_int(4)*( coef_der(1)*f(iy-1, :, iz+2) & + + coef_der(2)*f(iy, :, iz+2) & + + coef_der(3)*f(iy+1, :, iz+2) & + + coef_der(4)*f(iy+2, :, iz+2) ) + + end do + end do + + +end subroutine grady_v2n_fd4 + +subroutine interp_v2n_fd4 ( f, f_int_v2n ) + + use prec_const + implicit none + real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg),intent(in) :: f + real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_int_v2n + integer :: iy, iz + + f_int_v2n(:,:,:) = nan_ + + do iy=iys,iye + do iz=izs,ize + + f_int_v2n(iy,:,iz) = coef_int(1)*(coef_int(1)*f(iy-1, :, iz-1) & + + coef_int(2)*f(iy-1, : , iz) & + + coef_int(3)*f(iy-1, :, iz +1) & + + coef_int(4)*f(iy-1, :, iz+2) )& + + + coef_int(2)*( coef_int(1)*f(iy, :, iz-1) & + + coef_int(2)*f(iy, :, iz) & + + coef_int(3)*f(iy, :, iz+1) & + + coef_int(4)*f(iy, :, iz+2) )& + + + coef_int(3)*( coef_int(1)*f(iy+1, :, iz-1) & + + coef_int(2)*f(iy+1, :, iz) & + + coef_int(3)*f(iy+1, :, iz+1) & + + coef_int(4)*f(iy+1, :, iz+2) )& + + + coef_int(4)*( coef_int(1)*f(iy+2, :, iz-1) & + + coef_int(2)*f(iy+2, :, iz) & + + coef_int(3)*f(iy+2, :, iz+1) & + + coef_int(4)*f(iy+2, :, iz+2) ) + + end do + end do + + +end subroutine interp_v2n_fd4 + +subroutine gradx_n2n_fd4(f,f_x) + use prec_const + + IMPLICIT none + + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_x + integer :: ix + real(dp), DIMENSION(1:4) :: coef_der + coef_der(:)= deltaxi*coef_der1_n2n(:) + + f_x (:,:,: ) = nan_ + + do ix=ixs,ixe + f_x(:, ix, :) = coef_der(1)*f(:, ix-2, :) & + + coef_der(2)*f(:, ix-1, :) & + + coef_der(3)*f(:, ix+1, :) & + + coef_der(4)*f(:, ix+2, :) + + end do + + +end subroutine gradx_n2n_fd4 + +subroutine gradpar_v2n_fd4 ( f, f_grad) + use prec_const + + IMPLICIT none + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_grad + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg) :: f_z, f_y, f_n, f_x_n !f_x_v + + integer :: ix, iy + + f_grad(:,:,:) = nan_ + + call gradz_v2n_fd4(f, f_z) + call grady_v2n_fd4(f, f_y) + call interp_v2n_fd4(f, f_n ) + call gradx_n2n_fd4(f_n, f_x_n) + + do ix = ixs,ixe + do iy = iys, iye + f_grad(iy,ix,izs:ize) = gradpar_z*f_z(iy,ix,izs:ize) + gradpar_y_n(iy,ix)*f_y(iy,ix,izs:ize) & + + gradpar_x_n(iy,ix)*f_x_n(iy,ix,izs:ize) + end do + end do + +end subroutine gradpar_v2n_fd4 + + +end module gradients diff --git a/exercises/gradients/prec_const_mod.F90 b/exercises/gradients/prec_const_mod.F90 new file mode 100644 index 0000000..4cdf7f0 --- /dev/null +++ b/exercises/gradients/prec_const_mod.F90 @@ -0,0 +1,50 @@ +MODULE prec_const + use, intrinsic :: iso_fortran_env, only: REAL32, REAL64, & + stdin=>input_unit, & + stdout=>output_unit, & + stderr=>error_unit + ! + ! Precision for real and complex + ! + INTEGER, PARAMETER :: sp = REAL32 !Single precision, should not be used + INTEGER, PARAMETER :: dp = REAL64 !real(dp), enforced through the code + + INTEGER, private :: dp_r, dp_p !Range and Aprecision of doubles + INTEGER, private :: sp_r, sp_p !Range and precision of singles + + + INTEGER, private :: MPI_SP !Single precision for MPI + INTEGER, private :: MPI_DP !Double precision in MPI + + REAL(dp),public :: nan_ + ! + ! Some useful constants + ! + REAL(dp), PARAMETER :: PI=3.141592653589793238462643383279502884197_dp + REAL(dp), PARAMETER :: PIO2=1.57079632679489661923132169163975144209858_dp + REAL(dp), PARAMETER :: TWOPI=6.283185307179586476925286766559005768394_dp + REAL(dp), PARAMETER :: SQRT2=1.41421356237309504880168872420969807856967_dp + ! + CONTAINS + SUBROUTINE INIT_PREC_CONST + IMPLICIT NONE + integer :: ierr,me + + REAL(sp) :: a = 1_sp + REAL(dp) :: b = 1_dp +! logical :: commute = .true. + + !Get range and precision of ISO FORTRAN sizes + sp_r = range(a) + sp_p = precision(a) + + dp_r = range(b) + dp_p = precision(b) + + + + END SUBROUTINE INIT_PREC_CONST + + + +END MODULE prec_const diff --git a/exercises/gradients/space_grid_mod.F90 b/exercises/gradients/space_grid_mod.F90 new file mode 100644 index 0000000..28e84c5 --- /dev/null +++ b/exercises/gradients/space_grid_mod.F90 @@ -0,0 +1,118 @@ +MODULE space_grid + USE prec_const + IMPLICIT NONE + PRIVATE + ! + ! Grid module for spatial discretization + ! + ! GRID Namelist + ! + INTEGER, PUBLIC, PROTECTED :: nx=1 ! Number of total internal grip points in x + INTEGER, PUBLIC, PROTECTED :: ny=1 ! Number of total internal grip points in y + INTEGER, PUBLIC, PROTECTED :: nz=1 ! Number of total internal grip points in z + REAL(dp), PUBLIC, PROTECTED :: xmin=0._dp ! x coordinate for left boundary + REAL(dp), PUBLIC, PROTECTED :: xmax=10._dp ! x coordinate for right boundary + REAL(dp), PUBLIC, PROTECTED :: ymin=0._dp ! y coordinate for left boundary + REAL(dp), PUBLIC, PROTECTED :: ymax=10._dp ! y coordinate for right boundary + REAL(dp), PUBLIC, PROTECTED :: zmin=0._dp ! z coordinate for left boundary + REAL(dp), PUBLIC, PROTECTED :: zmax=1._dp ! z coordinate for right boundary + REAL(dp), PUBLIC, PROTECTED :: x_periodic_end = -10000_dp !Field line poloidal periodicity + REAL(dp), PUBLIC, PROTECTED :: Lx = 10._dp + ! + ! Logical switch to apply boundary conditions at the corners + ! with nl_corners_top -> .true. the corner bc's are evaluated + ! as vertical/poloidal bc's. + ! + LOGICAL, PUBLIC, PROTECTED :: nl_corners_top = .FALSE. + ! + ! + ! + ! Number of points in each grid + ! + INTEGER, PUBLIC, PROTECTED :: nxp, nyp, nzp + INTEGER, PUBLIC :: izsp, izep, nzpp + ! + ! Indices to start and end of local and global grids + ! s -> start, e-> end (local) + ! l -> left , r-> right (global) + ! + INTEGER, PUBLIC :: ixs, ixe, ixl, ixr + INTEGER, PUBLIC :: iys, iye, iyl, iyr, iyr_ngrid + INTEGER, PUBLIC :: izs, ize, izl, izr + ! + ! Number of ghost cells + ! + INTEGER, PUBLIC, PROTECTED :: nzgc=1 + INTEGER, PUBLIC, PROTECTED :: nygc=1 + INTEGER, PUBLIC, PROTECTED :: nxgc=1 + ! + ! Start and end ghost cells in local and global grids + ! + INTEGER, PUBLIC :: ixsg, ixeg, ixlg, ixrg + INTEGER, PUBLIC :: iysg, iyeg, iylg, iyrg + INTEGER, PUBLIC :: izsg, izeg, izlg, izrg + + !Array indices in x such that each allgatherv results in the full radial length + INTEGER, PUBLIC, PROTECTED :: ix1, ix2 + INTEGER, PUBLIC, PROTECTED :: ixl1,ixr2 + INTEGER, PUBLIC :: nx2d, nx1d + + !Array indices in x like ix1 ix2, but including all ghost cells + INTEGER, PUBLIC, PROTECTED :: ix1g, ix2g + + !Array indices in y such that each allgatherv results in the full radial length + INTEGER, PUBLIC, PROTECTED :: iy1, iy2 + INTEGER, PUBLIC, PROTECTED :: iyl1,iyr2 + INTEGER, PUBLIC :: ny2d, ny1d + + !Array indices in y like iy1 iy2, but including all ghost cells + INTEGER, PUBLIC, PROTECTED :: iy1g, iy2g + + !Field line poloidal periodicity + INTEGER, PUBLIC, PROTECTED :: ix_min !min index in the open (limited) field line region for each processor + + !Step size in each direction and inverses for finite differences + !Radial direction + REAL(dp), PUBLIC, PROTECTED :: deltax + REAL(dp), PUBLIC :: deltaxi + REAL(dp), PUBLIC, PROTECTED :: deltaxih + REAL(dp), PUBLIC, PROTECTED :: deltaxisq + + !Poloidal/vertical direction + REAL(dp), PUBLIC, PROTECTED :: deltay + REAL(dp), PUBLIC :: deltayi + REAL(dp), PUBLIC, PROTECTED :: deltayih + REAL(dp), PUBLIC, PROTECTED :: deltayisq + real(dp), PUBLIC, PROTECTED :: shiftgr_v = 0.5_dp + + !Toroidal direction + REAL(dp), PUBLIC, PROTECTED :: deltaz + real(dp), PUBLIC :: deltazi + real(dp), PUBLIC, PROTECTED :: deltazisq + + !Parallel direction (should be about the same as deltaz) + real(dp), PUBLIC, PROTECTED :: deltaai + real(dp), PUBLIC, PROTECTED :: deltaaih + real(dp), PUBLIC, PROTECTED :: deltaaisq + + !x,y,z grids + REAL(dp), DIMENSION(:), ALLOCATABLE, PUBLIC, PROTECTED :: xarray, yarray, zarray + + !Poloidal angle theta, its sine and cosine... warning, sin and cosine are defined with theta=0 at high-field-side + real(dp), DIMENSION(:), ALLOCATABLE, PUBLIC :: cos_tht + real(dp), DIMENSION(:), ALLOCATABLE, PUBLIC :: sin_tht + + !Variables related to field-aligned parallel gradient operator + !Should generalize to have iyshift=1 + INTEGER, PUBLIC, PROTECTED :: iyshift + INTEGER, PUBLIC, PROTECTED :: iyshift_d + INTEGER, PUBLIC, PROTECTED :: iyshift_u + INTEGER, PUBLIC, PROTECTED :: ny_top,ny_bot !Upwinding in n2n parallel gradients + INTEGER, PUBLIC, PROTECTED :: ny_topu,ny_botu !Upwinding in n2v parallel gradient with arbitrary iyshift + INTEGER, PUBLIC, PROTECTED :: ny_topd,ny_botd !Upwinding in v2n parallel gradient with arbitrary iyshift + INTEGER, PUBLIC, PROTECTED :: ny_zg + INTEGER, PUBLIC :: kl,ku,neq + + + +END MODULE space_grid diff --git a/exercises/gradients/test_gradients.F90 b/exercises/gradients/test_gradients.F90 new file mode 100644 index 0000000..7d86cdf --- /dev/null +++ b/exercises/gradients/test_gradients.F90 @@ -0,0 +1,108 @@ +subroutine initial_setup + use space_grid + use cuda_interface + use gradients + use iso_c_binding + implicit none + integer :: i,j,k + izsg=-1 + ixsg=-1 + iysg=-1 + izeg=3 + ixeg=1202 + iyeg=2002 + izs=1 + ixs=1 + iys=1 + ize=1 + ixe=1200 + iye=2000 + + deltazi=0.03 + deltayi=0.003 + deltayi=0.005 + call gbs_allocate_cuda(strmf_cpu,iysg,iyeg,ixsg,ixeg,izsg,izeg) + call gbs_allocate_cuda(strmf_gpu,iysg,iyeg,ixsg,ixeg,izsg,izeg) + call gbs_allocate_cuda(indata,iysg,iyeg,ixsg,ixeg,izsg,izeg) + call gbs_allocate_cuda(gradpar_y_n,iysg,iyeg,ixsg,ixeg) + call gbs_allocate_cuda(gradpar_x_n,iysg,iyeg,ixsg,ixeg) + call gbs_allocate_cuda(gradpar_y_n_gpu,iysg,iyeg,ixsg,ixeg) + call gbs_allocate_cuda(gradpar_x_n_gpu,iysg,iyeg,ixsg,ixeg) + + call gbs_allocate_cuda(f_z,iysg,iyeg,ixsg,ixeg,izsg,izeg) + call gbs_allocate_cuda(f_y,iysg,iyeg,ixsg,ixeg,izsg,izeg) + call gbs_allocate_cuda(f_x,iysg,iyeg,ixsg,ixeg,izsg,izeg) + call gbs_allocate_cuda(f_n,iysg,iyeg,ixsg,ixeg,izsg,izeg) + call gbs_allocate_cuda(f_x_n,iysg,iyeg,ixsg,ixeg,izsg,izeg) + +end subroutine initial_setup + +subroutine fill_arrays + use space_grid + use cuda_interface + use gradients + use iso_c_binding + implicit none + integer :: i,j,k + call initial_setup + + call intialize_coefficients + do k=izsg,izeg + do i=ixsg,ixeg + do j=iysg,iyeg + indata(j, i, k) = dble(i*123.0+j*5.1+k*45.12) + end do + end do + end do + nx_cuda = ixeg-ixsg+1 + ny_cuda = iyeg-iysg+1 + nz_cuda = izeg-izsg+1 + do i=ixsg,ixeg + do j=iysg,iyeg + gradpar_x_n(j, i) = dble(i*123.0+j*5.1) + gradpar_y_n(j, i) = dble(i*12.0+j*15.1) + end do + end do + gradpar_x_n_gpu(:,:) = gradpar_x_n(:,:) + gradpar_y_n_gpu(:,:) = gradpar_y_n(:,:) + + gradpar_z = 0.5 + +end subroutine fill_arrays + +program main + use space_grid + use cuda_interface + use gradients + use iso_c_binding + implicit none + + call initial_setup + + call intialize_coefficients + + call fill_arrays + + call gradz_n2n_fd4(indata(:,:,:),strmf_cpu(:,:,:)) + call gradz_n2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:)) + write(*,*) 'gradz_n2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize)) + call gradz_v2n_fd4(indata(:,:,:),strmf_cpu(:,:,:)) + call gradz_v2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:)) + write(*,*) 'gradz_v2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize)) + call grady_v2n_fd4(indata(:,:,:),strmf_cpu(:,:,:)) + call grady_v2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:)) + write(*,*) 'grady_v2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize)) + call interp_v2n_fd4(indata(:,:,:),strmf_cpu(:,:,:)) + call interp_v2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:)) + write(*,*) 'interp_v2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize)) + call gradx_n2n_fd4(indata(:,:,:),strmf_cpu(:,:,:)) + call gradx_n2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:)) + write(*,*) 'gradx_n2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize)) + + CALL gradpar_v2n_fd4(indata(:,:,:),strmf_cpu(:,:,:)) + CALL gradpar_v2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:), gradpar_z) + write(*,*) 'gradpar_v2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize)) + + +end program main +