diff --git a/exercises/gradients/Makefile b/exercises/gradients/Makefile new file mode 100644 index 0000000..fa6f63c --- /dev/null +++ b/exercises/gradients/Makefile @@ -0,0 +1,28 @@ +F90=gfortran +NVCC=nvcc +LDFLAGS=-lc -lstdc++ -lcuda -lcudart -lcudadevrt +F90FLAGS= -ffree-line-length-0 -O3 -g + + +all: test_gradients.x + +test_gradients.x: cudalloc.o grad_test.o prec_const_mod.o space_grid_mod.o cuda_interface_mod.o gradients_mod.o test_gradients.o + $(F90) $(LDFLAGS) cudalloc.o grad_test.o prec_const_mod.o space_grid_mod.o cuda_interface_mod.o \ + gradients_mod.o test_gradients.o -o $@ +test_gradients.o: test_gradients.F90 + $(F90) $(F90FLAGS) -c test_gradients.F90 -o $@ +cuda_interface_mod.o: cuda_interface_mod.F90 + $(F90) $(F90FLAGS) -c cuda_interface_mod.F90 -o $@ +space_grid_mod.o: space_grid_mod.F90 + $(F90) $(F90FLAGS) -c space_grid_mod.F90 -o $@ +prec_const_mod.o: prec_const_mod.F90 + $(F90) $(F90FLAGS) -c prec_const_mod.F90 -o $@ +gradients_mod.o: gradients_mod.F90 + $(F90) $(F90FLAGS) -c gradients_mod.F90 -o $@ +grad_test.o: grad_test.cu + $(NVCC) $(NVCCFLAGS) -c grad_test.cu -o $@ +cudalloc.o: cudalloc.cu + $(NVCC) $(NVCCFLAGS) -c cudalloc.cu -o $@ + +clean: + rm *.x *.o diff --git a/exercises/gradients/cuda_interface_mod.F90 b/exercises/gradients/cuda_interface_mod.F90 new file mode 100644 index 0000000..f0decef --- /dev/null +++ b/exercises/gradients/cuda_interface_mod.F90 @@ -0,0 +1,128 @@ + +MODULE cuda_interface +use prec_const +INTERFACE + FUNCTION allocate_cuda_memory(n) BIND(C, NAME='allocate_cuda_memory') + use iso_c_binding + TYPE(C_PTR) :: allocate_cuda_memory + integer :: n + END FUNCTION allocate_cuda_memory + + +END INTERFACE + INTERFACE gbs_allocate_cuda + MODULE PROCEDURE gbs_allocate_cuda_dp3, gbs_allocate_cuda_dp2, gbs_allocate_cuda_dp1 + END INTERFACE gbs_allocate_cuda + real(dp), pointer, save :: strmf_cpu(:,:,:),strmf_gpu(:,:,:), indata(:,:,:) + real(dp), pointer, save :: strmf1_cpu(:,:,:), strmf1_gpu(:,:,:) + real(dp), pointer, save :: grady_f(:,:,:), grady_diff(:,:,:), diff(:,:,:), f_x(:,:,:) + real(dp), pointer, save :: f_z(:,:,:), f_y(:,:,:), f_n(:,:,:), f_x_n(:,:,:) + real(dp), pointer, save :: f_v(:,:,:), f_x_v(:,:,:), f_xx_n(:,:,:),f_xx_v(:,:,:) + real(dp), pointer, save :: f_zz(:,:,:), f_yy(:,:,:), f_xx(:,:,:) + real(dp), pointer, save :: f_zy(:,:,:), f_yz(:,:,:), f_zx(:,:,:) + real(dp), pointer, save :: f_xz(:,:,:), f_xy(:,:,:), f_yx(:,:,:) + + real(dp), pointer, save :: gradpar_x_n_gpu(:,:), gradpar_x_v_gpu(:,:), gradpar_y_n_gpu(:,:), gradpar_y_v_gpu(:,:) + real(dp), pointer, save :: grad2par_yy_n_gpu(:,:), grad2par_xx_n_gpu(:,:), grad2par_x_n_gpu(:,:), grad2par_y_n_gpu(:,:) + real(dp), pointer, save :: grad2par_zy_n_gpu(:,:), grad2par_zx_n_gpu(:,:), grad2par_xy_n_gpu(:,:) + real(dp), pointer, save :: grad2par_yy_v_gpu(:,:), grad2par_xx_v_gpu(:,:), grad2par_x_v_gpu(:,:), grad2par_y_v_gpu(:,:) + real(dp), pointer, save :: grad2par_zy_v_gpu(:,:), grad2par_zx_v_gpu(:,:), grad2par_xy_v_gpu(:,:) + real(dp), pointer, save :: cur_x_n_gpu(:), cur_x_v_gpu(:), ccur_xx_n_gpu(:), ccur_xx_v_gpu(:) + real(dp), pointer, save :: cur_y_n_gpu(:,:), cur_y_v_gpu(:,:), ccur_yy_n_gpu(:,:), ccur_xy_n_gpu(:,:) + real(dp), pointer, save :: ccur_yy_v_gpu(:,:), ccur_xy_v_gpu(:,:) + real(dp), pointer, dimension(:,:,:), save :: A_x_n_gpu, A_x_v_gpu, A_y_v_gpu, B_x_v_gpu, B_y_v_gpu + real(dp), pointer, dimension(:,:,:), save :: B_x_n_gpu, B_y_n_gpu, A_y_n_gpu + real(dp), pointer, dimension(:,:), save :: gradcur_xx_v_gpu, gradcur_yy_v_gpu, gradcur_xy_v_gpu, gradcur_yx_v_gpu + real(dp), pointer, dimension(:,:), save :: gradcur_zx_v_gpu, gradcur_zy_v_gpu, gradcur_x_v_gpu, gradcur_y_v_gpu + real(dp), pointer, dimension(:,:), save :: curgrad_xx_n_gpu, curgrad_yy_n_gpu, curgrad_xy_n_gpu, curgrad_yx_n_gpu + real(dp), pointer, dimension(:,:), save :: curgrad_zx_n_gpu, curgrad_zy_n_gpu, curgrad_x_n_gpu, curgrad_y_n_gpu + + !real(dp), pointer, save :: gradpar_x_n_gpu(:,:,:), gradpar_x_v_gpu(:,:,:), gradpar_y_n_gpu(:,:,:), gradpar_y_v_gpu(:,:,:) + integer :: nx_cuda, ny_cuda, nz_cuda +contains + + SUBROUTINE gbs_allocate_cuda_dp3(a,is1,ie1,is2,ie2,is3,ie3) + use iso_c_binding + real(dp), DIMENSION(:,:,:), pointer, INTENT(INOUT) :: a + INTEGER, INTENT(IN) :: is1,ie1,is2,ie2,is3,ie3 + integer ndata + ndata=(ie3-is3+1)*(ie2-is2+1)*(ie1-is1+1) + call c_f_pointer(allocate_cuda_memory(ndata), a, \ + [(ie1-is1+1),(ie2-is2+1),(ie3-is3+1)]) + a(is1:,is2:,is3:) => a + a=0.0_dp + END SUBROUTINE gbs_allocate_cuda_dp3 + + SUBROUTINE gbs_allocate_cuda_dp2(a,is1,ie1,is2,ie2) + use iso_c_binding + real(dp), DIMENSION(:,:), pointer, INTENT(INOUT) :: a + INTEGER, INTENT(IN) :: is1,ie1,is2,ie2 + integer ndata + ndata=(ie2-is2+1)*(ie1-is1+1) + call c_f_pointer(allocate_cuda_memory(ndata), a, \ + [(ie1-is1+1),(ie2-is2+1)]) + a(is1:,is2:) => a + a=0.0_dp + END SUBROUTINE gbs_allocate_cuda_dp2 + + SUBROUTINE gbs_allocate_cuda_dp1(a,is1,ie1) + use iso_c_binding + real(dp), DIMENSION(:), pointer, INTENT(INOUT) :: a + INTEGER, INTENT(IN) :: is1,ie1 + integer ndata + ndata=(ie1-is1+1) + call c_f_pointer(allocate_cuda_memory(ndata), a, \ + [(ie1-is1+1)]) + a(is1:) => a + a=0.0_dp + END SUBROUTINE gbs_allocate_cuda_dp1 + + + subroutine gradz_n2n_fd4cuda(f_in, f_out) + use space_grid + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out + call gradz_n2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda, deltazi) + end subroutine gradz_n2n_fd4cuda + + subroutine gradz_v2n_fd4cuda(f_in, f_out) + use space_grid + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out + call gradz_v2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda, deltazi) + end subroutine gradz_v2n_fd4cuda + + subroutine grady_v2n_fd4cuda(f_in, f_out) + use space_grid + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out + call grady_v2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda, deltayi) + end subroutine grady_v2n_fd4cuda + + subroutine interp_v2n_fd4cuda(f_in, f_out) + use space_grid + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out + call interp_v2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda) + end subroutine interp_v2n_fd4cuda + + subroutine gradx_n2n_fd4cuda(f_in, f_out) + use space_grid + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out + call gradx_n2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda, deltaxi) + end subroutine gradx_n2n_fd4cuda + + + subroutine gradpar_v2n_fd4cuda(f_in, f_out, gradpar_z) + use space_grid + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in + real(dp), INTENT(in) :: gradpar_z + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out + + CALL gradpar_v2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda, f_z, f_y, f_n, \ + f_x_n, deltazi, deltayi, deltaxi, gradpar_z, gradpar_x_n_gpu, gradpar_y_n_gpu) + end subroutine gradpar_v2n_fd4cuda + + +END MODULE cuda_interface diff --git a/exercises/gradients/cudalloc.cu b/exercises/gradients/cudalloc.cu new file mode 100644 index 0000000..1186170 --- /dev/null +++ b/exercises/gradients/cudalloc.cu @@ -0,0 +1,76 @@ +/*#include +//#include +#include +#include +struct cuda_struct{ + double *coef_der_cuda; +}; + +*/ +#include "cudalloc.h" +/*__global__ void set_coefficients(cuda_struct *structure){ + structure->coef_der_cuda[0] = 1.0/12.0; + structure->coef_der_cuda[1] = -2.0/3.0; + structure->coef_der_cuda[2] = 2.0/3.0; + structure->coef_der_cuda[3] = -1.0/12.0; +*/ +__global__ void set_coefficients(double *coef, double *coefstag, double *coef_int){ +coef[0] = 1.0/12.0; +coef[1] = -2.0/3.0; +coef[2] = 2.0/3.0; +coef[3] = -1.0/12.0; +coef_int[0] = -1.0/16.0; +coef_int[1] = 9.0/16.0; +coef_int[2] = 9.0/16.0; +coef_int[3] = -1.0/16.0; +coefstag[0] = 1.0/24.0; +coefstag[1] = -9.0/8.0; +coefstag[2] = 9.0/8.0; +coefstag[3] = -1.0/24.0; +} +void set_extent(int *out, int in){ +*out = in; +} +extern "C" void allocate_cuda_struct(cuda_struct *& structure){ + structure = new cuda_struct(); + return; +} + +//__global__ void print_coefficients(cuda_struct *structure){ +__global__ void print_coefficients(double *coef){ +} +extern "C" void fill_cuda_struct(cuda_struct *structure, int *izeg, int *izsg, int *iyeg, int *iysg, int *ixeg, int *ixsg){ +//extern "C" void fill_cuda_struct(cuda_struct *structure){ + + cudaMallocManaged( (void **)&(structure->coef_der_cuda), sizeof(double) * 4 ); + cudaMallocManaged( (void **)&(structure->coef_der1_stag), sizeof(double) * 4 ); + cudaMallocManaged( (void **)&(structure->coef_int_cuda), sizeof(double) * 4 ); + set_coefficients<<<1, 1>>>(structure->coef_der_cuda, structure->coef_der1_stag, structure->coef_int_cuda); + //set_coefficients<<<1, 1>>>(structure->coef_der_cuda); + //set_extent((structure->nz), (*izeg-*izsg+1)); + //set_extent(&(structure->ny), (*izeg-*izsg+1)); + //set_extent(&(structure->nx), (*izeg-*izsg+1)); + //set_extent(&(structure->nx), (*iyeg-*iysg+1)); + //&(structure->nz) = &k;//(*izeg-*izsg+1); + //&(structure->ny) = &k;//(*ixeg-*ixsg+1); + //&(structure->nx) = &k;//(*iyeg-*iysg+1); + //printf("AAAAAAA %d %d %d\n", structure->nz, structure->ny, structure->nx); + //set_extent<<<1, 1>>>(structure->nz, (*izeg-*izsg+1)); + //set_extent<<<1, 1>>>(structure->ny, (*ixeg-*ixsg+1)); + //set_extent<<<1, 1>>>(structure->nx, (*iyeg-*iysg+1)); + cudaDeviceSynchronize(); + + printf("cudalloc coef_int %f %f %f %f \n", structure->coef_int_cuda[0], structure->coef_int_cuda[1], structure->coef_int_cuda[2], structure->coef_int_cuda[3]); + return; +} + +extern "C" void *allocate_cuda_memory(int *n){ + double *a; + int N=*n; + cudaMallocManaged( (void **)&a, sizeof(double) * N ); + cudaDeviceSynchronize(); + return (void *) a; +} + + + diff --git a/exercises/gradients/cudalloc.h b/exercises/gradients/cudalloc.h new file mode 100644 index 0000000..0d950a9 --- /dev/null +++ b/exercises/gradients/cudalloc.h @@ -0,0 +1,11 @@ +#include +//#include +#include +#include + +struct cuda_struct{ + double *coef_der_cuda; + double *coef_int_cuda; + double *coef_der1_stag; + int *nz, *ny, *nx; +}; diff --git a/exercises/gradients/grad_test.cu b/exercises/gradients/grad_test.cu new file mode 100644 index 0000000..948e342 --- /dev/null +++ b/exercises/gradients/grad_test.cu @@ -0,0 +1,135 @@ +#include +//#include +//#include "unistd.h" + + +__constant__ __device__ double coef_int[4] = {-1.0/16.0, 9.0/16.0, 9.0/16.0, -1.0/16.0 }; +__constant__ __device__ double coef_der1_stag[4] = { +1.0/24.0, -9.0/8.0, 9.0/8.0, -1.0/24.0 }; +__constant__ __device__ double coef_der2[5] = {-1.0/12.0, 4.0/3.0, -5.0/2.0, +4.0/3.0, -1.0/12.0}; +__constant__ __device__ double coef_der1_n2n[4] = { 1.0/12.0, -2.0/3.0, 2.0/3.0, -1.0/12.0}; +__constant__ __device__ double coef_der2_stag[4] = { 1.0/2., -1.0/2.0, -1.0/2., 1.0/2.0}; + +__device__ int xyindex(int i, int nz ) +{ + return i%nz; +} + + +__device__ int disp(int i, int j, int k, int nx, int ny) +{ + return i + j*nx + k*nx*ny; +} + +__device__ void gradz_n2n_fd4Device(double *f, double *fz, int nz, int ny, int nx, double fact) +{ + int index = blockDim.x*blockIdx.x+threadIdx.x; + int d1=index+disp(0,0,-2,nx,ny); + int d2=index+disp(0,0,-1,nx,ny); + int d3=index+disp(0,0,1,nx,ny); + int d4=index+disp(0,0,2,nx,ny); + if (d1>=0 && d4<(nx*ny*nz)){ + fz[index] = coef_der1_n2n[0]*f[d1] + coef_der1_n2n[1]*f[d2] + + coef_der1_n2n[2]*f[d3] + coef_der1_n2n[3]*f[d4]; + fz[index] *= fact; + } + +} + +__global__ void gradz_n2n_fd4(double *f, double *fz, int nz, int ny, int nx, double fact){ + gradz_n2n_fd4Device(f, fz, nz, ny, nx, fact); +} + +extern "C" void gradz_n2n_fd4_cuda_( double *f, double *f_z, int *nz, int *ny, int *nx, double *fact){ + int n=(*nx)*(*ny)*(*nz); + int threadsPerBlock = 256; + int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock); + gradz_n2n_fd4<<>>(f, f_z, *nz, *ny, *nx, *fact); + cudaDeviceSynchronize(); +} + + +__device__ void gradz_v2n_fd4Device(double *f, double *f_z_v2n, int nz, int ny, int nx, double fact) +{ + +} + +__global__ void gradz_v2n_fd4(double *f, double *fz, int nz, int ny, int nx, double fact){ + gradz_v2n_fd4Device(f, fz, nz, ny, nx, fact); +} + +extern "C" void gradz_v2n_fd4_cuda_( double *f, double *f_z, int *nz, int *ny, int *nx, double *fact){ + int n=(*nx)*(*ny)*(*nz); + int threadsPerBlock = 256; + int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock); + gradz_v2n_fd4<<>>(f, f_z, *nz, *ny, *nx, *fact); + cudaDeviceSynchronize(); +} + +__device__ void grady_v2n_fd4Device(double *f, double *f_y_v2n, int nz, int ny, int nx, double fact) +{ + + +} + +__global__ void grady_v2n_fd4(double *f, double *f_y_v2n, int nz, int ny, int nx, double fact){ + grady_v2n_fd4Device(f, f_y_v2n, nz, ny, nx, fact); +} + +extern "C" void grady_v2n_fd4_cuda_( double *f, double *f_y_v2n, int *nz, int *ny, int *nx, double *fact){ + int n=(*nx)*(*ny)*(*nz); + int threadsPerBlock = 256; + int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock); + grady_v2n_fd4<<>>(f, f_y_v2n, *nz, *ny, *nx, *fact); + cudaDeviceSynchronize(); +} + +__device__ void interp_v2n_fd4Device(double *f, double *f_int_v2n, int nz, int ny, int nx) +{ + + +} + +__global__ void interp_v2n_fd4(double *f, double *f_int_v2n, int nz, int ny, int nx){ + interp_v2n_fd4Device(f, f_int_v2n, nz, ny, nx); +} + +extern "C" void interp_v2n_fd4_cuda_( double *f, double *f_int_v2n, int *nz, int *ny, int *nx){ + int n=(*nx)*(*ny)*(*nz); + int threadsPerBlock = 256; + int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock); + interp_v2n_fd4<<>>(f, f_int_v2n, *nz, *ny, *nx); + cudaDeviceSynchronize(); +} + +__device__ void gradx_n2n_fd4Device(double *f, double *fx, int nz, int ny, int nx, double fact) +{ + +} + +__global__ void gradx_n2n_fd4(double *f, double *f_x_n2n, int nz, int ny, int nx, double fact){ + gradx_n2n_fd4Device(f, f_x_n2n, nz, ny, nx, fact); +} + +extern "C" void gradx_n2n_fd4_cuda_( double *f, double *f_x_n2n, int *nz, int *ny, int *nx, double *fact){ + int n=(*nx)*(*ny)*(*nz); + int threadsPerBlock = 256; + int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock); + gradx_n2n_fd4<<>>(f, f_x_n2n, *nz, *ny, *nx, *fact); + cudaDeviceSynchronize(); +} + +__global__ void gradpar_v2n_fd4(double *f, double *f_grad, int nz, int ny, int nx, double *f_z, double *f_y, double *f_n, double *f_x_n, double deltazi, double deltayi, double deltaxi, double gradpar_z, double *gradpar_x_n, double *gradpar_y_n) +{ + +} + + + + +extern "C" void gradpar_v2n_fd4_cuda_(double *f, double *f_grad, int *nz, int *ny, int *nx, double *f_z, double *f_y, double *f_n, double *f_x_n, double *deltazi, double *deltayi, double *deltaxi, double *gradpar_z, double *gradpar_x_n, double *gradpar_y_n){ + int n = (*nx)*(*ny)*(*nz); + dim3 threadsPerBlock = 1024; + dim3 numBlocks = ((n + threadsPerBlock.x -1) / threadsPerBlock.x); + gradpar_v2n_fd4<<>>(f, f_grad, *nz, *ny, *nx, f_z, f_y, f_n, f_x_n, *deltazi, *deltayi, *deltaxi, *gradpar_z, gradpar_x_n, gradpar_y_n); + cudaDeviceSynchronize(); +} diff --git a/exercises/gradients/gradients_mod.F90 b/exercises/gradients/gradients_mod.F90 new file mode 100644 index 0000000..3f34974 --- /dev/null +++ b/exercises/gradients/gradients_mod.F90 @@ -0,0 +1,219 @@ +module gradients +use space_grid +use prec_const +real(dp), DIMENSION(1:4), public, PROTECTED :: coef_int, coef_der1_stag, coef_der1_n2n, coef_der2_stag +real(dp), DIMENSION(1:5), public, PROTECTED :: coef_der2 +real(dp), DIMENSION(:,:), pointer :: gradpar_x_n, gradpar_y_n +real(dp), PUBLIC :: gradpar_z +contains + +subroutine intialize_coefficients + coef_int = (/ -1.0_dp/16.0_dp, 9.0_dp/16.0_dp, 9.0_dp/16.0_dp, -1.0_dp/16.0_dp /) + coef_der1_stag = (/ +1.0_dp/24.0_dp, -9.0_dp/8.0_dp, 9.0_dp/8.0_dp, -1.0_dp/24.0_dp /) + coef_der2 = (/-1.0_dp/12.0_dp, 4.0_dp/3.0_dp, -5.0_dp/2.0_dp, +4.0_dp/3.0_dp, -1.0_dp/12.0_dp /) + coef_der1_n2n = (/ 1.0_dp/12.0_dp, -2.0_dp/3.0_dp, 2.0_dp/3.0_dp, -1.0_dp/12.0_dp /) + coef_der2_stag = (/ 1.0_dp/2._dp, -1.0_dp/2.0_dp, -1.0_dp/2._dp, 1.0_dp/2.0_dp /) + +end subroutine intialize_coefficients + + +subroutine gradz_n2n_fd4(f , f_z) + + use prec_const + + IMPLICIT none + + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z + integer :: iz + real(dp), DIMENSION(1:4) :: coef_der + + coef_der(:) = deltazi*coef_der1_n2n(:) + f_z(:,:,:) = nan_ + do iz=izs,ize + f_z(:, :, iz) = coef_der(1)*f(:, :, iz-2) & + + coef_der(2)*f(:, :, iz-1) & + + coef_der(3)*f(:, :, iz+1) & + + coef_der(4)*f(:, :, iz+2) + end do + + +end subroutine gradz_n2n_fd4 + + + + +subroutine gradz_v2n_fd4(f,f_z_v2n) + + use prec_const + + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z_v2n + integer :: iy, iz + real(dp), DIMENSION(1:4) :: coef_der + coef_der(:) = deltazi*coef_der1_stag(:) + + f_z_v2n(:,:,:) = nan_ + do iy=iys,iye + do iz=izs,ize + + f_z_v2n(iy,:,iz) = coef_int(1)*( coef_der(1)*f(iy-1, :, iz-1) & + + coef_der(2)*f(iy-1, :, iz) & + + coef_der(3)*f(iy-1, :, iz +1) & + + coef_der(4)*f(iy-1, :, iz+2) )& + + + coef_int(2)*( coef_der(1)*f(iy, :, iz-1) & + + coef_der(2)*f(iy, :, iz) & + + coef_der(3)*f(iy, :, iz+1) & + + coef_der(4)*f(iy, :, iz+2) )& + + + coef_int(3)*( coef_der(1)*f(iy+1, :, iz-1) & + + coef_der(2)*f(iy+1, :, iz) & + + coef_der(3)*f(iy+1, :, iz+1) & + + coef_der(4)*f(iy+1, :, iz+2) )& + + + coef_int(4)*( coef_der(1)*f(iy+2, :, iz-1) & + + coef_der(2)*f(iy+2, :, iz) & + + coef_der(3)*f(iy+2, :, iz+1) & + + coef_der(4)*f(iy+2, :, iz+2) ) + + end do + end do + + + +end subroutine gradz_v2n_fd4 + +subroutine grady_v2n_fd4(f,f_y_v2n) + + use prec_const + + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_y_v2n + integer :: iy, iz + real(dp), DIMENSION(1:4) :: coef_der + + coef_der(:) = deltayi*coef_der1_stag(:) + + f_y_v2n(:,:,:) = nan_ + + do iy=iys,iye + do iz=izs,ize + + f_y_v2n(iy,:,iz) = coef_int(1)*( coef_der(1)*f(iy-1, :, iz-1) & + + coef_der(2)*f(iy, :, iz-1) & + + coef_der(3)*f(iy+1, :, iz-1) & + + coef_der(4)*f(iy+2, :, iz-1) )& + + + coef_int(2)*( coef_der(1)*f(iy-1, :, iz) & + + coef_der(2)*f(iy, :, iz) & + + coef_der(3)*f(iy+1, :, iz) & + + coef_der(4)*f(iy+2, :, iz) )& + + + coef_int(3)*( coef_der(1)*f(iy-1, :, iz+1) & + + coef_der(2)*f(iy, :, iz+1) & + + coef_der(3)*f(iy+1, :, iz+1) & + + coef_der(4)*f(iy+2, :, iz+1) )& + + + coef_int(4)*( coef_der(1)*f(iy-1, :, iz+2) & + + coef_der(2)*f(iy, :, iz+2) & + + coef_der(3)*f(iy+1, :, iz+2) & + + coef_der(4)*f(iy+2, :, iz+2) ) + + end do + end do + + +end subroutine grady_v2n_fd4 + +subroutine interp_v2n_fd4 ( f, f_int_v2n ) + + use prec_const + implicit none + real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg),intent(in) :: f + real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_int_v2n + integer :: iy, iz + + f_int_v2n(:,:,:) = nan_ + + do iy=iys,iye + do iz=izs,ize + + f_int_v2n(iy,:,iz) = coef_int(1)*(coef_int(1)*f(iy-1, :, iz-1) & + + coef_int(2)*f(iy-1, : , iz) & + + coef_int(3)*f(iy-1, :, iz +1) & + + coef_int(4)*f(iy-1, :, iz+2) )& + + + coef_int(2)*( coef_int(1)*f(iy, :, iz-1) & + + coef_int(2)*f(iy, :, iz) & + + coef_int(3)*f(iy, :, iz+1) & + + coef_int(4)*f(iy, :, iz+2) )& + + + coef_int(3)*( coef_int(1)*f(iy+1, :, iz-1) & + + coef_int(2)*f(iy+1, :, iz) & + + coef_int(3)*f(iy+1, :, iz+1) & + + coef_int(4)*f(iy+1, :, iz+2) )& + + + coef_int(4)*( coef_int(1)*f(iy+2, :, iz-1) & + + coef_int(2)*f(iy+2, :, iz) & + + coef_int(3)*f(iy+2, :, iz+1) & + + coef_int(4)*f(iy+2, :, iz+2) ) + + end do + end do + + +end subroutine interp_v2n_fd4 + +subroutine gradx_n2n_fd4(f,f_x) + use prec_const + + IMPLICIT none + + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_x + integer :: ix + real(dp), DIMENSION(1:4) :: coef_der + coef_der(:)= deltaxi*coef_der1_n2n(:) + + f_x (:,:,: ) = nan_ + + do ix=ixs,ixe + f_x(:, ix, :) = coef_der(1)*f(:, ix-2, :) & + + coef_der(2)*f(:, ix-1, :) & + + coef_der(3)*f(:, ix+1, :) & + + coef_der(4)*f(:, ix+2, :) + + end do + + +end subroutine gradx_n2n_fd4 + +subroutine gradpar_v2n_fd4 ( f, f_grad) + use prec_const + + IMPLICIT none + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_grad + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg) :: f_z, f_y, f_n, f_x_n !f_x_v + + integer :: ix, iy + + f_grad(:,:,:) = nan_ + + call gradz_v2n_fd4(f, f_z) + call grady_v2n_fd4(f, f_y) + call interp_v2n_fd4(f, f_n ) + call gradx_n2n_fd4(f_n, f_x_n) + + do ix = ixs,ixe + do iy = iys, iye + f_grad(iy,ix,izs:ize) = gradpar_z*f_z(iy,ix,izs:ize) + gradpar_y_n(iy,ix)*f_y(iy,ix,izs:ize) & + + gradpar_x_n(iy,ix)*f_x_n(iy,ix,izs:ize) + end do + end do + +end subroutine gradpar_v2n_fd4 + + +end module gradients diff --git a/exercises/gradients/prec_const_mod.F90 b/exercises/gradients/prec_const_mod.F90 new file mode 100644 index 0000000..4cdf7f0 --- /dev/null +++ b/exercises/gradients/prec_const_mod.F90 @@ -0,0 +1,50 @@ +MODULE prec_const + use, intrinsic :: iso_fortran_env, only: REAL32, REAL64, & + stdin=>input_unit, & + stdout=>output_unit, & + stderr=>error_unit + ! + ! Precision for real and complex + ! + INTEGER, PARAMETER :: sp = REAL32 !Single precision, should not be used + INTEGER, PARAMETER :: dp = REAL64 !real(dp), enforced through the code + + INTEGER, private :: dp_r, dp_p !Range and Aprecision of doubles + INTEGER, private :: sp_r, sp_p !Range and precision of singles + + + INTEGER, private :: MPI_SP !Single precision for MPI + INTEGER, private :: MPI_DP !Double precision in MPI + + REAL(dp),public :: nan_ + ! + ! Some useful constants + ! + REAL(dp), PARAMETER :: PI=3.141592653589793238462643383279502884197_dp + REAL(dp), PARAMETER :: PIO2=1.57079632679489661923132169163975144209858_dp + REAL(dp), PARAMETER :: TWOPI=6.283185307179586476925286766559005768394_dp + REAL(dp), PARAMETER :: SQRT2=1.41421356237309504880168872420969807856967_dp + ! + CONTAINS + SUBROUTINE INIT_PREC_CONST + IMPLICIT NONE + integer :: ierr,me + + REAL(sp) :: a = 1_sp + REAL(dp) :: b = 1_dp +! logical :: commute = .true. + + !Get range and precision of ISO FORTRAN sizes + sp_r = range(a) + sp_p = precision(a) + + dp_r = range(b) + dp_p = precision(b) + + + + END SUBROUTINE INIT_PREC_CONST + + + +END MODULE prec_const diff --git a/exercises/gradients/space_grid_mod.F90 b/exercises/gradients/space_grid_mod.F90 new file mode 100644 index 0000000..28e84c5 --- /dev/null +++ b/exercises/gradients/space_grid_mod.F90 @@ -0,0 +1,118 @@ +MODULE space_grid + USE prec_const + IMPLICIT NONE + PRIVATE + ! + ! Grid module for spatial discretization + ! + ! GRID Namelist + ! + INTEGER, PUBLIC, PROTECTED :: nx=1 ! Number of total internal grip points in x + INTEGER, PUBLIC, PROTECTED :: ny=1 ! Number of total internal grip points in y + INTEGER, PUBLIC, PROTECTED :: nz=1 ! Number of total internal grip points in z + REAL(dp), PUBLIC, PROTECTED :: xmin=0._dp ! x coordinate for left boundary + REAL(dp), PUBLIC, PROTECTED :: xmax=10._dp ! x coordinate for right boundary + REAL(dp), PUBLIC, PROTECTED :: ymin=0._dp ! y coordinate for left boundary + REAL(dp), PUBLIC, PROTECTED :: ymax=10._dp ! y coordinate for right boundary + REAL(dp), PUBLIC, PROTECTED :: zmin=0._dp ! z coordinate for left boundary + REAL(dp), PUBLIC, PROTECTED :: zmax=1._dp ! z coordinate for right boundary + REAL(dp), PUBLIC, PROTECTED :: x_periodic_end = -10000_dp !Field line poloidal periodicity + REAL(dp), PUBLIC, PROTECTED :: Lx = 10._dp + ! + ! Logical switch to apply boundary conditions at the corners + ! with nl_corners_top -> .true. the corner bc's are evaluated + ! as vertical/poloidal bc's. + ! + LOGICAL, PUBLIC, PROTECTED :: nl_corners_top = .FALSE. + ! + ! + ! + ! Number of points in each grid + ! + INTEGER, PUBLIC, PROTECTED :: nxp, nyp, nzp + INTEGER, PUBLIC :: izsp, izep, nzpp + ! + ! Indices to start and end of local and global grids + ! s -> start, e-> end (local) + ! l -> left , r-> right (global) + ! + INTEGER, PUBLIC :: ixs, ixe, ixl, ixr + INTEGER, PUBLIC :: iys, iye, iyl, iyr, iyr_ngrid + INTEGER, PUBLIC :: izs, ize, izl, izr + ! + ! Number of ghost cells + ! + INTEGER, PUBLIC, PROTECTED :: nzgc=1 + INTEGER, PUBLIC, PROTECTED :: nygc=1 + INTEGER, PUBLIC, PROTECTED :: nxgc=1 + ! + ! Start and end ghost cells in local and global grids + ! + INTEGER, PUBLIC :: ixsg, ixeg, ixlg, ixrg + INTEGER, PUBLIC :: iysg, iyeg, iylg, iyrg + INTEGER, PUBLIC :: izsg, izeg, izlg, izrg + + !Array indices in x such that each allgatherv results in the full radial length + INTEGER, PUBLIC, PROTECTED :: ix1, ix2 + INTEGER, PUBLIC, PROTECTED :: ixl1,ixr2 + INTEGER, PUBLIC :: nx2d, nx1d + + !Array indices in x like ix1 ix2, but including all ghost cells + INTEGER, PUBLIC, PROTECTED :: ix1g, ix2g + + !Array indices in y such that each allgatherv results in the full radial length + INTEGER, PUBLIC, PROTECTED :: iy1, iy2 + INTEGER, PUBLIC, PROTECTED :: iyl1,iyr2 + INTEGER, PUBLIC :: ny2d, ny1d + + !Array indices in y like iy1 iy2, but including all ghost cells + INTEGER, PUBLIC, PROTECTED :: iy1g, iy2g + + !Field line poloidal periodicity + INTEGER, PUBLIC, PROTECTED :: ix_min !min index in the open (limited) field line region for each processor + + !Step size in each direction and inverses for finite differences + !Radial direction + REAL(dp), PUBLIC, PROTECTED :: deltax + REAL(dp), PUBLIC :: deltaxi + REAL(dp), PUBLIC, PROTECTED :: deltaxih + REAL(dp), PUBLIC, PROTECTED :: deltaxisq + + !Poloidal/vertical direction + REAL(dp), PUBLIC, PROTECTED :: deltay + REAL(dp), PUBLIC :: deltayi + REAL(dp), PUBLIC, PROTECTED :: deltayih + REAL(dp), PUBLIC, PROTECTED :: deltayisq + real(dp), PUBLIC, PROTECTED :: shiftgr_v = 0.5_dp + + !Toroidal direction + REAL(dp), PUBLIC, PROTECTED :: deltaz + real(dp), PUBLIC :: deltazi + real(dp), PUBLIC, PROTECTED :: deltazisq + + !Parallel direction (should be about the same as deltaz) + real(dp), PUBLIC, PROTECTED :: deltaai + real(dp), PUBLIC, PROTECTED :: deltaaih + real(dp), PUBLIC, PROTECTED :: deltaaisq + + !x,y,z grids + REAL(dp), DIMENSION(:), ALLOCATABLE, PUBLIC, PROTECTED :: xarray, yarray, zarray + + !Poloidal angle theta, its sine and cosine... warning, sin and cosine are defined with theta=0 at high-field-side + real(dp), DIMENSION(:), ALLOCATABLE, PUBLIC :: cos_tht + real(dp), DIMENSION(:), ALLOCATABLE, PUBLIC :: sin_tht + + !Variables related to field-aligned parallel gradient operator + !Should generalize to have iyshift=1 + INTEGER, PUBLIC, PROTECTED :: iyshift + INTEGER, PUBLIC, PROTECTED :: iyshift_d + INTEGER, PUBLIC, PROTECTED :: iyshift_u + INTEGER, PUBLIC, PROTECTED :: ny_top,ny_bot !Upwinding in n2n parallel gradients + INTEGER, PUBLIC, PROTECTED :: ny_topu,ny_botu !Upwinding in n2v parallel gradient with arbitrary iyshift + INTEGER, PUBLIC, PROTECTED :: ny_topd,ny_botd !Upwinding in v2n parallel gradient with arbitrary iyshift + INTEGER, PUBLIC, PROTECTED :: ny_zg + INTEGER, PUBLIC :: kl,ku,neq + + + +END MODULE space_grid diff --git a/exercises/gradients/test_gradients.F90 b/exercises/gradients/test_gradients.F90 new file mode 100644 index 0000000..7d86cdf --- /dev/null +++ b/exercises/gradients/test_gradients.F90 @@ -0,0 +1,108 @@ +subroutine initial_setup + use space_grid + use cuda_interface + use gradients + use iso_c_binding + implicit none + integer :: i,j,k + izsg=-1 + ixsg=-1 + iysg=-1 + izeg=3 + ixeg=1202 + iyeg=2002 + izs=1 + ixs=1 + iys=1 + ize=1 + ixe=1200 + iye=2000 + + deltazi=0.03 + deltayi=0.003 + deltayi=0.005 + call gbs_allocate_cuda(strmf_cpu,iysg,iyeg,ixsg,ixeg,izsg,izeg) + call gbs_allocate_cuda(strmf_gpu,iysg,iyeg,ixsg,ixeg,izsg,izeg) + call gbs_allocate_cuda(indata,iysg,iyeg,ixsg,ixeg,izsg,izeg) + call gbs_allocate_cuda(gradpar_y_n,iysg,iyeg,ixsg,ixeg) + call gbs_allocate_cuda(gradpar_x_n,iysg,iyeg,ixsg,ixeg) + call gbs_allocate_cuda(gradpar_y_n_gpu,iysg,iyeg,ixsg,ixeg) + call gbs_allocate_cuda(gradpar_x_n_gpu,iysg,iyeg,ixsg,ixeg) + + call gbs_allocate_cuda(f_z,iysg,iyeg,ixsg,ixeg,izsg,izeg) + call gbs_allocate_cuda(f_y,iysg,iyeg,ixsg,ixeg,izsg,izeg) + call gbs_allocate_cuda(f_x,iysg,iyeg,ixsg,ixeg,izsg,izeg) + call gbs_allocate_cuda(f_n,iysg,iyeg,ixsg,ixeg,izsg,izeg) + call gbs_allocate_cuda(f_x_n,iysg,iyeg,ixsg,ixeg,izsg,izeg) + +end subroutine initial_setup + +subroutine fill_arrays + use space_grid + use cuda_interface + use gradients + use iso_c_binding + implicit none + integer :: i,j,k + call initial_setup + + call intialize_coefficients + do k=izsg,izeg + do i=ixsg,ixeg + do j=iysg,iyeg + indata(j, i, k) = dble(i*123.0+j*5.1+k*45.12) + end do + end do + end do + nx_cuda = ixeg-ixsg+1 + ny_cuda = iyeg-iysg+1 + nz_cuda = izeg-izsg+1 + do i=ixsg,ixeg + do j=iysg,iyeg + gradpar_x_n(j, i) = dble(i*123.0+j*5.1) + gradpar_y_n(j, i) = dble(i*12.0+j*15.1) + end do + end do + gradpar_x_n_gpu(:,:) = gradpar_x_n(:,:) + gradpar_y_n_gpu(:,:) = gradpar_y_n(:,:) + + gradpar_z = 0.5 + +end subroutine fill_arrays + +program main + use space_grid + use cuda_interface + use gradients + use iso_c_binding + implicit none + + call initial_setup + + call intialize_coefficients + + call fill_arrays + + call gradz_n2n_fd4(indata(:,:,:),strmf_cpu(:,:,:)) + call gradz_n2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:)) + write(*,*) 'gradz_n2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize)) + call gradz_v2n_fd4(indata(:,:,:),strmf_cpu(:,:,:)) + call gradz_v2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:)) + write(*,*) 'gradz_v2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize)) + call grady_v2n_fd4(indata(:,:,:),strmf_cpu(:,:,:)) + call grady_v2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:)) + write(*,*) 'grady_v2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize)) + call interp_v2n_fd4(indata(:,:,:),strmf_cpu(:,:,:)) + call interp_v2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:)) + write(*,*) 'interp_v2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize)) + call gradx_n2n_fd4(indata(:,:,:),strmf_cpu(:,:,:)) + call gradx_n2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:)) + write(*,*) 'gradx_n2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize)) + + CALL gradpar_v2n_fd4(indata(:,:,:),strmf_cpu(:,:,:)) + CALL gradpar_v2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:), gradpar_z) + write(*,*) 'gradpar_v2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize)) + + +end program main +