diff --git a/gradients_fd4.cu b/gradients_fd4.cu index 7853177..237a918 100644 --- a/gradients_fd4.cu +++ b/gradients_fd4.cu @@ -1,274 +1,276 @@ #include <stdio.h> //#include <stdlib.h> #include <cuda.h> #include <cuda_runtime.h> //#include "cudalloc.h" #include "unistd.h" #include "gradients_cuda.h" /* CUDA error macro */ #define CUDA_SAFE_CALL(call) do { \ cudaError_t err = call; \ if(cudaSuccess != err) { \ fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ __FILE__, __LINE__, cudaGetErrorString( err) ); \ exit(EXIT_FAILURE); \ } } while (0) __constant__ __device__ double coef_int[4] = {-1.0/16.0, 9.0/16.0, 9.0/16.0, -1.0/16.0 }; __constant__ __device__ double coef_der1_stag[4] = { +1.0/24.0, -9.0/8.0, 9.0/8.0, -1.0/24.0 }; __constant__ __device__ double coef_der2[5] = {-1.0/12.0, 4.0/3.0, -5.0/2.0, +4.0/3.0, -1.0/12.0}; __constant__ __device__ double coef_der1_n2n[4] = { 1.0/12.0, -2.0/3.0, 2.0/3.0, -1.0/12.0}; __constant__ __device__ double coef_der2_stag[4] = { 1.0/2., -1.0/2.0, -1.0/2., 1.0/2.0}; __device__ int xyindex(int index, int nz ) { return index%nz; } __device__ int disp(int ix, int iy, int iz, int nx, int ny, int nz) { return iz + ix*nz + iy*nz*nx; } __device__ int dispc(int ix, int iy, int iz, int nx, int ny) { return iy + ix*ny + iz*ny*nx; } //GRAD_N2N __device__ void grad_n2n_fd4Device(double *f_in, double *f_out, int nx, int ny, int nz, double fact, int *which_grad) //double *fact, int *which_grad) { int index = blockDim.x*blockIdx.x+threadIdx.x; int d1=dispc(which_grad[0],which_grad[1],which_grad[2],nx,ny); int d2=dispc(2*which_grad[0],2*which_grad[1],2*which_grad[2],nx,ny); if (index<(nx*ny*nz-d2) && index>=d2){ //f_out[index] = coef_der1_n2n[0]+f_in[index-d2] + coef_der1_n2n[1]+f_in[index-d1]+ // coef_der1_n2n[2]+f_in[index+d1] + coef_der1_n2n[3]+f_in[index+d2]; f_out[index] = fact*coef_der1_n2n[0]*f_in[index-d2] + fact*coef_der1_n2n[1]*f_in[index-d1]+ fact*coef_der1_n2n[2]*f_in[index+d1] + fact*coef_der1_n2n[3]*f_in[index+d2]; //f_out[index] = f_in[index-d2] + f_in[index-d1] + // f_in[index+d1] + f_in[index+d2]; // f_out[index] = index-d2;//f_in[index-d2]; //f_out[index] *= fact; //f_out[index] = fact[0]*f_in[index-d2] + fact[1]*f_in[index-d1] + // fact[2]*f_in[index+d1] + fact[3]*f_in[index+d2]; } } __global__ void grad_n2n_fd4(double *f_in, double *f_out, int nx, int ny, int nz, double fact, int *which_grad) //double *fact, int *which_grad) { grad_n2n_fd4Device(f_in,f_out, nx, ny, nz, fact, which_grad); } extern "C" void grady_n2n_fd4_cuda_( double *f_in, double *f_out, int *nx, int *ny, int *nz, double *fact){ int n=(*nx)*(*ny)*(*nz); which_grad[0] = 0; which_grad[1] = 1; which_grad[2] = 0; dim3 threadsPerBlock = 256; dim3 numBlocks((n + threadsPerBlock.x -1) / threadsPerBlock.x); grad_n2n_fd4<<<numBlocks, threadsPerBlock>>>(f_in, f_out, *nx, *ny, *nz, *fact, which_grad); CUDA_SAFE_CALL(cudaDeviceSynchronize()); } extern "C" void gradx_n2n_fd4_cuda_( double *f_in, double *f_out, int *nx, int *ny, int *nz, double *fact){ int n=(*nx)*(*ny)*(*nz); which_grad[0] = 1; which_grad[1] = 0; which_grad[1] = 0; which_grad[2] = 0; dim3 threadsPerBlock = 256; dim3 numBlocks((n + threadsPerBlock.x -1) / threadsPerBlock.x); grad_n2n_fd4<<<numBlocks, threadsPerBlock>>>(f_in, f_out, *nx, *ny, *nz, *fact, which_grad); CUDA_SAFE_CALL(cudaDeviceSynchronize()); } extern "C" void gradz_n2n_fd4_cuda_( double *f_in, double *f_out, int *nx, int *ny, int *nz, double *fact){ int n=(*nx)*(*ny)*(*nz); which_grad[0] = 0; which_grad[1] = 0; which_grad[2] = 1; dim3 threadsPerBlock = 256; dim3 numBlocks((n + threadsPerBlock.x -1) / threadsPerBlock.x); grad_n2n_fd4<<<numBlocks, threadsPerBlock>>>(f_in, f_out, *nx, *ny, *nz, *fact, which_grad); CUDA_SAFE_CALL(cudaDeviceSynchronize()); } __global__ void gradz_v2n(double *f_in, double *f_out, int nx, int ny, int nz, double fact) { int index=blockDim.x*blockIdx.x+threadIdx.x; int d11=index+dispc(0,-1,-1,nx,ny); int d12=index+dispc(0,-1,0,nx,ny); int d13=index+dispc(0,-1, 1,nx,ny); int d14=index+dispc(0,-1, 2,nx,ny); int d21=index+dispc(0, 0,-1,nx,ny); int d22=index+dispc(0, 0,0,nx,ny); int d23=index+dispc(0, 0, 1,nx,ny); int d24=index+dispc(0, 0, 2,nx,ny); int d31=index+dispc(0, 1,-1,nx,ny); int d32=index+dispc(0, 1,0,nx,ny); int d33=index+dispc(0, 1, 1,nx,ny); int d34=index+dispc(0, 1, 2,nx,ny); int d41=index+dispc(0, 2,-1,nx,ny); int d42=index+dispc(0, 2,0,nx,ny); int d43=index+dispc(0, 2, 1,nx,ny); int d44=index+dispc(0, 2, 2,nx,ny); double coef_der[4]; for(int i=0; i<4; i++) coef_der[i]=coef_der1_stag[i]*fact; if (d11>=0 && d44<(nx*ny*nz)){ f_out[index] = coef_int[0] * (coef_der[0]*f_in[d11] + coef_der[1]*f_in[d12] + coef_der[2]*f_in[d13] + coef_der[3]*f_in[d14])+ coef_int[1] * (coef_der[0]*f_in[d21] + coef_der[1]*f_in[d22] + coef_der[2]*f_in[d23] + coef_der[3]*f_in[d24])+ coef_int[2] * (coef_der[0]*f_in[d31] + coef_der[1]*f_in[d32] + coef_der[2]*f_in[d33] + coef_der[3]*f_in[d34]) + coef_int[3] * (coef_der[0]*f_in[d41] + coef_der[1]*f_in[d42] + coef_der[2]*f_in[d43] + coef_der[3]*f_in[d44]); } } extern "C" void gradz_v2n_fd4_cuda_(double *f_in, double *f_out, int *nx, int *ny, int *nz, double *fact){ int n = (*nx)*(*ny)*(*nz); dim3 threadsPerBlock = 256; dim3 numBlocks((n + threadsPerBlock.x -1) / threadsPerBlock.x); gradz_v2n<<<numBlocks, threadsPerBlock>>>(f_in, f_out, *nx, *ny, *nz, *fact); CUDA_SAFE_CALL(cudaDeviceSynchronize()); } __device__ void grady_v2n_fd4Device(double *f_in, double *f_out, int nx, int ny, int nz, double fact) { int index=blockDim.x*blockIdx.x+threadIdx.x; int d11=index+dispc(0,-1,-1,nx,ny); int d12=index+dispc(0, 0,-1,nx,ny); int d13=index+dispc(0, 1,-1,nx,ny); int d14=index+dispc(0, 2,-1,nx,ny); int d21=index+dispc(0,-1, 0,nx,ny); int d22=index+dispc(0, 0, 0,nx,ny); int d23=index+dispc(0, 1, 0,nx,ny); int d24=index+dispc(0, 2, 0,nx,ny); int d31=index+dispc(0,-1, 1,nx,ny); int d32=index+dispc(0, 0, 1,nx,ny); int d33=index+dispc(0, 1, 1,nx,ny); int d34=index+dispc(0, 2, 1,nx,ny); int d41=index+dispc(0,-1, 2,nx,ny); int d42=index+dispc(0,0 , 2,nx,ny); int d43=index+dispc(0, 1, 2,nx,ny); int d44=index+dispc(0, 2, 2,nx,ny); double coef_der[4]; for(int i=0; i<4; i++) coef_der[i]=coef_der1_stag[i]*fact; if (d11>=0 && d44<(nx*ny*nz)){ f_out[index] = coef_int[0] * (coef_der[0]*f_in[d11] + coef_der[1]*f_in[d12] + coef_der[2]*f_in[d13] + coef_der[3]*f_in[d14])+ coef_int[1] * (coef_der[0]*f_in[d21] + coef_der[1]*f_in[d22] + coef_der[2]*f_in[d23] + coef_der[3]*f_in[d24])+ coef_int[2] * (coef_der[0]*f_in[d31] + coef_der[1]*f_in[d32] + coef_der[2]*f_in[d33] + coef_der[3]*f_in[d34]) + coef_int[3] * (coef_der[0]*f_in[d41] + coef_der[1]*f_in[d42] + coef_der[2]*f_in[d43] + coef_der[3]*f_in[d44]); } } __global__ void grady_v2n_fd4(double *f_in, double *f_out, int nx, int ny, int nz, double fact){ grady_v2n_fd4Device(f_in, f_out, nx, ny, nz, fact); } extern "C" void grady_v2n_fd4_cuda_(double *f_in, double *f_out, int *nx, int *ny, int *nz, double *fact){ int n = (*nx)*(*ny)*(*nz); dim3 threadsPerBlock = 256; dim3 numBlocks((n + threadsPerBlock.x -1) / threadsPerBlock.x); grady_v2n_fd4<<<numBlocks, threadsPerBlock>>>(f_in, f_out, *nx, *ny, *nz, *fact); CUDA_SAFE_CALL(cudaDeviceSynchronize()); } __device__ void interp_v2n_f4Device(double *f_in, double *f_out, int nx, int ny, int nz) { int index=blockDim.x*blockIdx.x+threadIdx.x; int d11=index+dispc(0,-1,-1,nx,ny); int d12=index+dispc(0,-1,0,nx,ny); int d13=index+dispc(0,-1,1,nx,ny); int d14=index+dispc(0,-1,2,nx,ny); int d21=index+dispc(0,0,-1,nx,ny); int d22=index+dispc(0,0,0,nx,ny); int d23=index+dispc(0,0,1,nx,ny); int d24=index+dispc(0,0,2,nx,ny); int d31=index+dispc(0,1,-1,nx,ny); int d32=index+dispc(0,1,0,nx,ny); int d33=index+dispc(0,1,1,nx,ny); int d34=index+dispc(0,1,2,nx,ny); int d41=index+dispc(0,2,-1,nx,ny); int d42=index+dispc(0,2,0,nx,ny); int d43=index+dispc(0,2,1,nx,ny); int d44=index+dispc(0,2,2,nx,ny); if (d11>=0 && d44<(nx*ny*nz)){ f_out[index] = coef_int[0] * (coef_int[0]*f_in[d11] + coef_int[1]*f_in[d12] + coef_int[2]*f_in[d13] + coef_int[3]*f_in[d14]) + coef_int[1] * (coef_int[0]*f_in[d21] + coef_int[1]*f_in[d22] + coef_int[2]*f_in[d23] + coef_int[3]*f_in[d24])+ coef_int[2] * (coef_int[0]*f_in[d31] + coef_int[1]*f_in[d32] + coef_int[2]*f_in[d33] + coef_int[3]*f_in[d34]) + coef_int[3] * (coef_int[0]*f_in[d41] + coef_int[1]*f_in[d42] + coef_int[2]*f_in[d43] + coef_int[3]*f_in[d44]); } } __global__ void interp_v2n_fd4(double *f_in, double *f_out, int nx, int ny, int nz){ interp_v2n_f4Device(f_in, f_out, nx, ny, nz); } extern "C" void interp_v2n_fd4_cuda_(double *f_in, double *f_out, int *nx, int *ny, int *nz){ int n = (*nx)*(*ny)*(*nz); dim3 threadsPerBlock = 256; dim3 numBlocks((n + threadsPerBlock.x -1) / threadsPerBlock.x); interp_v2n_fd4<<<numBlocks, threadsPerBlock>>>(f_in, f_out, *nx, *ny, *nz); CUDA_SAFE_CALL(cudaDeviceSynchronize()); } __global__ void gradpar_fd4(double *f_z, double *f_y, double *gradpar_y, double *gradpar_x, double *f_x, double gradpar_z, double *f_out, int nx, int ny, int nz){ int ix = blockDim.x*blockIdx.x+threadIdx.x; int iy = blockDim.y*blockIdx.y+threadIdx.y; int iz = blockDim.z*blockIdx.z+threadIdx.z; int index = dispc(ix,iy,iz,nx,ny); int index2d = dispc(ix,iy,0,nx,ny); if(ix<nx &iy<ny && iz<nz) f_out[index] = gradpar_z*f_z[index] + gradpar_y[index2d]*f_y[index]+ gradpar_x[index2d]*f_x[index]; } extern "C" void gradpar_fd4_cuda_(double *f_z, double *f_y, double *gradpar_y, double *gradpar_x, double *f_x, double *gradpar_z, double *f_out, int *nx, int *ny, int *nz){ dim3 numBlocks; int threadsxy = 16; int threadsz = 2; dim3 numThreads = dim3(threadsxy, threadsxy, threadsz); numBlocks.x = ((*nx + threadsxy -1) / threadsxy); numBlocks.y = ((*ny + threadsxy -1) / threadsxy); numBlocks.z = ((*nz + threadsz -1) / threadsz); gradpar_fd4<<<numBlocks, numThreads>>>(f_z, f_y, gradpar_y, gradpar_x, f_x, *gradpar_z, f_out, *nx, *ny, *nz); CUDA_SAFE_CALL(cudaDeviceSynchronize()); } - +extern "C" void synchronize_cuda_device_(){ + cudaDeviceSynchronize(); +} extern "C" void init_which_grad_(){ cudaMallocManaged((void **)&(which_grad), sizeof(int)*(3)); } diff --git a/test_gbs_gradients.F90 b/test_gbs_gradients.F90 index 6b77cb6..3e20421 100644 --- a/test_gbs_gradients.F90 +++ b/test_gbs_gradients.F90 @@ -1,102 +1,103 @@ program main use space_grid use gradients use prec_const use iso_fortran_env #ifdef CUDA use gradients_cuda #endif implicit none ! INPUT: Number of cells in each direction integer :: nx=2000,ny=2000,nz=4 ! real(dp), dimension(:,:,:), allocatable :: fin,fout ! array in input and array in output (computed from different gradients) real(dp), dimension(:,:,:), allocatable :: fout_omp ! same but computed with openmp integer :: i,j,k integer(di) :: startc, endc ! For timing real(dp) :: rate ! For timing #ifdef CUDA real(dp), pointer :: fout_cuda(:,:,:) real(dp), pointer :: fin_cuda(:,:,:) #endif call system_clock(count_rate=rate) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! Compute mesh !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call compute_mesh(nx,ny,nz) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! Initialize coefficient for gradients (as in gbs) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call intialize_coefficients ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! allocate arrays (as in gbs) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call gbs_allocate(fin,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate(fout,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate(fout_omp,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate(gradpar_y_n,iysg,iyeg,ixsg,ixeg) call gbs_allocate(gradpar_x_n,iysg,iyeg,ixsg,ixeg) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! initialize input and output arrays !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call init_array(fin) call init_array(fout) call init_array(fout_omp) call init_array2d(gradpar_x_n) call init_array2d(gradpar_y_n) !$omp target enter data map (to: fin, gradpar_x_n,gradpar_y_n) !$acc enter data copyin(fin, gradpar_x_n,gradpar_y_n) #ifdef CUDA !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! Test gradients with CUDA C !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call gbs_allocate_cuda(fout_cuda,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate_cuda(fin_cuda,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate_cuda(gradpar_y_n_cuda,iysg,iyeg,ixsg,ixeg) call gbs_allocate_cuda(gradpar_x_n_cuda,iysg,iyeg,ixsg,ixeg) call init_array(fin_cuda) call init_array2d(gradpar_x_n_cuda) call init_array2d(gradpar_y_n_cuda) call init_which_grad_ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call init_array(fout_cuda) call init_array(fout) call system_clock(startc) call gradz_n2n_fd4(fin(:,:,:),fout(:,:,:)) call system_clock(endc) write(*,*) "timing for gradz_n2n_fd4 CPU: ", real(endc-startc, dp)/rate*1000, "ms" call system_clock(startc) call gradz_n2n_cuda(fin_cuda(:,:,:),fout_cuda(:,:,:)) + CALL synchronize_cuda_device_() call system_clock(endc) write(*,*) "timing for gradz_n2n_fd4 DEVICE: ", real(endc-startc, dp)/rate*1000, "ms" write(*,*) 'gradz_n2n_fd4 with CUDA L2_norm and maxval:', & ! norm2(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)), & maxval(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)) #endif !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! Test gradients with OpenMP and OpenACC !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call system_clock(startc) call gradz_n2n_fd4(fin(:,:,:),fout(:,:,:)) call system_clock(endc) write(*,*) "timing for gradz_n2n_fd4 CPU: ", real(endc-startc, dp)/rate*1000, "ms" call system_clock(startc) ! call gradz_n2n_fd4_omp(fin(:,:,:),fout_omp(:,:,:)) call gradz_n2n_fd4_omp(fin(:,:,:),fout_cuda(:,:,:)) call system_clock(endc) write(*,*) "timing for gradz_n2n_fd4 DEVICE: ", real(endc-startc, dp)/rate*1000, "ms" write(*,*) 'gradz_n2n_fd4 L2_norm and maxval:', & ! norm2(fout(iys:iye,ixs:ixe,izs:ize) - fout_omp(iys:iye,ixs:ixe,izs:ize)), & maxval(fout(iys:iye,ixs:ixe,izs:ize) - fout_omp(iys:iye,ixs:ixe,izs:ize)) print*, "END TESTS" end program main