diff --git a/gradients_mod.F90 b/gradients_mod.F90 index eeef4ad..e6936c9 100644 --- a/gradients_mod.F90 +++ b/gradients_mod.F90 @@ -1,472 +1,458 @@ !**************************************************** !************** FIRST DERIVATIVE ******************** !**************************************************** !----------------------------------------------------------------------------- module gradients use space_grid use prec_const real(dp), DIMENSION(1:4), public, PROTECTED :: coef_int, coef_der1_stag, coef_der1_n2n, coef_der2_stag real(dp), DIMENSION(1:5), public, PROTECTED :: coef_der2 !real(dp), DIMENSION(:,:), pointer :: gradpar_x_n, gradpar_y_n real(dp), dimension(:,:), allocatable :: gradpar_y_n,gradpar_x_n real(dp), PUBLIC :: gradpar_z=0.5 ! It contains the sign of the toroidal magnetic field contains subroutine intialize_coefficients coef_int = (/ -1.0_dp/16.0_dp, 9.0_dp/16.0_dp, 9.0_dp/16.0_dp, -1.0_dp/16.0_dp /) coef_der1_stag = (/ +1.0_dp/24.0_dp, -9.0_dp/8.0_dp, 9.0_dp/8.0_dp, -1.0_dp/24.0_dp /) coef_der2 = (/-1.0_dp/12.0_dp, 4.0_dp/3.0_dp, -5.0_dp/2.0_dp, +4.0_dp/3.0_dp, -1.0_dp/12.0_dp /) coef_der1_n2n = (/ 1.0_dp/12.0_dp, -2.0_dp/3.0_dp, 2.0_dp/3.0_dp, -1.0_dp/12.0_dp /) coef_der2_stag = (/ 1.0_dp/2._dp, -1.0_dp/2.0_dp, -1.0_dp/2._dp, 1.0_dp/2.0_dp /) end subroutine intialize_coefficients !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! subroutine gradz_n2n_fd4(f , f_z) use prec_const IMPLICIT none real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z integer :: iz real(dp), DIMENSION(1:4) :: coef_der coef_der(:) = deltazi*coef_der1_n2n(:) ! f_z(:,:,:) = nan_ do iz=izs,ize f_z(:, :, iz) = coef_der(1)*f(:, :, iz-2) & + coef_der(2)*f(:, :, iz-1) & + coef_der(3)*f(:, :, iz+1) & + coef_der(4)*f(:, :, iz+2) end do end subroutine gradz_n2n_fd4 subroutine gradz_n2n_fd4_omp(f , f_z) use prec_const implicit none real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z integer :: iz,iy,ix real(dp) :: toto real(dp), dimension(1:4) :: coef_der coef_der(:) = deltazi*coef_der1_n2n(:) ! f_z(:,:,:) = nan_ - !$omp target is_device_ptr(f_z) + !$omp target is_device_ptr(f) !$omp teams distribute parallel do simd collapse(3) ! map(from: f_z) !$acc parallel loop collapse(3) copyout(f_z) do iz=izs,ize do ix=ixsg,ixeg do iy=iysg,iyeg - toto = f_z(iy, ix, iz) * 1.2_dp - f_z(iy, ix, iz) = 1.*f_z(iy, ix, iz) ! coef_der(1)*f(iy, ix, iz-2) & + f_z(iy, ix, iz) = 1.*f(iy, ix, iz) ! coef_der(1)*f(iy, ix, iz-2) & ! + coef_der(2)*f(iy, ix, iz-1) & ! + coef_der(3)*f(iy, ix, iz+1) & ! + coef_der(4)*f(iy, ix, iz+2) end do end do end do !$omp end teams distribute parallel do simd !$omp end target - !$omp target teams distribute parallel do simd collapse(3) map(from: f_z) - do iz=izs,ize - do ix=ixsg,ixeg - do iy=iysg,iyeg - toto = f_z(iy, ix, iz) * 1.2_dp - f_z(iy, ix, iz) = 1.*f_z(iy, ix, iz) ! coef_der(1)*f(iy, ix, iz-2) & -! + coef_der(2)*f(iy, ix, iz-1) & -! + coef_der(3)*f(iy, ix, iz+1) & -! + coef_der(4)*f(iy, ix, iz+2) - end do - end do - end do - !$omp end target teams distribute parallel do simd end subroutine gradz_n2n_fd4_omp subroutine gradz_v2n_fd4(f,f_z_v2n) use prec_const real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z_v2n integer :: iy, iz real(dp), DIMENSION(1:4) :: coef_der coef_der(:) = deltazi*coef_der1_stag(:) ! f_z_v2n(:,:,:) = nan_ do iy=iys,iye do iz=izs,ize f_z_v2n(iy,:,iz) = coef_int(1)*( coef_der(1)*f(iy-1, :, iz-1) & + coef_der(2)*f(iy-1, :, iz) & + coef_der(3)*f(iy-1, :, iz +1) & + coef_der(4)*f(iy-1, :, iz+2) )& + coef_int(2)*( coef_der(1)*f(iy, :, iz-1) & + coef_der(2)*f(iy, :, iz) & + coef_der(3)*f(iy, :, iz+1) & + coef_der(4)*f(iy, :, iz+2) )& + coef_int(3)*( coef_der(1)*f(iy+1, :, iz-1) & + coef_der(2)*f(iy+1, :, iz) & + coef_der(3)*f(iy+1, :, iz+1) & + coef_der(4)*f(iy+1, :, iz+2) )& + coef_int(4)*( coef_der(1)*f(iy+2, :, iz-1) & + coef_der(2)*f(iy+2, :, iz) & + coef_der(3)*f(iy+2, :, iz+1) & + coef_der(4)*f(iy+2, :, iz+2) ) end do end do end subroutine gradz_v2n_fd4 subroutine gradz_v2n_fd4_omp(f,f_z_v2n) use prec_const real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z_v2n integer :: iy, iz, ix real(dp), dimension(1:4) :: coef_der coef_der(:) = deltazi*coef_der1_stag(:) ! f_z_v2n(:,:,:) = nan_ !$omp target teams distribute parallel do simd collapse(3) map(from: f_z_v2n) !$acc parallel loop collapse(3) copyout(f_z_v2n) do iz=izs,ize do ix=ixsg,ixeg do iy=iys,iye f_z_v2n(iy,ix,iz) = coef_int(1)*( coef_der(1)*f(iy-1, ix, iz-1) & + coef_der(2)*f(iy-1, ix, iz) & + coef_der(3)*f(iy-1, ix, iz +1) & + coef_der(4)*f(iy-1, ix, iz+2) )& + coef_int(2)*( coef_der(1)*f(iy, ix, iz-1) & + coef_der(2)*f(iy, ix, iz) & + coef_der(3)*f(iy, ix, iz+1) & + coef_der(4)*f(iy, ix, iz+2) )& + coef_int(3)*( coef_der(1)*f(iy+1, ix, iz-1) & + coef_der(2)*f(iy+1, ix, iz) & + coef_der(3)*f(iy+1, ix, iz+1) & + coef_der(4)*f(iy+1, ix, iz+2) )& + coef_int(4)*( coef_der(1)*f(iy+2, ix, iz-1) & + coef_der(2)*f(iy+2, ix, iz) & + coef_der(3)*f(iy+2, ix, iz+1) & + coef_der(4)*f(iy+2, ix, iz+2) ) end do end do end do !$omp end target teams distribute parallel do simd end subroutine gradz_v2n_fd4_omp subroutine grady_v2n_fd4(f,f_y_v2n) use prec_const real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_y_v2n integer :: iy, iz real(dp), DIMENSION(1:4) :: coef_der coef_der(:) = deltayi*coef_der1_stag(:) ! f_y_v2n(:,:,:) = nan_ do iy=iys,iye do iz=izs,ize f_y_v2n(iy,:,iz) = coef_int(1)*( coef_der(1)*f(iy-1, :, iz-1) & + coef_der(2)*f(iy, :, iz-1) & + coef_der(3)*f(iy+1, :, iz-1) & + coef_der(4)*f(iy+2, :, iz-1) )& + coef_int(2)*( coef_der(1)*f(iy-1, :, iz) & + coef_der(2)*f(iy, :, iz) & + coef_der(3)*f(iy+1, :, iz) & + coef_der(4)*f(iy+2, :, iz) )& + coef_int(3)*( coef_der(1)*f(iy-1, :, iz+1) & + coef_der(2)*f(iy, :, iz+1) & + coef_der(3)*f(iy+1, :, iz+1) & + coef_der(4)*f(iy+2, :, iz+1) )& + coef_int(4)*( coef_der(1)*f(iy-1, :, iz+2) & + coef_der(2)*f(iy, :, iz+2) & + coef_der(3)*f(iy+1, :, iz+2) & + coef_der(4)*f(iy+2, :, iz+2) ) end do end do end subroutine grady_v2n_fd4 subroutine grady_v2n_fd4_omp(f,f_y_v2n) use prec_const real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_y_v2n integer :: iy, iz, ix real(dp), dimension(1:4) :: coef_der coef_der(:) = deltayi*coef_der1_stag(:) ! f_y_v2n(:,:,:) = nan_ !$omp target teams distribute parallel do simd collapse(3) map(from: f_y_v2n) !$acc parallel loop collapse(3) copyout(f_y_v2n) do iz=izs,ize do ix=ixsg,ixeg do iy=iys,iye f_y_v2n(iy,ix,iz) = coef_int(1)*( coef_der(1)*f(iy-1, ix, iz-1) & + coef_der(2)*f(iy, ix, iz-1) & + coef_der(3)*f(iy+1, ix, iz-1) & + coef_der(4)*f(iy+2, ix, iz-1) )& + coef_int(2)*( coef_der(1)*f(iy-1, ix, iz) & + coef_der(2)*f(iy, ix, iz) & + coef_der(3)*f(iy+1, ix, iz) & + coef_der(4)*f(iy+2, ix, iz) )& + coef_int(3)*( coef_der(1)*f(iy-1, ix, iz+1) & + coef_der(2)*f(iy, ix, iz+1) & + coef_der(3)*f(iy+1, ix, iz+1) & + coef_der(4)*f(iy+2, ix, iz+1) )& + coef_int(4)*( coef_der(1)*f(iy-1, ix, iz+2) & + coef_der(2)*f(iy, ix, iz+2) & + coef_der(3)*f(iy+1, ix, iz+2) & + coef_der(4)*f(iy+2, ix, iz+2) ) enddo end do end do !$omp end target teams distribute parallel do simd end subroutine grady_v2n_fd4_omp subroutine interp_v2n_fd4 ( f, f_int_v2n ) use prec_const implicit none real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg),intent(in) :: f real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_int_v2n integer :: iy, iz ! f_int_v2n(:,:,:) = nan_ do iy=iys,iye do iz=izs,ize f_int_v2n(iy,:,iz) = coef_int(1)*(coef_int(1)*f(iy-1, :, iz-1) & + coef_int(2)*f(iy-1, : , iz) & + coef_int(3)*f(iy-1, :, iz +1) & + coef_int(4)*f(iy-1, :, iz+2) )& + coef_int(2)*( coef_int(1)*f(iy, :, iz-1) & + coef_int(2)*f(iy, :, iz) & + coef_int(3)*f(iy, :, iz+1) & + coef_int(4)*f(iy, :, iz+2) )& + coef_int(3)*( coef_int(1)*f(iy+1, :, iz-1) & + coef_int(2)*f(iy+1, :, iz) & + coef_int(3)*f(iy+1, :, iz+1) & + coef_int(4)*f(iy+1, :, iz+2) )& + coef_int(4)*( coef_int(1)*f(iy+2, :, iz-1) & + coef_int(2)*f(iy+2, :, iz) & + coef_int(3)*f(iy+2, :, iz+1) & + coef_int(4)*f(iy+2, :, iz+2) ) end do end do end subroutine interp_v2n_fd4 subroutine interp_v2n_fd4_omp ( f , f_int_v2n ) use prec_const implicit none real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg),intent(in) :: f real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_int_v2n integer :: iy, iz, ix ! f_int_v2n(:,:,:) = nan_ !$omp target teams distribute parallel do simd collapse(3) map(from: f_int_v2n) !$acc parallel loop collapse(3) copyout(f_int_v2n) do iz=izs,ize do ix=ixsg,ixeg do iy=iys,iye f_int_v2n(iy,ix,iz) = coef_int(1)*(coef_int(1)*f(iy-1, ix, iz-1) & + coef_int(2)*f(iy-1, ix , iz) & + coef_int(3)*f(iy-1, ix, iz +1) & + coef_int(4)*f(iy-1, ix, iz+2) )& + coef_int(2)*( coef_int(1)*f(iy, ix, iz-1) & + coef_int(2)*f(iy, ix, iz) & + coef_int(3)*f(iy, ix, iz+1) & + coef_int(4)*f(iy, ix, iz+2) )& + coef_int(3)*( coef_int(1)*f(iy+1, ix, iz-1) & + coef_int(2)*f(iy+1, ix, iz) & + coef_int(3)*f(iy+1, ix, iz+1) & + coef_int(4)*f(iy+1, ix, iz+2) )& + coef_int(4)*( coef_int(1)*f(iy+2, ix, iz-1) & + coef_int(2)*f(iy+2, ix, iz) & + coef_int(3)*f(iy+2, ix, iz+1) & + coef_int(4)*f(iy+2, ix, iz+2) ) end do end do end do !$omp end target teams distribute parallel do simd end subroutine interp_v2n_fd4_omp subroutine gradx_n2n_fd4(f,f_x) use prec_const IMPLICIT none real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_x integer :: ix real(dp), DIMENSION(1:4) :: coef_der coef_der(:)= deltaxi*coef_der1_n2n(:) ! f_x (:,:,: ) = nan_ do ix=ixs,ixe f_x(:, ix, :) = coef_der(1)*f(:, ix-2, :) & + coef_der(2)*f(:, ix-1, :) & + coef_der(3)*f(:, ix+1, :) & + coef_der(4)*f(:, ix+2, :) end do end subroutine gradx_n2n_fd4 subroutine gradx_n2n_fd4_omp(f,f_x) use prec_const implicit none real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_x integer :: ix,iy,iz real(dp), dimension(1:4) :: coef_der coef_der(:)= deltaxi*coef_der1_n2n(:) ! f_x (:,:,: ) = nan_ !$omp target teams distribute parallel do simd collapse(3) map(from: f_x) !$acc parallel loop collapse(3) copyout(f_x) do iz=izsg,izeg do ix=ixs,ixe do iy=iysg,iyeg f_x(iy, ix, iz) = coef_der(1)*f(iy, ix-2, iz) & + coef_der(2)*f(iy, ix-1, iz) & + coef_der(3)*f(iy, ix+1, iz) & + coef_der(4)*f(iy, ix+2, iz) end do end do end do !$omp end target teams distribute parallel do simd end subroutine gradx_n2n_fd4_omp subroutine gradpar_v2n_fd4 ( f, f_grad) !flu and frd are useless here for periodic BC use prec_const IMPLICIT none real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_grad real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg) :: f_z, f_y, f_n, f_x_n !f_x_v integer :: ix, iy ! f_grad(:,:,:) = nan_ call gradz_v2n_fd4(f, f_z) call grady_v2n_fd4(f, f_y) ! its important to do the int first call interp_v2n_fd4(f, f_n ) call gradx_n2n_fd4(f_n, f_x_n) !call gradx_n2n_fd4(f, f_x_n) do ix = ixs,ixe do iy = iys, iye f_grad(iy,ix,izs:ize) = gradpar_z*f_z(iy,ix,izs:ize) + gradpar_y_n(iy,ix)*f_y(iy,ix,izs:ize) & + gradpar_x_n(iy,ix)*f_x_n(iy,ix,izs:ize) end do end do end subroutine gradpar_v2n_fd4 ! parallel gradient for finite differences 4rth order from v grid to n grid subroutine gradpar_v2n_fd4_omp ( f, f_grad) !flu and frd are useless here for periodic BC use prec_const implicit none real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_grad real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg) :: f_z, f_y, f_n, f_x_n !f_x_v integer :: ix, iy,iz !$acc enter data create(f_z,f_y,f_n,f_x_n) !$omp target enter data map(alloc:f_z,f_y,f_n,f_x_n) ! f_grad(:,:,:) = nan_ call gradz_v2n_fd4_omp(f, f_z) call grady_v2n_fd4_omp(f, f_y) ! its important to do the int first call interp_v2n_fd4_omp(f, f_n ) call gradx_n2n_fd4_omp(f_n, f_x_n) !$omp target teams distribute parallel do simd collapse(3) map(from: f_grad) !$acc parallel loop collapse(3) copyout(f_grad) do iz = izs,ize do ix = ixs,ixe do iy = iys, iye f_grad(iy,ix,iz) = gradpar_z*f_z(iy,ix,iz) + gradpar_y_n(iy,ix)*f_y(iy,ix,iz) & + gradpar_x_n(iy,ix)*f_x_n(iy,ix,iz) end do end do end do !$omp end target teams distribute parallel do simd !$omp target exit data map(delete:f_z,f_y,f_n,f_x_n) !$acc exit data delete(f_z,f_y,f_n,f_x_n) end subroutine gradpar_v2n_fd4_omp end module gradients diff --git a/test_gbs_gradients.F90 b/test_gbs_gradients.F90 index 3e20421..a06d90e 100644 --- a/test_gbs_gradients.F90 +++ b/test_gbs_gradients.F90 @@ -1,103 +1,103 @@ program main use space_grid use gradients use prec_const use iso_fortran_env #ifdef CUDA use gradients_cuda #endif implicit none ! INPUT: Number of cells in each direction integer :: nx=2000,ny=2000,nz=4 ! real(dp), dimension(:,:,:), allocatable :: fin,fout ! array in input and array in output (computed from different gradients) real(dp), dimension(:,:,:), allocatable :: fout_omp ! same but computed with openmp integer :: i,j,k integer(di) :: startc, endc ! For timing real(dp) :: rate ! For timing #ifdef CUDA real(dp), pointer :: fout_cuda(:,:,:) real(dp), pointer :: fin_cuda(:,:,:) #endif call system_clock(count_rate=rate) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! Compute mesh !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call compute_mesh(nx,ny,nz) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! Initialize coefficient for gradients (as in gbs) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call intialize_coefficients ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! allocate arrays (as in gbs) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call gbs_allocate(fin,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate(fout,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate(fout_omp,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate(gradpar_y_n,iysg,iyeg,ixsg,ixeg) call gbs_allocate(gradpar_x_n,iysg,iyeg,ixsg,ixeg) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! initialize input and output arrays !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call init_array(fin) call init_array(fout) call init_array(fout_omp) call init_array2d(gradpar_x_n) call init_array2d(gradpar_y_n) !$omp target enter data map (to: fin, gradpar_x_n,gradpar_y_n) !$acc enter data copyin(fin, gradpar_x_n,gradpar_y_n) #ifdef CUDA !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! Test gradients with CUDA C !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call gbs_allocate_cuda(fout_cuda,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate_cuda(fin_cuda,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate_cuda(gradpar_y_n_cuda,iysg,iyeg,ixsg,ixeg) call gbs_allocate_cuda(gradpar_x_n_cuda,iysg,iyeg,ixsg,ixeg) call init_array(fin_cuda) call init_array2d(gradpar_x_n_cuda) call init_array2d(gradpar_y_n_cuda) call init_which_grad_ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call init_array(fout_cuda) call init_array(fout) call system_clock(startc) call gradz_n2n_fd4(fin(:,:,:),fout(:,:,:)) call system_clock(endc) write(*,*) "timing for gradz_n2n_fd4 CPU: ", real(endc-startc, dp)/rate*1000, "ms" call system_clock(startc) call gradz_n2n_cuda(fin_cuda(:,:,:),fout_cuda(:,:,:)) CALL synchronize_cuda_device_() call system_clock(endc) write(*,*) "timing for gradz_n2n_fd4 DEVICE: ", real(endc-startc, dp)/rate*1000, "ms" write(*,*) 'gradz_n2n_fd4 with CUDA L2_norm and maxval:', & ! norm2(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)), & maxval(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)) #endif !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! Test gradients with OpenMP and OpenACC !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call system_clock(startc) call gradz_n2n_fd4(fin(:,:,:),fout(:,:,:)) call system_clock(endc) write(*,*) "timing for gradz_n2n_fd4 CPU: ", real(endc-startc, dp)/rate*1000, "ms" call system_clock(startc) ! call gradz_n2n_fd4_omp(fin(:,:,:),fout_omp(:,:,:)) - call gradz_n2n_fd4_omp(fin(:,:,:),fout_cuda(:,:,:)) + call gradz_n2n_fd4_omp(fout_cuda(:,:,:),fout_omp(:,:,:)) call system_clock(endc) write(*,*) "timing for gradz_n2n_fd4 DEVICE: ", real(endc-startc, dp)/rate*1000, "ms" write(*,*) 'gradz_n2n_fd4 L2_norm and maxval:', & ! norm2(fout(iys:iye,ixs:ixe,izs:ize) - fout_omp(iys:iye,ixs:ixe,izs:ize)), & maxval(fout(iys:iye,ixs:ixe,izs:ize) - fout_omp(iys:iye,ixs:ixe,izs:ize)) print*, "END TESTS" end program main