diff --git a/gradients_cuda_mod.F90 b/gradients_cuda_mod.F90 index 354098a..0c346fb 100644 --- a/gradients_cuda_mod.F90 +++ b/gradients_cuda_mod.F90 @@ -1,57 +1,66 @@ module gradients_cuda use space_grid use gradients use prec_const use iso_c_binding real(dp), pointer :: gradpar_y_n_cuda(:,:),gradpar_x_n_cuda(:,:) contains subroutine gradx_n2n_cuda(f_in, f_out) real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out call gradx_n2n_fd4_cuda_(f_in, f_out, ixeg-ixsg+1, iyeg-iysg+1, & izeg-izsg+1, deltaxi) end subroutine gradx_n2n_cuda subroutine gradz_n2n_cuda(f_in, f_out) real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out call gradz_n2n_fd4_cuda_(f_in, f_out, ixeg-ixsg+1, iyeg-iysg+1, & izeg-izsg+1, deltazi) end subroutine gradz_n2n_cuda subroutine gradz_v2n_cuda(f_in, f_out) real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out CALL gradz_v2n_fd4_cuda_(f_in, f_out, ixeg-ixsg+1, iyeg-iysg+1, & izeg-izsg+1, deltazi) end subroutine gradz_v2n_cuda subroutine grady_v2n_cuda(f_in, f_out) real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out CALL grady_v2n_fd4_cuda_(f_in, f_out, ixeg-ixsg+1, iyeg-iysg+1, & izeg-izsg+1, deltayi) end subroutine grady_v2n_cuda subroutine interp_v2n_fd4cuda(f_in, f_out) real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out CALL interp_v2n_fd4_cuda_(f_in, f_out, ixeg-ixsg+1, iyeg-iysg+1, & izeg-izsg+1) end subroutine interp_v2n_fd4cuda subroutine gradpar_v2n_cuda(f_in, f_out) real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(out) :: f_out real(dp), dimension(:,:,:), pointer :: f_1,f_2,f_3,f_4 call gbs_allocate_cuda(f_1,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate_cuda(f_2,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate_cuda(f_3,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate_cuda(f_4,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gradz_v2n_cuda(f_in, f_1) !call gradz_v2n_fd4(f, f_z) call grady_v2n_cuda(f_in, f_2) !call grady_v2n_fd4(f, f_y) call interp_v2n_fd4cuda(f_in, f_3) !call interp_v2n_fd4(f, flu, frd, f_n ) call gradx_n2n_cuda(f_3, f_4) !call gradx_n2n_fd4(f_n, f_x_n) call gradpar_fd4_cuda_(f_1, f_2, gradpar_y_n_cuda, gradpar_x_n_cuda, f_4,& gradpar_z, f_out, ixeg-ixsg+1, iyeg-iysg+1, izeg-izsg+1) end subroutine gradpar_v2n_cuda + subroutine wrapper_for_use_ptr(fin_cuda,fout_cuda) + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: fin_cuda + real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: fout_cuda + !$omp target data use_device_ptr(fin_cuda,fout_cuda) + call gradz_n2n_cuda(fin_cuda(:,:,:),fout_cuda(:,:,:)) + !$omp end target data + CALL synchronize_cuda_device_() + end subroutine wrapper_for_use_ptr + end module gradients_cuda diff --git a/gradients_mod.F90 b/gradients_mod.F90 index c595a4e..78bde37 100644 --- a/gradients_mod.F90 +++ b/gradients_mod.F90 @@ -1,575 +1,575 @@ !**************************************************** !************** FIRST DERIVATIVE ******************** !**************************************************** !----------------------------------------------------------------------------- module gradients use space_grid use prec_const real(dp), DIMENSION(1:4), public, PUBLIC :: coef_int, coef_der1_stag, coef_der1_n2n, coef_der2_stag,coef_der_global real(dp), DIMENSION(1:5), public, PUBLIC :: coef_der2 !real(dp), DIMENSION(:,:), pointer :: gradpar_x_n, gradpar_y_n real(dp), dimension(:,:), allocatable :: gradpar_y_n,gradpar_x_n real(dp), PUBLIC :: gradpar_z=0.5 ! It contains the sign of the toroidal magnetic field real(dp), pointer :: fout_global(:,:,:) ! same but computed with openmp contains subroutine intialize_coefficients coef_int = (/ -1.0_dp/16.0_dp, 9.0_dp/16.0_dp, 9.0_dp/16.0_dp, -1.0_dp/16.0_dp /) coef_der1_stag = (/ +1.0_dp/24.0_dp, -9.0_dp/8.0_dp, 9.0_dp/8.0_dp, -1.0_dp/24.0_dp /) coef_der2 = (/-1.0_dp/12.0_dp, 4.0_dp/3.0_dp, -5.0_dp/2.0_dp, +4.0_dp/3.0_dp, -1.0_dp/12.0_dp /) coef_der1_n2n = (/ 1.0_dp/12.0_dp, -2.0_dp/3.0_dp, 2.0_dp/3.0_dp, -1.0_dp/12.0_dp /) coef_der2_stag = (/ 1.0_dp/2._dp, -1.0_dp/2.0_dp, -1.0_dp/2._dp, 1.0_dp/2.0_dp /) end subroutine intialize_coefficients !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! subroutine gradz_n2n_fd4(f , f_z) use prec_const IMPLICIT none real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z integer :: iz real(dp), DIMENSION(1:4) :: coef_der coef_der(:) = deltazi*coef_der1_n2n(:) ! f_z(:,:,:) = nan_ do iz=izs,ize f_z(:, :, iz) = coef_der(1)*f(:, :, iz-2) & + coef_der(2)*f(:, :, iz-1) & + coef_der(3)*f(:, :, iz+1) & + coef_der(4)*f(:, :, iz+2) end do end subroutine gradz_n2n_fd4 subroutine gradz_n2n_fd42(f , f_z) use prec_const IMPLICIT none real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z integer :: iz real(dp), DIMENSION(1:4) :: coef_der coef_der(:) = deltazi*coef_der1_n2n(:) ! f_z(:,:,:) = nan_ do iz=izs,ize f_z(:, :, iz) = 2.3*f(:,:, iz-2) & + 2.4*f(:,:, iz-1) & + 2.6*f(:,:, iz+1) & + 2.7*f(:,:, iz+2) end do end subroutine gradz_n2n_fd42 subroutine gradz_n2n_fd4_omp2(f , f_z) use prec_const implicit none real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z integer :: iz,iy,ix real(dp) :: toto real(dp), dimension(1:4) :: coef_der ! coef_der(:) = deltazi*coef_der1_n2n(:) ! f_z(:,:,:) = nan_ - !$omp target + !$omp target is_device_ptr(f,f_z) !$omp teams distribute parallel do simd collapse(3) ! map(from: f_z) do iz=izs,ize do ix=ixsg,ixeg do iy=iysg,iyeg f_z(iy, ix, iz) = coef_der_global(1)*f(iy, ix, iz-2) & + coef_der_global(2)*f(iy, ix, iz-1) & + coef_der_global(3)*f(iy, ix, iz+1) & + coef_der_global(4)*f(iy, ix, iz+2) end do end do end do !$omp end teams distribute parallel do simd !$omp end target end subroutine gradz_n2n_fd4_omp2 subroutine gradz_n2n_fd4_omp(f , f_z) use prec_const implicit none real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z integer :: iz,iy,ix real(dp) :: toto real(dp), dimension(1:4) :: coef_der ! !$omp declare target coef_der(:) = deltazi*coef_der1_n2n(:) ! f_z(:,:,:) = nan_ - !$omp target map(from: f_z) is_device_ptr(f) ! + !$omp target is_device_ptr(f,f_z) ! !$omp teams distribute parallel do simd collapse(3) !$acc parallel loop collapse(3) copyout(f_z) do iz=izs,ize do ix=ixsg,ixeg do iy=iysg,iyeg f_z(iy, ix, iz) = 1.*f(iy, ix, iz) ! coef_der(1)*f(iy, ix, iz-2) & ! + coef_der(2)*f(iy, ix, iz-1) & ! + coef_der(3)*f(iy, ix, iz+1) & ! + coef_der(4)*f(iy, ix, iz+2) end do end do end do !$omp end teams distribute parallel do simd !$omp end target end subroutine gradz_n2n_fd4_omp subroutine gradz_n2n_fd4_omp_without_ptr(f , f_z) use prec_const implicit none real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z integer :: iz,iy,ix real(dp) :: toto real(dp), dimension(1:4) :: coef_der ! !$omp declare target coef_der(:) = deltazi*coef_der1_n2n(:) ! f_z(:,:,:) = nan_ !$omp target map(from: f_z) ! is_device_ptr(f) ! !$omp teams distribute parallel do simd collapse(3) !$acc parallel loop collapse(3) copyout(f_z) do iz=izs,ize do ix=ixsg,ixeg do iy=iysg,iyeg f_z(iy, ix, iz) = 1.*f(iy, ix, iz) ! coef_der(1)*f(iy, ix, iz-2) & ! + coef_der(2)*f(iy, ix, iz-1) & ! + coef_der(3)*f(iy, ix, iz+1) & ! + coef_der(4)*f(iy, ix, iz+2) end do end do end do !$omp end teams distribute parallel do simd !$omp end target end subroutine gradz_n2n_fd4_omp_without_ptr subroutine gradz_n2n_fd4_omp_global(f_z) use prec_const implicit none real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z integer :: iz,iy,ix real(dp) :: toto real(dp), dimension(1:4) :: coef_der coef_der(:) = deltazi*coef_der1_n2n(:) ! f_z(:,:,:) = nan_ !$omp target map(from: f_z) ! is_device_ptr(fout_global) ! !$omp teams distribute parallel do simd collapse(3) !$acc parallel loop collapse(3) copyout(f_z) do iz=izs,ize do ix=ixsg,ixeg do iy=iysg,iyeg f_z(iy, ix, iz) = 1.*fout_global(iy, ix, iz) ! coef_der(1)*f(iy, ix, iz-2) & ! + coef_der(2)*f(iy, ix, iz-1) & ! + coef_der(3)*f(iy, ix, iz+1) & ! + coef_der(4)*f(iy, ix, iz+2) end do end do end do !$omp end teams distribute parallel do simd !$omp end target end subroutine gradz_n2n_fd4_omp_global subroutine gradz_v2n_fd4(f,f_z_v2n) use prec_const real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z_v2n integer :: iy, iz real(dp), DIMENSION(1:4) :: coef_der coef_der(:) = deltazi*coef_der1_stag(:) ! f_z_v2n(:,:,:) = nan_ do iy=iys,iye do iz=izs,ize f_z_v2n(iy,:,iz) = coef_int(1)*( coef_der(1)*f(iy-1, :, iz-1) & + coef_der(2)*f(iy-1, :, iz) & + coef_der(3)*f(iy-1, :, iz +1) & + coef_der(4)*f(iy-1, :, iz+2) )& + coef_int(2)*( coef_der(1)*f(iy, :, iz-1) & + coef_der(2)*f(iy, :, iz) & + coef_der(3)*f(iy, :, iz+1) & + coef_der(4)*f(iy, :, iz+2) )& + coef_int(3)*( coef_der(1)*f(iy+1, :, iz-1) & + coef_der(2)*f(iy+1, :, iz) & + coef_der(3)*f(iy+1, :, iz+1) & + coef_der(4)*f(iy+1, :, iz+2) )& + coef_int(4)*( coef_der(1)*f(iy+2, :, iz-1) & + coef_der(2)*f(iy+2, :, iz) & + coef_der(3)*f(iy+2, :, iz+1) & + coef_der(4)*f(iy+2, :, iz+2) ) end do end do end subroutine gradz_v2n_fd4 subroutine gradz_v2n_fd4_omp(f,f_z_v2n) use prec_const real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z_v2n integer :: iy, iz, ix real(dp), dimension(1:4) :: coef_der coef_der(:) = deltazi*coef_der1_stag(:) ! f_z_v2n(:,:,:) = nan_ !$omp target teams distribute parallel do simd collapse(3) map(from: f_z_v2n) !$acc parallel loop collapse(3) copyout(f_z_v2n) do iz=izs,ize do ix=ixsg,ixeg do iy=iys,iye f_z_v2n(iy,ix,iz) = coef_int(1)*( coef_der(1)*f(iy-1, ix, iz-1) & + coef_der(2)*f(iy-1, ix, iz) & + coef_der(3)*f(iy-1, ix, iz +1) & + coef_der(4)*f(iy-1, ix, iz+2) )& + coef_int(2)*( coef_der(1)*f(iy, ix, iz-1) & + coef_der(2)*f(iy, ix, iz) & + coef_der(3)*f(iy, ix, iz+1) & + coef_der(4)*f(iy, ix, iz+2) )& + coef_int(3)*( coef_der(1)*f(iy+1, ix, iz-1) & + coef_der(2)*f(iy+1, ix, iz) & + coef_der(3)*f(iy+1, ix, iz+1) & + coef_der(4)*f(iy+1, ix, iz+2) )& + coef_int(4)*( coef_der(1)*f(iy+2, ix, iz-1) & + coef_der(2)*f(iy+2, ix, iz) & + coef_der(3)*f(iy+2, ix, iz+1) & + coef_der(4)*f(iy+2, ix, iz+2) ) end do end do end do !$omp end target teams distribute parallel do simd end subroutine gradz_v2n_fd4_omp subroutine grady_v2n_fd4(f,f_y_v2n) use prec_const real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_y_v2n integer :: iy, iz real(dp), DIMENSION(1:4) :: coef_der coef_der(:) = deltayi*coef_der1_stag(:) ! f_y_v2n(:,:,:) = nan_ do iy=iys,iye do iz=izs,ize f_y_v2n(iy,:,iz) = coef_int(1)*( coef_der(1)*f(iy-1, :, iz-1) & + coef_der(2)*f(iy, :, iz-1) & + coef_der(3)*f(iy+1, :, iz-1) & + coef_der(4)*f(iy+2, :, iz-1) )& + coef_int(2)*( coef_der(1)*f(iy-1, :, iz) & + coef_der(2)*f(iy, :, iz) & + coef_der(3)*f(iy+1, :, iz) & + coef_der(4)*f(iy+2, :, iz) )& + coef_int(3)*( coef_der(1)*f(iy-1, :, iz+1) & + coef_der(2)*f(iy, :, iz+1) & + coef_der(3)*f(iy+1, :, iz+1) & + coef_der(4)*f(iy+2, :, iz+1) )& + coef_int(4)*( coef_der(1)*f(iy-1, :, iz+2) & + coef_der(2)*f(iy, :, iz+2) & + coef_der(3)*f(iy+1, :, iz+2) & + coef_der(4)*f(iy+2, :, iz+2) ) end do end do end subroutine grady_v2n_fd4 subroutine grady_v2n_fd4_omp(f,f_y_v2n) use prec_const real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_y_v2n integer :: iy, iz, ix real(dp), dimension(1:4) :: coef_der coef_der(:) = deltayi*coef_der1_stag(:) ! f_y_v2n(:,:,:) = nan_ !$omp target teams distribute parallel do simd collapse(3) map(from: f_y_v2n) !$acc parallel loop collapse(3) copyout(f_y_v2n) do iz=izs,ize do ix=ixsg,ixeg do iy=iys,iye f_y_v2n(iy,ix,iz) = coef_int(1)*( coef_der(1)*f(iy-1, ix, iz-1) & + coef_der(2)*f(iy, ix, iz-1) & + coef_der(3)*f(iy+1, ix, iz-1) & + coef_der(4)*f(iy+2, ix, iz-1) )& + coef_int(2)*( coef_der(1)*f(iy-1, ix, iz) & + coef_der(2)*f(iy, ix, iz) & + coef_der(3)*f(iy+1, ix, iz) & + coef_der(4)*f(iy+2, ix, iz) )& + coef_int(3)*( coef_der(1)*f(iy-1, ix, iz+1) & + coef_der(2)*f(iy, ix, iz+1) & + coef_der(3)*f(iy+1, ix, iz+1) & + coef_der(4)*f(iy+2, ix, iz+1) )& + coef_int(4)*( coef_der(1)*f(iy-1, ix, iz+2) & + coef_der(2)*f(iy, ix, iz+2) & + coef_der(3)*f(iy+1, ix, iz+2) & + coef_der(4)*f(iy+2, ix, iz+2) ) enddo end do end do !$omp end target teams distribute parallel do simd end subroutine grady_v2n_fd4_omp subroutine interp_v2n_fd4 ( f, f_int_v2n ) use prec_const implicit none real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg),intent(in) :: f real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_int_v2n integer :: iy, iz ! f_int_v2n(:,:,:) = nan_ do iy=iys,iye do iz=izs,ize f_int_v2n(iy,:,iz) = coef_int(1)*(coef_int(1)*f(iy-1, :, iz-1) & + coef_int(2)*f(iy-1, : , iz) & + coef_int(3)*f(iy-1, :, iz +1) & + coef_int(4)*f(iy-1, :, iz+2) )& + coef_int(2)*( coef_int(1)*f(iy, :, iz-1) & + coef_int(2)*f(iy, :, iz) & + coef_int(3)*f(iy, :, iz+1) & + coef_int(4)*f(iy, :, iz+2) )& + coef_int(3)*( coef_int(1)*f(iy+1, :, iz-1) & + coef_int(2)*f(iy+1, :, iz) & + coef_int(3)*f(iy+1, :, iz+1) & + coef_int(4)*f(iy+1, :, iz+2) )& + coef_int(4)*( coef_int(1)*f(iy+2, :, iz-1) & + coef_int(2)*f(iy+2, :, iz) & + coef_int(3)*f(iy+2, :, iz+1) & + coef_int(4)*f(iy+2, :, iz+2) ) end do end do end subroutine interp_v2n_fd4 subroutine interp_v2n_fd4_omp ( f , f_int_v2n ) use prec_const implicit none real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg),intent(in) :: f real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_int_v2n integer :: iy, iz, ix ! f_int_v2n(:,:,:) = nan_ !$omp target teams distribute parallel do simd collapse(3) map(from: f_int_v2n) !$acc parallel loop collapse(3) copyout(f_int_v2n) do iz=izs,ize do ix=ixsg,ixeg do iy=iys,iye f_int_v2n(iy,ix,iz) = coef_int(1)*(coef_int(1)*f(iy-1, ix, iz-1) & + coef_int(2)*f(iy-1, ix , iz) & + coef_int(3)*f(iy-1, ix, iz +1) & + coef_int(4)*f(iy-1, ix, iz+2) )& + coef_int(2)*( coef_int(1)*f(iy, ix, iz-1) & + coef_int(2)*f(iy, ix, iz) & + coef_int(3)*f(iy, ix, iz+1) & + coef_int(4)*f(iy, ix, iz+2) )& + coef_int(3)*( coef_int(1)*f(iy+1, ix, iz-1) & + coef_int(2)*f(iy+1, ix, iz) & + coef_int(3)*f(iy+1, ix, iz+1) & + coef_int(4)*f(iy+1, ix, iz+2) )& + coef_int(4)*( coef_int(1)*f(iy+2, ix, iz-1) & + coef_int(2)*f(iy+2, ix, iz) & + coef_int(3)*f(iy+2, ix, iz+1) & + coef_int(4)*f(iy+2, ix, iz+2) ) end do end do end do !$omp end target teams distribute parallel do simd end subroutine interp_v2n_fd4_omp subroutine gradx_n2n_fd4(f,f_x) use prec_const IMPLICIT none real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_x integer :: ix real(dp), DIMENSION(1:4) :: coef_der coef_der(:)= deltaxi*coef_der1_n2n(:) ! f_x (:,:,: ) = nan_ do ix=ixs,ixe f_x(:, ix, :) = coef_der(1)*f(:, ix-2, :) & + coef_der(2)*f(:, ix-1, :) & + coef_der(3)*f(:, ix+1, :) & + coef_der(4)*f(:, ix+2, :) end do end subroutine gradx_n2n_fd4 subroutine gradx_n2n_fd4_omp(f,f_x) use prec_const implicit none real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_x integer :: ix,iy,iz real(dp), dimension(1:4) :: coef_der coef_der(:)= deltaxi*coef_der1_n2n(:) ! f_x (:,:,: ) = nan_ !$omp target teams distribute parallel do simd collapse(3) map(from: f_x) !$acc parallel loop collapse(3) copyout(f_x) do iz=izsg,izeg do ix=ixs,ixe do iy=iysg,iyeg f_x(iy, ix, iz) = coef_der(1)*f(iy, ix-2, iz) & + coef_der(2)*f(iy, ix-1, iz) & + coef_der(3)*f(iy, ix+1, iz) & + coef_der(4)*f(iy, ix+2, iz) end do end do end do !$omp end target teams distribute parallel do simd end subroutine gradx_n2n_fd4_omp subroutine gradpar_v2n_fd4 ( f, f_grad) !flu and frd are useless here for periodic BC use prec_const IMPLICIT none real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_grad real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg) :: f_z, f_y, f_n, f_x_n !f_x_v integer :: ix, iy ! f_grad(:,:,:) = nan_ call gradz_v2n_fd4(f, f_z) call grady_v2n_fd4(f, f_y) ! its important to do the int first call interp_v2n_fd4(f, f_n ) call gradx_n2n_fd4(f_n, f_x_n) !call gradx_n2n_fd4(f, f_x_n) do ix = ixs,ixe do iy = iys, iye f_grad(iy,ix,izs:ize) = gradpar_z*f_z(iy,ix,izs:ize) + gradpar_y_n(iy,ix)*f_y(iy,ix,izs:ize) & + gradpar_x_n(iy,ix)*f_x_n(iy,ix,izs:ize) end do end do end subroutine gradpar_v2n_fd4 ! parallel gradient for finite differences 4rth order from v grid to n grid subroutine gradpar_v2n_fd4_omp ( f, f_grad) !flu and frd are useless here for periodic BC use prec_const implicit none real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_grad real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg) :: f_z, f_y, f_n, f_x_n !f_x_v integer :: ix, iy,iz !$acc enter data create(f_z,f_y,f_n,f_x_n) !$omp target enter data map(alloc:f_z,f_y,f_n,f_x_n) ! f_grad(:,:,:) = nan_ call gradz_v2n_fd4_omp(f, f_z) call grady_v2n_fd4_omp(f, f_y) ! its important to do the int first call interp_v2n_fd4_omp(f, f_n ) call gradx_n2n_fd4_omp(f_n, f_x_n) !$omp target teams distribute parallel do simd collapse(3) map(from: f_grad) !$acc parallel loop collapse(3) copyout(f_grad) do iz = izs,ize do ix = ixs,ixe do iy = iys, iye f_grad(iy,ix,iz) = gradpar_z*f_z(iy,ix,iz) + gradpar_y_n(iy,ix)*f_y(iy,ix,iz) & + gradpar_x_n(iy,ix)*f_x_n(iy,ix,iz) end do end do end do !$omp end target teams distribute parallel do simd !$omp target exit data map(delete:f_z,f_y,f_n,f_x_n) !$acc exit data delete(f_z,f_y,f_n,f_x_n) end subroutine gradpar_v2n_fd4_omp end module gradients diff --git a/test_gbs_gradients.F90 b/test_gbs_gradients.F90 index 8a7f1ff..c9c9cf2 100644 --- a/test_gbs_gradients.F90 +++ b/test_gbs_gradients.F90 @@ -1,163 +1,175 @@ program main use space_grid use gradients use prec_const use iso_fortran_env use omp_lib #ifdef CUDA use gradients_cuda #endif implicit none ! INPUT: Number of cells in each direction - integer :: nx=300,ny=600,nz=8 + integer :: nx=1000,ny=1800,nz=4 ! real(dp), dimension(:,:,:), allocatable :: fin,fout ! array in input and array in output (computed from different gradients) real(dp), dimension(:,:,:), allocatable :: fout_omp ! same but computed with openmp integer :: i,j,k real(dp) :: startc, endc ! For timing real(dp) :: rate,startomp,endomp ! For timing #ifdef CUDA real(dp), pointer :: fout_cuda(:,:,:) real(dp), pointer :: fin_cuda(:,:,:) #else real(dp), dimension(:,:,:), allocatable :: fin_cuda,fout_cuda ! array in input and array in output (computed from different gradients) #endif !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! Compute mesh !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call compute_mesh(nx,ny,nz) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! Initialize coefficient for gradients (as in gbs) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call intialize_coefficients ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! allocate arrays (as in gbs) !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call gbs_allocate(fin,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate(fout,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate(fout_omp,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate(gradpar_y_n,iysg,iyeg,ixsg,ixeg) call gbs_allocate(gradpar_x_n,iysg,iyeg,ixsg,ixeg) ! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! initialize input and output arrays !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call init_array(fin) call init_array(fout) call init_array(fout_omp) call init_array2d(gradpar_x_n) call init_array2d(gradpar_y_n) !$omp target enter data map (to: fin, gradpar_x_n,gradpar_y_n) !$acc enter data copyin(fin, gradpar_x_n,gradpar_y_n) #ifdef CUDA !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! Test gradients with CUDA C !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call gbs_allocate_cuda(fout_global,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate_cuda(fout_cuda,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate_cuda(fin_cuda,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate_cuda(gradpar_y_n_cuda,iysg,iyeg,ixsg,ixeg) +!!! !$omp target enter data map(to:fin_cuda,fout_cuda) call gbs_allocate_cuda(gradpar_x_n_cuda,iysg,iyeg,ixsg,ixeg) call init_array(fin_cuda) call init_array2d(gradpar_x_n_cuda) call init_array2d(gradpar_y_n_cuda) call init_which_grad_ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! call init_array(fout_global) call init_array(fout_cuda) startc=omp_get_wtime() ! !$omp target enter data map (to:fout_cuda) ! !$omp target data map (fout_cuda) use_device_ptr(fout_cuda) endc=omp_get_wtime() write(*,*) "timing transfer fout_cuda: ", (endc-startc)*1000, "ms" call init_array(fout) startc=omp_get_wtime() call gradz_n2n_fd4(fin(:,:,:),fout(:,:,:)) endc=omp_get_wtime() write(*,*) "timing for gradz_n2n_fd4 CPU: ", (endc-startc)*1000, "ms" startc=omp_get_wtime() call gradz_n2n_cuda(fin_cuda(:,:,:),fout_cuda(:,:,:)) CALL synchronize_cuda_device_() endc=omp_get_wtime() write(*,*) "timing for calling 1st time CUDA: ", (endc-startc)*1000, "ms" write(*,*) 'gradz_n2n_fd4 with CUDA L2_norm and maxval:', & ! norm2(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)), & maxval(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)) #else call gbs_allocate(fout_cuda,iysg,iyeg,ixsg,ixeg,izsg,izeg) call gbs_allocate(fin_cuda,iysg,iyeg,ixsg,ixeg,izsg,izeg) call init_array(fin_cuda) call init_array(fout_cuda) #endif !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! Test gradients with OpenMP and OpenACC !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! startc=omp_get_wtime() call gradz_n2n_fd4(fin(:,:,:),fout(:,:,:)) endc=omp_get_wtime() write(*,*) "timing for gradz_n2n_fd4 CPU: ", (endc-startc)*1000, "ms" !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! CUDA OPENMP comparison !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! print*, "! CUDA OPENMP comparison " call gradz_n2n_fd4(fin(:,:,:),fout(:,:,:)) - ! !$omp target enter data map(to:fin_cuda,fout_cuda) coef_der_global(:) = deltazi*coef_der1_n2n(:) !$omp target enter data map(to:coef_der_global) +! !$omp target enter data map(to:fin_cuda,fout_cuda) #ifdef CUDA startomp=omp_get_wtime() call gradz_n2n_cuda(fin_cuda(:,:,:),fout_cuda(:,:,:)) CALL synchronize_cuda_device_() endomp=omp_get_wtime() write(*,*) "timing for calling 1st time CUDA: ", (endomp-startomp)*1000, "ms" ! write(*,*) 'gradz_n2n_fd4 with CUDA L2_norm and maxval:', & ! norm2(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)), & ! maxval(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)) startomp=omp_get_wtime() call gradz_n2n_cuda(fin_cuda(:,:,:),fout_cuda(:,:,:)) CALL synchronize_cuda_device_() endomp=omp_get_wtime() write(*,*) "timing for calling 2nd time CUDA: ", (endomp-startomp)*1000, "ms" ! write(*,*) 'gradz_n2n_fd4 with CUDA L2_norm and maxval:', & ! norm2(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)), & ! maxval(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)) #endif startomp=omp_get_wtime() ! call gradz_n2n_fd4_omp(fin(:,:,:),fout_omp(:,:,:)) - call gradz_n2n_fd4_omp2(fin_cuda(:,:,:),fout_cuda(:,:,:)) + call gradz_n2n_fd4_omp(fout_cuda(:,:,:),fin_cuda(:,:,:)) endomp=omp_get_wtime() -! !$omp target exit data map(from:fout_cuda) - write(*,*) "timing for calling 1st time OPENMP with all the data transfer D2H+H2D: ", (endomp-startomp)*1000, "ms" +! !$omp target exit data map(from:fin_cuda) + write(*,*) "timing for calling 1st time OPENMP: ", (endomp-startomp)*1000, "ms" write(*,*) 'gradz_n2n_fd4 L2_norm and maxval:', & ! norm2(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)), & - maxval(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)) + maxval(fout(iys:iye,ixs:ixe,izs:ize) - fin_cuda(iys:iye,ixs:ixe,izs:ize)) + + +!#ifdef CUDA +! startomp=omp_get_wtime() +! !$omp target data use_device_ptr(fin_cuda,fout_cuda) +! call gradz_n2n_cuda(fin_cuda(:,:,:),fout_cuda(:,:,:)) +! !$omp end target data +! CALL synchronize_cuda_device_() +!! call wrapper_for_use_ptr(fin_cuda,fout_cuda) +! endomp=omp_get_wtime() +! write(*,*) "timing for calling intermediate time CUDA: ", (endomp-startomp)*1000, "ms" +!#endif - !$omp target enter data map(to:fin_cuda,fout_cuda) startomp=omp_get_wtime() ! call gradz_n2n_fd4_omp(fin(:,:,:),fout_omp(:,:,:)) - call gradz_n2n_fd4_omp2(fin_cuda(:,:,:),fout_cuda(:,:,:)) + call gradz_n2n_fd4_omp(fin_cuda(:,:,:),fout_cuda(:,:,:)) endomp=omp_get_wtime() - !$omp target exit data map(from:fout_cuda) - write(*,*) "timing for calling 2nd time OPENMP with any transfer: ", (endomp-startomp)*1000, "ms" +! !$omp target exit data map(from:fout_cuda) + write(*,*) "timing for calling 2nd time OPENMP: ", (endomp-startomp)*1000, "ms" write(*,*) 'gradz_n2n_fd4 L2_norm and maxval:', & ! norm2(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)), & maxval(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)) print*, "END TESTS" end program main