diff --git a/gradients_cuda_mod.F90 b/gradients_cuda_mod.F90
index 354098a..0c346fb 100644
--- a/gradients_cuda_mod.F90
+++ b/gradients_cuda_mod.F90
@@ -1,57 +1,66 @@
 module gradients_cuda
   use space_grid
   use gradients
   use prec_const
   use iso_c_binding
   real(dp), pointer :: gradpar_y_n_cuda(:,:),gradpar_x_n_cuda(:,:)
 contains
   subroutine gradx_n2n_cuda(f_in, f_out)
     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in
     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out
     call gradx_n2n_fd4_cuda_(f_in, f_out, ixeg-ixsg+1, iyeg-iysg+1, &
          izeg-izsg+1, deltaxi)
   end subroutine gradx_n2n_cuda
   subroutine gradz_n2n_cuda(f_in, f_out)
     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in
     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out
     call gradz_n2n_fd4_cuda_(f_in, f_out, ixeg-ixsg+1, iyeg-iysg+1, &
          izeg-izsg+1, deltazi)
   end subroutine gradz_n2n_cuda
   subroutine gradz_v2n_cuda(f_in, f_out)
     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in      
     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out
     CALL gradz_v2n_fd4_cuda_(f_in, f_out, ixeg-ixsg+1, iyeg-iysg+1, &
          izeg-izsg+1, deltazi)
   end subroutine gradz_v2n_cuda
   subroutine grady_v2n_cuda(f_in, f_out)
     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in      
     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out
     CALL grady_v2n_fd4_cuda_(f_in, f_out, ixeg-ixsg+1, iyeg-iysg+1, &
          izeg-izsg+1, deltayi)
   end subroutine grady_v2n_cuda
 
   subroutine interp_v2n_fd4cuda(f_in, f_out)
     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in
     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out
     CALL interp_v2n_fd4_cuda_(f_in, f_out, ixeg-ixsg+1, iyeg-iysg+1, &
          izeg-izsg+1)
   end subroutine interp_v2n_fd4cuda
 
   subroutine gradpar_v2n_cuda(f_in, f_out)
     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in
     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(out) :: f_out
     real(dp), dimension(:,:,:), pointer :: f_1,f_2,f_3,f_4
     call gbs_allocate_cuda(f_1,iysg,iyeg,ixsg,ixeg,izsg,izeg)
     call gbs_allocate_cuda(f_2,iysg,iyeg,ixsg,ixeg,izsg,izeg)
     call gbs_allocate_cuda(f_3,iysg,iyeg,ixsg,ixeg,izsg,izeg)
     call gbs_allocate_cuda(f_4,iysg,iyeg,ixsg,ixeg,izsg,izeg)
     call gradz_v2n_cuda(f_in, f_1)       !call gradz_v2n_fd4(f, f_z)                                                                                                               
     call grady_v2n_cuda(f_in, f_2)       !call grady_v2n_fd4(f, f_y)                                                                                                               
     call interp_v2n_fd4cuda(f_in, f_3)   !call interp_v2n_fd4(f, flu, frd, f_n )                                                                                                   
     call gradx_n2n_cuda(f_3, f_4)        !call gradx_n2n_fd4(f_n,  f_x_n)                                                                                                          
     call gradpar_fd4_cuda_(f_1, f_2, gradpar_y_n_cuda, gradpar_x_n_cuda, f_4,&
          gradpar_z, f_out, ixeg-ixsg+1, iyeg-iysg+1, izeg-izsg+1)
   end subroutine gradpar_v2n_cuda
 
+  subroutine wrapper_for_use_ptr(fin_cuda,fout_cuda)
+    real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: fin_cuda      
+    real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: fout_cuda
+    !$omp target data use_device_ptr(fin_cuda,fout_cuda)
+    call gradz_n2n_cuda(fin_cuda(:,:,:),fout_cuda(:,:,:))
+    !$omp end target data
+    CALL synchronize_cuda_device_()  
+  end subroutine wrapper_for_use_ptr
+
 
   end module gradients_cuda
diff --git a/gradients_mod.F90 b/gradients_mod.F90
index c595a4e..78bde37 100644
--- a/gradients_mod.F90
+++ b/gradients_mod.F90
@@ -1,575 +1,575 @@
 !****************************************************
 !************** FIRST DERIVATIVE ********************
 !****************************************************
 
 !-----------------------------------------------------------------------------
 
 module gradients
 use space_grid
 use prec_const
 real(dp), DIMENSION(1:4), public, PUBLIC :: coef_int, coef_der1_stag, coef_der1_n2n, coef_der2_stag,coef_der_global
 real(dp), DIMENSION(1:5), public, PUBLIC :: coef_der2
 !real(dp), DIMENSION(:,:), pointer :: gradpar_x_n, gradpar_y_n
 real(dp), dimension(:,:), allocatable :: gradpar_y_n,gradpar_x_n
 real(dp), PUBLIC :: gradpar_z=0.5 ! It contains the sign of the toroidal magnetic field
 real(dp), pointer :: fout_global(:,:,:) ! same but computed with openmp     
 contains
 
 subroutine intialize_coefficients
     coef_int = (/ -1.0_dp/16.0_dp, 9.0_dp/16.0_dp, 9.0_dp/16.0_dp, -1.0_dp/16.0_dp /)
     coef_der1_stag = (/ +1.0_dp/24.0_dp, -9.0_dp/8.0_dp, 9.0_dp/8.0_dp, -1.0_dp/24.0_dp /)
     coef_der2 =  (/-1.0_dp/12.0_dp, 4.0_dp/3.0_dp, -5.0_dp/2.0_dp, +4.0_dp/3.0_dp, -1.0_dp/12.0_dp /)
     coef_der1_n2n = (/  1.0_dp/12.0_dp,  -2.0_dp/3.0_dp,  2.0_dp/3.0_dp,  -1.0_dp/12.0_dp  /)
     coef_der2_stag = (/ 1.0_dp/2._dp, -1.0_dp/2.0_dp, -1.0_dp/2._dp, 1.0_dp/2.0_dp /)
 
 end subroutine intialize_coefficients
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 subroutine gradz_n2n_fd4(f , f_z)
 
   use prec_const
 
   IMPLICIT none
 
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f      
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z
   integer :: iz
   real(dp), DIMENSION(1:4) :: coef_der 
   coef_der(:) = deltazi*coef_der1_n2n(:)
 !  f_z(:,:,:) = nan_
   do iz=izs,ize
      f_z(:, :, iz) =    coef_der(1)*f(:, :, iz-2)  &
           + coef_der(2)*f(:, :, iz-1)  &
           + coef_der(3)*f(:, :, iz+1)  &
           + coef_der(4)*f(:, :, iz+2)  
   end do
 
 
 end subroutine gradz_n2n_fd4
 
 subroutine gradz_n2n_fd42(f , f_z)
 
   use prec_const
 
   IMPLICIT none
 
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f      
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z
   integer :: iz
   real(dp), DIMENSION(1:4) :: coef_der 
   coef_der(:) = deltazi*coef_der1_n2n(:)
 !  f_z(:,:,:) = nan_
   do iz=izs,ize
      f_z(:, :, iz) =    2.3*f(:,:, iz-2)  &
                 + 2.4*f(:,:, iz-1)  &
                 + 2.6*f(:,:, iz+1)  &
                 + 2.7*f(:,:, iz+2)
   end do
 
 end subroutine gradz_n2n_fd42
 
 subroutine gradz_n2n_fd4_omp2(f , f_z)
 
   use prec_const
 
   implicit none
 
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f      
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z
   integer :: iz,iy,ix
   real(dp) :: toto
   real(dp), dimension(1:4) :: coef_der 
 
 !  coef_der(:) = deltazi*coef_der1_n2n(:)
   !  f_z(:,:,:) = nan_
-  !$omp target
+  !$omp target is_device_ptr(f,f_z)
   !$omp teams distribute parallel do simd collapse(3) ! map(from: f_z)
   do iz=izs,ize
      do ix=ixsg,ixeg
         do iy=iysg,iyeg
            f_z(iy, ix, iz) = coef_der_global(1)*f(iy, ix, iz-2)  &
                 + coef_der_global(2)*f(iy, ix, iz-1)  &
                 + coef_der_global(3)*f(iy, ix, iz+1)  &
                 + coef_der_global(4)*f(iy, ix, iz+2)  
         end do
      end do
   end do
   !$omp end teams distribute parallel do simd
   !$omp end target
 
 end subroutine gradz_n2n_fd4_omp2
 
 subroutine gradz_n2n_fd4_omp(f , f_z)
 
   use prec_const
 
   implicit none
 
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f      
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z
   integer :: iz,iy,ix
   real(dp) :: toto
   real(dp), dimension(1:4) :: coef_der 
 !  !$omp declare target
   coef_der(:) = deltazi*coef_der1_n2n(:)
   !  f_z(:,:,:) = nan_
-  !$omp target map(from: f_z) is_device_ptr(f) !  
+  !$omp target is_device_ptr(f,f_z) !  
   !$omp teams distribute parallel do simd collapse(3)
   !$acc parallel loop collapse(3) copyout(f_z) 
   do iz=izs,ize
      do ix=ixsg,ixeg
         do iy=iysg,iyeg
            f_z(iy, ix, iz) = 1.*f(iy, ix, iz) ! coef_der(1)*f(iy, ix, iz-2)  &
 !                + coef_der(2)*f(iy, ix, iz-1)  &
 !                + coef_der(3)*f(iy, ix, iz+1)  &
 !                + coef_der(4)*f(iy, ix, iz+2)  
         end do
      end do
   end do
   !$omp end teams distribute parallel do simd
   !$omp end target
 
 
 end subroutine gradz_n2n_fd4_omp
 
 subroutine gradz_n2n_fd4_omp_without_ptr(f , f_z)
 
   use prec_const
 
   implicit none
 
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f      
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z
   integer :: iz,iy,ix
   real(dp) :: toto
   real(dp), dimension(1:4) :: coef_der 
 !  !$omp declare target
   coef_der(:) = deltazi*coef_der1_n2n(:)
   !  f_z(:,:,:) = nan_
   !$omp target map(from: f_z) ! is_device_ptr(f) !  
   !$omp teams distribute parallel do simd collapse(3)
   !$acc parallel loop collapse(3) copyout(f_z) 
   do iz=izs,ize
      do ix=ixsg,ixeg
         do iy=iysg,iyeg
            f_z(iy, ix, iz) = 1.*f(iy, ix, iz) ! coef_der(1)*f(iy, ix, iz-2)  &
 !                + coef_der(2)*f(iy, ix, iz-1)  &
 !                + coef_der(3)*f(iy, ix, iz+1)  &
 !                + coef_der(4)*f(iy, ix, iz+2)  
         end do
      end do
   end do
   !$omp end teams distribute parallel do simd
   !$omp end target
 
 
 end subroutine gradz_n2n_fd4_omp_without_ptr
 
 subroutine gradz_n2n_fd4_omp_global(f_z)
 
   use prec_const
 
   implicit none
 
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z
   integer :: iz,iy,ix
   real(dp) :: toto
   real(dp), dimension(1:4) :: coef_der 
 
   coef_der(:) = deltazi*coef_der1_n2n(:)
   !  f_z(:,:,:) = nan_
   !$omp target map(from: f_z) ! is_device_ptr(fout_global) !  
   !$omp teams distribute parallel do simd collapse(3)
   !$acc parallel loop collapse(3) copyout(f_z) 
   do iz=izs,ize
      do ix=ixsg,ixeg
         do iy=iysg,iyeg
            f_z(iy, ix, iz) = 1.*fout_global(iy, ix, iz) ! coef_der(1)*f(iy, ix, iz-2)  &
 !                + coef_der(2)*f(iy, ix, iz-1)  &
 !                + coef_der(3)*f(iy, ix, iz+1)  &
 !                + coef_der(4)*f(iy, ix, iz+2)  
         end do
      end do
   end do
   !$omp end teams distribute parallel do simd
   !$omp end target
 
 
 end subroutine gradz_n2n_fd4_omp_global
 
 
 subroutine gradz_v2n_fd4(f,f_z_v2n)                 
 
   use prec_const
 
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f      
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z_v2n
   integer :: iy, iz
   real(dp), DIMENSION(1:4) :: coef_der 
   coef_der(:) = deltazi*coef_der1_stag(:)
 
 !  f_z_v2n(:,:,:) = nan_
   do iy=iys,iye
      do iz=izs,ize
 
         f_z_v2n(iy,:,iz) = coef_int(1)*(   coef_der(1)*f(iy-1, :, iz-1)   &
              + coef_der(2)*f(iy-1, :, iz)     &
              + coef_der(3)*f(iy-1, :, iz +1)  &
              + coef_der(4)*f(iy-1, :, iz+2)  )&
              
              +  coef_int(2)*(   coef_der(1)*f(iy, :, iz-1)     &
              + coef_der(2)*f(iy, :, iz)       &
              + coef_der(3)*f(iy, :, iz+1)     &
              + coef_der(4)*f(iy, :, iz+2)    )&
              
              +  coef_int(3)*(   coef_der(1)*f(iy+1, :, iz-1)    &
              + coef_der(2)*f(iy+1, :, iz)      &
              + coef_der(3)*f(iy+1, :, iz+1)    &
              + coef_der(4)*f(iy+1, :, iz+2)   )&
              
              + coef_int(4)*(   coef_der(1)*f(iy+2, :, iz-1)    &
              + coef_der(2)*f(iy+2, :, iz)      &
              + coef_der(3)*f(iy+2, :, iz+1)    &
              + coef_der(4)*f(iy+2, :, iz+2)   )
 
      end do
   end do
 
 end subroutine gradz_v2n_fd4
 
 
 subroutine gradz_v2n_fd4_omp(f,f_z_v2n)                 
 
   use prec_const
 
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f      
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z_v2n
   integer :: iy, iz, ix
   real(dp), dimension(1:4) :: coef_der 
   coef_der(:) = deltazi*coef_der1_stag(:)
 
   !  f_z_v2n(:,:,:) = nan_
   !$omp target teams distribute parallel do simd collapse(3) map(from: f_z_v2n)
   !$acc parallel loop collapse(3) copyout(f_z_v2n)
   do iz=izs,ize
      do ix=ixsg,ixeg
         do iy=iys,iye
 
            f_z_v2n(iy,ix,iz) = coef_int(1)*(   coef_der(1)*f(iy-1, ix, iz-1)   &
                 + coef_der(2)*f(iy-1, ix, iz)     &
                 + coef_der(3)*f(iy-1, ix, iz +1)  &
                 + coef_der(4)*f(iy-1, ix, iz+2)  )&
                 
                 +  coef_int(2)*(   coef_der(1)*f(iy, ix, iz-1)     &
                 + coef_der(2)*f(iy, ix, iz)       &
                 + coef_der(3)*f(iy, ix, iz+1)     &
                 + coef_der(4)*f(iy, ix, iz+2)    )&
                 
                 +  coef_int(3)*(   coef_der(1)*f(iy+1, ix, iz-1)    &
                 + coef_der(2)*f(iy+1, ix, iz)      &
                 + coef_der(3)*f(iy+1, ix, iz+1)    &
                 + coef_der(4)*f(iy+1, ix, iz+2)   )&
                 
                 + coef_int(4)*(   coef_der(1)*f(iy+2, ix, iz-1)    &
                 + coef_der(2)*f(iy+2, ix, iz)      &
                 + coef_der(3)*f(iy+2, ix, iz+1)    &
                 + coef_der(4)*f(iy+2, ix, iz+2)   )
 	end do
      end do
   end do
   !$omp end target teams distribute parallel do simd
 
 end subroutine gradz_v2n_fd4_omp
 
 
 
 subroutine grady_v2n_fd4(f,f_y_v2n)
 
   use prec_const
 
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f      
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_y_v2n
   integer :: iy, iz
   real(dp), DIMENSION(1:4) :: coef_der 
 
   coef_der(:) = deltayi*coef_der1_stag(:)
 
 !  f_y_v2n(:,:,:) = nan_
 
   do iy=iys,iye
      do iz=izs,ize
 
         f_y_v2n(iy,:,iz) = coef_int(1)*(   coef_der(1)*f(iy-1, :, iz-1)   &
              + coef_der(2)*f(iy, :, iz-1)     &
              + coef_der(3)*f(iy+1, :, iz-1)  &
              + coef_der(4)*f(iy+2, :, iz-1)  )&
              
              + coef_int(2)*(   coef_der(1)*f(iy-1, :, iz)     &
              + coef_der(2)*f(iy, :, iz)       &
              + coef_der(3)*f(iy+1, :, iz)     &
              + coef_der(4)*f(iy+2, :, iz)    )&
              
              + coef_int(3)*(   coef_der(1)*f(iy-1, :, iz+1)    &
              + coef_der(2)*f(iy, :, iz+1)      &
              + coef_der(3)*f(iy+1, :, iz+1)    &
              + coef_der(4)*f(iy+2, :, iz+1)   )&
              
              + coef_int(4)*(   coef_der(1)*f(iy-1, :, iz+2)    &
              + coef_der(2)*f(iy, :, iz+2)      &
              + coef_der(3)*f(iy+1, :, iz+2)    &
              + coef_der(4)*f(iy+2, :, iz+2)   )
 
      end do
   end do
 
 
 end subroutine grady_v2n_fd4
 
 subroutine grady_v2n_fd4_omp(f,f_y_v2n)
 
   use prec_const
 
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f      
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_y_v2n
   integer :: iy, iz, ix
   real(dp), dimension(1:4) :: coef_der 
 
   coef_der(:) = deltayi*coef_der1_stag(:)
 
   !  f_y_v2n(:,:,:) = nan_
 
   !$omp target teams distribute parallel do simd collapse(3) map(from: f_y_v2n)
   !$acc parallel loop collapse(3) copyout(f_y_v2n)
   do iz=izs,ize
      do ix=ixsg,ixeg
         do iy=iys,iye
 
            f_y_v2n(iy,ix,iz) = coef_int(1)*(   coef_der(1)*f(iy-1, ix, iz-1)   &
                 + coef_der(2)*f(iy, ix, iz-1)     &
                 + coef_der(3)*f(iy+1, ix, iz-1)  &
                 + coef_der(4)*f(iy+2, ix, iz-1)  )&
                 
                 + coef_int(2)*(   coef_der(1)*f(iy-1, ix, iz)     &
                 + coef_der(2)*f(iy, ix, iz)       &
                 + coef_der(3)*f(iy+1, ix, iz)     &
                 + coef_der(4)*f(iy+2, ix, iz)    )&
                 
                 + coef_int(3)*(   coef_der(1)*f(iy-1, ix, iz+1)    &
                 + coef_der(2)*f(iy, ix, iz+1)      &
                 + coef_der(3)*f(iy+1, ix, iz+1)    &
                 + coef_der(4)*f(iy+2, ix, iz+1)   )&
                 
                 + coef_int(4)*(   coef_der(1)*f(iy-1, ix, iz+2)    &
                 + coef_der(2)*f(iy, ix, iz+2)      &
                 + coef_der(3)*f(iy+1, ix, iz+2)    &
                 + coef_der(4)*f(iy+2, ix, iz+2)   )
 	enddo
      end do
   end do
   !$omp end target teams distribute parallel do simd
 
 end subroutine grady_v2n_fd4_omp
 
 subroutine interp_v2n_fd4 ( f,  f_int_v2n )
 
   use prec_const
   implicit none
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg),intent(in) :: f
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_int_v2n
   integer :: iy, iz
 
 !  f_int_v2n(:,:,:) = nan_
   do iy=iys,iye
      do iz=izs,ize
 
         f_int_v2n(iy,:,iz) = coef_int(1)*(coef_int(1)*f(iy-1, :, iz-1)   &
              + coef_int(2)*f(iy-1, : , iz)     &
              + coef_int(3)*f(iy-1, :, iz +1)  &
              + coef_int(4)*f(iy-1, :, iz+2)  )&
              
              +  coef_int(2)*(   coef_int(1)*f(iy, :, iz-1)     &
              + coef_int(2)*f(iy, :, iz)       &
              + coef_int(3)*f(iy, :, iz+1)     &
              + coef_int(4)*f(iy, :, iz+2)    )&
              
              +  coef_int(3)*(   coef_int(1)*f(iy+1, :, iz-1)    &
              + coef_int(2)*f(iy+1, :, iz)      &
              + coef_int(3)*f(iy+1, :, iz+1)    &
              + coef_int(4)*f(iy+1, :, iz+2)   )&
              
              + coef_int(4)*(   coef_int(1)*f(iy+2, :, iz-1)    &
              + coef_int(2)*f(iy+2, :, iz)      &
              + coef_int(3)*f(iy+2, :, iz+1)    &
              + coef_int(4)*f(iy+2, :, iz+2)   )
 
      end do
   end do
 
 
 end subroutine interp_v2n_fd4
 
 
 subroutine interp_v2n_fd4_omp ( f , f_int_v2n )
 
   use prec_const
   implicit none
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg),intent(in) :: f
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_int_v2n
   integer :: iy, iz, ix
 
   !  f_int_v2n(:,:,:) = nan_
 
   !$omp target teams distribute parallel do simd collapse(3) map(from: f_int_v2n)
   !$acc parallel loop collapse(3) copyout(f_int_v2n)
   do iz=izs,ize
      do ix=ixsg,ixeg
         do iy=iys,iye
            f_int_v2n(iy,ix,iz) = coef_int(1)*(coef_int(1)*f(iy-1, ix, iz-1)   &
                 + coef_int(2)*f(iy-1, ix , iz)     &
                 + coef_int(3)*f(iy-1, ix, iz +1)  &
                 + coef_int(4)*f(iy-1, ix, iz+2)  )&
                 
                 +  coef_int(2)*(   coef_int(1)*f(iy, ix, iz-1)     &
                 + coef_int(2)*f(iy, ix, iz)       &
                 + coef_int(3)*f(iy, ix, iz+1)     &
                 + coef_int(4)*f(iy, ix, iz+2)    )&
                 
                 +  coef_int(3)*(   coef_int(1)*f(iy+1, ix, iz-1)    &
                 + coef_int(2)*f(iy+1, ix, iz)      &
                 + coef_int(3)*f(iy+1, ix, iz+1)    &
                 + coef_int(4)*f(iy+1, ix, iz+2)   )&
                 
                 + coef_int(4)*(   coef_int(1)*f(iy+2, ix, iz-1)    &
                 + coef_int(2)*f(iy+2, ix, iz)      &
                 + coef_int(3)*f(iy+2, ix, iz+1)    &
                 + coef_int(4)*f(iy+2, ix, iz+2)   )
         end do
      end do
   end do
   !$omp end target teams distribute parallel do simd
 
 end subroutine interp_v2n_fd4_omp
 
 
 subroutine gradx_n2n_fd4(f,f_x)
   use prec_const
 
   IMPLICIT none
 
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f      
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out)  :: f_x
   integer :: ix
   real(dp), DIMENSION(1:4) :: coef_der
   coef_der(:)= deltaxi*coef_der1_n2n(:)                                                          
 
 !  f_x (:,:,: ) =  nan_
 
   do ix=ixs,ixe
      f_x(:, ix, :) =   coef_der(1)*f(:, ix-2, :)  &
           + coef_der(2)*f(:, ix-1, :)  &
           + coef_der(3)*f(:, ix+1, :)    &
           + coef_der(4)*f(:, ix+2, :)  
 
   end do
 
 
 end subroutine gradx_n2n_fd4
 
 subroutine gradx_n2n_fd4_omp(f,f_x)
   use prec_const
 
   implicit none
 
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f      
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out)  :: f_x
   integer :: ix,iy,iz
   real(dp), dimension(1:4) :: coef_der
   coef_der(:)= deltaxi*coef_der1_n2n(:)                                                          
 
   !  f_x (:,:,: ) =  nan_
 
   !$omp target teams distribute parallel do simd collapse(3) map(from: f_x)
   !$acc parallel loop collapse(3) copyout(f_x)
   do iz=izsg,izeg
      do ix=ixs,ixe
         do iy=iysg,iyeg
            f_x(iy, ix, iz) =   coef_der(1)*f(iy, ix-2, iz)  &
                 + coef_der(2)*f(iy, ix-1, iz)  &
                 + coef_der(3)*f(iy, ix+1, iz)    &
                 + coef_der(4)*f(iy, ix+2, iz)  
         end do
      end do
   end do
   !$omp end target teams distribute parallel do simd
 
 end subroutine gradx_n2n_fd4_omp
 
 
 
 
 subroutine gradpar_v2n_fd4 ( f,  f_grad) 
   !flu and frd are useless here for periodic BC 
   use prec_const
 
   IMPLICIT none
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f      
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_grad
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg)      :: f_z, f_y, f_n, f_x_n  !f_x_v
 
   integer :: ix, iy
 
 !  f_grad(:,:,:) = nan_
   call gradz_v2n_fd4(f, f_z)
   call grady_v2n_fd4(f, f_y)   
   ! its  important to do the int first
   call interp_v2n_fd4(f,  f_n ) 
   call gradx_n2n_fd4(f_n,  f_x_n) 
   !call gradx_n2n_fd4(f,  f_x_n) 
 
   do ix = ixs,ixe   
      do iy = iys, iye
         f_grad(iy,ix,izs:ize) = gradpar_z*f_z(iy,ix,izs:ize) + gradpar_y_n(iy,ix)*f_y(iy,ix,izs:ize) &
              +                          gradpar_x_n(iy,ix)*f_x_n(iy,ix,izs:ize)
      end do
   end do
 
 end subroutine gradpar_v2n_fd4
 
 ! parallel gradient for finite differences 4rth order from v  grid to n grid
 subroutine gradpar_v2n_fd4_omp ( f, f_grad) 
   !flu and frd are useless here for periodic BC 
   use prec_const
 
   implicit none
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f      
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_grad
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg)      :: f_z, f_y, f_n, f_x_n  !f_x_v
 
   integer :: ix, iy,iz
 
   !$acc enter data create(f_z,f_y,f_n,f_x_n)  
   !$omp target enter data map(alloc:f_z,f_y,f_n,f_x_n)
   !  f_grad(:,:,:) = nan_
 
   call gradz_v2n_fd4_omp(f, f_z)
   call grady_v2n_fd4_omp(f, f_y)   
   ! its  important to do the int first
   call interp_v2n_fd4_omp(f, f_n ) 
   call gradx_n2n_fd4_omp(f_n,  f_x_n) 
   !$omp target teams distribute parallel do simd collapse(3) map(from: f_grad)
   !$acc parallel loop collapse(3) copyout(f_grad)
   do iz = izs,ize
      do ix = ixs,ixe   
         do iy = iys, iye
            f_grad(iy,ix,iz) = gradpar_z*f_z(iy,ix,iz) + gradpar_y_n(iy,ix)*f_y(iy,ix,iz) &
                 +                          gradpar_x_n(iy,ix)*f_x_n(iy,ix,iz)
         end do
      end do
   end do
   !$omp end target teams distribute parallel do simd
   !$omp target exit data map(delete:f_z,f_y,f_n,f_x_n)
   !$acc exit data delete(f_z,f_y,f_n,f_x_n)  
 end subroutine gradpar_v2n_fd4_omp
 
 
 end module gradients
diff --git a/test_gbs_gradients.F90 b/test_gbs_gradients.F90
index 8a7f1ff..c9c9cf2 100644
--- a/test_gbs_gradients.F90
+++ b/test_gbs_gradients.F90
@@ -1,163 +1,175 @@
 program main
   use space_grid
   use gradients
   use prec_const
   use iso_fortran_env
   use omp_lib
 #ifdef CUDA
 use gradients_cuda
 #endif
   implicit none
 
   ! INPUT: Number of cells in each direction
-  integer :: nx=300,ny=600,nz=8
+  integer :: nx=1000,ny=1800,nz=4
   !
   real(dp), dimension(:,:,:), allocatable :: fin,fout ! array in input and array in output (computed from different gradients)     
   real(dp), dimension(:,:,:), allocatable :: fout_omp ! same but computed with openmp     
   integer :: i,j,k
 
   real(dp) :: startc, endc ! For timing
   real(dp) :: rate,startomp,endomp ! For timing
  
 #ifdef CUDA
  real(dp), pointer :: fout_cuda(:,:,:)
  real(dp), pointer :: fin_cuda(:,:,:)
 #else
   real(dp), dimension(:,:,:), allocatable :: fin_cuda,fout_cuda ! array in input and array in output (computed from different gradients)
 #endif
 
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! Compute mesh  
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   call compute_mesh(nx,ny,nz)
 !
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! Initialize coefficient for gradients (as in gbs) 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   call intialize_coefficients
 !
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! allocate arrays (as in gbs)  
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   call gbs_allocate(fin,iysg,iyeg,ixsg,ixeg,izsg,izeg)
   call gbs_allocate(fout,iysg,iyeg,ixsg,ixeg,izsg,izeg)
   call gbs_allocate(fout_omp,iysg,iyeg,ixsg,ixeg,izsg,izeg)
   call gbs_allocate(gradpar_y_n,iysg,iyeg,ixsg,ixeg)
   call gbs_allocate(gradpar_x_n,iysg,iyeg,ixsg,ixeg)
 !
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! initialize input and output arrays   
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   call init_array(fin)
   call init_array(fout)
   call init_array(fout_omp)
   call init_array2d(gradpar_x_n)
   call init_array2d(gradpar_y_n)
   !$omp target enter data map (to: fin, gradpar_x_n,gradpar_y_n)
   !$acc enter data copyin(fin, gradpar_x_n,gradpar_y_n)
 #ifdef CUDA
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! Test gradients with CUDA C
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  
   call gbs_allocate_cuda(fout_global,iysg,iyeg,ixsg,ixeg,izsg,izeg)
   call gbs_allocate_cuda(fout_cuda,iysg,iyeg,ixsg,ixeg,izsg,izeg)
   call gbs_allocate_cuda(fin_cuda,iysg,iyeg,ixsg,ixeg,izsg,izeg)
   call gbs_allocate_cuda(gradpar_y_n_cuda,iysg,iyeg,ixsg,ixeg)
+!!!  !$omp target enter data map(to:fin_cuda,fout_cuda)
   call gbs_allocate_cuda(gradpar_x_n_cuda,iysg,iyeg,ixsg,ixeg)
   call init_array(fin_cuda)
   call init_array2d(gradpar_x_n_cuda)
   call init_array2d(gradpar_y_n_cuda)
   call init_which_grad_
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   call init_array(fout_global)
   call init_array(fout_cuda)
   startc=omp_get_wtime()
 !  !$omp target enter data map (to:fout_cuda)     
 !  !$omp target data map (fout_cuda) use_device_ptr(fout_cuda)     
   endc=omp_get_wtime()
   write(*,*) "timing transfer fout_cuda: ", (endc-startc)*1000, "ms"
   call init_array(fout)
   startc=omp_get_wtime()
   call gradz_n2n_fd4(fin(:,:,:),fout(:,:,:))
   endc=omp_get_wtime()
   write(*,*) "timing for gradz_n2n_fd4 CPU: ", (endc-startc)*1000, "ms"
   startc=omp_get_wtime()
   call gradz_n2n_cuda(fin_cuda(:,:,:),fout_cuda(:,:,:))
   CALL synchronize_cuda_device_()
   endc=omp_get_wtime()
   write(*,*) "timing for calling 1st time CUDA: ", (endc-startc)*1000, "ms"
   write(*,*) 'gradz_n2n_fd4 with CUDA L2_norm and maxval:', &
 !      norm2(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)), &
       maxval(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize))
 #else
   call gbs_allocate(fout_cuda,iysg,iyeg,ixsg,ixeg,izsg,izeg)
   call gbs_allocate(fin_cuda,iysg,iyeg,ixsg,ixeg,izsg,izeg)
   call init_array(fin_cuda)
   call init_array(fout_cuda)
   
 
 #endif
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! Test gradients with OpenMP and OpenACC
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  
   startc=omp_get_wtime()
   call gradz_n2n_fd4(fin(:,:,:),fout(:,:,:))
   endc=omp_get_wtime()
   write(*,*) "timing for gradz_n2n_fd4 CPU: ", (endc-startc)*1000, "ms"
 
 
 
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! CUDA OPENMP comparison
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  
   print*, "! CUDA OPENMP comparison  "
   call gradz_n2n_fd4(fin(:,:,:),fout(:,:,:))
-  !  !$omp target enter data map(to:fin_cuda,fout_cuda)
   coef_der_global(:) = deltazi*coef_der1_n2n(:)
   !$omp target enter data map(to:coef_der_global)
+!  !$omp target enter data map(to:fin_cuda,fout_cuda)
 
 #ifdef CUDA
   startomp=omp_get_wtime()
   call gradz_n2n_cuda(fin_cuda(:,:,:),fout_cuda(:,:,:))
   CALL synchronize_cuda_device_()
   endomp=omp_get_wtime()
   write(*,*) "timing for calling 1st time CUDA: ", (endomp-startomp)*1000, "ms"
 !  write(*,*) 'gradz_n2n_fd4 with CUDA L2_norm and maxval:', &
 !       norm2(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)), &
 !       maxval(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize))
   startomp=omp_get_wtime()
   call gradz_n2n_cuda(fin_cuda(:,:,:),fout_cuda(:,:,:))
   CALL synchronize_cuda_device_()
   endomp=omp_get_wtime()
   write(*,*) "timing for calling 2nd time CUDA: ", (endomp-startomp)*1000, "ms"
 !  write(*,*) 'gradz_n2n_fd4 with CUDA L2_norm and maxval:', &
 !       norm2(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)), &
 !       maxval(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize))
 #endif
   
   startomp=omp_get_wtime()
   !  call gradz_n2n_fd4_omp(fin(:,:,:),fout_omp(:,:,:))
-  call gradz_n2n_fd4_omp2(fin_cuda(:,:,:),fout_cuda(:,:,:))
+  call gradz_n2n_fd4_omp(fout_cuda(:,:,:),fin_cuda(:,:,:))
   endomp=omp_get_wtime()
-!  !$omp target exit data map(from:fout_cuda)
-  write(*,*) "timing for calling 1st time OPENMP with all the data transfer D2H+H2D: ", (endomp-startomp)*1000, "ms"
+!  !$omp target exit data map(from:fin_cuda)
+  write(*,*) "timing for calling 1st time OPENMP: ", (endomp-startomp)*1000, "ms"
   write(*,*) 'gradz_n2n_fd4 L2_norm and maxval:', &
 !       norm2(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)), &
-       maxval(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize))
+       maxval(fout(iys:iye,ixs:ixe,izs:ize) - fin_cuda(iys:iye,ixs:ixe,izs:ize))
+       
+
+!#ifdef CUDA
+!  startomp=omp_get_wtime()
+!  !$omp target data use_device_ptr(fin_cuda,fout_cuda)
+!  call gradz_n2n_cuda(fin_cuda(:,:,:),fout_cuda(:,:,:))
+!  !$omp end target data
+!  CALL synchronize_cuda_device_()
+!!  call wrapper_for_use_ptr(fin_cuda,fout_cuda)  
+!  endomp=omp_get_wtime()
+!  write(*,*) "timing for calling intermediate time CUDA: ", (endomp-startomp)*1000, "ms"
+!#endif
 
-    !$omp target enter data map(to:fin_cuda,fout_cuda)     
   startomp=omp_get_wtime()
   !  call gradz_n2n_fd4_omp(fin(:,:,:),fout_omp(:,:,:))
-  call gradz_n2n_fd4_omp2(fin_cuda(:,:,:),fout_cuda(:,:,:))
+  call gradz_n2n_fd4_omp(fin_cuda(:,:,:),fout_cuda(:,:,:))
   endomp=omp_get_wtime()
-    !$omp target exit data map(from:fout_cuda)     
-  write(*,*) "timing for calling 2nd time OPENMP with any transfer: ", (endomp-startomp)*1000, "ms"
+!    !$omp target exit data map(from:fout_cuda)     
+  write(*,*) "timing for calling 2nd time OPENMP: ", (endomp-startomp)*1000, "ms"
   write(*,*) 'gradz_n2n_fd4 L2_norm and maxval:', &
 !       norm2(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)), &
        maxval(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize))
 
   print*, "END TESTS"
 end program main