diff --git a/gradients_mod.F90 b/gradients_mod.F90
index eeef4ad..e6936c9 100644
--- a/gradients_mod.F90
+++ b/gradients_mod.F90
@@ -1,472 +1,458 @@
 !****************************************************
 !************** FIRST DERIVATIVE ********************
 !****************************************************
 
 !-----------------------------------------------------------------------------
 
 module gradients
 use space_grid
 use prec_const
 real(dp), DIMENSION(1:4), public, PROTECTED :: coef_int, coef_der1_stag, coef_der1_n2n, coef_der2_stag
 real(dp), DIMENSION(1:5), public, PROTECTED :: coef_der2
 !real(dp), DIMENSION(:,:), pointer :: gradpar_x_n, gradpar_y_n
 real(dp), dimension(:,:), allocatable :: gradpar_y_n,gradpar_x_n
 real(dp), PUBLIC :: gradpar_z=0.5 ! It contains the sign of the toroidal magnetic field
 contains
 
 subroutine intialize_coefficients
     coef_int = (/ -1.0_dp/16.0_dp, 9.0_dp/16.0_dp, 9.0_dp/16.0_dp, -1.0_dp/16.0_dp /)
     coef_der1_stag = (/ +1.0_dp/24.0_dp, -9.0_dp/8.0_dp, 9.0_dp/8.0_dp, -1.0_dp/24.0_dp /)
     coef_der2 =  (/-1.0_dp/12.0_dp, 4.0_dp/3.0_dp, -5.0_dp/2.0_dp, +4.0_dp/3.0_dp, -1.0_dp/12.0_dp /)
     coef_der1_n2n = (/  1.0_dp/12.0_dp,  -2.0_dp/3.0_dp,  2.0_dp/3.0_dp,  -1.0_dp/12.0_dp  /)
     coef_der2_stag = (/ 1.0_dp/2._dp, -1.0_dp/2.0_dp, -1.0_dp/2._dp, 1.0_dp/2.0_dp /)
 
 end subroutine intialize_coefficients
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 subroutine gradz_n2n_fd4(f , f_z)
 
   use prec_const
 
   IMPLICIT none
 
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f      
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z
   integer :: iz
   real(dp), DIMENSION(1:4) :: coef_der 
   coef_der(:) = deltazi*coef_der1_n2n(:)
 !  f_z(:,:,:) = nan_
   do iz=izs,ize
      f_z(:, :, iz) =    coef_der(1)*f(:, :, iz-2)  &
           + coef_der(2)*f(:, :, iz-1)  &
           + coef_der(3)*f(:, :, iz+1)  &
           + coef_der(4)*f(:, :, iz+2)  
   end do
 
 
 end subroutine gradz_n2n_fd4
 
 
 subroutine gradz_n2n_fd4_omp(f , f_z)
 
   use prec_const
 
   implicit none
 
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f      
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z
   integer :: iz,iy,ix
   real(dp) :: toto
   real(dp), dimension(1:4) :: coef_der 
 
   coef_der(:) = deltazi*coef_der1_n2n(:)
   !  f_z(:,:,:) = nan_
-  !$omp target is_device_ptr(f_z)
+  !$omp target is_device_ptr(f)
   !$omp teams distribute parallel do simd collapse(3) ! map(from: f_z)
   !$acc parallel loop collapse(3) copyout(f_z) 
   do iz=izs,ize
      do ix=ixsg,ixeg
         do iy=iysg,iyeg
-           toto = f_z(iy, ix, iz) * 1.2_dp
-           f_z(iy, ix, iz) = 1.*f_z(iy, ix, iz) ! coef_der(1)*f(iy, ix, iz-2)  &
+           f_z(iy, ix, iz) = 1.*f(iy, ix, iz) ! coef_der(1)*f(iy, ix, iz-2)  &
 !                + coef_der(2)*f(iy, ix, iz-1)  &
 !                + coef_der(3)*f(iy, ix, iz+1)  &
 !                + coef_der(4)*f(iy, ix, iz+2)  
         end do
      end do
   end do
   !$omp end teams distribute parallel do simd
   !$omp end target
 
-  !$omp target teams distribute parallel do simd collapse(3) map(from: f_z)
-  do iz=izs,ize
-     do ix=ixsg,ixeg
-        do iy=iysg,iyeg
-           toto = f_z(iy, ix, iz) * 1.2_dp
-           f_z(iy, ix, iz) = 1.*f_z(iy, ix, iz) ! coef_der(1)*f(iy, ix, iz-2)  &
-!                + coef_der(2)*f(iy, ix, iz-1)  &
-!                + coef_der(3)*f(iy, ix, iz+1)  &
-!                + coef_der(4)*f(iy, ix, iz+2)  
-        end do
-     end do
-  end do
-  !$omp end target teams distribute parallel do simd
 
 end subroutine gradz_n2n_fd4_omp
 
 
 subroutine gradz_v2n_fd4(f,f_z_v2n)                 
 
   use prec_const
 
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f      
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z_v2n
   integer :: iy, iz
   real(dp), DIMENSION(1:4) :: coef_der 
   coef_der(:) = deltazi*coef_der1_stag(:)
 
 !  f_z_v2n(:,:,:) = nan_
   do iy=iys,iye
      do iz=izs,ize
 
         f_z_v2n(iy,:,iz) = coef_int(1)*(   coef_der(1)*f(iy-1, :, iz-1)   &
              + coef_der(2)*f(iy-1, :, iz)     &
              + coef_der(3)*f(iy-1, :, iz +1)  &
              + coef_der(4)*f(iy-1, :, iz+2)  )&
              
              +  coef_int(2)*(   coef_der(1)*f(iy, :, iz-1)     &
              + coef_der(2)*f(iy, :, iz)       &
              + coef_der(3)*f(iy, :, iz+1)     &
              + coef_der(4)*f(iy, :, iz+2)    )&
              
              +  coef_int(3)*(   coef_der(1)*f(iy+1, :, iz-1)    &
              + coef_der(2)*f(iy+1, :, iz)      &
              + coef_der(3)*f(iy+1, :, iz+1)    &
              + coef_der(4)*f(iy+1, :, iz+2)   )&
              
              + coef_int(4)*(   coef_der(1)*f(iy+2, :, iz-1)    &
              + coef_der(2)*f(iy+2, :, iz)      &
              + coef_der(3)*f(iy+2, :, iz+1)    &
              + coef_der(4)*f(iy+2, :, iz+2)   )
 
      end do
   end do
 
 end subroutine gradz_v2n_fd4
 
 
 subroutine gradz_v2n_fd4_omp(f,f_z_v2n)                 
 
   use prec_const
 
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f      
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z_v2n
   integer :: iy, iz, ix
   real(dp), dimension(1:4) :: coef_der 
   coef_der(:) = deltazi*coef_der1_stag(:)
 
   !  f_z_v2n(:,:,:) = nan_
   !$omp target teams distribute parallel do simd collapse(3) map(from: f_z_v2n)
   !$acc parallel loop collapse(3) copyout(f_z_v2n)
   do iz=izs,ize
      do ix=ixsg,ixeg
         do iy=iys,iye
 
            f_z_v2n(iy,ix,iz) = coef_int(1)*(   coef_der(1)*f(iy-1, ix, iz-1)   &
                 + coef_der(2)*f(iy-1, ix, iz)     &
                 + coef_der(3)*f(iy-1, ix, iz +1)  &
                 + coef_der(4)*f(iy-1, ix, iz+2)  )&
                 
                 +  coef_int(2)*(   coef_der(1)*f(iy, ix, iz-1)     &
                 + coef_der(2)*f(iy, ix, iz)       &
                 + coef_der(3)*f(iy, ix, iz+1)     &
                 + coef_der(4)*f(iy, ix, iz+2)    )&
                 
                 +  coef_int(3)*(   coef_der(1)*f(iy+1, ix, iz-1)    &
                 + coef_der(2)*f(iy+1, ix, iz)      &
                 + coef_der(3)*f(iy+1, ix, iz+1)    &
                 + coef_der(4)*f(iy+1, ix, iz+2)   )&
                 
                 + coef_int(4)*(   coef_der(1)*f(iy+2, ix, iz-1)    &
                 + coef_der(2)*f(iy+2, ix, iz)      &
                 + coef_der(3)*f(iy+2, ix, iz+1)    &
                 + coef_der(4)*f(iy+2, ix, iz+2)   )
 	end do
      end do
   end do
   !$omp end target teams distribute parallel do simd
 
 end subroutine gradz_v2n_fd4_omp
 
 
 
 subroutine grady_v2n_fd4(f,f_y_v2n)
 
   use prec_const
 
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f      
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_y_v2n
   integer :: iy, iz
   real(dp), DIMENSION(1:4) :: coef_der 
 
   coef_der(:) = deltayi*coef_der1_stag(:)
 
 !  f_y_v2n(:,:,:) = nan_
 
   do iy=iys,iye
      do iz=izs,ize
 
         f_y_v2n(iy,:,iz) = coef_int(1)*(   coef_der(1)*f(iy-1, :, iz-1)   &
              + coef_der(2)*f(iy, :, iz-1)     &
              + coef_der(3)*f(iy+1, :, iz-1)  &
              + coef_der(4)*f(iy+2, :, iz-1)  )&
              
              + coef_int(2)*(   coef_der(1)*f(iy-1, :, iz)     &
              + coef_der(2)*f(iy, :, iz)       &
              + coef_der(3)*f(iy+1, :, iz)     &
              + coef_der(4)*f(iy+2, :, iz)    )&
              
              + coef_int(3)*(   coef_der(1)*f(iy-1, :, iz+1)    &
              + coef_der(2)*f(iy, :, iz+1)      &
              + coef_der(3)*f(iy+1, :, iz+1)    &
              + coef_der(4)*f(iy+2, :, iz+1)   )&
              
              + coef_int(4)*(   coef_der(1)*f(iy-1, :, iz+2)    &
              + coef_der(2)*f(iy, :, iz+2)      &
              + coef_der(3)*f(iy+1, :, iz+2)    &
              + coef_der(4)*f(iy+2, :, iz+2)   )
 
      end do
   end do
 
 
 end subroutine grady_v2n_fd4
 
 subroutine grady_v2n_fd4_omp(f,f_y_v2n)
 
   use prec_const
 
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f      
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_y_v2n
   integer :: iy, iz, ix
   real(dp), dimension(1:4) :: coef_der 
 
   coef_der(:) = deltayi*coef_der1_stag(:)
 
   !  f_y_v2n(:,:,:) = nan_
 
   !$omp target teams distribute parallel do simd collapse(3) map(from: f_y_v2n)
   !$acc parallel loop collapse(3) copyout(f_y_v2n)
   do iz=izs,ize
      do ix=ixsg,ixeg
         do iy=iys,iye
 
            f_y_v2n(iy,ix,iz) = coef_int(1)*(   coef_der(1)*f(iy-1, ix, iz-1)   &
                 + coef_der(2)*f(iy, ix, iz-1)     &
                 + coef_der(3)*f(iy+1, ix, iz-1)  &
                 + coef_der(4)*f(iy+2, ix, iz-1)  )&
                 
                 + coef_int(2)*(   coef_der(1)*f(iy-1, ix, iz)     &
                 + coef_der(2)*f(iy, ix, iz)       &
                 + coef_der(3)*f(iy+1, ix, iz)     &
                 + coef_der(4)*f(iy+2, ix, iz)    )&
                 
                 + coef_int(3)*(   coef_der(1)*f(iy-1, ix, iz+1)    &
                 + coef_der(2)*f(iy, ix, iz+1)      &
                 + coef_der(3)*f(iy+1, ix, iz+1)    &
                 + coef_der(4)*f(iy+2, ix, iz+1)   )&
                 
                 + coef_int(4)*(   coef_der(1)*f(iy-1, ix, iz+2)    &
                 + coef_der(2)*f(iy, ix, iz+2)      &
                 + coef_der(3)*f(iy+1, ix, iz+2)    &
                 + coef_der(4)*f(iy+2, ix, iz+2)   )
 	enddo
      end do
   end do
   !$omp end target teams distribute parallel do simd
 
 end subroutine grady_v2n_fd4_omp
 
 subroutine interp_v2n_fd4 ( f,  f_int_v2n )
 
   use prec_const
   implicit none
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg),intent(in) :: f
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_int_v2n
   integer :: iy, iz
 
 !  f_int_v2n(:,:,:) = nan_
   do iy=iys,iye
      do iz=izs,ize
 
         f_int_v2n(iy,:,iz) = coef_int(1)*(coef_int(1)*f(iy-1, :, iz-1)   &
              + coef_int(2)*f(iy-1, : , iz)     &
              + coef_int(3)*f(iy-1, :, iz +1)  &
              + coef_int(4)*f(iy-1, :, iz+2)  )&
              
              +  coef_int(2)*(   coef_int(1)*f(iy, :, iz-1)     &
              + coef_int(2)*f(iy, :, iz)       &
              + coef_int(3)*f(iy, :, iz+1)     &
              + coef_int(4)*f(iy, :, iz+2)    )&
              
              +  coef_int(3)*(   coef_int(1)*f(iy+1, :, iz-1)    &
              + coef_int(2)*f(iy+1, :, iz)      &
              + coef_int(3)*f(iy+1, :, iz+1)    &
              + coef_int(4)*f(iy+1, :, iz+2)   )&
              
              + coef_int(4)*(   coef_int(1)*f(iy+2, :, iz-1)    &
              + coef_int(2)*f(iy+2, :, iz)      &
              + coef_int(3)*f(iy+2, :, iz+1)    &
              + coef_int(4)*f(iy+2, :, iz+2)   )
 
      end do
   end do
 
 
 end subroutine interp_v2n_fd4
 
 
 subroutine interp_v2n_fd4_omp ( f , f_int_v2n )
 
   use prec_const
   implicit none
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg),intent(in) :: f
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_int_v2n
   integer :: iy, iz, ix
 
   !  f_int_v2n(:,:,:) = nan_
 
   !$omp target teams distribute parallel do simd collapse(3) map(from: f_int_v2n)
   !$acc parallel loop collapse(3) copyout(f_int_v2n)
   do iz=izs,ize
      do ix=ixsg,ixeg
         do iy=iys,iye
            f_int_v2n(iy,ix,iz) = coef_int(1)*(coef_int(1)*f(iy-1, ix, iz-1)   &
                 + coef_int(2)*f(iy-1, ix , iz)     &
                 + coef_int(3)*f(iy-1, ix, iz +1)  &
                 + coef_int(4)*f(iy-1, ix, iz+2)  )&
                 
                 +  coef_int(2)*(   coef_int(1)*f(iy, ix, iz-1)     &
                 + coef_int(2)*f(iy, ix, iz)       &
                 + coef_int(3)*f(iy, ix, iz+1)     &
                 + coef_int(4)*f(iy, ix, iz+2)    )&
                 
                 +  coef_int(3)*(   coef_int(1)*f(iy+1, ix, iz-1)    &
                 + coef_int(2)*f(iy+1, ix, iz)      &
                 + coef_int(3)*f(iy+1, ix, iz+1)    &
                 + coef_int(4)*f(iy+1, ix, iz+2)   )&
                 
                 + coef_int(4)*(   coef_int(1)*f(iy+2, ix, iz-1)    &
                 + coef_int(2)*f(iy+2, ix, iz)      &
                 + coef_int(3)*f(iy+2, ix, iz+1)    &
                 + coef_int(4)*f(iy+2, ix, iz+2)   )
         end do
      end do
   end do
   !$omp end target teams distribute parallel do simd
 
 end subroutine interp_v2n_fd4_omp
 
 
 subroutine gradx_n2n_fd4(f,f_x)
   use prec_const
 
   IMPLICIT none
 
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f      
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out)  :: f_x
   integer :: ix
   real(dp), DIMENSION(1:4) :: coef_der
   coef_der(:)= deltaxi*coef_der1_n2n(:)                                                          
 
 !  f_x (:,:,: ) =  nan_
 
   do ix=ixs,ixe
      f_x(:, ix, :) =   coef_der(1)*f(:, ix-2, :)  &
           + coef_der(2)*f(:, ix-1, :)  &
           + coef_der(3)*f(:, ix+1, :)    &
           + coef_der(4)*f(:, ix+2, :)  
 
   end do
 
 
 end subroutine gradx_n2n_fd4
 
 subroutine gradx_n2n_fd4_omp(f,f_x)
   use prec_const
 
   implicit none
 
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f      
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out)  :: f_x
   integer :: ix,iy,iz
   real(dp), dimension(1:4) :: coef_der
   coef_der(:)= deltaxi*coef_der1_n2n(:)                                                          
 
   !  f_x (:,:,: ) =  nan_
 
   !$omp target teams distribute parallel do simd collapse(3) map(from: f_x)
   !$acc parallel loop collapse(3) copyout(f_x)
   do iz=izsg,izeg
      do ix=ixs,ixe
         do iy=iysg,iyeg
            f_x(iy, ix, iz) =   coef_der(1)*f(iy, ix-2, iz)  &
                 + coef_der(2)*f(iy, ix-1, iz)  &
                 + coef_der(3)*f(iy, ix+1, iz)    &
                 + coef_der(4)*f(iy, ix+2, iz)  
         end do
      end do
   end do
   !$omp end target teams distribute parallel do simd
 
 end subroutine gradx_n2n_fd4_omp
 
 
 
 
 subroutine gradpar_v2n_fd4 ( f,  f_grad) 
   !flu and frd are useless here for periodic BC 
   use prec_const
 
   IMPLICIT none
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f      
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_grad
   real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg)      :: f_z, f_y, f_n, f_x_n  !f_x_v
 
   integer :: ix, iy
 
 !  f_grad(:,:,:) = nan_
   call gradz_v2n_fd4(f, f_z)
   call grady_v2n_fd4(f, f_y)   
   ! its  important to do the int first
   call interp_v2n_fd4(f,  f_n ) 
   call gradx_n2n_fd4(f_n,  f_x_n) 
   !call gradx_n2n_fd4(f,  f_x_n) 
 
   do ix = ixs,ixe   
      do iy = iys, iye
         f_grad(iy,ix,izs:ize) = gradpar_z*f_z(iy,ix,izs:ize) + gradpar_y_n(iy,ix)*f_y(iy,ix,izs:ize) &
              +                          gradpar_x_n(iy,ix)*f_x_n(iy,ix,izs:ize)
      end do
   end do
 
 end subroutine gradpar_v2n_fd4
 
 ! parallel gradient for finite differences 4rth order from v  grid to n grid
 subroutine gradpar_v2n_fd4_omp ( f, f_grad) 
   !flu and frd are useless here for periodic BC 
   use prec_const
 
   implicit none
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f      
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_grad
   real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg)      :: f_z, f_y, f_n, f_x_n  !f_x_v
 
   integer :: ix, iy,iz
 
   !$acc enter data create(f_z,f_y,f_n,f_x_n)  
   !$omp target enter data map(alloc:f_z,f_y,f_n,f_x_n)
   !  f_grad(:,:,:) = nan_
 
   call gradz_v2n_fd4_omp(f, f_z)
   call grady_v2n_fd4_omp(f, f_y)   
   ! its  important to do the int first
   call interp_v2n_fd4_omp(f, f_n ) 
   call gradx_n2n_fd4_omp(f_n,  f_x_n) 
   !$omp target teams distribute parallel do simd collapse(3) map(from: f_grad)
   !$acc parallel loop collapse(3) copyout(f_grad)
   do iz = izs,ize
      do ix = ixs,ixe   
         do iy = iys, iye
            f_grad(iy,ix,iz) = gradpar_z*f_z(iy,ix,iz) + gradpar_y_n(iy,ix)*f_y(iy,ix,iz) &
                 +                          gradpar_x_n(iy,ix)*f_x_n(iy,ix,iz)
         end do
      end do
   end do
   !$omp end target teams distribute parallel do simd
   !$omp target exit data map(delete:f_z,f_y,f_n,f_x_n)
   !$acc exit data delete(f_z,f_y,f_n,f_x_n)  
 end subroutine gradpar_v2n_fd4_omp
 
 
 end module gradients
diff --git a/test_gbs_gradients.F90 b/test_gbs_gradients.F90
index 3e20421..a06d90e 100644
--- a/test_gbs_gradients.F90
+++ b/test_gbs_gradients.F90
@@ -1,103 +1,103 @@
 program main
   use space_grid
   use gradients
   use prec_const
   use iso_fortran_env
 #ifdef CUDA
 use gradients_cuda
 #endif
   implicit none
 
   ! INPUT: Number of cells in each direction
   integer :: nx=2000,ny=2000,nz=4
   !
   real(dp), dimension(:,:,:), allocatable :: fin,fout ! array in input and array in output (computed from different gradients)     
   real(dp), dimension(:,:,:), allocatable :: fout_omp ! same but computed with openmp     
   integer :: i,j,k
 
   integer(di) :: startc, endc ! For timing
   real(dp) :: rate ! For timing
  
 #ifdef CUDA
  real(dp), pointer :: fout_cuda(:,:,:)
  real(dp), pointer :: fin_cuda(:,:,:)
 #endif
 
  call system_clock(count_rate=rate)
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! Compute mesh  
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   call compute_mesh(nx,ny,nz)
 !
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! Initialize coefficient for gradients (as in gbs) 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   call intialize_coefficients
 !
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! allocate arrays (as in gbs)  
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   call gbs_allocate(fin,iysg,iyeg,ixsg,ixeg,izsg,izeg)
   call gbs_allocate(fout,iysg,iyeg,ixsg,ixeg,izsg,izeg)
   call gbs_allocate(fout_omp,iysg,iyeg,ixsg,ixeg,izsg,izeg)
   call gbs_allocate(gradpar_y_n,iysg,iyeg,ixsg,ixeg)
   call gbs_allocate(gradpar_x_n,iysg,iyeg,ixsg,ixeg)
 !
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! initialize input and output arrays   
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   call init_array(fin)
   call init_array(fout)
   call init_array(fout_omp)
   call init_array2d(gradpar_x_n)
   call init_array2d(gradpar_y_n)
   !$omp target enter data map (to: fin, gradpar_x_n,gradpar_y_n)
   !$acc enter data copyin(fin, gradpar_x_n,gradpar_y_n)
 #ifdef CUDA
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! Test gradients with CUDA C
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  
   call gbs_allocate_cuda(fout_cuda,iysg,iyeg,ixsg,ixeg,izsg,izeg)
   call gbs_allocate_cuda(fin_cuda,iysg,iyeg,ixsg,ixeg,izsg,izeg)
   call gbs_allocate_cuda(gradpar_y_n_cuda,iysg,iyeg,ixsg,ixeg)
   call gbs_allocate_cuda(gradpar_x_n_cuda,iysg,iyeg,ixsg,ixeg)
   call init_array(fin_cuda)
   call init_array2d(gradpar_x_n_cuda)
   call init_array2d(gradpar_y_n_cuda)
   call init_which_grad_
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   call init_array(fout_cuda)
   call init_array(fout)
   call system_clock(startc)
   call gradz_n2n_fd4(fin(:,:,:),fout(:,:,:))
   call system_clock(endc)
   write(*,*) "timing for gradz_n2n_fd4 CPU: ", real(endc-startc, dp)/rate*1000, "ms"
   call system_clock(startc)
   call gradz_n2n_cuda(fin_cuda(:,:,:),fout_cuda(:,:,:))
   CALL synchronize_cuda_device_()
   call system_clock(endc)
   write(*,*) "timing for gradz_n2n_fd4 DEVICE: ", real(endc-startc, dp)/rate*1000, "ms"
   write(*,*) 'gradz_n2n_fd4 with CUDA L2_norm and maxval:', &
 !      norm2(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)), &
       maxval(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize))
 #endif
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! Test gradients with OpenMP and OpenACC
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  
   call system_clock(startc)
   call gradz_n2n_fd4(fin(:,:,:),fout(:,:,:))
   call system_clock(endc)
   write(*,*) "timing for gradz_n2n_fd4 CPU: ", real(endc-startc, dp)/rate*1000, "ms"
   call system_clock(startc)
 !  call gradz_n2n_fd4_omp(fin(:,:,:),fout_omp(:,:,:))
-  call gradz_n2n_fd4_omp(fin(:,:,:),fout_cuda(:,:,:))
+  call gradz_n2n_fd4_omp(fout_cuda(:,:,:),fout_omp(:,:,:))
   call system_clock(endc)
   write(*,*) "timing for gradz_n2n_fd4 DEVICE: ", real(endc-startc, dp)/rate*1000, "ms"
   write(*,*) 'gradz_n2n_fd4 L2_norm and maxval:', &
 !      norm2(fout(iys:iye,ixs:ixe,izs:ize) - fout_omp(iys:iye,ixs:ixe,izs:ize)), &
       maxval(fout(iys:iye,ixs:ixe,izs:ize) - fout_omp(iys:iye,ixs:ixe,izs:ize))
 
   print*, "END TESTS"
 end program main