diff --git a/gradients_fd4.cu b/gradients_fd4.cu
index 7853177..237a918 100644
--- a/gradients_fd4.cu
+++ b/gradients_fd4.cu
@@ -1,274 +1,276 @@
 #include <stdio.h>
 //#include <stdlib.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 //#include "cudalloc.h"
 #include "unistd.h"
 #include "gradients_cuda.h"
 
 /* CUDA error macro */
 #define CUDA_SAFE_CALL(call) do {                                 \
   cudaError_t err = call;                                         \
   if(cudaSuccess != err) {                                        \
     fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
             __FILE__, __LINE__, cudaGetErrorString( err) );       \
     exit(EXIT_FAILURE);                                           \
   } } while (0)
 
 __constant__ __device__ double coef_int[4] = {-1.0/16.0, 9.0/16.0, 9.0/16.0, -1.0/16.0 };
 __constant__ __device__ double coef_der1_stag[4] = { +1.0/24.0, -9.0/8.0, 9.0/8.0, -1.0/24.0 };
 __constant__ __device__ double coef_der2[5] =  {-1.0/12.0, 4.0/3.0, -5.0/2.0, +4.0/3.0, -1.0/12.0};
 __constant__ __device__ double coef_der1_n2n[4] =  {  1.0/12.0,  -2.0/3.0,  2.0/3.0,  -1.0/12.0};
 __constant__ __device__ double coef_der2_stag[4] = { 1.0/2., -1.0/2.0, -1.0/2., 1.0/2.0};
 __device__ int xyindex(int index,  int nz )
 {
    return index%nz;
 }
 
 __device__ int disp(int ix, int iy, int iz, int nx, int ny, int nz)
 {
    return iz + ix*nz + iy*nz*nx;
 }
 
 __device__ int dispc(int ix, int iy, int iz, int nx, int ny)
 {
    return iy + ix*ny + iz*ny*nx;
 }
 
 //GRAD_N2N
 
 __device__ void grad_n2n_fd4Device(double *f_in, double *f_out, int nx, int ny, int nz,
         double fact, int *which_grad)
         //double *fact, int *which_grad)
 {
 
    int index = blockDim.x*blockIdx.x+threadIdx.x;
    int d1=dispc(which_grad[0],which_grad[1],which_grad[2],nx,ny);
    int d2=dispc(2*which_grad[0],2*which_grad[1],2*which_grad[2],nx,ny);
    if (index<(nx*ny*nz-d2) && index>=d2){
       //f_out[index] = coef_der1_n2n[0]+f_in[index-d2] + coef_der1_n2n[1]+f_in[index-d1]+
       //    coef_der1_n2n[2]+f_in[index+d1] + coef_der1_n2n[3]+f_in[index+d2];
       f_out[index] = fact*coef_der1_n2n[0]*f_in[index-d2] + fact*coef_der1_n2n[1]*f_in[index-d1]+
           fact*coef_der1_n2n[2]*f_in[index+d1] + fact*coef_der1_n2n[3]*f_in[index+d2];
       //f_out[index] = f_in[index-d2] + f_in[index-d1] + 
       //    f_in[index+d1] + f_in[index+d2];
       // f_out[index] = index-d2;//f_in[index-d2];
       //f_out[index] *= fact;
       //f_out[index] = fact[0]*f_in[index-d2] + fact[1]*f_in[index-d1] + 
       //    fact[2]*f_in[index+d1] + fact[3]*f_in[index+d2];
    }
 
 }
 
 __global__ void grad_n2n_fd4(double *f_in, double *f_out, int nx, int ny, int nz, 
         double fact, int *which_grad)
         //double *fact, int *which_grad)
 {
    grad_n2n_fd4Device(f_in,f_out, nx, ny, nz, fact, which_grad);
 }
 
 extern "C" void grady_n2n_fd4_cuda_( double *f_in, double *f_out, int *nx, int *ny, int *nz, double *fact){
    int n=(*nx)*(*ny)*(*nz);
    which_grad[0] = 0;
    which_grad[1] = 1;
    which_grad[2] = 0;
    dim3 threadsPerBlock = 256;
    dim3 numBlocks((n + threadsPerBlock.x -1) / threadsPerBlock.x);
    grad_n2n_fd4<<<numBlocks, threadsPerBlock>>>(f_in, f_out,  *nx, *ny, *nz, *fact, which_grad);
    CUDA_SAFE_CALL(cudaDeviceSynchronize());
 }
 extern "C" void gradx_n2n_fd4_cuda_( double *f_in, double *f_out, int *nx, int *ny, int *nz, double *fact){
    int n=(*nx)*(*ny)*(*nz);
    which_grad[0] = 1;
    which_grad[1] = 0;
    which_grad[1] = 0;
    which_grad[2] = 0;
    dim3 threadsPerBlock = 256;
    dim3 numBlocks((n + threadsPerBlock.x -1) / threadsPerBlock.x);
    grad_n2n_fd4<<<numBlocks, threadsPerBlock>>>(f_in, f_out,  *nx, *ny, *nz, *fact, which_grad);
    CUDA_SAFE_CALL(cudaDeviceSynchronize());
 }
 extern "C" void gradz_n2n_fd4_cuda_( double *f_in, double *f_out, int *nx, int *ny, int *nz, double *fact){
    int n=(*nx)*(*ny)*(*nz);
    which_grad[0] = 0;
    which_grad[1] = 0;
    which_grad[2] = 1;
    dim3 threadsPerBlock = 256;
    dim3 numBlocks((n + threadsPerBlock.x -1) / threadsPerBlock.x);
    grad_n2n_fd4<<<numBlocks, threadsPerBlock>>>(f_in, f_out,  *nx, *ny, *nz, *fact, which_grad);
    CUDA_SAFE_CALL(cudaDeviceSynchronize());
 }
 
 __global__ void gradz_v2n(double *f_in, double *f_out, int nx, int ny, int nz, double fact)
 {
    int index=blockDim.x*blockIdx.x+threadIdx.x;
    int d11=index+dispc(0,-1,-1,nx,ny);
    int d12=index+dispc(0,-1,0,nx,ny);
    int d13=index+dispc(0,-1, 1,nx,ny);
    int d14=index+dispc(0,-1, 2,nx,ny);
    int d21=index+dispc(0, 0,-1,nx,ny);
    int d22=index+dispc(0, 0,0,nx,ny);
    int d23=index+dispc(0, 0, 1,nx,ny);
    int d24=index+dispc(0, 0, 2,nx,ny);
    int d31=index+dispc(0, 1,-1,nx,ny);
    int d32=index+dispc(0, 1,0,nx,ny);
    int d33=index+dispc(0, 1, 1,nx,ny);
    int d34=index+dispc(0, 1, 2,nx,ny);
    int d41=index+dispc(0, 2,-1,nx,ny);
    int d42=index+dispc(0, 2,0,nx,ny);
    int d43=index+dispc(0, 2, 1,nx,ny);
    int d44=index+dispc(0, 2, 2,nx,ny);
    double coef_der[4];
    for(int i=0; i<4; i++)
        coef_der[i]=coef_der1_stag[i]*fact;
 
    if (d11>=0 && d44<(nx*ny*nz)){
       f_out[index] = coef_int[0]  * (coef_der[0]*f_in[d11] + coef_der[1]*f_in[d12] +
                      coef_der[2]*f_in[d13] + coef_der[3]*f_in[d14])+ 
                      coef_int[1] * (coef_der[0]*f_in[d21] + coef_der[1]*f_in[d22] +
                      coef_der[2]*f_in[d23] + coef_der[3]*f_in[d24])+ 
                      coef_int[2]  * (coef_der[0]*f_in[d31] + coef_der[1]*f_in[d32] +
                      coef_der[2]*f_in[d33] + coef_der[3]*f_in[d34]) +
                      coef_int[3] *  (coef_der[0]*f_in[d41] + coef_der[1]*f_in[d42] +
                      coef_der[2]*f_in[d43] + coef_der[3]*f_in[d44]);
    }
 
 }
 
 
 extern "C" void gradz_v2n_fd4_cuda_(double *f_in, double *f_out,  int *nx, int *ny, int *nz, double *fact){
    int n = (*nx)*(*ny)*(*nz);
    dim3 threadsPerBlock = 256;
    dim3 numBlocks((n + threadsPerBlock.x -1) / threadsPerBlock.x);
    gradz_v2n<<<numBlocks, threadsPerBlock>>>(f_in, f_out, *nx, *ny, *nz, *fact);
    CUDA_SAFE_CALL(cudaDeviceSynchronize());
 }
 
 __device__ void grady_v2n_fd4Device(double *f_in, double *f_out, int nx, int ny, int nz, double fact)
 {
    int index=blockDim.x*blockIdx.x+threadIdx.x;
    int d11=index+dispc(0,-1,-1,nx,ny);
    int d12=index+dispc(0, 0,-1,nx,ny);
    int d13=index+dispc(0, 1,-1,nx,ny);
    int d14=index+dispc(0, 2,-1,nx,ny);
    int d21=index+dispc(0,-1, 0,nx,ny);
    int d22=index+dispc(0, 0, 0,nx,ny);
    int d23=index+dispc(0, 1, 0,nx,ny);
    int d24=index+dispc(0, 2, 0,nx,ny);
    int d31=index+dispc(0,-1, 1,nx,ny);
    int d32=index+dispc(0, 0, 1,nx,ny);
    int d33=index+dispc(0, 1, 1,nx,ny);
    int d34=index+dispc(0, 2, 1,nx,ny);
    int d41=index+dispc(0,-1, 2,nx,ny);
    int d42=index+dispc(0,0 , 2,nx,ny);
    int d43=index+dispc(0, 1, 2,nx,ny);
    int d44=index+dispc(0, 2, 2,nx,ny);
    double coef_der[4];
    for(int i=0; i<4; i++)
        coef_der[i]=coef_der1_stag[i]*fact;
 
    if (d11>=0 && d44<(nx*ny*nz)){
       f_out[index] = coef_int[0]  * (coef_der[0]*f_in[d11] + coef_der[1]*f_in[d12] +
                      coef_der[2]*f_in[d13] + coef_der[3]*f_in[d14])+ 
                      coef_int[1] * (coef_der[0]*f_in[d21] + coef_der[1]*f_in[d22] +
                      coef_der[2]*f_in[d23] + coef_der[3]*f_in[d24])+ 
                      coef_int[2]  * (coef_der[0]*f_in[d31] + coef_der[1]*f_in[d32] +
                      coef_der[2]*f_in[d33] + coef_der[3]*f_in[d34]) +
                      coef_int[3] *  (coef_der[0]*f_in[d41] + coef_der[1]*f_in[d42] +
                      coef_der[2]*f_in[d43] + coef_der[3]*f_in[d44]);
    }
 
 }
 
 __global__ void grady_v2n_fd4(double *f_in, double *f_out, int nx, int ny, int nz, double fact){
     grady_v2n_fd4Device(f_in, f_out, nx, ny, nz, fact);
 }
 extern "C" void grady_v2n_fd4_cuda_(double *f_in, double *f_out,  int *nx, int *ny, int *nz, double *fact){
    int n = (*nx)*(*ny)*(*nz);
    dim3 threadsPerBlock = 256;
    dim3 numBlocks((n + threadsPerBlock.x -1) / threadsPerBlock.x);
    grady_v2n_fd4<<<numBlocks, threadsPerBlock>>>(f_in, f_out, *nx, *ny, *nz, *fact);
    CUDA_SAFE_CALL(cudaDeviceSynchronize());
 }
 
 __device__ void interp_v2n_f4Device(double *f_in, double *f_out, int nx, int ny, int nz)
 {
    int index=blockDim.x*blockIdx.x+threadIdx.x;
    int d11=index+dispc(0,-1,-1,nx,ny);
    int d12=index+dispc(0,-1,0,nx,ny);
    int d13=index+dispc(0,-1,1,nx,ny);
    int d14=index+dispc(0,-1,2,nx,ny);
    int d21=index+dispc(0,0,-1,nx,ny);
    int d22=index+dispc(0,0,0,nx,ny);
    int d23=index+dispc(0,0,1,nx,ny);
    int d24=index+dispc(0,0,2,nx,ny);
    int d31=index+dispc(0,1,-1,nx,ny);
    int d32=index+dispc(0,1,0,nx,ny);
    int d33=index+dispc(0,1,1,nx,ny);
    int d34=index+dispc(0,1,2,nx,ny);
    int d41=index+dispc(0,2,-1,nx,ny);
    int d42=index+dispc(0,2,0,nx,ny);
    int d43=index+dispc(0,2,1,nx,ny);
    int d44=index+dispc(0,2,2,nx,ny);
 
    if (d11>=0 && d44<(nx*ny*nz)){
       f_out[index] = coef_int[0]  * (coef_int[0]*f_in[d11] + coef_int[1]*f_in[d12] +
                      coef_int[2]*f_in[d13] + coef_int[3]*f_in[d14]) + 
                      coef_int[1] * (coef_int[0]*f_in[d21] + coef_int[1]*f_in[d22] +
                      coef_int[2]*f_in[d23] + coef_int[3]*f_in[d24])+ 
                      coef_int[2]  * (coef_int[0]*f_in[d31] + coef_int[1]*f_in[d32] +
                      coef_int[2]*f_in[d33] + coef_int[3]*f_in[d34]) +
                      coef_int[3] *  (coef_int[0]*f_in[d41] + coef_int[1]*f_in[d42] +
                      coef_int[2]*f_in[d43] + coef_int[3]*f_in[d44]);
    }
 
 }
 
 __global__ void interp_v2n_fd4(double *f_in, double *f_out, int nx, int ny, int nz){
     interp_v2n_f4Device(f_in, f_out, nx, ny, nz);
 }
 
 extern "C" void interp_v2n_fd4_cuda_(double *f_in, double *f_out,  int *nx, int *ny, int *nz){
     int n = (*nx)*(*ny)*(*nz);
    dim3 threadsPerBlock = 256;
    dim3 numBlocks((n + threadsPerBlock.x -1) / threadsPerBlock.x);
    interp_v2n_fd4<<<numBlocks, threadsPerBlock>>>(f_in, f_out, *nx, *ny, *nz);
    CUDA_SAFE_CALL(cudaDeviceSynchronize());
 }
 
 __global__ void gradpar_fd4(double *f_z, double *f_y, double *gradpar_y, 
         double *gradpar_x, double *f_x,  double gradpar_z, double *f_out,
         int nx, int ny, int nz){
     int ix = blockDim.x*blockIdx.x+threadIdx.x;
     int iy = blockDim.y*blockIdx.y+threadIdx.y;
     int iz = blockDim.z*blockIdx.z+threadIdx.z;
     int index = dispc(ix,iy,iz,nx,ny);
     int index2d = dispc(ix,iy,0,nx,ny);
     
     if(ix<nx &iy<ny && iz<nz)
         f_out[index] = gradpar_z*f_z[index] + gradpar_y[index2d]*f_y[index]+
               gradpar_x[index2d]*f_x[index];
 }
 
 
 extern "C" void gradpar_fd4_cuda_(double *f_z, double *f_y, double *gradpar_y, 
         double *gradpar_x, double *f_x,  double *gradpar_z, double *f_out,
         int *nx, int *ny, int *nz){
    dim3 numBlocks;
    int threadsxy = 16;
    int threadsz = 2;
    dim3 numThreads = dim3(threadsxy, threadsxy, threadsz);
    numBlocks.x = ((*nx + threadsxy -1) / threadsxy);
    numBlocks.y = ((*ny + threadsxy -1) / threadsxy);
    numBlocks.z = ((*nz + threadsz -1) / threadsz);
    gradpar_fd4<<<numBlocks, numThreads>>>(f_z, f_y, gradpar_y, 
            gradpar_x, f_x,  *gradpar_z, f_out, *nx,  *ny,  *nz);
    CUDA_SAFE_CALL(cudaDeviceSynchronize());
 }
 
-
+extern "C" void synchronize_cuda_device_(){
+   cudaDeviceSynchronize();
+}
 
 
 extern "C" void init_which_grad_(){
     cudaMallocManaged((void **)&(which_grad), sizeof(int)*(3));
 }
diff --git a/test_gbs_gradients.F90 b/test_gbs_gradients.F90
index 6b77cb6..3e20421 100644
--- a/test_gbs_gradients.F90
+++ b/test_gbs_gradients.F90
@@ -1,102 +1,103 @@
 program main
   use space_grid
   use gradients
   use prec_const
   use iso_fortran_env
 #ifdef CUDA
 use gradients_cuda
 #endif
   implicit none
 
   ! INPUT: Number of cells in each direction
   integer :: nx=2000,ny=2000,nz=4
   !
   real(dp), dimension(:,:,:), allocatable :: fin,fout ! array in input and array in output (computed from different gradients)     
   real(dp), dimension(:,:,:), allocatable :: fout_omp ! same but computed with openmp     
   integer :: i,j,k
 
   integer(di) :: startc, endc ! For timing
   real(dp) :: rate ! For timing
  
 #ifdef CUDA
  real(dp), pointer :: fout_cuda(:,:,:)
  real(dp), pointer :: fin_cuda(:,:,:)
 #endif
 
  call system_clock(count_rate=rate)
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! Compute mesh  
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   call compute_mesh(nx,ny,nz)
 !
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! Initialize coefficient for gradients (as in gbs) 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   call intialize_coefficients
 !
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! allocate arrays (as in gbs)  
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   call gbs_allocate(fin,iysg,iyeg,ixsg,ixeg,izsg,izeg)
   call gbs_allocate(fout,iysg,iyeg,ixsg,ixeg,izsg,izeg)
   call gbs_allocate(fout_omp,iysg,iyeg,ixsg,ixeg,izsg,izeg)
   call gbs_allocate(gradpar_y_n,iysg,iyeg,ixsg,ixeg)
   call gbs_allocate(gradpar_x_n,iysg,iyeg,ixsg,ixeg)
 !
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! initialize input and output arrays   
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   call init_array(fin)
   call init_array(fout)
   call init_array(fout_omp)
   call init_array2d(gradpar_x_n)
   call init_array2d(gradpar_y_n)
   !$omp target enter data map (to: fin, gradpar_x_n,gradpar_y_n)
   !$acc enter data copyin(fin, gradpar_x_n,gradpar_y_n)
 #ifdef CUDA
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! Test gradients with CUDA C
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  
   call gbs_allocate_cuda(fout_cuda,iysg,iyeg,ixsg,ixeg,izsg,izeg)
   call gbs_allocate_cuda(fin_cuda,iysg,iyeg,ixsg,ixeg,izsg,izeg)
   call gbs_allocate_cuda(gradpar_y_n_cuda,iysg,iyeg,ixsg,ixeg)
   call gbs_allocate_cuda(gradpar_x_n_cuda,iysg,iyeg,ixsg,ixeg)
   call init_array(fin_cuda)
   call init_array2d(gradpar_x_n_cuda)
   call init_array2d(gradpar_y_n_cuda)
   call init_which_grad_
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   call init_array(fout_cuda)
   call init_array(fout)
   call system_clock(startc)
   call gradz_n2n_fd4(fin(:,:,:),fout(:,:,:))
   call system_clock(endc)
   write(*,*) "timing for gradz_n2n_fd4 CPU: ", real(endc-startc, dp)/rate*1000, "ms"
   call system_clock(startc)
   call gradz_n2n_cuda(fin_cuda(:,:,:),fout_cuda(:,:,:))
+  CALL synchronize_cuda_device_()
   call system_clock(endc)
   write(*,*) "timing for gradz_n2n_fd4 DEVICE: ", real(endc-startc, dp)/rate*1000, "ms"
   write(*,*) 'gradz_n2n_fd4 with CUDA L2_norm and maxval:', &
 !      norm2(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize)), &
       maxval(fout(iys:iye,ixs:ixe,izs:ize) - fout_cuda(iys:iye,ixs:ixe,izs:ize))
 #endif
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! Test gradients with OpenMP and OpenACC
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  
   call system_clock(startc)
   call gradz_n2n_fd4(fin(:,:,:),fout(:,:,:))
   call system_clock(endc)
   write(*,*) "timing for gradz_n2n_fd4 CPU: ", real(endc-startc, dp)/rate*1000, "ms"
   call system_clock(startc)
 !  call gradz_n2n_fd4_omp(fin(:,:,:),fout_omp(:,:,:))
   call gradz_n2n_fd4_omp(fin(:,:,:),fout_cuda(:,:,:))
   call system_clock(endc)
   write(*,*) "timing for gradz_n2n_fd4 DEVICE: ", real(endc-startc, dp)/rate*1000, "ms"
   write(*,*) 'gradz_n2n_fd4 L2_norm and maxval:', &
 !      norm2(fout(iys:iye,ixs:ixe,izs:ize) - fout_omp(iys:iye,ixs:ixe,izs:ize)), &
       maxval(fout(iys:iye,ixs:ixe,izs:ize) - fout_omp(iys:iye,ixs:ixe,izs:ize))
 
   print*, "END TESTS"
 end program main