diff --git a/exercises/gradients/Makefile b/exercises/gradients/Makefile
new file mode 100644
index 0000000..fa6f63c
--- /dev/null
+++ b/exercises/gradients/Makefile
@@ -0,0 +1,28 @@
+F90=gfortran
+NVCC=nvcc
+LDFLAGS=-lc -lstdc++ -lcuda -lcudart -lcudadevrt
+F90FLAGS= -ffree-line-length-0 -O3  -g
+
+
+all:  test_gradients.x
+          
+test_gradients.x: cudalloc.o grad_test.o prec_const_mod.o space_grid_mod.o cuda_interface_mod.o gradients_mod.o test_gradients.o 
+	$(F90) $(LDFLAGS) cudalloc.o grad_test.o prec_const_mod.o space_grid_mod.o cuda_interface_mod.o \
+                          gradients_mod.o test_gradients.o -o $@
+test_gradients.o: test_gradients.F90
+	$(F90) $(F90FLAGS) -c test_gradients.F90 -o $@
+cuda_interface_mod.o: cuda_interface_mod.F90
+	$(F90) $(F90FLAGS) -c cuda_interface_mod.F90 -o $@
+space_grid_mod.o: space_grid_mod.F90
+	$(F90) $(F90FLAGS) -c space_grid_mod.F90 -o $@
+prec_const_mod.o: prec_const_mod.F90
+	$(F90) $(F90FLAGS) -c prec_const_mod.F90 -o $@
+gradients_mod.o: gradients_mod.F90
+	$(F90) $(F90FLAGS) -c gradients_mod.F90 -o $@
+grad_test.o: grad_test.cu
+	$(NVCC) $(NVCCFLAGS) -c grad_test.cu -o $@
+cudalloc.o: cudalloc.cu
+	$(NVCC) $(NVCCFLAGS) -c cudalloc.cu -o $@
+
+clean:
+	rm *.x *.o
diff --git a/exercises/gradients/cuda_interface_mod.F90 b/exercises/gradients/cuda_interface_mod.F90
new file mode 100644
index 0000000..f0decef
--- /dev/null
+++ b/exercises/gradients/cuda_interface_mod.F90
@@ -0,0 +1,128 @@
+
+MODULE cuda_interface
+use prec_const
+INTERFACE
+  FUNCTION allocate_cuda_memory(n) BIND(C, NAME='allocate_cuda_memory')
+    use iso_c_binding
+    TYPE(C_PTR) :: allocate_cuda_memory
+    integer :: n
+  END FUNCTION allocate_cuda_memory
+
+
+END INTERFACE
+  INTERFACE gbs_allocate_cuda
+     MODULE PROCEDURE gbs_allocate_cuda_dp3, gbs_allocate_cuda_dp2, gbs_allocate_cuda_dp1
+  END INTERFACE gbs_allocate_cuda
+  real(dp), pointer, save :: strmf_cpu(:,:,:),strmf_gpu(:,:,:), indata(:,:,:)
+  real(dp), pointer, save :: strmf1_cpu(:,:,:), strmf1_gpu(:,:,:)
+  real(dp), pointer, save :: grady_f(:,:,:), grady_diff(:,:,:), diff(:,:,:), f_x(:,:,:)
+  real(dp), pointer, save :: f_z(:,:,:), f_y(:,:,:), f_n(:,:,:), f_x_n(:,:,:)
+  real(dp), pointer, save :: f_v(:,:,:), f_x_v(:,:,:), f_xx_n(:,:,:),f_xx_v(:,:,:)
+  real(dp), pointer, save :: f_zz(:,:,:), f_yy(:,:,:), f_xx(:,:,:)
+  real(dp), pointer, save :: f_zy(:,:,:), f_yz(:,:,:), f_zx(:,:,:)
+  real(dp), pointer, save :: f_xz(:,:,:), f_xy(:,:,:), f_yx(:,:,:)
+
+  real(dp), pointer, save :: gradpar_x_n_gpu(:,:), gradpar_x_v_gpu(:,:), gradpar_y_n_gpu(:,:), gradpar_y_v_gpu(:,:)
+  real(dp), pointer, save :: grad2par_yy_n_gpu(:,:), grad2par_xx_n_gpu(:,:), grad2par_x_n_gpu(:,:), grad2par_y_n_gpu(:,:)
+  real(dp), pointer, save :: grad2par_zy_n_gpu(:,:), grad2par_zx_n_gpu(:,:), grad2par_xy_n_gpu(:,:)
+  real(dp), pointer, save :: grad2par_yy_v_gpu(:,:), grad2par_xx_v_gpu(:,:), grad2par_x_v_gpu(:,:), grad2par_y_v_gpu(:,:)
+  real(dp), pointer, save :: grad2par_zy_v_gpu(:,:), grad2par_zx_v_gpu(:,:), grad2par_xy_v_gpu(:,:)
+  real(dp), pointer, save :: cur_x_n_gpu(:), cur_x_v_gpu(:), ccur_xx_n_gpu(:),  ccur_xx_v_gpu(:)
+  real(dp), pointer, save :: cur_y_n_gpu(:,:), cur_y_v_gpu(:,:), ccur_yy_n_gpu(:,:), ccur_xy_n_gpu(:,:) 
+  real(dp), pointer, save :: ccur_yy_v_gpu(:,:), ccur_xy_v_gpu(:,:) 
+  real(dp), pointer, dimension(:,:,:), save ::  A_x_n_gpu, A_x_v_gpu, A_y_v_gpu, B_x_v_gpu, B_y_v_gpu
+  real(dp), pointer, dimension(:,:,:), save ::  B_x_n_gpu, B_y_n_gpu, A_y_n_gpu   
+  real(dp), pointer, dimension(:,:), save ::  gradcur_xx_v_gpu, gradcur_yy_v_gpu, gradcur_xy_v_gpu, gradcur_yx_v_gpu
+  real(dp), pointer, dimension(:,:), save ::  gradcur_zx_v_gpu, gradcur_zy_v_gpu, gradcur_x_v_gpu, gradcur_y_v_gpu
+  real(dp), pointer, dimension(:,:), save ::  curgrad_xx_n_gpu, curgrad_yy_n_gpu, curgrad_xy_n_gpu, curgrad_yx_n_gpu
+  real(dp), pointer, dimension(:,:), save ::  curgrad_zx_n_gpu, curgrad_zy_n_gpu, curgrad_x_n_gpu, curgrad_y_n_gpu
+
+  !real(dp), pointer, save :: gradpar_x_n_gpu(:,:,:), gradpar_x_v_gpu(:,:,:), gradpar_y_n_gpu(:,:,:), gradpar_y_v_gpu(:,:,:)
+  integer :: nx_cuda, ny_cuda, nz_cuda
+contains
+
+  SUBROUTINE gbs_allocate_cuda_dp3(a,is1,ie1,is2,ie2,is3,ie3)
+    use iso_c_binding
+    real(dp), DIMENSION(:,:,:), pointer, INTENT(INOUT) :: a
+    INTEGER, INTENT(IN) :: is1,ie1,is2,ie2,is3,ie3
+    integer ndata
+    ndata=(ie3-is3+1)*(ie2-is2+1)*(ie1-is1+1) 
+    call c_f_pointer(allocate_cuda_memory(ndata), a, \
+                    [(ie1-is1+1),(ie2-is2+1),(ie3-is3+1)])
+    a(is1:,is2:,is3:) => a
+    a=0.0_dp
+  END SUBROUTINE gbs_allocate_cuda_dp3
+
+  SUBROUTINE gbs_allocate_cuda_dp2(a,is1,ie1,is2,ie2)
+    use iso_c_binding
+    real(dp), DIMENSION(:,:), pointer, INTENT(INOUT) :: a
+    INTEGER, INTENT(IN) :: is1,ie1,is2,ie2
+    integer ndata
+    ndata=(ie2-is2+1)*(ie1-is1+1) 
+    call c_f_pointer(allocate_cuda_memory(ndata), a, \
+                    [(ie1-is1+1),(ie2-is2+1)])
+    a(is1:,is2:) => a
+    a=0.0_dp
+  END SUBROUTINE gbs_allocate_cuda_dp2
+
+  SUBROUTINE gbs_allocate_cuda_dp1(a,is1,ie1)
+    use iso_c_binding
+    real(dp), DIMENSION(:), pointer, INTENT(INOUT) :: a
+    INTEGER, INTENT(IN) :: is1,ie1
+    integer ndata
+    ndata=(ie1-is1+1) 
+    call c_f_pointer(allocate_cuda_memory(ndata), a, \
+                    [(ie1-is1+1)])
+    a(is1:) => a
+    a=0.0_dp
+  END SUBROUTINE gbs_allocate_cuda_dp1
+
+
+  subroutine gradz_n2n_fd4cuda(f_in, f_out)
+     use space_grid
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in      
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out
+     call gradz_n2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda, deltazi)
+  end subroutine gradz_n2n_fd4cuda
+
+  subroutine gradz_v2n_fd4cuda(f_in, f_out)
+     use space_grid
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in      
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out
+     call gradz_v2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda, deltazi)
+  end subroutine gradz_v2n_fd4cuda
+
+  subroutine grady_v2n_fd4cuda(f_in, f_out)
+     use space_grid
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in      
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out
+     call grady_v2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda, deltayi)
+  end subroutine grady_v2n_fd4cuda
+
+  subroutine interp_v2n_fd4cuda(f_in, f_out)
+     use space_grid
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in      
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out
+     call interp_v2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda)
+  end subroutine interp_v2n_fd4cuda
+
+  subroutine gradx_n2n_fd4cuda(f_in, f_out)
+     use space_grid
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in      
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out
+     call gradx_n2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda, deltaxi)
+  end subroutine gradx_n2n_fd4cuda
+
+
+  subroutine gradpar_v2n_fd4cuda(f_in, f_out,  gradpar_z)
+     use space_grid
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in
+     real(dp),  INTENT(in) :: gradpar_z
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out
+
+     CALL gradpar_v2n_fd4_cuda(f_in, f_out,  nz_cuda, nx_cuda, ny_cuda, f_z, f_y, f_n, \
+                            f_x_n, deltazi, deltayi, deltaxi, gradpar_z, gradpar_x_n_gpu, gradpar_y_n_gpu)
+  end subroutine gradpar_v2n_fd4cuda
+
+
+END MODULE cuda_interface
diff --git a/exercises/gradients/cudalloc.cu b/exercises/gradients/cudalloc.cu
new file mode 100644
index 0000000..1186170
--- /dev/null
+++ b/exercises/gradients/cudalloc.cu
@@ -0,0 +1,76 @@
+/*#include <stdio.h>
+//#include <stdlib.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+struct cuda_struct{
+   double *coef_der_cuda;
+};
+
+*/
+#include "cudalloc.h"
+/*__global__ void set_coefficients(cuda_struct *structure){
+   structure->coef_der_cuda[0] = 1.0/12.0;
+   structure->coef_der_cuda[1] = -2.0/3.0;
+   structure->coef_der_cuda[2] = 2.0/3.0;
+   structure->coef_der_cuda[3] = -1.0/12.0;
+*/
+__global__ void set_coefficients(double *coef, double *coefstag, double *coef_int){
+coef[0] = 1.0/12.0;
+coef[1] = -2.0/3.0;
+coef[2] = 2.0/3.0;
+coef[3] = -1.0/12.0;
+coef_int[0] = -1.0/16.0;
+coef_int[1] = 9.0/16.0;
+coef_int[2] = 9.0/16.0;
+coef_int[3] = -1.0/16.0;
+coefstag[0] = 1.0/24.0;
+coefstag[1] = -9.0/8.0;
+coefstag[2] = 9.0/8.0;
+coefstag[3] = -1.0/24.0;
+}
+void set_extent(int *out, int in){
+*out = in;
+}
+extern "C" void allocate_cuda_struct(cuda_struct *& structure){
+   structure = new cuda_struct();
+   return;
+}
+
+//__global__ void print_coefficients(cuda_struct *structure){
+__global__ void print_coefficients(double *coef){
+}
+extern "C" void fill_cuda_struct(cuda_struct *structure, int *izeg, int *izsg, int *iyeg, int *iysg, int *ixeg, int *ixsg){
+//extern "C" void fill_cuda_struct(cuda_struct *structure){
+
+   cudaMallocManaged( (void **)&(structure->coef_der_cuda), sizeof(double) * 4 );
+   cudaMallocManaged( (void **)&(structure->coef_der1_stag), sizeof(double) * 4 );
+   cudaMallocManaged( (void **)&(structure->coef_int_cuda), sizeof(double) * 4 );
+   set_coefficients<<<1, 1>>>(structure->coef_der_cuda, structure->coef_der1_stag, structure->coef_int_cuda);
+   //set_coefficients<<<1, 1>>>(structure->coef_der_cuda);
+   //set_extent((structure->nz), (*izeg-*izsg+1));
+   //set_extent(&(structure->ny), (*izeg-*izsg+1));
+   //set_extent(&(structure->nx), (*izeg-*izsg+1));
+   //set_extent(&(structure->nx), (*iyeg-*iysg+1));
+   //&(structure->nz) = &k;//(*izeg-*izsg+1);
+   //&(structure->ny) = &k;//(*ixeg-*ixsg+1);
+   //&(structure->nx) = &k;//(*iyeg-*iysg+1);
+   //printf("AAAAAAA %d %d %d\n", structure->nz, structure->ny, structure->nx);
+   //set_extent<<<1, 1>>>(structure->nz, (*izeg-*izsg+1));
+   //set_extent<<<1, 1>>>(structure->ny, (*ixeg-*ixsg+1));
+   //set_extent<<<1, 1>>>(structure->nx, (*iyeg-*iysg+1));
+   cudaDeviceSynchronize();
+
+   printf("cudalloc coef_int %f %f %f %f \n", structure->coef_int_cuda[0], structure->coef_int_cuda[1], structure->coef_int_cuda[2], structure->coef_int_cuda[3]);
+   return;
+}
+
+extern "C" void *allocate_cuda_memory(int *n){
+   double *a;
+   int N=*n;
+   cudaMallocManaged( (void **)&a, sizeof(double) * N );
+   cudaDeviceSynchronize();
+   return (void *) a;
+}
+
+
+
diff --git a/exercises/gradients/cudalloc.h b/exercises/gradients/cudalloc.h
new file mode 100644
index 0000000..0d950a9
--- /dev/null
+++ b/exercises/gradients/cudalloc.h
@@ -0,0 +1,11 @@
+#include <stdio.h>
+//#include <stdlib.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+struct cuda_struct{
+   double *coef_der_cuda;
+   double *coef_int_cuda;
+   double *coef_der1_stag;
+   int *nz, *ny, *nx;
+};
diff --git a/exercises/gradients/grad_test.cu b/exercises/gradients/grad_test.cu
new file mode 100644
index 0000000..948e342
--- /dev/null
+++ b/exercises/gradients/grad_test.cu
@@ -0,0 +1,135 @@
+#include <cuda.h>
+//#include <cuda_runtime.h>
+//#include "unistd.h"
+
+
+__constant__ __device__ double coef_int[4] = {-1.0/16.0, 9.0/16.0, 9.0/16.0, -1.0/16.0 };
+__constant__ __device__ double coef_der1_stag[4] = { +1.0/24.0, -9.0/8.0, 9.0/8.0, -1.0/24.0 };
+__constant__ __device__ double coef_der2[5] =  {-1.0/12.0, 4.0/3.0, -5.0/2.0, +4.0/3.0, -1.0/12.0};
+__constant__ __device__ double coef_der1_n2n[4] =  {  1.0/12.0,  -2.0/3.0,  2.0/3.0,  -1.0/12.0};
+__constant__ __device__ double coef_der2_stag[4] = { 1.0/2., -1.0/2.0, -1.0/2., 1.0/2.0};
+
+__device__ int xyindex(int i,  int nz )
+{
+   return i%nz;
+}
+
+
+__device__ int disp(int i, int j, int k, int nx, int ny)
+{
+   return i + j*nx + k*nx*ny;
+}
+
+__device__ void gradz_n2n_fd4Device(double *f, double *fz, int nz, int ny, int nx, double fact)
+{
+     int index = blockDim.x*blockIdx.x+threadIdx.x;
+   int d1=index+disp(0,0,-2,nx,ny);
+   int d2=index+disp(0,0,-1,nx,ny);
+   int d3=index+disp(0,0,1,nx,ny);
+   int d4=index+disp(0,0,2,nx,ny);
+   if (d1>=0 && d4<(nx*ny*nz)){
+      fz[index] = coef_der1_n2n[0]*f[d1] + coef_der1_n2n[1]*f[d2] + 
+                  coef_der1_n2n[2]*f[d3] + coef_der1_n2n[3]*f[d4];
+      fz[index] *= fact;
+   }
+
+}
+
+__global__ void gradz_n2n_fd4(double *f, double *fz, int nz, int ny, int nx, double fact){
+     gradz_n2n_fd4Device(f, fz, nz, ny, nx, fact);
+}
+
+extern "C" void gradz_n2n_fd4_cuda_( double *f, double *f_z, int *nz, int *ny, int *nx, double *fact){
+   int n=(*nx)*(*ny)*(*nz);
+   int threadsPerBlock = 256;
+   int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock);
+   gradz_n2n_fd4<<<numBlocks, threadsPerBlock>>>(f, f_z, *nz, *ny, *nx, *fact);
+   cudaDeviceSynchronize();
+}
+
+
+__device__ void gradz_v2n_fd4Device(double *f, double *f_z_v2n, int nz, int ny, int nx, double fact)
+{
+
+}
+
+__global__ void gradz_v2n_fd4(double *f, double *fz, int nz, int ny, int nx, double fact){
+     gradz_v2n_fd4Device(f, fz, nz, ny, nx, fact);
+}
+
+extern "C" void gradz_v2n_fd4_cuda_( double *f, double *f_z, int *nz, int *ny, int *nx, double *fact){
+   int n=(*nx)*(*ny)*(*nz);
+   int threadsPerBlock = 256;
+   int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock);
+   gradz_v2n_fd4<<<numBlocks, threadsPerBlock>>>(f, f_z, *nz, *ny, *nx, *fact);
+   cudaDeviceSynchronize();
+}
+
+__device__ void grady_v2n_fd4Device(double *f, double *f_y_v2n, int nz, int ny, int nx, double fact)
+{
+
+
+}
+
+__global__ void grady_v2n_fd4(double *f, double *f_y_v2n, int nz, int ny, int nx, double fact){
+     grady_v2n_fd4Device(f, f_y_v2n, nz, ny, nx, fact);
+}
+
+extern "C" void grady_v2n_fd4_cuda_( double *f, double *f_y_v2n, int *nz, int *ny, int *nx, double *fact){
+   int n=(*nx)*(*ny)*(*nz);
+   int threadsPerBlock = 256;
+   int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock);
+   grady_v2n_fd4<<<numBlocks, threadsPerBlock>>>(f, f_y_v2n, *nz, *ny, *nx, *fact);
+   cudaDeviceSynchronize();
+}
+
+__device__ void interp_v2n_fd4Device(double *f, double *f_int_v2n, int nz, int ny, int nx)
+{
+
+
+}
+
+__global__ void interp_v2n_fd4(double *f, double *f_int_v2n, int nz, int ny, int nx){
+     interp_v2n_fd4Device(f, f_int_v2n, nz, ny, nx);
+}
+
+extern "C" void interp_v2n_fd4_cuda_( double *f, double *f_int_v2n, int *nz, int *ny, int *nx){
+   int n=(*nx)*(*ny)*(*nz);
+   int threadsPerBlock = 256;
+   int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock);
+   interp_v2n_fd4<<<numBlocks, threadsPerBlock>>>(f, f_int_v2n, *nz, *ny, *nx);
+   cudaDeviceSynchronize();
+}
+
+__device__ void gradx_n2n_fd4Device(double *f, double *fx, int nz, int ny, int nx, double fact)
+{
+
+}
+
+__global__ void gradx_n2n_fd4(double *f, double *f_x_n2n, int nz, int ny, int nx, double fact){
+     gradx_n2n_fd4Device(f, f_x_n2n, nz, ny, nx, fact);
+}
+
+extern "C" void gradx_n2n_fd4_cuda_( double *f, double *f_x_n2n, int *nz, int *ny, int *nx, double *fact){
+   int n=(*nx)*(*ny)*(*nz);
+   int threadsPerBlock = 256;
+   int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock);
+   gradx_n2n_fd4<<<numBlocks, threadsPerBlock>>>(f, f_x_n2n, *nz, *ny, *nx, *fact);
+   cudaDeviceSynchronize();
+}
+
+__global__ void gradpar_v2n_fd4(double *f, double *f_grad, int nz, int ny, int nx,  double *f_z, double *f_y, double *f_n, double *f_x_n, double deltazi, double deltayi, double deltaxi, double gradpar_z, double *gradpar_x_n, double *gradpar_y_n)
+{
+
+}
+
+
+
+
+extern "C" void gradpar_v2n_fd4_cuda_(double *f, double *f_grad, int *nz, int *ny, int *nx,  double *f_z, double *f_y, double *f_n, double *f_x_n, double *deltazi, double *deltayi, double *deltaxi, double *gradpar_z, double *gradpar_x_n, double *gradpar_y_n){
+   int n = (*nx)*(*ny)*(*nz);
+   dim3 threadsPerBlock = 1024;
+   dim3 numBlocks = ((n + threadsPerBlock.x -1) / threadsPerBlock.x);
+   gradpar_v2n_fd4<<<numBlocks, threadsPerBlock>>>(f, f_grad, *nz, *ny, *nx, f_z, f_y, f_n, f_x_n, *deltazi, *deltayi, *deltaxi, *gradpar_z, gradpar_x_n, gradpar_y_n);
+   cudaDeviceSynchronize();
+}
diff --git a/exercises/gradients/gradients_mod.F90 b/exercises/gradients/gradients_mod.F90
new file mode 100644
index 0000000..3f34974
--- /dev/null
+++ b/exercises/gradients/gradients_mod.F90
@@ -0,0 +1,219 @@
+module gradients
+use space_grid
+use prec_const
+real(dp), DIMENSION(1:4), public, PROTECTED :: coef_int, coef_der1_stag, coef_der1_n2n, coef_der2_stag
+real(dp), DIMENSION(1:5), public, PROTECTED :: coef_der2
+real(dp), DIMENSION(:,:), pointer :: gradpar_x_n, gradpar_y_n
+real(dp), PUBLIC :: gradpar_z 
+contains
+
+subroutine intialize_coefficients
+    coef_int = (/ -1.0_dp/16.0_dp, 9.0_dp/16.0_dp, 9.0_dp/16.0_dp, -1.0_dp/16.0_dp /)
+    coef_der1_stag = (/ +1.0_dp/24.0_dp, -9.0_dp/8.0_dp, 9.0_dp/8.0_dp, -1.0_dp/24.0_dp /)
+    coef_der2 =  (/-1.0_dp/12.0_dp, 4.0_dp/3.0_dp, -5.0_dp/2.0_dp, +4.0_dp/3.0_dp, -1.0_dp/12.0_dp /)
+    coef_der1_n2n = (/  1.0_dp/12.0_dp,  -2.0_dp/3.0_dp,  2.0_dp/3.0_dp,  -1.0_dp/12.0_dp  /)
+    coef_der2_stag = (/ 1.0_dp/2._dp, -1.0_dp/2.0_dp, -1.0_dp/2._dp, 1.0_dp/2.0_dp /)
+
+end subroutine intialize_coefficients
+
+
+subroutine gradz_n2n_fd4(f , f_z)
+
+  use prec_const
+
+  IMPLICIT none
+
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f      
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z
+  integer :: iz
+  real(dp), DIMENSION(1:4) :: coef_der 
+
+  coef_der(:) = deltazi*coef_der1_n2n(:)
+  f_z(:,:,:) = nan_
+  do iz=izs,ize
+     f_z(:, :, iz) =    coef_der(1)*f(:, :, iz-2)  &
+          + coef_der(2)*f(:, :, iz-1)  &
+          + coef_der(3)*f(:, :, iz+1)  &
+          + coef_der(4)*f(:, :, iz+2)  
+  end do
+
+
+end subroutine gradz_n2n_fd4
+
+
+
+
+subroutine gradz_v2n_fd4(f,f_z_v2n)                 
+
+  use prec_const
+
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f      
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z_v2n
+  integer :: iy, iz
+  real(dp), DIMENSION(1:4) :: coef_der 
+  coef_der(:) = deltazi*coef_der1_stag(:)
+
+  f_z_v2n(:,:,:) = nan_
+  do iy=iys,iye
+     do iz=izs,ize
+
+        f_z_v2n(iy,:,iz) = coef_int(1)*(   coef_der(1)*f(iy-1, :, iz-1)   &
+             + coef_der(2)*f(iy-1, :, iz)     &
+             + coef_der(3)*f(iy-1, :, iz +1)  &
+             + coef_der(4)*f(iy-1, :, iz+2)  )&
+             
+             +  coef_int(2)*(   coef_der(1)*f(iy, :, iz-1)     &
+             + coef_der(2)*f(iy, :, iz)       &
+             + coef_der(3)*f(iy, :, iz+1)     &
+             + coef_der(4)*f(iy, :, iz+2)    )&
+             
+             +  coef_int(3)*(   coef_der(1)*f(iy+1, :, iz-1)    &
+             + coef_der(2)*f(iy+1, :, iz)      &
+             + coef_der(3)*f(iy+1, :, iz+1)    &
+             + coef_der(4)*f(iy+1, :, iz+2)   )&
+             
+             + coef_int(4)*(   coef_der(1)*f(iy+2, :, iz-1)    &
+             + coef_der(2)*f(iy+2, :, iz)      &
+             + coef_der(3)*f(iy+2, :, iz+1)    &
+             + coef_der(4)*f(iy+2, :, iz+2)   )
+
+     end do
+  end do
+
+
+
+end subroutine gradz_v2n_fd4
+
+subroutine grady_v2n_fd4(f,f_y_v2n)
+
+  use prec_const
+
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f      
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_y_v2n
+  integer :: iy, iz
+  real(dp), DIMENSION(1:4) :: coef_der 
+
+  coef_der(:) = deltayi*coef_der1_stag(:)
+
+  f_y_v2n(:,:,:) = nan_
+
+  do iy=iys,iye
+     do iz=izs,ize
+
+        f_y_v2n(iy,:,iz) = coef_int(1)*(   coef_der(1)*f(iy-1, :, iz-1)   &
+             + coef_der(2)*f(iy, :, iz-1)     &
+             + coef_der(3)*f(iy+1, :, iz-1)  &
+             + coef_der(4)*f(iy+2, :, iz-1)  )&
+             
+             + coef_int(2)*(   coef_der(1)*f(iy-1, :, iz)     &
+             + coef_der(2)*f(iy, :, iz)       &
+             + coef_der(3)*f(iy+1, :, iz)     &
+             + coef_der(4)*f(iy+2, :, iz)    )&
+             
+             + coef_int(3)*(   coef_der(1)*f(iy-1, :, iz+1)    &
+             + coef_der(2)*f(iy, :, iz+1)      &
+             + coef_der(3)*f(iy+1, :, iz+1)    &
+             + coef_der(4)*f(iy+2, :, iz+1)   )&
+             
+             + coef_int(4)*(   coef_der(1)*f(iy-1, :, iz+2)    &
+             + coef_der(2)*f(iy, :, iz+2)      &
+             + coef_der(3)*f(iy+1, :, iz+2)    &
+             + coef_der(4)*f(iy+2, :, iz+2)   )
+
+     end do
+  end do
+
+
+end subroutine grady_v2n_fd4
+
+subroutine interp_v2n_fd4 ( f,  f_int_v2n )
+
+  use prec_const
+  implicit none
+  real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg),intent(in) :: f
+  real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_int_v2n
+  integer :: iy, iz
+
+  f_int_v2n(:,:,:) = nan_
+
+  do iy=iys,iye
+     do iz=izs,ize
+
+        f_int_v2n(iy,:,iz) = coef_int(1)*(coef_int(1)*f(iy-1, :, iz-1)   &
+             + coef_int(2)*f(iy-1, : , iz)     &
+             + coef_int(3)*f(iy-1, :, iz +1)  &
+             + coef_int(4)*f(iy-1, :, iz+2)  )&
+             
+             +  coef_int(2)*(   coef_int(1)*f(iy, :, iz-1)     &
+             + coef_int(2)*f(iy, :, iz)       &
+             + coef_int(3)*f(iy, :, iz+1)     &
+             + coef_int(4)*f(iy, :, iz+2)    )&
+             
+             +  coef_int(3)*(   coef_int(1)*f(iy+1, :, iz-1)    &
+             + coef_int(2)*f(iy+1, :, iz)      &
+             + coef_int(3)*f(iy+1, :, iz+1)    &
+             + coef_int(4)*f(iy+1, :, iz+2)   )&
+             
+             + coef_int(4)*(   coef_int(1)*f(iy+2, :, iz-1)    &
+             + coef_int(2)*f(iy+2, :, iz)      &
+             + coef_int(3)*f(iy+2, :, iz+1)    &
+             + coef_int(4)*f(iy+2, :, iz+2)   )
+
+     end do
+  end do
+
+
+end subroutine interp_v2n_fd4
+
+subroutine gradx_n2n_fd4(f,f_x)
+  use prec_const
+
+  IMPLICIT none
+
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f      
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out)  :: f_x
+  integer :: ix
+  real(dp), DIMENSION(1:4) :: coef_der
+  coef_der(:)= deltaxi*coef_der1_n2n(:)                                                          
+
+  f_x (:,:,: ) =  nan_
+
+  do ix=ixs,ixe
+     f_x(:, ix, :) =   coef_der(1)*f(:, ix-2, :)  &
+          + coef_der(2)*f(:, ix-1, :)  &
+          + coef_der(3)*f(:, ix+1, :)    &
+          + coef_der(4)*f(:, ix+2, :)  
+
+  end do
+
+
+end subroutine gradx_n2n_fd4
+
+subroutine gradpar_v2n_fd4 ( f,  f_grad) 
+  use prec_const
+
+  IMPLICIT none
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f      
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_grad
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg)      :: f_z, f_y, f_n, f_x_n  !f_x_v
+
+  integer :: ix, iy
+
+  f_grad(:,:,:) = nan_
+
+  call gradz_v2n_fd4(f, f_z)
+  call grady_v2n_fd4(f, f_y)   
+  call interp_v2n_fd4(f,  f_n ) 
+  call gradx_n2n_fd4(f_n,  f_x_n) 
+
+  do ix = ixs,ixe   
+     do iy = iys, iye
+        f_grad(iy,ix,izs:ize) = gradpar_z*f_z(iy,ix,izs:ize) + gradpar_y_n(iy,ix)*f_y(iy,ix,izs:ize) &
+             +                          gradpar_x_n(iy,ix)*f_x_n(iy,ix,izs:ize)
+     end do
+  end do
+
+end subroutine gradpar_v2n_fd4
+
+
+end module gradients
diff --git a/exercises/gradients/prec_const_mod.F90 b/exercises/gradients/prec_const_mod.F90
new file mode 100644
index 0000000..4cdf7f0
--- /dev/null
+++ b/exercises/gradients/prec_const_mod.F90
@@ -0,0 +1,50 @@
+MODULE prec_const
+  use, intrinsic :: iso_fortran_env, only: REAL32, REAL64, &
+                                           stdin=>input_unit, &
+                                           stdout=>output_unit, &
+                                           stderr=>error_unit
+  !
+  !   Precision for real and complex
+  !
+  INTEGER, PARAMETER :: sp = REAL32 !Single precision, should not be used
+  INTEGER, PARAMETER :: dp = REAL64 !real(dp), enforced through the code
+
+  INTEGER, private :: dp_r, dp_p !Range and Aprecision of doubles
+  INTEGER, private :: sp_r, sp_p !Range and precision of singles
+
+
+  INTEGER, private :: MPI_SP !Single precision for MPI
+  INTEGER, private :: MPI_DP !Double precision in MPI
+
+  REAL(dp),public :: nan_
+  !
+  !   Some useful constants
+  !
+  REAL(dp), PARAMETER :: PI=3.141592653589793238462643383279502884197_dp
+  REAL(dp), PARAMETER :: PIO2=1.57079632679489661923132169163975144209858_dp
+  REAL(dp), PARAMETER :: TWOPI=6.283185307179586476925286766559005768394_dp
+  REAL(dp), PARAMETER :: SQRT2=1.41421356237309504880168872420969807856967_dp
+  !
+  CONTAINS
+    SUBROUTINE INIT_PREC_CONST
+      IMPLICIT NONE
+      integer :: ierr,me
+      
+      REAL(sp) :: a = 1_sp
+      REAL(dp) :: b = 1_dp
+!      logical :: commute = .true.
+
+      !Get range and precision of ISO FORTRAN sizes
+      sp_r = range(a)
+      sp_p = precision(a)
+
+      dp_r = range(b)
+      dp_p = precision(b)
+
+
+
+    END SUBROUTINE INIT_PREC_CONST
+
+
+
+END MODULE prec_const
diff --git a/exercises/gradients/space_grid_mod.F90 b/exercises/gradients/space_grid_mod.F90
new file mode 100644
index 0000000..28e84c5
--- /dev/null
+++ b/exercises/gradients/space_grid_mod.F90
@@ -0,0 +1,118 @@
+MODULE space_grid
+  USE prec_const
+  IMPLICIT NONE
+  PRIVATE
+  !
+  !   Grid module for spatial discretization
+  !
+  !   GRID Namelist
+  !
+  INTEGER, PUBLIC, PROTECTED :: nx=1                ! Number of total internal grip points in x
+  INTEGER, PUBLIC, PROTECTED :: ny=1                ! Number of total internal grip points in y
+  INTEGER, PUBLIC, PROTECTED :: nz=1                ! Number of total internal grip points in z
+  REAL(dp), PUBLIC, PROTECTED :: xmin=0._dp     ! x coordinate for left boundary
+  REAL(dp), PUBLIC, PROTECTED :: xmax=10._dp    ! x coordinate for right boundary
+  REAL(dp), PUBLIC, PROTECTED :: ymin=0._dp     ! y coordinate for left boundary
+  REAL(dp), PUBLIC, PROTECTED :: ymax=10._dp    ! y coordinate for right boundary
+  REAL(dp), PUBLIC, PROTECTED :: zmin=0._dp     ! z coordinate for left boundary
+  REAL(dp), PUBLIC, PROTECTED :: zmax=1._dp    ! z coordinate for right boundary
+  REAL(dp), PUBLIC, PROTECTED :: x_periodic_end = -10000_dp !Field line poloidal periodicity
+  REAL(dp), PUBLIC, PROTECTED :: Lx = 10._dp
+  !
+  ! Logical switch to apply boundary conditions at the corners
+  ! with nl_corners_top -> .true. the corner bc's are evaluated
+  ! as vertical/poloidal bc's.
+  !
+  LOGICAL, PUBLIC, PROTECTED :: nl_corners_top = .FALSE.
+  !
+  !
+  !
+  ! Number of points in each grid
+  !
+  INTEGER, PUBLIC, PROTECTED :: nxp, nyp, nzp
+  INTEGER, PUBLIC :: izsp, izep, nzpp
+  !
+  ! Indices to start and end of local and global grids
+  ! s -> start, e-> end (local)
+  ! l -> left , r-> right (global)
+  !
+  INTEGER, PUBLIC :: ixs, ixe, ixl, ixr
+  INTEGER, PUBLIC :: iys, iye, iyl, iyr, iyr_ngrid
+  INTEGER, PUBLIC :: izs, ize, izl, izr
+  !
+  ! Number of ghost cells
+  !
+  INTEGER, PUBLIC, PROTECTED :: nzgc=1
+  INTEGER, PUBLIC, PROTECTED :: nygc=1
+  INTEGER, PUBLIC, PROTECTED :: nxgc=1
+  !
+  ! Start and end ghost cells in local and global grids
+  !
+  INTEGER, PUBLIC :: ixsg, ixeg, ixlg, ixrg
+  INTEGER, PUBLIC :: iysg, iyeg, iylg, iyrg
+  INTEGER, PUBLIC :: izsg, izeg, izlg, izrg
+
+  !Array indices in x such that each allgatherv results in the full radial length
+  INTEGER, PUBLIC, PROTECTED :: ix1, ix2
+  INTEGER, PUBLIC, PROTECTED :: ixl1,ixr2
+  INTEGER, PUBLIC :: nx2d, nx1d
+
+  !Array indices in x like ix1 ix2, but including all ghost cells
+  INTEGER, PUBLIC, PROTECTED :: ix1g, ix2g
+
+  !Array indices in y such that each allgatherv results in the full radial length
+  INTEGER, PUBLIC, PROTECTED :: iy1, iy2
+  INTEGER, PUBLIC, PROTECTED :: iyl1,iyr2
+  INTEGER, PUBLIC :: ny2d, ny1d
+
+  !Array indices in y like iy1 iy2, but including all ghost cells
+  INTEGER, PUBLIC, PROTECTED :: iy1g, iy2g
+
+  !Field line poloidal periodicity
+  INTEGER, PUBLIC, PROTECTED :: ix_min    !min index in the open (limited) field line region for each processor
+
+  !Step size in each direction and inverses for finite differences
+  !Radial direction
+  REAL(dp), PUBLIC, PROTECTED :: deltax
+  REAL(dp), PUBLIC  :: deltaxi
+  REAL(dp), PUBLIC, PROTECTED :: deltaxih
+  REAL(dp), PUBLIC, PROTECTED :: deltaxisq
+
+  !Poloidal/vertical direction
+  REAL(dp), PUBLIC, PROTECTED :: deltay
+  REAL(dp), PUBLIC :: deltayi
+  REAL(dp), PUBLIC, PROTECTED :: deltayih
+  REAL(dp), PUBLIC, PROTECTED :: deltayisq
+  real(dp), PUBLIC, PROTECTED :: shiftgr_v = 0.5_dp
+
+  !Toroidal direction
+  REAL(dp), PUBLIC, PROTECTED :: deltaz
+  real(dp), PUBLIC :: deltazi
+  real(dp), PUBLIC, PROTECTED :: deltazisq
+
+  !Parallel direction (should be about the same as deltaz)
+  real(dp), PUBLIC, PROTECTED :: deltaai
+  real(dp), PUBLIC, PROTECTED :: deltaaih
+  real(dp), PUBLIC, PROTECTED :: deltaaisq
+
+  !x,y,z grids
+  REAL(dp), DIMENSION(:), ALLOCATABLE, PUBLIC, PROTECTED :: xarray, yarray, zarray
+
+  !Poloidal angle theta, its sine and cosine... warning, sin and cosine are defined with theta=0 at high-field-side
+  real(dp), DIMENSION(:), ALLOCATABLE, PUBLIC :: cos_tht
+  real(dp), DIMENSION(:), ALLOCATABLE, PUBLIC :: sin_tht
+
+  !Variables related to field-aligned parallel gradient operator
+  !Should generalize to have iyshift=1
+  INTEGER, PUBLIC, PROTECTED :: iyshift
+  INTEGER, PUBLIC, PROTECTED :: iyshift_d
+  INTEGER, PUBLIC, PROTECTED :: iyshift_u
+  INTEGER, PUBLIC, PROTECTED :: ny_top,ny_bot   !Upwinding in n2n parallel gradients
+  INTEGER, PUBLIC, PROTECTED :: ny_topu,ny_botu !Upwinding in n2v parallel gradient with arbitrary iyshift
+  INTEGER, PUBLIC, PROTECTED :: ny_topd,ny_botd !Upwinding in v2n parallel gradient with arbitrary iyshift
+  INTEGER, PUBLIC, PROTECTED :: ny_zg
+  INTEGER, PUBLIC :: kl,ku,neq
+
+
+
+END MODULE space_grid
diff --git a/exercises/gradients/test_gradients.F90 b/exercises/gradients/test_gradients.F90
new file mode 100644
index 0000000..7d86cdf
--- /dev/null
+++ b/exercises/gradients/test_gradients.F90
@@ -0,0 +1,108 @@
+subroutine initial_setup
+  use space_grid
+  use cuda_interface
+  use gradients
+  use iso_c_binding
+  implicit none
+  integer :: i,j,k
+  izsg=-1
+  ixsg=-1
+  iysg=-1
+  izeg=3
+  ixeg=1202
+  iyeg=2002
+  izs=1
+  ixs=1
+  iys=1
+  ize=1
+  ixe=1200
+  iye=2000
+
+  deltazi=0.03
+  deltayi=0.003
+  deltayi=0.005
+  call gbs_allocate_cuda(strmf_cpu,iysg,iyeg,ixsg,ixeg,izsg,izeg)
+  call gbs_allocate_cuda(strmf_gpu,iysg,iyeg,ixsg,ixeg,izsg,izeg)
+  call gbs_allocate_cuda(indata,iysg,iyeg,ixsg,ixeg,izsg,izeg)
+  call gbs_allocate_cuda(gradpar_y_n,iysg,iyeg,ixsg,ixeg)
+  call gbs_allocate_cuda(gradpar_x_n,iysg,iyeg,ixsg,ixeg)
+  call gbs_allocate_cuda(gradpar_y_n_gpu,iysg,iyeg,ixsg,ixeg)
+  call gbs_allocate_cuda(gradpar_x_n_gpu,iysg,iyeg,ixsg,ixeg)
+ 
+  call gbs_allocate_cuda(f_z,iysg,iyeg,ixsg,ixeg,izsg,izeg)
+  call gbs_allocate_cuda(f_y,iysg,iyeg,ixsg,ixeg,izsg,izeg)
+  call gbs_allocate_cuda(f_x,iysg,iyeg,ixsg,ixeg,izsg,izeg)
+  call gbs_allocate_cuda(f_n,iysg,iyeg,ixsg,ixeg,izsg,izeg)
+  call gbs_allocate_cuda(f_x_n,iysg,iyeg,ixsg,ixeg,izsg,izeg)
+
+end subroutine initial_setup
+
+subroutine fill_arrays
+  use space_grid
+  use cuda_interface
+  use gradients
+  use iso_c_binding
+  implicit none
+  integer :: i,j,k
+  call initial_setup 
+ 
+  call intialize_coefficients
+  do k=izsg,izeg
+    do i=ixsg,ixeg
+      do j=iysg,iyeg
+         indata(j, i, k) = dble(i*123.0+j*5.1+k*45.12)
+      end do
+    end do
+  end do
+  nx_cuda = ixeg-ixsg+1
+  ny_cuda = iyeg-iysg+1
+  nz_cuda = izeg-izsg+1
+  do i=ixsg,ixeg
+    do j=iysg,iyeg
+       gradpar_x_n(j, i) = dble(i*123.0+j*5.1)
+       gradpar_y_n(j, i) = dble(i*12.0+j*15.1)
+    end do
+  end do
+  gradpar_x_n_gpu(:,:) = gradpar_x_n(:,:)
+  gradpar_y_n_gpu(:,:) = gradpar_y_n(:,:)
+
+  gradpar_z = 0.5
+
+end subroutine fill_arrays
+
+program main
+  use space_grid
+  use cuda_interface
+  use gradients
+  use iso_c_binding
+  implicit none
+
+  call initial_setup 
+ 
+  call intialize_coefficients
+
+  call fill_arrays
+
+  call gradz_n2n_fd4(indata(:,:,:),strmf_cpu(:,:,:))
+  call gradz_n2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:))
+  write(*,*) 'gradz_n2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize))
+  call gradz_v2n_fd4(indata(:,:,:),strmf_cpu(:,:,:))
+  call gradz_v2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:))
+  write(*,*) 'gradz_v2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize))
+  call grady_v2n_fd4(indata(:,:,:),strmf_cpu(:,:,:))
+  call grady_v2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:))
+  write(*,*) 'grady_v2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize))
+  call interp_v2n_fd4(indata(:,:,:),strmf_cpu(:,:,:))
+  call interp_v2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:))
+  write(*,*) 'interp_v2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize))
+  call gradx_n2n_fd4(indata(:,:,:),strmf_cpu(:,:,:))
+  call gradx_n2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:))
+  write(*,*) 'gradx_n2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize))
+
+  CALL gradpar_v2n_fd4(indata(:,:,:),strmf_cpu(:,:,:))
+  CALL gradpar_v2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:), gradpar_z)
+  write(*,*) 'gradpar_v2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize))
+
+
+end program main
+