diff --git a/examples/saxpy/Makefile b/examples/saxpy/Makefile
new file mode 100644
index 0000000..4d4c92f
--- /dev/null
+++ b/examples/saxpy/Makefile
@@ -0,0 +1,13 @@
+F90=nvfortran
+F90FLAGS=-Mcuda
+
+all:    saxpy.x
+          
+
+saxpy.x: saxpy.o
+	$(F90) $(F90FLAGS) saxpy.o -o $@
+saxpy.o: saxpy.F90
+	$(F90) $(F90FLAGS) -c saxpy.F90 -o $@
+
+clean:
+	rm *.x *.o
diff --git a/examples/saxpy/saxpy.F90 b/examples/saxpy/saxpy.F90
new file mode 100644
index 0000000..41f2659
--- /dev/null
+++ b/examples/saxpy/saxpy.F90
@@ -0,0 +1,48 @@
+module mathOps
+contains
+  attributes(global) subroutine saxpy(x, y, a)
+    implicit none
+    real :: x(:), y(:)
+    real, value :: a 
+    integer :: i, n
+    n = size(x)
+    i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
+    if (i <= n) y(i) = y(i) + a*x(i)
+  end subroutine saxpy
+end module mathOps
+
+program testSaxpy
+  use mathOps
+  use cudafor
+  implicit none
+  integer, parameter :: N = 20*1024*1024
+  real :: x(N), y(N), a
+  real, device :: x_d(N), y_d(N)
+  type(dim3) :: grid, tBlock
+  type (cudaEvent) :: startEvent, stopEvent
+  real :: time
+  integer :: istat
+
+  tBlock = dim3(512,1,1)
+  grid = dim3(ceiling(real(N)/tBlock%x),1,1)
+
+  istat = cudaEventCreate(startEvent)
+  istat = cudaEventCreate(stopEvent)
+
+  x = 1.0; y = 2.0; a = 2.0
+  x_d = x
+  y_d = y
+  istat = cudaEventRecord(startEvent, 0)
+  call saxpy<<<grid, tblock>>>(x_d, y_d, a)
+  istat = cudaEventRecord(stopEvent, 0)
+  istat = cudaEventSynchronize(stopEvent)
+  istat = cudaEventElapsedTime(time, startEvent, stopEvent)
+
+  y = y_d
+  write(*,*) 'Max error: ', maxval(abs(y-4.0))
+  write(*,*) 'Effective Bandwidth (GB/s): ', N*4*3/time/10**6
+
+  istat = cudaEventDestroy(startEvent)
+  istat = cudaEventDestroy(stopEvent)
+
+end program testSaxpy
diff --git a/examples/saxpy/script.sh b/examples/saxpy/script.sh
new file mode 100644
index 0000000..b1ff517
--- /dev/null
+++ b/examples/saxpy/script.sh
@@ -0,0 +1,13 @@
+#!/bin/bash -l
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --ntasks-per-core=1
+#SBATCH --cpus-per-task=1
+#SBATCH --gres=gpu:1
+#SBATCH --reservation=spc-cuda-training-19.04
+#SBATCH --account=spc-cuda-training
+#SBATCH --time=0:05:00
+
+module load nvhpc
+
+srun  -n 1 ./saxpy.x > output_saxpy
diff --git a/examples/warmup/script.sh b/examples/warmup/script.sh
new file mode 100644
index 0000000..006f956
--- /dev/null
+++ b/examples/warmup/script.sh
@@ -0,0 +1,13 @@
+#!/bin/bash -l
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --ntasks-per-core=1
+#SBATCH --cpus-per-task=1
+#SBATCH --gres=gpu:1
+#SBATCH --reservation=spc-cuda-training-12.04
+#SBATCH --account=spc-cuda-training
+#SBATCH --time=0:05:00
+
+module load nvhpc
+
+nvidia-smi -a > output
diff --git a/exercises/binding/Makefile b/exercises/binding/Makefile
new file mode 100644
index 0000000..1111bb7
--- /dev/null
+++ b/exercises/binding/Makefile
@@ -0,0 +1,10 @@
+all: bind
+
+bind: c_func.o f_mod.o fmain.o
+	$(F90) $(F90FLAGS) $? -o $@
+
+%.o: %.f90
+	$(F90) $(F90FLAGS) $< -o $@ -c
+
+clean:
+	rm -rf *.o *.mod bind
diff --git a/exercises/binding/c_func.c b/exercises/binding/c_func.c
new file mode 100644
index 0000000..6b1f5e2
--- /dev/null
+++ b/exercises/binding/c_func.c
@@ -0,0 +1,11 @@
+/* -------------------------------------------------------------------------- */
+#include "c_func.h"
+/* -------------------------------------------------------------------------- */
+#include <stdio.h>
+/* -------------------------------------------------------------------------- */
+
+void c_function(matrix * in, double d) {
+    printf("d = %g\n", d);
+
+    // print matrix to screan
+}
diff --git a/exercises/binding/c_func.h b/exercises/binding/c_func.h
new file mode 100644
index 0000000..e4635d7
--- /dev/null
+++ b/exercises/binding/c_func.h
@@ -0,0 +1,11 @@
+#ifndef __C_FUNC_H_
+#define __C_FUNC_H_
+
+typedef struct {
+    int m, n;
+    double * data;
+} matrix;
+
+void c_function(matrix * in, double d);
+
+#endif // __C_FUNC_H_
diff --git a/exercises/binding/f_mod.f90 b/exercises/binding/f_mod.f90
new file mode 100644
index 0000000..afaf1b2
--- /dev/null
+++ b/exercises/binding/f_mod.f90
@@ -0,0 +1,31 @@
+module f_mod
+  use, intrinsic :: iso_c_binding, only : c_int, c_double, c_ptr, c_loc
+  implicit none
+
+contains
+  subroutine f_func(data)
+    type, bind(C) :: c_matrix
+       integer(c_int) :: m
+       integer(c_int) :: n
+       type(c_ptr) :: data
+    end type c_matrix
+
+    interface
+       subroutine c_function(mat, d) BIND(C, name='c_function')
+         import c_matrix, c_double
+         type(c_matrix) :: mat
+         real(c_double), value :: d
+       end subroutine c_function
+    end interface
+
+    type(c_matrix) :: mat
+    double precision, dimension(:, :), allocatable, intent(in) :: data
+    real(c_double) :: d = 1.45
+
+    mat%m = size(data, 1)
+    mat%n = size(data, 2)
+    mat%data = c_loc(data(lbound(data,1), lbound(data, 2)))
+
+    call c_function(mat, d)
+  end subroutine f_func
+end module f_mod
diff --git a/exercises/binding/fmain.f90 b/exercises/binding/fmain.f90
new file mode 100644
index 0000000..d2c6668
--- /dev/null
+++ b/exercises/binding/fmain.f90
@@ -0,0 +1,19 @@
+program test_binding
+  use f_mod
+
+  implicit none
+
+  double precision, dimension(:, :), allocatable :: data
+  integer :: i, j
+  integer :: m = 20, n = 10
+
+  allocate (data(m, n))
+
+  do i = 1, m
+     do j = 1, n
+        ! initialize data here
+     end do
+  end do
+
+  call f_func(data)
+end program test_binding
diff --git a/exercises/binding/solution/Makefile b/exercises/binding/solution/Makefile
new file mode 100644
index 0000000..7da3388
--- /dev/null
+++ b/exercises/binding/solution/Makefile
@@ -0,0 +1,16 @@
+CXXFLAGS=-std=c++11 -g -O3
+CFLAGS=-g -O3
+LDFLAGS=-g -O3
+F90FLAGS=-g -O3
+
+
+all: clean bind
+
+bind: c_func.o f_mod.o fmain.o
+	$(F90) $(F90FLAGS) $? -o $@
+
+%.o: %.f90
+	$(F90) $(F90FLAGS) $< -o $@ -c
+
+clean:
+	rm -rf *.o *.mod bind
diff --git a/exercises/binding/solution/c_func.c b/exercises/binding/solution/c_func.c
new file mode 100644
index 0000000..1497ab0
--- /dev/null
+++ b/exercises/binding/solution/c_func.c
@@ -0,0 +1,15 @@
+/* -------------------------------------------------------------------------- */
+#include "c_func.h"
+/* -------------------------------------------------------------------------- */
+#include <stdio.h>
+/* -------------------------------------------------------------------------- */
+
+void c_function(matrix * in, double d) {
+    printf("d = %g\n", d);
+    for(int i = 0; i < in->m; ++i) {
+        for(int j = 0; j < in->n; ++j) {
+            printf("%3g ", in->data[j * in->m + i]);
+        }
+        printf("\n");
+    }
+}
diff --git a/exercises/binding/solution/c_func.h b/exercises/binding/solution/c_func.h
new file mode 100644
index 0000000..e4635d7
--- /dev/null
+++ b/exercises/binding/solution/c_func.h
@@ -0,0 +1,11 @@
+#ifndef __C_FUNC_H_
+#define __C_FUNC_H_
+
+typedef struct {
+    int m, n;
+    double * data;
+} matrix;
+
+void c_function(matrix * in, double d);
+
+#endif // __C_FUNC_H_
diff --git a/exercises/binding/solution/f_mod.f90 b/exercises/binding/solution/f_mod.f90
new file mode 100644
index 0000000..afaf1b2
--- /dev/null
+++ b/exercises/binding/solution/f_mod.f90
@@ -0,0 +1,31 @@
+module f_mod
+  use, intrinsic :: iso_c_binding, only : c_int, c_double, c_ptr, c_loc
+  implicit none
+
+contains
+  subroutine f_func(data)
+    type, bind(C) :: c_matrix
+       integer(c_int) :: m
+       integer(c_int) :: n
+       type(c_ptr) :: data
+    end type c_matrix
+
+    interface
+       subroutine c_function(mat, d) BIND(C, name='c_function')
+         import c_matrix, c_double
+         type(c_matrix) :: mat
+         real(c_double), value :: d
+       end subroutine c_function
+    end interface
+
+    type(c_matrix) :: mat
+    double precision, dimension(:, :), allocatable, intent(in) :: data
+    real(c_double) :: d = 1.45
+
+    mat%m = size(data, 1)
+    mat%n = size(data, 2)
+    mat%data = c_loc(data(lbound(data,1), lbound(data, 2)))
+
+    call c_function(mat, d)
+  end subroutine f_func
+end module f_mod
diff --git a/exercises/binding/solution/fmain.f90 b/exercises/binding/solution/fmain.f90
new file mode 100644
index 0000000..533aeaa
--- /dev/null
+++ b/exercises/binding/solution/fmain.f90
@@ -0,0 +1,19 @@
+program test_binding
+  use f_mod
+
+  implicit none
+
+  double precision, dimension(:, :), allocatable :: data
+  integer :: i, j
+  integer :: m = 20, n = 10
+
+  allocate (data(m, n))
+
+  do i = 1, m
+     do j = 1, n
+        data(i, j) = n * (i-1) + (j-1)
+     end do
+  end do
+
+  call f_func(data)
+end program test_binding
diff --git a/exercises/binding/solution_cxx/Makefile b/exercises/binding/solution_cxx/Makefile
new file mode 100644
index 0000000..81e90f8
--- /dev/null
+++ b/exercises/binding/solution_cxx/Makefile
@@ -0,0 +1,14 @@
+CXXFLAGS=-std=c++11 -g -O3
+LDFLAGS=-g -O3
+F90FLAGS=-g -O3
+
+all: bind
+
+bind: c_func.o f_mod.o fmain.o
+	$(F90) $(F90FLAGS) $? -o $@
+
+%.o: %.f90
+	$(F90) $(F90FLAGS) $< -o $@ -c
+
+clean:
+	rm -rf *.o *.mod bind
diff --git a/exercises/binding/solution_cxx/c_func.cc b/exercises/binding/solution_cxx/c_func.cc
new file mode 100644
index 0000000..86affd7
--- /dev/null
+++ b/exercises/binding/solution_cxx/c_func.cc
@@ -0,0 +1,16 @@
+/* -------------------------------------------------------------------------- */
+#include "c_func.h"
+/* -------------------------------------------------------------------------- */
+#include <stdio.h>
+/* -------------------------------------------------------------------------- */
+
+void c_function(matrix * in, double d) {
+    printf("d = %g\n", d);
+    auto & mat = *in;
+    for(int i = 0; i < mat.m; ++i) {
+        for(int j = 0; j < mat.n; ++j) {
+            printf("%g ", mat(i, j));
+        }
+        printf("\n");
+    }
+}
diff --git a/exercises/binding/solution_cxx/c_func.h b/exercises/binding/solution_cxx/c_func.h
new file mode 100644
index 0000000..5d76654
--- /dev/null
+++ b/exercises/binding/solution_cxx/c_func.h
@@ -0,0 +1,18 @@
+#ifndef __C_FUNC_H_
+#define __C_FUNC_H_
+
+extern "C" {
+
+struct matrix {
+    int m, n;
+    double * data;
+    inline double & operator()(int i, int j) {
+        return data[j*m + i];
+    }
+};
+
+void c_function(matrix * in, double d);
+
+}
+
+#endif // __C_FUNC_H_
diff --git a/exercises/binding/solution_cxx/f_mod.f90 b/exercises/binding/solution_cxx/f_mod.f90
new file mode 100644
index 0000000..afaf1b2
--- /dev/null
+++ b/exercises/binding/solution_cxx/f_mod.f90
@@ -0,0 +1,31 @@
+module f_mod
+  use, intrinsic :: iso_c_binding, only : c_int, c_double, c_ptr, c_loc
+  implicit none
+
+contains
+  subroutine f_func(data)
+    type, bind(C) :: c_matrix
+       integer(c_int) :: m
+       integer(c_int) :: n
+       type(c_ptr) :: data
+    end type c_matrix
+
+    interface
+       subroutine c_function(mat, d) BIND(C, name='c_function')
+         import c_matrix, c_double
+         type(c_matrix) :: mat
+         real(c_double), value :: d
+       end subroutine c_function
+    end interface
+
+    type(c_matrix) :: mat
+    double precision, dimension(:, :), allocatable, intent(in) :: data
+    real(c_double) :: d = 1.45
+
+    mat%m = size(data, 1)
+    mat%n = size(data, 2)
+    mat%data = c_loc(data(lbound(data,1), lbound(data, 2)))
+
+    call c_function(mat, d)
+  end subroutine f_func
+end module f_mod
diff --git a/exercises/binding/solution_cxx/fmain.f90 b/exercises/binding/solution_cxx/fmain.f90
new file mode 100644
index 0000000..533aeaa
--- /dev/null
+++ b/exercises/binding/solution_cxx/fmain.f90
@@ -0,0 +1,19 @@
+program test_binding
+  use f_mod
+
+  implicit none
+
+  double precision, dimension(:, :), allocatable :: data
+  integer :: i, j
+  integer :: m = 20, n = 10
+
+  allocate (data(m, n))
+
+  do i = 1, m
+     do j = 1, n
+        data(i, j) = n * (i-1) + (j-1)
+     end do
+  end do
+
+  call f_func(data)
+end program test_binding
diff --git a/exercises/gradients/Makefile b/exercises/gradients/Makefile
new file mode 100644
index 0000000..fa6f63c
--- /dev/null
+++ b/exercises/gradients/Makefile
@@ -0,0 +1,28 @@
+F90=gfortran
+NVCC=nvcc
+LDFLAGS=-lc -lstdc++ -lcuda -lcudart -lcudadevrt
+F90FLAGS= -ffree-line-length-0 -O3  -g
+
+
+all:  test_gradients.x
+          
+test_gradients.x: cudalloc.o grad_test.o prec_const_mod.o space_grid_mod.o cuda_interface_mod.o gradients_mod.o test_gradients.o 
+	$(F90) $(LDFLAGS) cudalloc.o grad_test.o prec_const_mod.o space_grid_mod.o cuda_interface_mod.o \
+                          gradients_mod.o test_gradients.o -o $@
+test_gradients.o: test_gradients.F90
+	$(F90) $(F90FLAGS) -c test_gradients.F90 -o $@
+cuda_interface_mod.o: cuda_interface_mod.F90
+	$(F90) $(F90FLAGS) -c cuda_interface_mod.F90 -o $@
+space_grid_mod.o: space_grid_mod.F90
+	$(F90) $(F90FLAGS) -c space_grid_mod.F90 -o $@
+prec_const_mod.o: prec_const_mod.F90
+	$(F90) $(F90FLAGS) -c prec_const_mod.F90 -o $@
+gradients_mod.o: gradients_mod.F90
+	$(F90) $(F90FLAGS) -c gradients_mod.F90 -o $@
+grad_test.o: grad_test.cu
+	$(NVCC) $(NVCCFLAGS) -c grad_test.cu -o $@
+cudalloc.o: cudalloc.cu
+	$(NVCC) $(NVCCFLAGS) -c cudalloc.cu -o $@
+
+clean:
+	rm *.x *.o
diff --git a/exercises/gradients/cuda_interface_mod.F90 b/exercises/gradients/cuda_interface_mod.F90
new file mode 100644
index 0000000..f0decef
--- /dev/null
+++ b/exercises/gradients/cuda_interface_mod.F90
@@ -0,0 +1,128 @@
+
+MODULE cuda_interface
+use prec_const
+INTERFACE
+  FUNCTION allocate_cuda_memory(n) BIND(C, NAME='allocate_cuda_memory')
+    use iso_c_binding
+    TYPE(C_PTR) :: allocate_cuda_memory
+    integer :: n
+  END FUNCTION allocate_cuda_memory
+
+
+END INTERFACE
+  INTERFACE gbs_allocate_cuda
+     MODULE PROCEDURE gbs_allocate_cuda_dp3, gbs_allocate_cuda_dp2, gbs_allocate_cuda_dp1
+  END INTERFACE gbs_allocate_cuda
+  real(dp), pointer, save :: strmf_cpu(:,:,:),strmf_gpu(:,:,:), indata(:,:,:)
+  real(dp), pointer, save :: strmf1_cpu(:,:,:), strmf1_gpu(:,:,:)
+  real(dp), pointer, save :: grady_f(:,:,:), grady_diff(:,:,:), diff(:,:,:), f_x(:,:,:)
+  real(dp), pointer, save :: f_z(:,:,:), f_y(:,:,:), f_n(:,:,:), f_x_n(:,:,:)
+  real(dp), pointer, save :: f_v(:,:,:), f_x_v(:,:,:), f_xx_n(:,:,:),f_xx_v(:,:,:)
+  real(dp), pointer, save :: f_zz(:,:,:), f_yy(:,:,:), f_xx(:,:,:)
+  real(dp), pointer, save :: f_zy(:,:,:), f_yz(:,:,:), f_zx(:,:,:)
+  real(dp), pointer, save :: f_xz(:,:,:), f_xy(:,:,:), f_yx(:,:,:)
+
+  real(dp), pointer, save :: gradpar_x_n_gpu(:,:), gradpar_x_v_gpu(:,:), gradpar_y_n_gpu(:,:), gradpar_y_v_gpu(:,:)
+  real(dp), pointer, save :: grad2par_yy_n_gpu(:,:), grad2par_xx_n_gpu(:,:), grad2par_x_n_gpu(:,:), grad2par_y_n_gpu(:,:)
+  real(dp), pointer, save :: grad2par_zy_n_gpu(:,:), grad2par_zx_n_gpu(:,:), grad2par_xy_n_gpu(:,:)
+  real(dp), pointer, save :: grad2par_yy_v_gpu(:,:), grad2par_xx_v_gpu(:,:), grad2par_x_v_gpu(:,:), grad2par_y_v_gpu(:,:)
+  real(dp), pointer, save :: grad2par_zy_v_gpu(:,:), grad2par_zx_v_gpu(:,:), grad2par_xy_v_gpu(:,:)
+  real(dp), pointer, save :: cur_x_n_gpu(:), cur_x_v_gpu(:), ccur_xx_n_gpu(:),  ccur_xx_v_gpu(:)
+  real(dp), pointer, save :: cur_y_n_gpu(:,:), cur_y_v_gpu(:,:), ccur_yy_n_gpu(:,:), ccur_xy_n_gpu(:,:) 
+  real(dp), pointer, save :: ccur_yy_v_gpu(:,:), ccur_xy_v_gpu(:,:) 
+  real(dp), pointer, dimension(:,:,:), save ::  A_x_n_gpu, A_x_v_gpu, A_y_v_gpu, B_x_v_gpu, B_y_v_gpu
+  real(dp), pointer, dimension(:,:,:), save ::  B_x_n_gpu, B_y_n_gpu, A_y_n_gpu   
+  real(dp), pointer, dimension(:,:), save ::  gradcur_xx_v_gpu, gradcur_yy_v_gpu, gradcur_xy_v_gpu, gradcur_yx_v_gpu
+  real(dp), pointer, dimension(:,:), save ::  gradcur_zx_v_gpu, gradcur_zy_v_gpu, gradcur_x_v_gpu, gradcur_y_v_gpu
+  real(dp), pointer, dimension(:,:), save ::  curgrad_xx_n_gpu, curgrad_yy_n_gpu, curgrad_xy_n_gpu, curgrad_yx_n_gpu
+  real(dp), pointer, dimension(:,:), save ::  curgrad_zx_n_gpu, curgrad_zy_n_gpu, curgrad_x_n_gpu, curgrad_y_n_gpu
+
+  !real(dp), pointer, save :: gradpar_x_n_gpu(:,:,:), gradpar_x_v_gpu(:,:,:), gradpar_y_n_gpu(:,:,:), gradpar_y_v_gpu(:,:,:)
+  integer :: nx_cuda, ny_cuda, nz_cuda
+contains
+
+  SUBROUTINE gbs_allocate_cuda_dp3(a,is1,ie1,is2,ie2,is3,ie3)
+    use iso_c_binding
+    real(dp), DIMENSION(:,:,:), pointer, INTENT(INOUT) :: a
+    INTEGER, INTENT(IN) :: is1,ie1,is2,ie2,is3,ie3
+    integer ndata
+    ndata=(ie3-is3+1)*(ie2-is2+1)*(ie1-is1+1) 
+    call c_f_pointer(allocate_cuda_memory(ndata), a, \
+                    [(ie1-is1+1),(ie2-is2+1),(ie3-is3+1)])
+    a(is1:,is2:,is3:) => a
+    a=0.0_dp
+  END SUBROUTINE gbs_allocate_cuda_dp3
+
+  SUBROUTINE gbs_allocate_cuda_dp2(a,is1,ie1,is2,ie2)
+    use iso_c_binding
+    real(dp), DIMENSION(:,:), pointer, INTENT(INOUT) :: a
+    INTEGER, INTENT(IN) :: is1,ie1,is2,ie2
+    integer ndata
+    ndata=(ie2-is2+1)*(ie1-is1+1) 
+    call c_f_pointer(allocate_cuda_memory(ndata), a, \
+                    [(ie1-is1+1),(ie2-is2+1)])
+    a(is1:,is2:) => a
+    a=0.0_dp
+  END SUBROUTINE gbs_allocate_cuda_dp2
+
+  SUBROUTINE gbs_allocate_cuda_dp1(a,is1,ie1)
+    use iso_c_binding
+    real(dp), DIMENSION(:), pointer, INTENT(INOUT) :: a
+    INTEGER, INTENT(IN) :: is1,ie1
+    integer ndata
+    ndata=(ie1-is1+1) 
+    call c_f_pointer(allocate_cuda_memory(ndata), a, \
+                    [(ie1-is1+1)])
+    a(is1:) => a
+    a=0.0_dp
+  END SUBROUTINE gbs_allocate_cuda_dp1
+
+
+  subroutine gradz_n2n_fd4cuda(f_in, f_out)
+     use space_grid
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in      
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out
+     call gradz_n2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda, deltazi)
+  end subroutine gradz_n2n_fd4cuda
+
+  subroutine gradz_v2n_fd4cuda(f_in, f_out)
+     use space_grid
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in      
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out
+     call gradz_v2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda, deltazi)
+  end subroutine gradz_v2n_fd4cuda
+
+  subroutine grady_v2n_fd4cuda(f_in, f_out)
+     use space_grid
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in      
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out
+     call grady_v2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda, deltayi)
+  end subroutine grady_v2n_fd4cuda
+
+  subroutine interp_v2n_fd4cuda(f_in, f_out)
+     use space_grid
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in      
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out
+     call interp_v2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda)
+  end subroutine interp_v2n_fd4cuda
+
+  subroutine gradx_n2n_fd4cuda(f_in, f_out)
+     use space_grid
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in      
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out
+     call gradx_n2n_fd4_cuda(f_in, f_out, nz_cuda, nx_cuda, ny_cuda, deltaxi)
+  end subroutine gradx_n2n_fd4cuda
+
+
+  subroutine gradpar_v2n_fd4cuda(f_in, f_out,  gradpar_z)
+     use space_grid
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f_in
+     real(dp),  INTENT(in) :: gradpar_z
+     real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_out
+
+     CALL gradpar_v2n_fd4_cuda(f_in, f_out,  nz_cuda, nx_cuda, ny_cuda, f_z, f_y, f_n, \
+                            f_x_n, deltazi, deltayi, deltaxi, gradpar_z, gradpar_x_n_gpu, gradpar_y_n_gpu)
+  end subroutine gradpar_v2n_fd4cuda
+
+
+END MODULE cuda_interface
diff --git a/exercises/gradients/cudalloc.cu b/exercises/gradients/cudalloc.cu
new file mode 100644
index 0000000..1186170
--- /dev/null
+++ b/exercises/gradients/cudalloc.cu
@@ -0,0 +1,76 @@
+/*#include <stdio.h>
+//#include <stdlib.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+struct cuda_struct{
+   double *coef_der_cuda;
+};
+
+*/
+#include "cudalloc.h"
+/*__global__ void set_coefficients(cuda_struct *structure){
+   structure->coef_der_cuda[0] = 1.0/12.0;
+   structure->coef_der_cuda[1] = -2.0/3.0;
+   structure->coef_der_cuda[2] = 2.0/3.0;
+   structure->coef_der_cuda[3] = -1.0/12.0;
+*/
+__global__ void set_coefficients(double *coef, double *coefstag, double *coef_int){
+coef[0] = 1.0/12.0;
+coef[1] = -2.0/3.0;
+coef[2] = 2.0/3.0;
+coef[3] = -1.0/12.0;
+coef_int[0] = -1.0/16.0;
+coef_int[1] = 9.0/16.0;
+coef_int[2] = 9.0/16.0;
+coef_int[3] = -1.0/16.0;
+coefstag[0] = 1.0/24.0;
+coefstag[1] = -9.0/8.0;
+coefstag[2] = 9.0/8.0;
+coefstag[3] = -1.0/24.0;
+}
+void set_extent(int *out, int in){
+*out = in;
+}
+extern "C" void allocate_cuda_struct(cuda_struct *& structure){
+   structure = new cuda_struct();
+   return;
+}
+
+//__global__ void print_coefficients(cuda_struct *structure){
+__global__ void print_coefficients(double *coef){
+}
+extern "C" void fill_cuda_struct(cuda_struct *structure, int *izeg, int *izsg, int *iyeg, int *iysg, int *ixeg, int *ixsg){
+//extern "C" void fill_cuda_struct(cuda_struct *structure){
+
+   cudaMallocManaged( (void **)&(structure->coef_der_cuda), sizeof(double) * 4 );
+   cudaMallocManaged( (void **)&(structure->coef_der1_stag), sizeof(double) * 4 );
+   cudaMallocManaged( (void **)&(structure->coef_int_cuda), sizeof(double) * 4 );
+   set_coefficients<<<1, 1>>>(structure->coef_der_cuda, structure->coef_der1_stag, structure->coef_int_cuda);
+   //set_coefficients<<<1, 1>>>(structure->coef_der_cuda);
+   //set_extent((structure->nz), (*izeg-*izsg+1));
+   //set_extent(&(structure->ny), (*izeg-*izsg+1));
+   //set_extent(&(structure->nx), (*izeg-*izsg+1));
+   //set_extent(&(structure->nx), (*iyeg-*iysg+1));
+   //&(structure->nz) = &k;//(*izeg-*izsg+1);
+   //&(structure->ny) = &k;//(*ixeg-*ixsg+1);
+   //&(structure->nx) = &k;//(*iyeg-*iysg+1);
+   //printf("AAAAAAA %d %d %d\n", structure->nz, structure->ny, structure->nx);
+   //set_extent<<<1, 1>>>(structure->nz, (*izeg-*izsg+1));
+   //set_extent<<<1, 1>>>(structure->ny, (*ixeg-*ixsg+1));
+   //set_extent<<<1, 1>>>(structure->nx, (*iyeg-*iysg+1));
+   cudaDeviceSynchronize();
+
+   printf("cudalloc coef_int %f %f %f %f \n", structure->coef_int_cuda[0], structure->coef_int_cuda[1], structure->coef_int_cuda[2], structure->coef_int_cuda[3]);
+   return;
+}
+
+extern "C" void *allocate_cuda_memory(int *n){
+   double *a;
+   int N=*n;
+   cudaMallocManaged( (void **)&a, sizeof(double) * N );
+   cudaDeviceSynchronize();
+   return (void *) a;
+}
+
+
+
diff --git a/exercises/gradients/cudalloc.h b/exercises/gradients/cudalloc.h
new file mode 100644
index 0000000..0d950a9
--- /dev/null
+++ b/exercises/gradients/cudalloc.h
@@ -0,0 +1,11 @@
+#include <stdio.h>
+//#include <stdlib.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+struct cuda_struct{
+   double *coef_der_cuda;
+   double *coef_int_cuda;
+   double *coef_der1_stag;
+   int *nz, *ny, *nx;
+};
diff --git a/exercises/gradients/grad_test.cu b/exercises/gradients/grad_test.cu
new file mode 100644
index 0000000..3b9eda0
--- /dev/null
+++ b/exercises/gradients/grad_test.cu
@@ -0,0 +1,256 @@
+#include <cuda.h>
+//#include <cuda_runtime.h>
+//#include "unistd.h"
+
+
+__constant__ __device__ double coef_int[4] = {-1.0/16.0, 9.0/16.0, 9.0/16.0, -1.0/16.0 };
+__constant__ __device__ double coef_der1_stag[4] = { +1.0/24.0, -9.0/8.0, 9.0/8.0, -1.0/24.0 };
+__constant__ __device__ double coef_der2[5] =  {-1.0/12.0, 4.0/3.0, -5.0/2.0, +4.0/3.0, -1.0/12.0};
+__constant__ __device__ double coef_der1_n2n[4] =  {  1.0/12.0,  -2.0/3.0,  2.0/3.0,  -1.0/12.0};
+__constant__ __device__ double coef_der2_stag[4] = { 1.0/2., -1.0/2.0, -1.0/2., 1.0/2.0};
+
+__device__ int xyindex(int i,  int nz )
+{
+   return i%nz;
+}
+
+
+__device__ int disp(int i, int j, int k, int nx, int ny)
+{
+   return i + j*nx + k*nx*ny;
+}
+
+__device__ void gradz_n2n_fd4Device(double *f, double *fz, int nz, int ny, int nx, double fact)
+{
+     int index = blockDim.x*blockIdx.x+threadIdx.x;
+   int d1=index+disp(0,0,-2,nx,ny);
+   int d2=index+disp(0,0,-1,nx,ny);
+   int d3=index+disp(0,0,1,nx,ny);
+   int d4=index+disp(0,0,2,nx,ny);
+   if (d1>=0 && d4<(nx*ny*nz)){
+      fz[index] = coef_der1_n2n[0]*f[d1] + coef_der1_n2n[1]*f[d2] + 
+                  coef_der1_n2n[2]*f[d3] + coef_der1_n2n[3]*f[d4];
+      fz[index] *= fact;
+   }
+
+}
+
+__global__ void gradz_n2n_fd4(double *f, double *fz, int nz, int ny, int nx, double fact){
+     gradz_n2n_fd4Device(f, fz, nz, ny, nx, fact);
+}
+
+extern "C" void gradz_n2n_fd4_cuda_( double *f, double *f_z, int *nz, int *ny, int *nx, double *fact){
+   int n=(*nx)*(*ny)*(*nz);
+   int threadsPerBlock = 256;
+   int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock);
+   gradz_n2n_fd4<<<numBlocks, threadsPerBlock>>>(f, f_z, *nz, *ny, *nx, *fact);
+   cudaDeviceSynchronize();
+}
+
+
+__device__ void gradz_v2n_fd4Device(double *f, double *f_z_v2n, int nz, int ny, int nx, double fact)
+{
+   int index=blockDim.x*blockIdx.x+threadIdx.x;
+   int d11=index+disp(-1,0,-1,nx,ny);
+   int d12=index+disp(-1,0,0,nx,ny);
+   int d13=index+disp(-1,0,1,nx,ny);
+   int d14=index+disp(-1,0,2,nx,ny);
+   int d21=index+disp(0,0,-1,nx,ny);
+   int d22=index+disp(0,0,0,nx,ny);
+   int d23=index+disp(0,0,1,nx,ny);
+   int d24=index+disp(0,0,2,nx,ny);
+   int d31=index+disp(1,0,-1,nx,ny);
+   int d32=index+disp(1,0,0,nx,ny);
+   int d33=index+disp(1,0,1,nx,ny);
+   int d34=index+disp(1,0,2,nx,ny);
+   int d41=index+disp(2,0,-1,nx,ny);
+   int d42=index+disp(2,0,0,nx,ny);
+   int d43=index+disp(2,0,1,nx,ny);
+   int d44=index+disp(2,0,2,nx,ny);
+
+   if (d11>=0 && d44<(nx*ny*nz)){
+      f_z_v2n[index] = coef_int[0]  * (coef_der1_stag[0]*f[d11] + coef_der1_stag[1]*f[d12] +
+                     coef_der1_stag[2]*f[d13] + coef_der1_stag[3]*f[d14]) + 
+                     coef_int[1] * (coef_der1_stag[0]*f[d21] + coef_der1_stag[1]*f[d22] +
+                     coef_der1_stag[2]*f[d23] + coef_der1_stag[3]*f[d24])+ 
+                     coef_int[2]  * (coef_der1_stag[0]*f[d31] + coef_der1_stag[1]*f[d32] +
+                     coef_der1_stag[2]*f[d33] + coef_der1_stag[3]*f[d34]) +
+                     coef_int[3] *  (coef_der1_stag[0]*f[d41] + coef_der1_stag[1]*f[d42] +
+                     coef_der1_stag[2]*f[d43] + coef_der1_stag[3]*f[d44]);
+      f_z_v2n[index] *= fact;
+   }
+
+}
+
+__global__ void gradz_v2n_fd4(double *f, double *fz, int nz, int ny, int nx, double fact){
+     gradz_v2n_fd4Device(f, fz, nz, ny, nx, fact);
+}
+
+extern "C" void gradz_v2n_fd4_cuda_( double *f, double *f_z, int *nz, int *ny, int *nx, double *fact){
+   int n=(*nx)*(*ny)*(*nz);
+   int threadsPerBlock = 256;
+   int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock);
+   gradz_v2n_fd4<<<numBlocks, threadsPerBlock>>>(f, f_z, *nz, *ny, *nx, *fact);
+   cudaDeviceSynchronize();
+}
+
+__device__ void grady_v2n_fd4Device(double *f, double *f_y_v2n, int nz, int ny, int nx, double fact)
+{
+   int index = blockDim.x*blockIdx.x+threadIdx.x;
+   int d11=index+disp(-1,0,-1,nx,ny);
+   int d12=index+disp(0,0,-1,nx,ny);
+   int d13=index+disp(1,0,-1,nx,ny);
+   int d14=index+disp(2,0,-1,nx,ny);
+   int d21=index+disp(-1,0,0,nx,ny);
+   int d22=index+disp(0,0,0,nx,ny);
+   int d23=index+disp(1,0,0,nx,ny);
+   int d24=index+disp(2,0,0,nx,ny);
+   int d31=index+disp(-1,0,1,nx,ny);
+   int d32=index+disp(0,0,1,nx,ny);
+   int d33=index+disp(1,0,1,nx,ny);
+   int d34=index+disp(2,0,1,nx,ny);
+   int d41=index+disp(-1,0,2,nx,ny);
+   int d42=index+disp(0,0,2,nx,ny);
+   int d43=index+disp(1,0,2,nx,ny);
+   int d44=index+disp(2,0,2,nx,ny);
+
+   if(d11>=0 && d44<(nx*ny*nz)){
+      f_y_v2n[index] = coef_int[0]  * (coef_der1_stag[0]*f[d11] + coef_der1_stag[1]*f[d12] +
+                     coef_der1_stag[2]*f[d13] + coef_der1_stag[3]*f[d14]) + 
+                     coef_int[1] * (coef_der1_stag[0]*f[d21] + coef_der1_stag[1]*f[d22] +
+                     coef_der1_stag[2]*f[d23] + coef_der1_stag[3]*f[d24])+ 
+                     coef_int[2]  * (coef_der1_stag[0]*f[d31] + coef_der1_stag[1]*f[d32] +
+                     coef_der1_stag[2]*f[d33] + coef_der1_stag[3]*f[d34]) +
+                     coef_int[3]  * (coef_der1_stag[0]*f[d41] + coef_der1_stag[1]*f[d42] +
+                     coef_der1_stag[2]*f[d43] + coef_der1_stag[3]*f[d44]);
+      f_y_v2n[index] *= fact;
+   }
+
+}
+
+__global__ void grady_v2n_fd4(double *f, double *f_y_v2n, int nz, int ny, int nx, double fact){
+     grady_v2n_fd4Device(f, f_y_v2n, nz, ny, nx, fact);
+}
+
+extern "C" void grady_v2n_fd4_cuda_( double *f, double *f_y_v2n, int *nz, int *ny, int *nx, double *fact){
+   int n=(*nx)*(*ny)*(*nz);
+   int threadsPerBlock = 256;
+   int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock);
+   grady_v2n_fd4<<<numBlocks, threadsPerBlock>>>(f, f_y_v2n, *nz, *ny, *nx, *fact);
+   cudaDeviceSynchronize();
+}
+
+__device__ void interp_v2n_fd4Device(double *f, double *f_int_v2n, int nz, int ny, int nx)
+{
+   int index=blockDim.x*blockIdx.x+threadIdx.x;
+   int d11=index+disp(-1,0,-1,nx,ny);
+   int d12=index+disp(-1,0,0,nx,ny);
+   int d13=index+disp(-1,0,1,nx,ny);
+   int d14=index+disp(-1,0,2,nx,ny);
+   int d21=index+disp(0,0,-1,nx,ny);
+   int d22=index+disp(0,0,0,nx,ny);
+   int d23=index+disp(0,0,1,nx,ny);
+   int d24=index+disp(0,0,2,nx,ny);
+   int d31=index+disp(1,0,-1,nx,ny);
+   int d32=index+disp(1,0,0,nx,ny);
+   int d33=index+disp(1,0,1,nx,ny);
+   int d34=index+disp(1,0,2,nx,ny);
+   int d41=index+disp(2,0,-1,nx,ny);
+   int d42=index+disp(2,0,0,nx,ny);
+   int d43=index+disp(2,0,1,nx,ny);
+   int d44=index+disp(2,0,2,nx,ny);
+
+   if (d11>=0 && d44<(nx*ny*nz)){
+      f_int_v2n[index] = coef_int[0]  * (coef_int[0]*f[d11] + coef_int[1]*f[d12] +
+                     coef_int[2]*f[d13] + coef_int[3]*f[d14]) + 
+                     coef_int[1] * (coef_int[0]*f[d21] + coef_int[1]*f[d22] +
+                     coef_int[2]*f[d23] + coef_int[3]*f[d24])+ 
+                     coef_int[2]  * (coef_int[0]*f[d31] + coef_int[1]*f[d32] +
+                     coef_int[2]*f[d33] + coef_int[3]*f[d34]) +
+                     coef_int[3] *  (coef_int[0]*f[d41] + coef_int[1]*f[d42] +
+                     coef_int[2]*f[d43] + coef_int[3]*f[d44]);
+   }
+
+}
+
+__global__ void interp_v2n_fd4(double *f, double *f_int_v2n, int nz, int ny, int nx){
+     interp_v2n_fd4Device(f, f_int_v2n, nz, ny, nx);
+}
+
+extern "C" void interp_v2n_fd4_cuda_( double *f, double *f_int_v2n, int *nz, int *ny, int *nx){
+   int n=(*nx)*(*ny)*(*nz);
+   int threadsPerBlock = 256;
+   int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock);
+   interp_v2n_fd4<<<numBlocks, threadsPerBlock>>>(f, f_int_v2n, *nz, *ny, *nx);
+   cudaDeviceSynchronize();
+}
+
+__device__ void gradx_n2n_fd4Device(double *f, double *fx, int nz, int ny, int nx, double fact)
+{
+   int d1=disp(0,1,0,nx,ny);
+   int d2=disp(0,2,0,nx,ny);
+   int index=blockDim.x*blockIdx.x+threadIdx.x;
+   if (index<(nx*ny*nz-d2) && index>=d2){
+      fx[index] = coef_der1_n2n[0]*f[index-d2] + coef_der1_n2n[1]*f[index-d1] + coef_der1_n2n[2]*f[index+d1] + coef_der1_n2n[3]*f[index+d2];
+      fx[index] *= fact;
+   }
+
+}
+
+__global__ void gradx_n2n_fd4(double *f, double *f_x_n2n, int nz, int ny, int nx, double fact){
+     gradx_n2n_fd4Device(f, f_x_n2n, nz, ny, nx, fact);
+}
+
+extern "C" void gradx_n2n_fd4_cuda_( double *f, double *f_x_n2n, int *nz, int *ny, int *nx, double *fact){
+   int n=(*nx)*(*ny)*(*nz);
+   int threadsPerBlock = 256;
+   int numBlocks = ((n + threadsPerBlock -1) / threadsPerBlock);
+   gradx_n2n_fd4<<<numBlocks, threadsPerBlock>>>(f, f_x_n2n, *nz, *ny, *nx, *fact);
+   cudaDeviceSynchronize();
+}
+
+__global__ void gradpar_v2n_fd4(double *f, double *f_grad, int nz, int ny, int nx,  double *f_z, double *f_y, double *f_n, double *f_x_n, double deltazi, double deltayi, double deltaxi, double gradpar_z, double *gradpar_x_n, double *gradpar_y_n)
+{
+   int index=blockDim.x*blockIdx.x+threadIdx.x;
+   int ind=xyindex(index, nx*ny);
+
+   gradz_v2n_fd4Device(f, f_z, nz, ny, nx, deltazi); 
+   grady_v2n_fd4Device(f, f_y, nz, ny, nx, deltayi); 
+   interp_v2n_fd4Device(f, f_n, nz, ny, nx); 
+   __syncthreads();
+   if (index<(nx)*(ny)*(nz)){
+      f_grad[index] = gradpar_z*f_z[index] + gradpar_y_n[ind]*f_y[index] ;
+   }
+
+   gradx_n2n_fd4Device(f_n, f_x_n, nz, ny, nx, deltaxi); 
+   __syncthreads();
+   
+   if (index<(nx)*(ny)*(nz)){
+      f_grad[index] +=  gradpar_x_n[ind]*f_x_n[index];
+   }
+
+}
+
+__global__ void gradpar1_v2n_fd4(double *f, double *f_grad, int nz, int ny, int nx,  double *f_z, double *f_y, double *f_n, double *f_x_n, double deltazi, double deltayi, double deltaxi, double gradpar_z, double *gradpar_x_n, double *gradpar_y_n)
+{
+   int index=blockDim.x*blockIdx.x+threadIdx.x;
+   int ind=xyindex(index, nx*ny);
+   gradx_n2n_fd4Device(f_n, f_x_n, nz, ny, nx, deltaxi); 
+   __syncthreads();
+   
+   if (index<(nx)*(ny)*(nz)){
+      f_grad[index] += gradpar_x_n[ind]*f_x_n[index];
+   }
+
+
+
+}
+
+
+extern "C" void gradpar_v2n_fd4_cuda_(double *f, double *f_grad, int *nz, int *ny, int *nx,  double *f_z, double *f_y, double *f_n, double *f_x_n, double *deltazi, double *deltayi, double *deltaxi, double *gradpar_z, double *gradpar_x_n, double *gradpar_y_n){
+   int n = (*nx)*(*ny)*(*nz);
+   dim3 threadsPerBlock = 1024;
+   dim3 numBlocks = ((n + threadsPerBlock.x -1) / threadsPerBlock.x);
+   gradpar_v2n_fd4<<<numBlocks, threadsPerBlock>>>(f, f_grad, *nz, *ny, *nx, f_z, f_y, f_n, f_x_n, *deltazi, *deltayi, *deltaxi, *gradpar_z, gradpar_x_n, gradpar_y_n);
+   cudaDeviceSynchronize();
+}
diff --git a/exercises/gradients/gradients_mod.F90 b/exercises/gradients/gradients_mod.F90
new file mode 100644
index 0000000..3f34974
--- /dev/null
+++ b/exercises/gradients/gradients_mod.F90
@@ -0,0 +1,219 @@
+module gradients
+use space_grid
+use prec_const
+real(dp), DIMENSION(1:4), public, PROTECTED :: coef_int, coef_der1_stag, coef_der1_n2n, coef_der2_stag
+real(dp), DIMENSION(1:5), public, PROTECTED :: coef_der2
+real(dp), DIMENSION(:,:), pointer :: gradpar_x_n, gradpar_y_n
+real(dp), PUBLIC :: gradpar_z 
+contains
+
+subroutine intialize_coefficients
+    coef_int = (/ -1.0_dp/16.0_dp, 9.0_dp/16.0_dp, 9.0_dp/16.0_dp, -1.0_dp/16.0_dp /)
+    coef_der1_stag = (/ +1.0_dp/24.0_dp, -9.0_dp/8.0_dp, 9.0_dp/8.0_dp, -1.0_dp/24.0_dp /)
+    coef_der2 =  (/-1.0_dp/12.0_dp, 4.0_dp/3.0_dp, -5.0_dp/2.0_dp, +4.0_dp/3.0_dp, -1.0_dp/12.0_dp /)
+    coef_der1_n2n = (/  1.0_dp/12.0_dp,  -2.0_dp/3.0_dp,  2.0_dp/3.0_dp,  -1.0_dp/12.0_dp  /)
+    coef_der2_stag = (/ 1.0_dp/2._dp, -1.0_dp/2.0_dp, -1.0_dp/2._dp, 1.0_dp/2.0_dp /)
+
+end subroutine intialize_coefficients
+
+
+subroutine gradz_n2n_fd4(f , f_z)
+
+  use prec_const
+
+  IMPLICIT none
+
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f      
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z
+  integer :: iz
+  real(dp), DIMENSION(1:4) :: coef_der 
+
+  coef_der(:) = deltazi*coef_der1_n2n(:)
+  f_z(:,:,:) = nan_
+  do iz=izs,ize
+     f_z(:, :, iz) =    coef_der(1)*f(:, :, iz-2)  &
+          + coef_der(2)*f(:, :, iz-1)  &
+          + coef_der(3)*f(:, :, iz+1)  &
+          + coef_der(4)*f(:, :, iz+2)  
+  end do
+
+
+end subroutine gradz_n2n_fd4
+
+
+
+
+subroutine gradz_v2n_fd4(f,f_z_v2n)                 
+
+  use prec_const
+
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f      
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_z_v2n
+  integer :: iy, iz
+  real(dp), DIMENSION(1:4) :: coef_der 
+  coef_der(:) = deltazi*coef_der1_stag(:)
+
+  f_z_v2n(:,:,:) = nan_
+  do iy=iys,iye
+     do iz=izs,ize
+
+        f_z_v2n(iy,:,iz) = coef_int(1)*(   coef_der(1)*f(iy-1, :, iz-1)   &
+             + coef_der(2)*f(iy-1, :, iz)     &
+             + coef_der(3)*f(iy-1, :, iz +1)  &
+             + coef_der(4)*f(iy-1, :, iz+2)  )&
+             
+             +  coef_int(2)*(   coef_der(1)*f(iy, :, iz-1)     &
+             + coef_der(2)*f(iy, :, iz)       &
+             + coef_der(3)*f(iy, :, iz+1)     &
+             + coef_der(4)*f(iy, :, iz+2)    )&
+             
+             +  coef_int(3)*(   coef_der(1)*f(iy+1, :, iz-1)    &
+             + coef_der(2)*f(iy+1, :, iz)      &
+             + coef_der(3)*f(iy+1, :, iz+1)    &
+             + coef_der(4)*f(iy+1, :, iz+2)   )&
+             
+             + coef_int(4)*(   coef_der(1)*f(iy+2, :, iz-1)    &
+             + coef_der(2)*f(iy+2, :, iz)      &
+             + coef_der(3)*f(iy+2, :, iz+1)    &
+             + coef_der(4)*f(iy+2, :, iz+2)   )
+
+     end do
+  end do
+
+
+
+end subroutine gradz_v2n_fd4
+
+subroutine grady_v2n_fd4(f,f_y_v2n)
+
+  use prec_const
+
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f      
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_y_v2n
+  integer :: iy, iz
+  real(dp), DIMENSION(1:4) :: coef_der 
+
+  coef_der(:) = deltayi*coef_der1_stag(:)
+
+  f_y_v2n(:,:,:) = nan_
+
+  do iy=iys,iye
+     do iz=izs,ize
+
+        f_y_v2n(iy,:,iz) = coef_int(1)*(   coef_der(1)*f(iy-1, :, iz-1)   &
+             + coef_der(2)*f(iy, :, iz-1)     &
+             + coef_der(3)*f(iy+1, :, iz-1)  &
+             + coef_der(4)*f(iy+2, :, iz-1)  )&
+             
+             + coef_int(2)*(   coef_der(1)*f(iy-1, :, iz)     &
+             + coef_der(2)*f(iy, :, iz)       &
+             + coef_der(3)*f(iy+1, :, iz)     &
+             + coef_der(4)*f(iy+2, :, iz)    )&
+             
+             + coef_int(3)*(   coef_der(1)*f(iy-1, :, iz+1)    &
+             + coef_der(2)*f(iy, :, iz+1)      &
+             + coef_der(3)*f(iy+1, :, iz+1)    &
+             + coef_der(4)*f(iy+2, :, iz+1)   )&
+             
+             + coef_int(4)*(   coef_der(1)*f(iy-1, :, iz+2)    &
+             + coef_der(2)*f(iy, :, iz+2)      &
+             + coef_der(3)*f(iy+1, :, iz+2)    &
+             + coef_der(4)*f(iy+2, :, iz+2)   )
+
+     end do
+  end do
+
+
+end subroutine grady_v2n_fd4
+
+subroutine interp_v2n_fd4 ( f,  f_int_v2n )
+
+  use prec_const
+  implicit none
+  real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg),intent(in) :: f
+  real(dp), dimension(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_int_v2n
+  integer :: iy, iz
+
+  f_int_v2n(:,:,:) = nan_
+
+  do iy=iys,iye
+     do iz=izs,ize
+
+        f_int_v2n(iy,:,iz) = coef_int(1)*(coef_int(1)*f(iy-1, :, iz-1)   &
+             + coef_int(2)*f(iy-1, : , iz)     &
+             + coef_int(3)*f(iy-1, :, iz +1)  &
+             + coef_int(4)*f(iy-1, :, iz+2)  )&
+             
+             +  coef_int(2)*(   coef_int(1)*f(iy, :, iz-1)     &
+             + coef_int(2)*f(iy, :, iz)       &
+             + coef_int(3)*f(iy, :, iz+1)     &
+             + coef_int(4)*f(iy, :, iz+2)    )&
+             
+             +  coef_int(3)*(   coef_int(1)*f(iy+1, :, iz-1)    &
+             + coef_int(2)*f(iy+1, :, iz)      &
+             + coef_int(3)*f(iy+1, :, iz+1)    &
+             + coef_int(4)*f(iy+1, :, iz+2)   )&
+             
+             + coef_int(4)*(   coef_int(1)*f(iy+2, :, iz-1)    &
+             + coef_int(2)*f(iy+2, :, iz)      &
+             + coef_int(3)*f(iy+2, :, iz+1)    &
+             + coef_int(4)*f(iy+2, :, iz+2)   )
+
+     end do
+  end do
+
+
+end subroutine interp_v2n_fd4
+
+subroutine gradx_n2n_fd4(f,f_x)
+  use prec_const
+
+  IMPLICIT none
+
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), INTENT(in) :: f      
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out)  :: f_x
+  integer :: ix
+  real(dp), DIMENSION(1:4) :: coef_der
+  coef_der(:)= deltaxi*coef_der1_n2n(:)                                                          
+
+  f_x (:,:,: ) =  nan_
+
+  do ix=ixs,ixe
+     f_x(:, ix, :) =   coef_der(1)*f(:, ix-2, :)  &
+          + coef_der(2)*f(:, ix-1, :)  &
+          + coef_der(3)*f(:, ix+1, :)    &
+          + coef_der(4)*f(:, ix+2, :)  
+
+  end do
+
+
+end subroutine gradx_n2n_fd4
+
+subroutine gradpar_v2n_fd4 ( f,  f_grad) 
+  use prec_const
+
+  IMPLICIT none
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(in) :: f      
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg), intent(out) :: f_grad
+  real(dp), DIMENSION(iysg:iyeg,ixsg:ixeg,izsg:izeg)      :: f_z, f_y, f_n, f_x_n  !f_x_v
+
+  integer :: ix, iy
+
+  f_grad(:,:,:) = nan_
+
+  call gradz_v2n_fd4(f, f_z)
+  call grady_v2n_fd4(f, f_y)   
+  call interp_v2n_fd4(f,  f_n ) 
+  call gradx_n2n_fd4(f_n,  f_x_n) 
+
+  do ix = ixs,ixe   
+     do iy = iys, iye
+        f_grad(iy,ix,izs:ize) = gradpar_z*f_z(iy,ix,izs:ize) + gradpar_y_n(iy,ix)*f_y(iy,ix,izs:ize) &
+             +                          gradpar_x_n(iy,ix)*f_x_n(iy,ix,izs:ize)
+     end do
+  end do
+
+end subroutine gradpar_v2n_fd4
+
+
+end module gradients
diff --git a/exercises/gradients/prec_const_mod.F90 b/exercises/gradients/prec_const_mod.F90
new file mode 100644
index 0000000..4cdf7f0
--- /dev/null
+++ b/exercises/gradients/prec_const_mod.F90
@@ -0,0 +1,50 @@
+MODULE prec_const
+  use, intrinsic :: iso_fortran_env, only: REAL32, REAL64, &
+                                           stdin=>input_unit, &
+                                           stdout=>output_unit, &
+                                           stderr=>error_unit
+  !
+  !   Precision for real and complex
+  !
+  INTEGER, PARAMETER :: sp = REAL32 !Single precision, should not be used
+  INTEGER, PARAMETER :: dp = REAL64 !real(dp), enforced through the code
+
+  INTEGER, private :: dp_r, dp_p !Range and Aprecision of doubles
+  INTEGER, private :: sp_r, sp_p !Range and precision of singles
+
+
+  INTEGER, private :: MPI_SP !Single precision for MPI
+  INTEGER, private :: MPI_DP !Double precision in MPI
+
+  REAL(dp),public :: nan_
+  !
+  !   Some useful constants
+  !
+  REAL(dp), PARAMETER :: PI=3.141592653589793238462643383279502884197_dp
+  REAL(dp), PARAMETER :: PIO2=1.57079632679489661923132169163975144209858_dp
+  REAL(dp), PARAMETER :: TWOPI=6.283185307179586476925286766559005768394_dp
+  REAL(dp), PARAMETER :: SQRT2=1.41421356237309504880168872420969807856967_dp
+  !
+  CONTAINS
+    SUBROUTINE INIT_PREC_CONST
+      IMPLICIT NONE
+      integer :: ierr,me
+      
+      REAL(sp) :: a = 1_sp
+      REAL(dp) :: b = 1_dp
+!      logical :: commute = .true.
+
+      !Get range and precision of ISO FORTRAN sizes
+      sp_r = range(a)
+      sp_p = precision(a)
+
+      dp_r = range(b)
+      dp_p = precision(b)
+
+
+
+    END SUBROUTINE INIT_PREC_CONST
+
+
+
+END MODULE prec_const
diff --git a/exercises/gradients/space_grid_mod.F90 b/exercises/gradients/space_grid_mod.F90
new file mode 100644
index 0000000..28e84c5
--- /dev/null
+++ b/exercises/gradients/space_grid_mod.F90
@@ -0,0 +1,118 @@
+MODULE space_grid
+  USE prec_const
+  IMPLICIT NONE
+  PRIVATE
+  !
+  !   Grid module for spatial discretization
+  !
+  !   GRID Namelist
+  !
+  INTEGER, PUBLIC, PROTECTED :: nx=1                ! Number of total internal grip points in x
+  INTEGER, PUBLIC, PROTECTED :: ny=1                ! Number of total internal grip points in y
+  INTEGER, PUBLIC, PROTECTED :: nz=1                ! Number of total internal grip points in z
+  REAL(dp), PUBLIC, PROTECTED :: xmin=0._dp     ! x coordinate for left boundary
+  REAL(dp), PUBLIC, PROTECTED :: xmax=10._dp    ! x coordinate for right boundary
+  REAL(dp), PUBLIC, PROTECTED :: ymin=0._dp     ! y coordinate for left boundary
+  REAL(dp), PUBLIC, PROTECTED :: ymax=10._dp    ! y coordinate for right boundary
+  REAL(dp), PUBLIC, PROTECTED :: zmin=0._dp     ! z coordinate for left boundary
+  REAL(dp), PUBLIC, PROTECTED :: zmax=1._dp    ! z coordinate for right boundary
+  REAL(dp), PUBLIC, PROTECTED :: x_periodic_end = -10000_dp !Field line poloidal periodicity
+  REAL(dp), PUBLIC, PROTECTED :: Lx = 10._dp
+  !
+  ! Logical switch to apply boundary conditions at the corners
+  ! with nl_corners_top -> .true. the corner bc's are evaluated
+  ! as vertical/poloidal bc's.
+  !
+  LOGICAL, PUBLIC, PROTECTED :: nl_corners_top = .FALSE.
+  !
+  !
+  !
+  ! Number of points in each grid
+  !
+  INTEGER, PUBLIC, PROTECTED :: nxp, nyp, nzp
+  INTEGER, PUBLIC :: izsp, izep, nzpp
+  !
+  ! Indices to start and end of local and global grids
+  ! s -> start, e-> end (local)
+  ! l -> left , r-> right (global)
+  !
+  INTEGER, PUBLIC :: ixs, ixe, ixl, ixr
+  INTEGER, PUBLIC :: iys, iye, iyl, iyr, iyr_ngrid
+  INTEGER, PUBLIC :: izs, ize, izl, izr
+  !
+  ! Number of ghost cells
+  !
+  INTEGER, PUBLIC, PROTECTED :: nzgc=1
+  INTEGER, PUBLIC, PROTECTED :: nygc=1
+  INTEGER, PUBLIC, PROTECTED :: nxgc=1
+  !
+  ! Start and end ghost cells in local and global grids
+  !
+  INTEGER, PUBLIC :: ixsg, ixeg, ixlg, ixrg
+  INTEGER, PUBLIC :: iysg, iyeg, iylg, iyrg
+  INTEGER, PUBLIC :: izsg, izeg, izlg, izrg
+
+  !Array indices in x such that each allgatherv results in the full radial length
+  INTEGER, PUBLIC, PROTECTED :: ix1, ix2
+  INTEGER, PUBLIC, PROTECTED :: ixl1,ixr2
+  INTEGER, PUBLIC :: nx2d, nx1d
+
+  !Array indices in x like ix1 ix2, but including all ghost cells
+  INTEGER, PUBLIC, PROTECTED :: ix1g, ix2g
+
+  !Array indices in y such that each allgatherv results in the full radial length
+  INTEGER, PUBLIC, PROTECTED :: iy1, iy2
+  INTEGER, PUBLIC, PROTECTED :: iyl1,iyr2
+  INTEGER, PUBLIC :: ny2d, ny1d
+
+  !Array indices in y like iy1 iy2, but including all ghost cells
+  INTEGER, PUBLIC, PROTECTED :: iy1g, iy2g
+
+  !Field line poloidal periodicity
+  INTEGER, PUBLIC, PROTECTED :: ix_min    !min index in the open (limited) field line region for each processor
+
+  !Step size in each direction and inverses for finite differences
+  !Radial direction
+  REAL(dp), PUBLIC, PROTECTED :: deltax
+  REAL(dp), PUBLIC  :: deltaxi
+  REAL(dp), PUBLIC, PROTECTED :: deltaxih
+  REAL(dp), PUBLIC, PROTECTED :: deltaxisq
+
+  !Poloidal/vertical direction
+  REAL(dp), PUBLIC, PROTECTED :: deltay
+  REAL(dp), PUBLIC :: deltayi
+  REAL(dp), PUBLIC, PROTECTED :: deltayih
+  REAL(dp), PUBLIC, PROTECTED :: deltayisq
+  real(dp), PUBLIC, PROTECTED :: shiftgr_v = 0.5_dp
+
+  !Toroidal direction
+  REAL(dp), PUBLIC, PROTECTED :: deltaz
+  real(dp), PUBLIC :: deltazi
+  real(dp), PUBLIC, PROTECTED :: deltazisq
+
+  !Parallel direction (should be about the same as deltaz)
+  real(dp), PUBLIC, PROTECTED :: deltaai
+  real(dp), PUBLIC, PROTECTED :: deltaaih
+  real(dp), PUBLIC, PROTECTED :: deltaaisq
+
+  !x,y,z grids
+  REAL(dp), DIMENSION(:), ALLOCATABLE, PUBLIC, PROTECTED :: xarray, yarray, zarray
+
+  !Poloidal angle theta, its sine and cosine... warning, sin and cosine are defined with theta=0 at high-field-side
+  real(dp), DIMENSION(:), ALLOCATABLE, PUBLIC :: cos_tht
+  real(dp), DIMENSION(:), ALLOCATABLE, PUBLIC :: sin_tht
+
+  !Variables related to field-aligned parallel gradient operator
+  !Should generalize to have iyshift=1
+  INTEGER, PUBLIC, PROTECTED :: iyshift
+  INTEGER, PUBLIC, PROTECTED :: iyshift_d
+  INTEGER, PUBLIC, PROTECTED :: iyshift_u
+  INTEGER, PUBLIC, PROTECTED :: ny_top,ny_bot   !Upwinding in n2n parallel gradients
+  INTEGER, PUBLIC, PROTECTED :: ny_topu,ny_botu !Upwinding in n2v parallel gradient with arbitrary iyshift
+  INTEGER, PUBLIC, PROTECTED :: ny_topd,ny_botd !Upwinding in v2n parallel gradient with arbitrary iyshift
+  INTEGER, PUBLIC, PROTECTED :: ny_zg
+  INTEGER, PUBLIC :: kl,ku,neq
+
+
+
+END MODULE space_grid
diff --git a/exercises/gradients/test_gradients.F90 b/exercises/gradients/test_gradients.F90
new file mode 100644
index 0000000..7d86cdf
--- /dev/null
+++ b/exercises/gradients/test_gradients.F90
@@ -0,0 +1,108 @@
+subroutine initial_setup
+  use space_grid
+  use cuda_interface
+  use gradients
+  use iso_c_binding
+  implicit none
+  integer :: i,j,k
+  izsg=-1
+  ixsg=-1
+  iysg=-1
+  izeg=3
+  ixeg=1202
+  iyeg=2002
+  izs=1
+  ixs=1
+  iys=1
+  ize=1
+  ixe=1200
+  iye=2000
+
+  deltazi=0.03
+  deltayi=0.003
+  deltayi=0.005
+  call gbs_allocate_cuda(strmf_cpu,iysg,iyeg,ixsg,ixeg,izsg,izeg)
+  call gbs_allocate_cuda(strmf_gpu,iysg,iyeg,ixsg,ixeg,izsg,izeg)
+  call gbs_allocate_cuda(indata,iysg,iyeg,ixsg,ixeg,izsg,izeg)
+  call gbs_allocate_cuda(gradpar_y_n,iysg,iyeg,ixsg,ixeg)
+  call gbs_allocate_cuda(gradpar_x_n,iysg,iyeg,ixsg,ixeg)
+  call gbs_allocate_cuda(gradpar_y_n_gpu,iysg,iyeg,ixsg,ixeg)
+  call gbs_allocate_cuda(gradpar_x_n_gpu,iysg,iyeg,ixsg,ixeg)
+ 
+  call gbs_allocate_cuda(f_z,iysg,iyeg,ixsg,ixeg,izsg,izeg)
+  call gbs_allocate_cuda(f_y,iysg,iyeg,ixsg,ixeg,izsg,izeg)
+  call gbs_allocate_cuda(f_x,iysg,iyeg,ixsg,ixeg,izsg,izeg)
+  call gbs_allocate_cuda(f_n,iysg,iyeg,ixsg,ixeg,izsg,izeg)
+  call gbs_allocate_cuda(f_x_n,iysg,iyeg,ixsg,ixeg,izsg,izeg)
+
+end subroutine initial_setup
+
+subroutine fill_arrays
+  use space_grid
+  use cuda_interface
+  use gradients
+  use iso_c_binding
+  implicit none
+  integer :: i,j,k
+  call initial_setup 
+ 
+  call intialize_coefficients
+  do k=izsg,izeg
+    do i=ixsg,ixeg
+      do j=iysg,iyeg
+         indata(j, i, k) = dble(i*123.0+j*5.1+k*45.12)
+      end do
+    end do
+  end do
+  nx_cuda = ixeg-ixsg+1
+  ny_cuda = iyeg-iysg+1
+  nz_cuda = izeg-izsg+1
+  do i=ixsg,ixeg
+    do j=iysg,iyeg
+       gradpar_x_n(j, i) = dble(i*123.0+j*5.1)
+       gradpar_y_n(j, i) = dble(i*12.0+j*15.1)
+    end do
+  end do
+  gradpar_x_n_gpu(:,:) = gradpar_x_n(:,:)
+  gradpar_y_n_gpu(:,:) = gradpar_y_n(:,:)
+
+  gradpar_z = 0.5
+
+end subroutine fill_arrays
+
+program main
+  use space_grid
+  use cuda_interface
+  use gradients
+  use iso_c_binding
+  implicit none
+
+  call initial_setup 
+ 
+  call intialize_coefficients
+
+  call fill_arrays
+
+  call gradz_n2n_fd4(indata(:,:,:),strmf_cpu(:,:,:))
+  call gradz_n2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:))
+  write(*,*) 'gradz_n2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize))
+  call gradz_v2n_fd4(indata(:,:,:),strmf_cpu(:,:,:))
+  call gradz_v2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:))
+  write(*,*) 'gradz_v2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize))
+  call grady_v2n_fd4(indata(:,:,:),strmf_cpu(:,:,:))
+  call grady_v2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:))
+  write(*,*) 'grady_v2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize))
+  call interp_v2n_fd4(indata(:,:,:),strmf_cpu(:,:,:))
+  call interp_v2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:))
+  write(*,*) 'interp_v2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize))
+  call gradx_n2n_fd4(indata(:,:,:),strmf_cpu(:,:,:))
+  call gradx_n2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:))
+  write(*,*) 'gradx_n2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize))
+
+  CALL gradpar_v2n_fd4(indata(:,:,:),strmf_cpu(:,:,:))
+  CALL gradpar_v2n_fd4cuda(indata(:,:,:),strmf_gpu(:,:,:), gradpar_z)
+  write(*,*) 'gradpar_v2n_fd4 L2_norm(strmf_cpu-strmf_gpu):', norm2(strmf_cpu(iys:iye,ixs:ixe,izs:ize) - strmf_gpu(iys:iye,ixs:ixe,izs:ize))
+
+
+end program main
+