diff --git a/Benchmarks/GridGradientBenchmark/gradient_GPU.cu b/Benchmarks/GridGradientBenchmark/gradient_GPU.cu
index 92809bb..187e9c1 100644
--- a/Benchmarks/GridGradientBenchmark/gradient_GPU.cu
+++ b/Benchmarks/GridGradientBenchmark/gradient_GPU.cu
@@ -1,353 +1,418 @@
 
 #include "gradient_GPU.cuh"
 
 #include <iostream>
 #include <string.h>
 #include <cuda_runtime.h>
 #include <math.h>
 #include <sys/time.h>
 #include <fstream>
 #include <immintrin.h>
 #include <map>
 /*
 #ifdef __AVX__
 #include "simd_math_avx.h"
 #endif
 #ifdef __AVX512F__
 #include "simd_math_avx512f.h"
 #endif
 */
 #include "structure_hpc.h"
 #include "utils.hpp"
 
 
 
 //
 // SOA versions, vectorizable
 //
-__device__ struct point module_potentialDerivatives_totalGradient_5_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
+__device__ 
+inline struct point module_potentialDerivatives_totalGradient_5_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
         //asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA begins");
   //
   struct point grad, clumpgrad;
         grad.x = 0;
         grad.y = 0;
   for(int i = shalos; i < shalos + nhalos; i++)
   {
     //
     struct point true_coord, true_coord_rotation;
     //
     true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
     //
     true_coord_rotation = rotateCoordinateSystem_GPU(true_coord, lens->ellipticity_angle[i]);
     double R = sqrt(true_coord_rotation.x*true_coord_rotation.x*(1 - lens->ellipticity_potential[i])+true_coord_rotation.y*true_coord_rotation.y*(1 + lens->ellipticity_potential[i]));
     //
     grad.x += (1 - lens->ellipticity[i]/3.)*lens->b0[i]*true_coord_rotation.x/R;
     grad.y += (1 + lens->ellipticity[i]/3.)*lens->b0[i]*true_coord_rotation.y/R;
   }
   return grad;
 }
 
 //
 //
 //
-__device__ struct point module_potentialDerivatives_totalGradient_8_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
+	__device__ 
+inline struct point module_potentialDerivatives_totalGradient_8_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
-  //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
-  // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
-  //
-  struct point grad, clumpgrad;
-  grad.x = 0;
-  grad.y = 0;
-  //
-  for(int i = shalos; i < shalos + nhalos; i++)
-  {
-    //IACA_START;
-    //
-    struct point true_coord, true_coord_rot; //, result;
-    //double       R, angular_deviation;
-    complex      zis;
-    //
-    //result.x = result.y = 0.;
-    //
-    true_coord.x = pImage->x - lens->position_x[i];
-    true_coord.y = pImage->y - lens->position_y[i];
-    double cosi,sinu;
-    //cosi = cos(lens->ellipticity_angle[i]);
-    //sinu = sin(lens->ellipticity_angle[i]);
-    cosi = lens->anglecos[i];
-    sinu = lens->anglesin[i];
-    //positionning at the potential center
-    // Change the origin of the coordinate system to the center of the clump
-    //true_coord_rot = rotateCoordinateSystem_GPU(true_coord, lens->ellipticity_angle[i]);
-    true_coord_rot = rotateCoordinateSystem_GPU_2(true_coord, cosi,sinu);
-    //
-    double x   = true_coord_rot.x;
-    double y   = true_coord_rot.y;
-    double eps = lens->ellipticity_potential[i];
-    double rc  = lens->rcore[i];
-    //
-    //std::cout << "piemd_lderivatives" << std::endl;
-    //
-    double sqe  = sqrt(eps);
-    //
-    double cx1  = (1. - eps)/(1. + eps);
-    double cxro = (1. + eps)*(1. + eps);
-    double cyro = (1. - eps)*(1. - eps);
-    //
-    double rem2 = x*x/cxro + y*y/cyro;
-    //
-    complex zci, znum, zden, zres;
-    double norm;
-    //
-    zci.re  = 0;
-    zci.im  = -0.5*(1. - eps*eps)/sqe;
-    //
-    znum.re = cx1*x;
-    znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
-    //
-    zden.re = x;
-    zden.im = 2.*rc*sqe - y;
-    norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
-    //
-    zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
-    zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
-    norm    = zis.re;
-    zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
-    zis.im  = atan2(zis.im, norm);
-    //  norm = zis.re;
-    zres.re = zci.re*zis.re - zci.im*zis.im;   // Re( zci*ln(zis) )
-    zres.im = zci.im*zis.re + zis.im*zci.re;   // Im( zci*ln(zis) )
-    //
-    zis.re  = zres.re;
-    zis.im  = zres.im;
-    //
-    //zres.re = zis.re*b0;
-    //zres.im = zis.im*b0;
-    // rotation
-    clumpgrad.x = zis.re;
-    clumpgrad.y = zis.im;
-    //clumpgrad = rotateCoordinateSystem_GPU(true_coord, -lens->ellipticity_angle[i]);
-    clumpgrad = rotateCoordinateSystem_GPU_2(clumpgrad, cosi,-sinu);
-    //
-    clumpgrad.x = lens->b0[i]*clumpgrad.x;
-    clumpgrad.y = lens->b0[i]*clumpgrad.y;
-    //
-    //clumpgrad.x = lens->b0[i]*zis.re;
-    //clumpgrad.y = lens->b0[i]*zis.im;
-    //nan check
-    //if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y)
-    //{
-      // add the gradients
-    grad.x += clumpgrad.x;
-    grad.y += clumpgrad.y;
-    //}
-  }
-  //IACA_END;
-  //
-  return(grad);
+	//asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
+	// 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
+	//
+	struct point grad, clumpgrad;
+	grad.x = 0;
+	grad.y = 0;
+	//
+	for(int i = shalos; i < shalos + nhalos; i++)
+	{
+		//IACA_START;
+		//
+		struct point true_coord, true_coord_rot; //, result;
+		//double       R, angular_deviation;
+		complex      zis;
+		//
+		//result.x = result.y = 0.;
+		//
+		true_coord.x = pImage->x - lens->position_x[i];
+		true_coord.y = pImage->y - lens->position_y[i];
+		//double cosi,sinu;
+		//cosi = cos(lens->ellipticity_angle[i]);
+		//sinu = sin(lens->ellipticity_angle[i]);
+		double cosi = lens->anglecos[i];
+		double sinu = lens->anglesin[i];
+		//positionning at the potential center
+		// Change the origin of the coordinate system to the center of the clump
+		//true_coord_rot = rotateCoordinateSystem_GPU(true_coord, lens->ellipticity_angle[i]);
+		//true_coord_rot = rotateCoordinateSystem_GPU_2(true_coord, cosi, sinu);
+		true_coord_rot.x = true_coord.x*cosi + true_coord.y*sinu;
+		true_coord_rot.y = true_coord.y*cosi - true_coord.x*sinu;	
+		//
+		double x   = true_coord_rot.x;
+		double y   = true_coord_rot.y;
+		double eps = lens->ellipticity_potential[i];
+		double rc  = lens->rcore[i];
+		double b0  = lens->b0[i];
+		//
+		//std::cout << "piemd_lderivatives" << std::endl;
+		//
+		double sqe  = sqrt(eps);
+		//
+		double cx1  = (1. - eps)/(1. + eps);
+		double cxro = (1. + eps)*(1. + eps);
+		double cyro = (1. - eps)*(1. - eps);
+		//
+		double rem2 = x*x/cxro + y*y/cyro;
+		//
+		complex zci, znum, zden, zres;
+		double norm;
+		//
+		zci.re  = 0;
+		zci.im  = -0.5*(1. - eps*eps)/sqe;
+		//
+		znum.re = cx1*x;
+		znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
+		//
+		zden.re = x;
+		zden.im = 2.*rc*sqe - y;
+		norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
+		//
+		zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
+		zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
+		norm    = zis.re;
+		zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
+		zis.im  = atan2(zis.im, norm);
+		//  norm = zis.re;
+		zres.re = zci.re*zis.re - zci.im*zis.im;   // Re( zci*ln(zis) )
+		zres.im = zci.im*zis.re + zis.im*zci.re;   // Im( zci*ln(zis) )
+		//
+		zis.re  = zres.re;
+		zis.im  = zres.im;
+		//
+		grad.x += b0*(zres.re*cosi - zres.im*sinu);
+                grad.y += b0*(zres.im*cosi + zres.re*sinu);
+		//
+		//zres.re = zis.re*b0;
+		//zres.im = zis.im*b0;
+		// rotation
+		//clumpgrad.x = zis.re;
+		//clumpgrad.y = zis.im;
+		//clumpgrad = rotateCoordinateSystem_GPU(true_coord, -lens->ellipticity_angle[i]);
+		//clumpgrad = rotateCoordinateSystem_GPU_2(clumpgrad, cosi, -sinu);
+		//grad.x += b0*(zis.re*cosi - zis.im*sinu);
+		//grad.y += b0*(zis.im*cosi + zis.re*sinu);
+		//clumpgrad.x = true_coord.x*cosi + true_coord.y*sinu;
+		//clumpgrad.y = true_coord.y*cosi - true_coord.x*sinu;	
+		//
+		//clumpgrad.x = lens->b0[i]*clumpgrad.x;
+		//clumpgrad.y = lens->b0[i]*clumpgrad.y;
+		//
+		//clumpgrad.x = lens->b0[i]*zis.re;
+		//clumpgrad.y = lens->b0[i]*zis.im;
+		//nan check
+		//if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y)
+		//{
+		// add the gradients
+		//grad.x += clumpgrad.x;
+		//grad.y += clumpgrad.y;
+		//}
+	}
+	//IACA_END;
+	//
+	return(grad);
 }
 
 
-__device__ struct point module_potentialDerivatives_totalGradient_81_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
+__device__ inline struct point module_potentialDerivatives_totalGradient_81_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
-        //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
-  //std::cout << "# module_potentialDerivatives_totalGradient_SOA begins" << std::endl;
-        // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
-        //
-        struct point grad, clumpgrad;
-        grad.x = 0;
-        grad.y = 0;
-        //
-        for(int i = shalos; i < shalos + nhalos; i++)
-        {
-                //IACA_START;
-                //
-                struct point true_coord, true_coord_rot; //, result;
-                //double       R, angular_deviation;
-                complex      zis;
-                //
-                //result.x = result.y = 0.;
-                //
-                true_coord.x = pImage->x - lens->position_x[i];
-                true_coord.y = pImage->y - lens->position_y[i];
-                //positionning at the potential center
-                // Change the origin of the coordinate system to the center of the clump
-                true_coord_rot = rotateCoordinateSystem_GPU(true_coord, lens->ellipticity_angle[i]);
-                //
-                double x    = true_coord_rot.x;
-                double y    = true_coord_rot.y;
-                double eps  = lens->ellipticity_potential[i];
-                double rc   = lens->rcore[i];
-                double rcut = lens->rcut[i];
-    double b0   = lens->b0[i];
-    double t05  = b0*rcut/(rcut - rc);
-    //printf("b0 = %f, rcut = %f, rc = %f, t05 = %f\n", b0, rcut, rc, t05);
-                //
-                //std::cout << "piemd_lderivatives" << std::endl;
-                //
-                double sqe  = sqrt(eps);
-                //
-                double cx1  = (1. - eps)/(1. + eps);
-                double cxro = (1. + eps)*(1. + eps);
-                double cyro = (1. - eps)*(1. - eps);
-                //
-                double rem2 = x*x/cxro + y*y/cyro;
-                //
-                complex zci, znum, zden, zres_rc, zres_rcut;
-                double norm;
-                //
-                zci.re  = 0;
-                zci.im  = -0.5*(1. - eps*eps)/sqe;
-                //
-    // step 1
-    //
-    {
-      znum.re = cx1*x;
-      znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
-      //
-      zden.re = x;
-      zden.im = 2.*rc*sqe - y;
-      norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
-      //
-      zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
-      zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
-      norm    = zis.re;
-      zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
-      zis.im  = atan2(zis.im, norm);
-      //  norm = zis.re;
-      zres_rc.re = zci.re*zis.re - zci.im*zis.im;   // Re( zci*ln(zis) )
-      zres_rc.im = zci.im*zis.re + zis.im*zci.re;   // Im( zci*ln(zis) )
-    }
-    //
-    // step 2
-    //
-    {
-                        znum.re = cx1*x;
-                        znum.im = 2.*sqe*sqrt(rcut*rcut + rem2) - y/cx1;
-                        //
-                        zden.re = x;
-                        zden.im = 2.*rcut*sqe - y;
-                        norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
-                        //
-                        zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
-                        zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
-                        norm    = zis.re;
-                        zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
-                        zis.im  = atan2(zis.im, norm);
-                        //  norm = zis.re;
-                        zres_rcut.re = zci.re*zis.re - zci.im*zis.im;   // Re( zci*ln(zis) )
-                        zres_rcut.im = zci.im*zis.re + zis.im*zci.re;   // Im( zci*ln(zis) )
-                }
-    zis.re  = t05*(zres_rc.re - zres_rcut.re);
-    zis.im  = t05*(zres_rc.im - zres_rcut.im);
-    //printf("%f %f\n", zis.re, zis.im);
-                //
-                //zres.re = zis.re*b0;
-                //zres.im = zis.im*b0;
-                // rotation
-                clumpgrad.x = zis.re;
-                clumpgrad.y = zis.im;
-                clumpgrad = rotateCoordinateSystem_GPU(clumpgrad, -lens->ellipticity_angle[i]);
-                //
-                //clumpgrad.x = lens->b0[i]*clumpgrad.x;
-                //clumpgrad.y = lens->b0[i]*clumpgrad.y;
-                //
-                //clumpgrad.x = lens->b0[i]*zis.re;
-                //clumpgrad.y = lens->b0[i]*zis.im;
-                //nan check
-                //if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y)
-                //{
-                        // add the gradients
-                grad.x += clumpgrad.x;
-                grad.y += clumpgrad.y;
-    //printf("grad = %f %f\n", clumpgrad.x, clumpgrad.y);
-                //}
-        }
-        //IACA_END;
-        //
-        return(grad);
+	//asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
+	//std::cout << "# module_potentialDerivatives_totalGradient_SOA begins" << std::endl;
+	// 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
+	//
+	struct point grad, clumpgrad;
+	grad.x = 0;
+	grad.y = 0;
+	//
+	for(int i = shalos; i < shalos + nhalos; i++)
+	{
+		//IACA_START;
+		//
+		struct point true_coord, true_coord_rot; //, result;
+		//double       R, angular_deviation;
+		complex      zis;
+		//
+		//result.x = result.y = 0.;
+		//
+		true_coord.x = pImage->x - lens->position_x[i];
+		true_coord.y = pImage->y - lens->position_y[i];
+		//positionning at the potential center
+		// Change the origin of the coordinate system to the center of the clump
+		true_coord_rot = rotateCoordinateSystem_GPU(true_coord, lens->ellipticity_angle[i]);
+		//
+		double x    = true_coord_rot.x;
+		double y    = true_coord_rot.y;
+		double eps  = lens->ellipticity_potential[i];
+		double rc   = lens->rcore[i];
+		double rcut = lens->rcut[i];
+		double b0   = lens->b0[i];
+		double t05  = b0*rcut/(rcut - rc);
+		//printf("b0 = %f, rcut = %f, rc = %f, t05 = %f\n", b0, rcut, rc, t05);
+		//
+		//std::cout << "piemd_lderivatives" << std::endl;
+		//
+		double sqe  = sqrt(eps);
+		//
+		double cx1  = (1. - eps)/(1. + eps);
+		double cxro = (1. + eps)*(1. + eps);
+		double cyro = (1. - eps)*(1. - eps);
+		//
+		double rem2 = x*x/cxro + y*y/cyro;
+		//
+		complex zci, znum, zden, zres_rc, zres_rcut;
+		double norm;
+		//
+		zci.re  = 0;
+		zci.im  = -0.5*(1. - eps*eps)/sqe;
+		//
+		// step 1
+		//
+		{
+			znum.re = cx1*x;
+			znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
+			//
+			zden.re = x;
+			zden.im = 2.*rc*sqe - y;
+			norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
+			//
+			zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
+			zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
+			norm    = zis.re;
+			zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
+			zis.im  = atan2(zis.im, norm);
+			//  norm = zis.re;
+			zres_rc.re = zci.re*zis.re - zci.im*zis.im;   // Re( zci*ln(zis) )
+			zres_rc.im = zci.im*zis.re + zis.im*zci.re;   // Im( zci*ln(zis) )
+		}
+		//
+		// step 2
+		//
+		{
+			znum.re = cx1*x;
+			znum.im = 2.*sqe*sqrt(rcut*rcut + rem2) - y/cx1;
+			//
+			zden.re = x;
+			zden.im = 2.*rcut*sqe - y;
+			norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
+			//
+			zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
+			zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
+			norm    = zis.re;
+			zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
+			zis.im  = atan2(zis.im, norm);
+			//  norm = zis.re;
+			zres_rcut.re = zci.re*zis.re - zci.im*zis.im;   // Re( zci*ln(zis) )
+			zres_rcut.im = zci.im*zis.re + zis.im*zci.re;   // Im( zci*ln(zis) )
+		}
+		zis.re  = t05*(zres_rc.re - zres_rcut.re);
+		zis.im  = t05*(zres_rc.im - zres_rcut.im);
+		//printf("%f %f\n", zis.re, zis.im);
+		//
+		//zres.re = zis.re*b0;
+		//zres.im = zis.im*b0;
+		// rotation
+		clumpgrad.x = zis.re;
+		clumpgrad.y = zis.im;
+		clumpgrad = rotateCoordinateSystem_GPU(clumpgrad, -lens->ellipticity_angle[i]);
+		//
+		//clumpgrad.x = lens->b0[i]*clumpgrad.x;
+		//clumpgrad.y = lens->b0[i]*clumpgrad.y;
+		//
+		//clumpgrad.x = lens->b0[i]*zis.re;
+		//clumpgrad.y = lens->b0[i]*zis.im;
+		//nan check
+		//if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y)
+		//{
+		// add the gradients
+		grad.x += clumpgrad.x;
+		grad.y += clumpgrad.y;
+		//printf("grad = %f %f\n", clumpgrad.x, clumpgrad.y);
+		//}
+	}
+	//IACA_END;
+	//
+	return(grad);
 }
 //
 //
 //
 
 typedef struct point (*halo_func_GPU_t) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos);
 
 __constant__ halo_func_GPU_t halo_func_GPU[100] =
 {
-0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_GPU, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_GPU,  0,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-0,  module_potentialDerivatives_totalGradient_81_SOA_GPU, 0, 0, 0, 0, 0, 0, 0, 0,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_GPU, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_GPU,  0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0,  module_potentialDerivatives_totalGradient_81_SOA_GPU, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 //
 //
 //
 __device__ struct point module_potentialDerivatives_totalGradient_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int nhalos)
+{
+	struct point grad, clumpgrad;
+	//
+	grad.x = clumpgrad.x = 0;
+	grad.y = clumpgrad.y = 0;
+	//
+	int shalos = 0;
+	clumpgrad = module_potentialDerivatives_totalGradient_5_SOA_GPU(pImage, lens, 0, nhalos);
+	grad.x += clumpgrad.x;
+	grad.y += clumpgrad.y;
+	return(grad);
+	//
+	//module_potentialDerivatives_totalGradient_81_SOA(pImage, lens, 0, nhalos);
+	//return;
+	/*
+	   int* p_type = &(lens->type)[0];
+	   int* lens_type = (int*) malloc(nhalos*sizeof(int));
+	   memcpy(lens_type, &(lens->type)[0], nhalos*sizeof(int));
+
+	//quicksort(lens_type, nhalos);
+	//*/
+
+	//printf ("%f \n",lens->position_x[shalos]);
+
+
+	while (shalos < nhalos)
+	{
+
+		int lens_type = lens->type[shalos];
+		int count     = 1;
+		while (lens->type[shalos + count] == lens_type) count++;
+		//std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl;
+		//printf ("%d %d %d \n",lens_type,count,shalos);
+		//
+		clumpgrad = (*halo_func_GPU[lens_type])(pImage, lens, shalos, count);
+		//
+		grad.x += clumpgrad.x;
+		grad.y += clumpgrad.y;
+		shalos += count;
+	}
+
+	return(grad);
+}
+
+
+#if 0
+__device__ struct point module_potentialDerivatives_totalGradient_SOA_CPU_GPU(const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, double xmin, double ymin, int nbgridcells, int nhalos)
 {
         struct point grad, clumpgrad;
         //
         grad.x = clumpgrad.x = 0;
         grad.y = clumpgrad.y = 0;
-  //
-  int shalos = 0;
-  //
-  //module_potentialDerivatives_totalGradient_81_SOA(pImage, lens, 0, nhalos);
-  //return;
-  /*
-  int* p_type = &(lens->type)[0];
-  int* lens_type = (int*) malloc(nhalos*sizeof(int));
-  memcpy(lens_type, &(lens->type)[0], nhalos*sizeof(int));
+        //
+        int shalos = 0;
+        //clumpgrad = module_potentialDerivatives_totalGradient_5_SOA_GPU(pImage, lens, 0, nhalos);
+        //grad.x += clumpgrad.x;
+        //grad.y += clumpgrad.y;
+        //return(grad);
+        //
+        //module_potentialDerivatives_totalGradient_81_SOA(pImage, lens, 0, nhalos);
+        //return;
+        /*
+           int* p_type = &(lens->type)[0];
+           int* lens_type = (int*) malloc(nhalos*sizeof(int));
+           memcpy(lens_type, &(lens->type)[0], nhalos*sizeof(int));
 
-  //quicksort(lens_type, nhalos);
-  //*/
+        //quicksort(lens_type, nhalos);
+        //*/
 
-  //printf ("%f \n",lens->position_x[shalos]);
+        //printf ("%f \n",lens->position_x[shalos]);
 
 
-  while (shalos < nhalos)
-  {
+        while (shalos < nhalos)
+        {
 
-    int lens_type = lens->type[shalos];
-    int count     = 1;
-    while (lens->type[shalos + count] == lens_type) count++;
-    //std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl;
-    //printf ("%d %d %d \n",lens_type,count,shalos);
-    //
-    clumpgrad = (*halo_func_GPU[lens_type])(pImage, lens, shalos, count);
-    //
-    grad.x += clumpgrad.x;
-    grad.y += clumpgrad.y;
-    shalos += count;
-  }
+                int lens_type = lens->type[shalos];
+                int count     = 1;
+                while (lens->type[shalos + count] == lens_type) count++;
+                //std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl;
+                //printf ("%d %d %d \n",lens_type,count,shalos);
+                //
+                clumpgrad = (*halo_func_GPU[lens_type])(pImage, lens, shalos, count);
+                //
+                grad.x += clumpgrad.x;
+                grad.y += clumpgrad.y;
+                shalos += count;
+        }
 
         return(grad);
 }
+#endif
+
 
 __device__ inline struct point rotateCoordinateSystem_GPU(struct point P, double theta)
 {
-  struct  point   Q;
+	struct  point   Q;
 
-  Q.x = P.x*cos(theta) + P.y*sin(theta);
-  Q.y = P.y*cos(theta) - P.x*sin(theta);
+	Q.x = P.x*cos(theta) + P.y*sin(theta);
+	Q.y = P.y*cos(theta) - P.x*sin(theta);
 
-  return(Q);
+	return(Q);
 }
 
 __device__ inline struct point rotateCoordinateSystem_GPU_2(struct point P, double cosi, double sinu)
 {
-  struct  point   Q;
+	struct  point   Q;
 
-  Q.x = P.x*cosi + P.y*sinu;
-  Q.y = P.y*cosi - P.x*sinu;
+	Q.x = P.x*cosi + P.y*sinu;
+	Q.y = P.y*cosi - P.x*sinu;
 
-  return(Q);
+	return(Q);
 }
diff --git a/Benchmarks/GridGradientBenchmark/gradient_GPU.cuh b/Benchmarks/GridGradientBenchmark/gradient_GPU.cuh
index 4cb3d20..65b9bdf 100644
--- a/Benchmarks/GridGradientBenchmark/gradient_GPU.cuh
+++ b/Benchmarks/GridGradientBenchmark/gradient_GPU.cuh
@@ -1,20 +1,25 @@
 /*
  * gradient_GPU.cuh
  *
  *  Created on: Feb 1, 2017
  *      Author: cerschae
  */
 
 #ifndef GRADIENT_GPU_CUH_
 #define GRADIENT_GPU_CUH_
 
 #include "cudafunctions.cuh"
 #include <fstream>
 #include <structure_hpc.h>
 
 __device__ struct point module_potentialDerivatives_totalGradient_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int nhalos);
 
 __device__ inline struct point rotateCoordinateSystem_GPU(struct point P, double theta);
 __device__ inline struct point rotateCoordinateSystem_GPU_2(struct point P, double cosi, double sinu);
+//
+//__device__
+//inline struct point module_potentialDerivatives_totalGradient_5_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
+
+
 
 #endif /* GRADIENT_GPU_CUH_ */
diff --git a/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cu b/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cu
index 4de3c88..6412cfd 100644
--- a/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cu
+++ b/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cu
@@ -1,792 +1,841 @@
 #include <fstream>
 #include "grid_gradient_GPU.cuh"
 
+#define BLOCK_SIZE 16
+
+void
+module_potentialDerivatives_totalGradient_SOA_CPU_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos);
+
+void
+module_potentialDerivatives_totalGradient_SOA_CPU_GPU_v2(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos);
+
 void calculate_cossin_values(double *theta_cos, double *theta_sin, double *angles, int nhalos ){
 	for(int i = 0 ; i < nhalos; i++){
 		theta_cos[i]=cos(angles[i]);
 		theta_sin[i]=sin(angles[i]);
 	}
 }
 
 void gradient_grid_GPU_sorted(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos ,int nbgridcells){
 
 
   int nBlocks_gpu = 0;
   // Define the number of threads per block the GPU will use
   cudaDeviceProp properties_gpu;
 
   cudaGetDeviceProperties(&properties_gpu, 0); // Get properties of 0th GPU in use
 
   if (properties_gpu.maxThreadsDim[0]<threadsPerBlock)
     {
     fprintf(stderr, "ERROR: The GPU has to support at least %u threads per block.\n", threadsPerBlock);
     exit(-1);
     }
   else
     {
     nBlocks_gpu = properties_gpu.maxGridSize[0] / threadsPerBlock;  // Get the maximum number of blocks with the chosen number of threads
                     // per Block that the GPU supports
     }
 
   grid_param *frame_gpu;
   Potential_SOA *lens_gpu,*lens_kernel ;
   int *type_gpu;
   double *lens_x_gpu, *lens_y_gpu, *b0_gpu, *angle_gpu, *epot_gpu, *rcore_gpu, *rcut_gpu, *anglecos_gpu, *anglesin_gpu;
   double *grid_grad_x_gpu, *grid_grad_y_gpu ;
 
   lens_gpu = (Potential_SOA *) malloc(sizeof(Potential_SOA));
   lens_gpu->type = (int *) malloc(sizeof(int));
 
   // Allocate variables on the GPU
   cudasafe(cudaMalloc( (void**)&(lens_kernel), sizeof(Potential_SOA)),"Gradientgpu.cu : Alloc Potential_SOA: " );
   cudasafe(cudaMalloc( (void**)&(type_gpu), nhalos*sizeof(int)),"Gradientgpu.cu : Alloc type_gpu: " );
   cudasafe(cudaMalloc( (void**)&(lens_x_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc x_gpu: " );
   cudasafe(cudaMalloc( (void**)&(lens_y_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc y_gpu: " );
   cudasafe(cudaMalloc( (void**)&(b0_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc b0_gpu: " );
   cudasafe(cudaMalloc( (void**)&(angle_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc angle_gpu: " );
   cudasafe(cudaMalloc( (void**)&(epot_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc epot_gpu: " );
   cudasafe(cudaMalloc( (void**)&(rcore_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcore_gpu: " );
   cudasafe(cudaMalloc( (void**)&(rcut_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcut_gpu: " );
   cudasafe(cudaMalloc( (void**)&(anglecos_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglecos_gpu: " );
   cudasafe(cudaMalloc( (void**)&(anglesin_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglesin_gpu: " );
   cudasafe(cudaMalloc( (void**)&(frame_gpu), sizeof(grid_param)),"Gradientgpu.cu : Alloc frame_gpu: " );
   cudasafe(cudaMalloc( (void**)&(grid_grad_x_gpu), (nbgridcells) * (nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_x_gpu: " );
   cudasafe(cudaMalloc( (void**)&(grid_grad_y_gpu), (nbgridcells) * (nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_y_gpu: " );
 
   // Copy values to the GPU
   cudasafe(cudaMemcpy(type_gpu,lens->type , nhalos*sizeof(int),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy type_gpu: " );
   cudasafe(cudaMemcpy(lens_x_gpu,lens->position_x , nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy x_gpu: " );
   cudasafe(cudaMemcpy(lens_y_gpu,lens->position_y , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy y_gpu: " );
   cudasafe(cudaMemcpy(b0_gpu,lens->b0 , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy b0_gpu: " );
   cudasafe(cudaMemcpy(angle_gpu,lens->ellipticity_angle , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy angle_gpu: " );
   cudasafe(cudaMemcpy(epot_gpu, lens->ellipticity_potential, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy epot_gpu: " );
   cudasafe(cudaMemcpy(rcore_gpu, lens->rcore, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy rcore_gpu: " );
   cudasafe(cudaMemcpy(rcut_gpu, lens->rcut, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy rcut_gpu: " );
   cudasafe(cudaMemcpy(anglecos_gpu, lens->anglecos, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy anglecos: " );
   cudasafe(cudaMemcpy(anglesin_gpu, lens->anglesin, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy anglesin: " );
   cudasafe(cudaMemcpy(frame_gpu, frame, sizeof(grid_param), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy fame_gpu: " );
 
 
   //printf("%p \n", lens_gpu);
   //printf("%p \n", type_gpu);
   //printf("%p \n", lens_gpu->type);
   //fflush(stdout);
   lens_gpu->type = type_gpu;
   lens_gpu->position_x = lens_x_gpu;
   lens_gpu->position_y = lens_y_gpu;
   lens_gpu->b0 = b0_gpu;
   lens_gpu->ellipticity_angle = angle_gpu;
   lens_gpu->ellipticity_potential = epot_gpu;
   lens_gpu->rcore = rcore_gpu;
   lens_gpu->rcut = rcut_gpu;
   lens_gpu->anglecos = anglecos_gpu;
   lens_gpu->anglesin = anglesin_gpu;
 
   cudaMemcpy(lens_kernel, lens_gpu, sizeof(Potential_SOA), cudaMemcpyHostToDevice);
 
 
   if (int((nbgridcells) * (nbgridcells)/threadsPerBlock) == 0){
     gradient_grid_kernel<<<1,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel);
   }
   else{
     gradient_grid_kernel<<<(nbgridcells) * (nbgridcells)/threadsPerBlock,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel);
   }
   cudasafe(cudaMemcpy( grid_grad_x, grid_grad_x_gpu, (nbgridcells) * (nbgridcells) *sizeof(double),cudaMemcpyDeviceToHost ),"Gradientgpu.cu : Copy source_x_gpu: " );
   cudasafe(cudaMemcpy( grid_grad_y, grid_grad_y_gpu, (nbgridcells) * (nbgridcells) *sizeof(double), cudaMemcpyDeviceToHost),"Gradientgpu.cu : Copy source_y_gpu: " );
 
 
   //printf("%f %f \n",grid_grad_x[0],grid_grad_y[0]);
 
     // Free GPU memory
   cudaFree(lens_gpu);
   cudaFree(type_gpu);
   cudaFree(lens_x_gpu);
   cudaFree(lens_y_gpu);
   cudaFree(b0_gpu);
   cudaFree(angle_gpu);
   cudaFree(epot_gpu);
   cudaFree(rcore_gpu);
   cudaFree(rcut_gpu);
   cudaFree(anglecos_gpu);
   cudaFree(anglesin_gpu);
   cudaFree(grid_grad_x_gpu);
   cudaFree(grid_grad_y_gpu);
 
 /*
   for (int i = 0; i < nbgridcells; i++){
     for(int j = 0; j < nbgridcells; j++){
       printf(" %f",grid_grad_x[i*nbgridcells + j]);
     }
     printf("\n");
   }*/
 
 }
 
 void gradient_grid_GPU_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, int nbgridcells){
   //get number of GPU devices
   int nDevices;
   cudaGetDeviceCount(&nDevices);
 
   // Initialise kernel variables, table for multiple devices
   grid_param *frame_gpu[nDevices];
   Potential_SOA *lens_gpu[nDevices],*lens_kernel[nDevices] ;
   int *type_gpu[nDevices];
   double *lens_x_gpu[nDevices], *lens_y_gpu[nDevices], *b0_gpu[nDevices], *angle_gpu[nDevices], *epot_gpu[nDevices], *rcore_gpu[nDevices], *rcut_gpu[nDevices],*anglecos_gpu[nDevices], *anglesin_gpu[nDevices];
   double *grid_grad_x_gpu[nDevices], *grid_grad_y_gpu[nDevices] ;
 
 
 
   // Initialise multiple device variables
   int Ndevice[nDevices], indexactual[nDevices];
   cudaStream_t stream[nDevices];
 
   indexactual[0] = 0 ;
   Ndevice[0] = (nbgridcells) * (nbgridcells)/nDevices;
 
   printf("Using %d Gpu's \n",nDevices );
   //printf("%d %d \n",indexactual[0], Ndevice[0]);
 
   for (int dev = 1; dev < nDevices; dev++) {
 
     Ndevice[dev] = (nbgridcells) * (nbgridcells)/nDevices;
 
     if(indexactual[dev]+Ndevice[dev] > (nbgridcells) * (nbgridcells)){
       Ndevice[dev] = (nbgridcells) * (nbgridcells) - indexactual[dev-1];
     }
 
     indexactual[dev] = indexactual[dev-1] + Ndevice[dev];
     //printf("%d %d \n",indexactual[dev], Ndevice[dev]);
   }
 
   for (int dev = 0; dev < nDevices; dev++) {
 
 
     cudaSetDevice(dev);
 
     lens_gpu[dev] = (Potential_SOA *) malloc(sizeof(Potential_SOA));
     lens_gpu[dev]->type = (int *) malloc(sizeof(int));
 
     // Allocate variables on the GPU
     cudasafe(cudaMalloc( (void**)&(lens_kernel[dev]), sizeof(Potential_SOA)),"Gradientgpu.cu : Alloc Potential_SOA: " );
     cudasafe(cudaMalloc( (void**)&(type_gpu[dev]), nhalos*sizeof(int)),"Gradientgpu.cu : Alloc type_gpu: " );
     cudasafe(cudaMalloc( (void**)&(lens_x_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc x_gpu: " );
     cudasafe(cudaMalloc( (void**)&(lens_y_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc y_gpu: " );
     cudasafe(cudaMalloc( (void**)&(b0_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc b0_gpu: " );
     cudasafe(cudaMalloc( (void**)&(angle_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc angle_gpu: " );
     cudasafe(cudaMalloc( (void**)&(epot_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc epot_gpu: " );
     cudasafe(cudaMalloc( (void**)&(rcore_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcore_gpu: " );
     cudasafe(cudaMalloc( (void**)&(rcut_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcut_gpu: " );
     cudasafe(cudaMalloc( (void**)&(frame_gpu[dev]), sizeof(grid_param)),"Gradientgpu.cu : Alloc frame_gpu: " );
     cudasafe(cudaMalloc( (void**)&(anglecos_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglecos_gpu: " );
     cudasafe(cudaMalloc( (void**)&(anglesin_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglesin_gpu: " );
     cudasafe(cudaMalloc( (void**)&(grid_grad_x_gpu[dev]), Ndevice[dev] *sizeof(double)),"Gradientgpu.cu : Alloc source_x_gpu: " );
     cudasafe(cudaMalloc( (void**)&(grid_grad_y_gpu[dev]), Ndevice[dev] *sizeof(double)),"Gradientgpu.cu : Alloc source_y_gpu: " );
 
     cudaStreamCreate(&stream[dev]);
 
   }
 
   for (int dev = 0; dev < nDevices; dev++) {
 
 
     cudaSetDevice(dev);
 
     // Copy values to the GPU
     cudasafe(cudaMemcpyAsync(type_gpu[dev],lens->type , nhalos*sizeof(int),cudaMemcpyHostToDevice,stream[dev] ),"Gradientgpu.cu : Copy type_gpu: " );
     cudasafe(cudaMemcpyAsync(lens_x_gpu[dev],lens->position_x , nhalos*sizeof(double),cudaMemcpyHostToDevice,stream[dev] ),"Gradientgpu.cu : Copy x_gpu: " );
     cudasafe(cudaMemcpyAsync(lens_y_gpu[dev],lens->position_y , nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy y_gpu: " );
     cudasafe(cudaMemcpyAsync(b0_gpu[dev],lens->b0 , nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy b0_gpu: " );
     cudasafe(cudaMemcpyAsync(angle_gpu[dev],lens->ellipticity_angle , nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy angle_gpu: " );
     cudasafe(cudaMemcpyAsync(epot_gpu[dev], lens->ellipticity_potential, nhalos*sizeof(double),cudaMemcpyHostToDevice ,stream[dev]),"Gradientgpu.cu : Copy epot_gpu: " );
     cudasafe(cudaMemcpyAsync(rcore_gpu[dev], lens->rcore, nhalos*sizeof(double),cudaMemcpyHostToDevice ,stream[dev]),"Gradientgpu.cu : Copy rcore_gpu: " );
     cudasafe(cudaMemcpyAsync(rcut_gpu[dev], lens->rcut, nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy rcut_gpu: " );
     cudasafe(cudaMemcpyAsync(anglecos_gpu[dev], lens->anglecos, nhalos*sizeof(double),cudaMemcpyHostToDevice,stream[dev] ),"Gradientgpu.cu : Copy anglecos: " );
     cudasafe(cudaMemcpyAsync(anglesin_gpu[dev], lens->anglesin, nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy anglesin: " );
     cudasafe(cudaMemcpyAsync(frame_gpu[dev], frame, sizeof(grid_param), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy fame_gpu: " );
 
 
     lens_gpu[dev]->type = type_gpu[dev];
     lens_gpu[dev]->position_x = lens_x_gpu[dev];
     lens_gpu[dev]->position_y = lens_y_gpu[dev];
     lens_gpu[dev]->b0 = b0_gpu[dev];
     lens_gpu[dev]->ellipticity_angle = angle_gpu[dev];
     lens_gpu[dev]->ellipticity_potential = epot_gpu[dev];
     lens_gpu[dev]->rcore = rcore_gpu[dev];
     lens_gpu[dev]->rcut = rcut_gpu[dev];
     lens_gpu[dev]->anglecos = anglecos_gpu[dev];
     lens_gpu[dev]->anglesin = anglesin_gpu[dev];
 
     cudaMemcpyAsync(lens_kernel[dev], lens_gpu[dev], sizeof(Potential_SOA), cudaMemcpyHostToDevice,stream[dev]);
   }
 
   for (int dev = 0; dev < nDevices; dev++) {
 
 
     cudaSetDevice(dev);
     int nBlocks_gpu = 0;
     cudaDeviceProp properties_gpu;
     cudaGetDeviceProperties(&properties_gpu, dev); // Get properties of 0th GPU in use
 
     if (properties_gpu.maxThreadsDim[0]<threadsPerBlock)
       {
       fprintf(stderr, "ERROR: The GPU has to support at least %u threads per block.\n", threadsPerBlock);
       exit(-1);
       }
 
 
     if (int((nbgridcells) * (nbgridcells)/threadsPerBlock) == 0){
     	gradient_grid_kernel_multiple<<<1,threadsPerBlock,0,stream[dev]>>>(grid_grad_x_gpu[dev], grid_grad_y_gpu[dev],frame_gpu[dev],nhalos, nbgridcells, lens_kernel[dev], indexactual[dev],Ndevice[dev]);
     }
     else{
     	gradient_grid_kernel_multiple<<<(nbgridcells) * (nbgridcells)/threadsPerBlock,threadsPerBlock,0,stream[dev]>>>(grid_grad_x_gpu[dev], grid_grad_y_gpu[dev],frame_gpu[dev],nhalos, nbgridcells, lens_kernel[dev], indexactual[dev],Ndevice[dev]);
     }
   }
 
   for (int dev = 0; dev < nDevices; dev++) {
     cudasafe(cudaMemcpyAsync( grid_grad_x + indexactual[dev], grid_grad_x_gpu[dev], Ndevice[dev] *sizeof(double),cudaMemcpyDeviceToHost ,stream[dev]),"Gradientgpu.cu : Copy source_x_gpu: " );
     cudasafe(cudaMemcpyAsync( grid_grad_y + indexactual[dev], grid_grad_y_gpu[dev], Ndevice[dev] *sizeof(double), cudaMemcpyDeviceToHost,stream[dev]),"Gradientgpu.cu : Copy source_y_gpu: " );
   }
 
   for (int dev = 0; dev < nDevices; dev++) {
 
 
     cudaSetDevice(dev);
       // Free GPU memory
     cudaFree(type_gpu[dev]);
     cudaFree(lens_x_gpu[dev]);
     cudaFree(lens_y_gpu[dev]);
     cudaFree(b0_gpu[dev]);
     cudaFree(angle_gpu[dev]);
     cudaFree(epot_gpu[dev]);
     cudaFree(rcore_gpu[dev]);
     cudaFree(rcut_gpu[dev]);
     cudaFree(anglecos_gpu[dev]);
     cudaFree(anglesin_gpu[dev]);
     cudaFree(grid_grad_x_gpu[dev]);
     cudaFree(grid_grad_y_gpu[dev]);
     cudaStreamDestroy(stream[dev]);
 
   }
 
 }
 
 #if 1
 void gradient_grid_GPU_sub(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, int nbgridcells, int indexactual, int Ncells ){
 
 
 	// GPU Property query
 	int nBlocks_gpu = 0;
+	// Define the number of threads per block the GPU will use
 	cudaDeviceProp properties_gpu;
+
 	cudaGetDeviceProperties(&properties_gpu, 0); // Get properties of 0th GPU in use
+
 	if (properties_gpu.maxThreadsDim[0]<threadsPerBlock)
 	{
-	fprintf(stderr, "ERROR: The GPU has to support at least %u threads per block.\n", threadsPerBlock);
-	exit(-1);
+		fprintf(stderr, "ERROR: The GPU has to support at least %u threads per block.\n", threadsPerBlock);
+		exit(-1);
 	}
 	else
 	{
-	nBlocks_gpu = properties_gpu.maxGridSize[0] / threadsPerBlock;  // Get the maximum number of blocks with the chosen number of threads
-					// per Block that the GPU supports
+		nBlocks_gpu = properties_gpu.maxGridSize[0] / threadsPerBlock;  // Get the maximum number of blocks with the chosen number of threads
+		// per Block that the GPU supports
 	}
 
-	//Number of subdivision and Dimension variables for subdivision
-	//int nSubd = 2;
-
-	//cudaStream_t stream[nDevices];
-
-	//Kernel Variables
 	grid_param *frame_gpu;
 	Potential_SOA *lens_gpu,*lens_kernel ;
 	int *type_gpu;
 	double *lens_x_gpu, *lens_y_gpu, *b0_gpu, *angle_gpu, *epot_gpu, *rcore_gpu, *rcut_gpu, *anglecos_gpu, *anglesin_gpu;
 	double *grid_grad_x_gpu, *grid_grad_y_gpu ;
 
-
 	lens_gpu = (Potential_SOA *) malloc(sizeof(Potential_SOA));
 	lens_gpu->type = (int *) malloc(sizeof(int));
 
 	// Allocate variables on the GPU
 	cudasafe(cudaMalloc( (void**)&(lens_kernel), sizeof(Potential_SOA)),"Gradientgpu.cu : Alloc Potential_SOA: " );
 	cudasafe(cudaMalloc( (void**)&(type_gpu), nhalos*sizeof(int)),"Gradientgpu.cu : Alloc type_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(lens_x_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc x_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(lens_y_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc y_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(b0_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc b0_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(angle_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc angle_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(epot_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc epot_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(rcore_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcore_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(rcut_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcut_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(anglecos_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglecos_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(anglesin_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglesin_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(frame_gpu), sizeof(grid_param)),"Gradientgpu.cu : Alloc frame_gpu: " );
-	cudasafe(cudaMalloc( (void**)&(grid_grad_x_gpu), Ncells *sizeof(double)),"Gradientgpu.cu : Alloc source_x_gpu: " );
-	cudasafe(cudaMalloc( (void**)&(grid_grad_y_gpu), Ncells *sizeof(double)),"Gradientgpu.cu : Alloc source_y_gpu: " );
-
+	cudasafe(cudaMalloc( (void**)&(grid_grad_x_gpu), (nbgridcells) * (nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_x_gpu: " );
+	cudasafe(cudaMalloc( (void**)&(grid_grad_y_gpu), (nbgridcells) * (nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_y_gpu: " );
 
 	// Copy values to the GPU
-	cudasafe(cudaMemcpyAsync(type_gpu,lens->type , nhalos*sizeof(int),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy type_gpu: " );
-	cudasafe(cudaMemcpyAsync(lens_x_gpu,lens->position_x , nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy x_gpu: " );
-	cudasafe(cudaMemcpyAsync(lens_y_gpu,lens->position_y , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy y_gpu: " );
-	cudasafe(cudaMemcpyAsync(b0_gpu,lens->b0 , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy b0_gpu: " );
-	cudasafe(cudaMemcpyAsync(angle_gpu,lens->ellipticity_angle , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy angle_gpu: " );
-	cudasafe(cudaMemcpyAsync(epot_gpu, lens->ellipticity_potential, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy epot_gpu: " );
-	cudasafe(cudaMemcpyAsync(rcore_gpu, lens->rcore, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy rcore_gpu: " );
-	cudasafe(cudaMemcpyAsync(rcut_gpu, lens->rcut, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy rcut_gpu: " );
-	cudasafe(cudaMemcpyAsync(anglecos_gpu, lens->anglecos, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy anglecos: " );
-	cudasafe(cudaMemcpyAsync(anglesin_gpu, lens->anglesin, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy anglesin: " );
-	cudasafe(cudaMemcpyAsync(frame_gpu, frame, sizeof(grid_param), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy fame_gpu: " );
-
-
+	cudasafe(cudaMemcpy(type_gpu,lens->type , nhalos*sizeof(int),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy type_gpu: " );
+	cudasafe(cudaMemcpy(lens_x_gpu,lens->position_x , nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy x_gpu: " );
+	cudasafe(cudaMemcpy(lens_y_gpu,lens->position_y , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy y_gpu: " );
+	cudasafe(cudaMemcpy(b0_gpu,lens->b0 , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy b0_gpu: " );
+	cudasafe(cudaMemcpy(angle_gpu,lens->ellipticity_angle , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy angle_gpu: " );
+	cudasafe(cudaMemcpy(epot_gpu, lens->ellipticity_potential, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy epot_gpu: " );
+	cudasafe(cudaMemcpy(rcore_gpu, lens->rcore, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy rcore_gpu: " );
+	cudasafe(cudaMemcpy(rcut_gpu, lens->rcut, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy rcut_gpu: " );
+	cudasafe(cudaMemcpy(anglecos_gpu, lens->anglecos, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy anglecos: " );
+	cudasafe(cudaMemcpy(anglesin_gpu, lens->anglesin, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy anglesin: " );
+	cudasafe(cudaMemcpy(frame_gpu, frame, sizeof(grid_param), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy fame_gpu: " );
+
+
+	//printf("%p \n", lens_gpu);
+	//printf("%p \n", type_gpu);
+	//printf("%p \n", lens_gpu->type);
+	//fflush(stdout);
 	lens_gpu->type = type_gpu;
 	lens_gpu->position_x = lens_x_gpu;
 	lens_gpu->position_y = lens_y_gpu;
 	lens_gpu->b0 = b0_gpu;
 	lens_gpu->ellipticity_angle = angle_gpu;
 	lens_gpu->ellipticity_potential = epot_gpu;
 	lens_gpu->rcore = rcore_gpu;
 	lens_gpu->rcut = rcut_gpu;
 	lens_gpu->anglecos = anglecos_gpu;
 	lens_gpu->anglesin = anglesin_gpu;
 
 	cudaMemcpy(lens_kernel, lens_gpu, sizeof(Potential_SOA), cudaMemcpyHostToDevice);
+#if 0
+	int BLOCK_SIZE = 16; // number of threads
+	int GRID_SIZE = (nbgridcells + BLOCK_SIZE - 1)/BLOCK_SIZE; // number of blocks
+	//
+	dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
+	dim3 dimGrid(GRID_SIZE, GRID_SIZE);
+	//
+	if (int((nbgridcells) * (nbgridcells)/threadsPerBlock) == 0)
+	{
+		gradient_grid_kernel<<<1,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel);
+	}
+	else
+	{
+		//gradient_grid_kernel<<<(nbgridcells) * (nbgridcells)/threadsPerBlock,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel);
+		//gradient_grid_kernel_v2<<<dimGrid, dimBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu, frame_gpu, nhalos, nbgridcells, lens_kernel);
+		//gradient_grid_kernel_v2<<<dimGrid, dimBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu, frame_gpu, nhalos, nbgridcells, lens_kernel);
 
 
 	if (int((Ncells)/threadsPerBlock) == 0){
 	gradient_grid_kernel_multiple<<<1,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel, indexactual, Ncells);
 	}
 	else{
 	gradient_grid_kernel_multiple<<<(Ncells)/threadsPerBlock,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel, indexactual, Ncells);
 	}
-	cudasafe(cudaMemcpy( grid_grad_x + indexactual, grid_grad_x_gpu, Ncells *sizeof(double),cudaMemcpyDeviceToHost ),"Gradientgpu.cu : Copy source_x_gpu: " );
-	cudasafe(cudaMemcpy( grid_grad_y + indexactual, grid_grad_y_gpu, Ncells *sizeof(double),cudaMemcpyDeviceToHost ),"Gradientgpu.cu : Copy source_y_gpu: " );
+#endif
+		module_potentialDerivatives_totalGradient_SOA_CPU_GPU(grid_grad_x_gpu, grid_grad_y_gpu, frame_gpu, lens, lens_kernel, nbgridcells, nhalos);
+	cudasafe(cudaMemcpy( grid_grad_x, grid_grad_x_gpu, (nbgridcells) * (nbgridcells) *sizeof(double),cudaMemcpyDeviceToHost ),"Gradientgpu.cu : Copy source_x_gpu: " );
+	cudasafe(cudaMemcpy( grid_grad_y, grid_grad_y_gpu, (nbgridcells) * (nbgridcells) *sizeof(double), cudaMemcpyDeviceToHost),"Gradientgpu.cu : Copy source_y_gpu: " );
 
 
 	//printf("%f %f \n",grid_grad_x[0],grid_grad_y[0]);
 
 	// Free GPU memory
 	cudaFree(lens_gpu);
 	cudaFree(type_gpu);
 	cudaFree(lens_x_gpu);
 	cudaFree(lens_y_gpu);
 	cudaFree(b0_gpu);
 	cudaFree(angle_gpu);
 	cudaFree(epot_gpu);
 	cudaFree(rcore_gpu);
 	cudaFree(rcut_gpu);
 	cudaFree(anglecos_gpu);
 	cudaFree(anglesin_gpu);
 	cudaFree(grid_grad_x_gpu);
 	cudaFree(grid_grad_y_gpu);
 
+	/*
+	   for (int i = 0; i < nbgridcells; i++){
+	   for(int j = 0; j < nbgridcells; j++){
+	   printf(" %f",grid_grad_x[i*nbgridcells + j]);
+	   }
+	   printf("\n");
+	   }*/
 
 }
 
-#endif
 
 __global__ void gradient_grid_kernel(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens) {
 
-  //*grad_x = *grad_y = 0.;
+	//*grad_x = *grad_y = 0.;
 
-  int bid=blockIdx.x; // index of the block (and of the set of images)
-  int tid=threadIdx.x; // index of the thread within the block
+	int bid=blockIdx.x; // index of the block (and of the set of images)
+	int tid=threadIdx.x; // index of the thread within the block
 
-  double dx,dy;        //pixelsize
-  int grid_dim, index;
-  struct point image_point, Grad;
-  dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
-  dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
-  grid_dim = (nbgridcells);
+	double dx,dy;        //pixelsize
+	int grid_dim, index;
+	struct point image_point, Grad;
+	dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
+	dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
+	grid_dim = (nbgridcells);
 
-  index = bid * threadsPerBlock + tid ;
-
-  while(index < grid_dim*grid_dim){
-
-    grid_grad_x[index] = 0.;
-    grid_grad_y[index] = 0.;
+	index = bid * threadsPerBlock + tid ;
 
-    image_point.x = frame->xmin + (index/grid_dim)*dx;
-    image_point.y = frame->ymin + (index % grid_dim)*dy;
+	while(index < grid_dim*grid_dim){
 
-    Grad = module_potentialDerivatives_totalGradient_SOA_GPU(&image_point, lens, Nlens);
+		grid_grad_x[index] = 0.;
+		grid_grad_y[index] = 0.;
 
-    grid_grad_x[index] = Grad.x;
-    grid_grad_y[index] = Grad.y;
+		image_point.x = frame->xmin + (index/grid_dim)*dx;
+		image_point.y = frame->ymin + (index % grid_dim)*dy;
 
-    bid += gridDim.x;
-    index = bid * threadsPerBlock + tid;
-  }
+		Grad = module_potentialDerivatives_totalGradient_SOA_GPU(&image_point, lens, Nlens);
 
+		grid_grad_x[index] = Grad.x;
+		grid_grad_y[index] = Grad.y;
 
+		bid += gridDim.x;
+		index = bid * threadsPerBlock + tid;
+	}
 }
 
-__global__ void gradient_grid_kernel_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells, const struct Potential_SOA *lens, int indexactual, int ncells) {
+__global__ void gradient_grid_kernel_v2(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens) {
 
-  //*grad_x = *grad_y = 0.;
+	//*grad_x = *grad_y = 0.;
 
-  int bid=blockIdx.x; // index of the block (and of the set of images)
-  int tid=threadIdx.x; // index of the thread within the block
+	int bid = blockIdx.x; // index of the block (and of the set of images)
+	int tid = threadIdx.x; // index of the thread within the block
 
-  double dx,dy;        //pixelsize
-  int grid_dim, index;
-  struct point image_point, Grad;
-  dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
-  dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
-  grid_dim = (nbgridcells);
-
-  index = bid * threadsPerBlock + tid ;
-  int major_index = indexactual + index ;
-
-  while(index < ncells){
-
-    grid_grad_x[index] = 0.;
-    grid_grad_y[index] = 0.;
+	double dx,dy;        //pixelsize
+	int grid_dim, index;
+	struct point image_point, Grad;
+	//
+	dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
+	dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
+	//
+	grid_dim = (nbgridcells);
+	//
+	int row = blockIdx.y * blockDim.y + threadIdx.y;
+	int col = blockIdx.x * blockDim.x + threadIdx.x;
+	//
+	index = col*nbgridcells + row ;
+	//
+	//while(index < grid_dim*grid_dim){
 
-    image_point.x = frame->xmin + (major_index/grid_dim)*dx;
-    image_point.y = frame->ymin + (major_index % grid_dim)*dy;
+	//grid_grad_x[index] = 0.;
+	//grid_grad_y[index] = 0.;
 
-    Grad = module_potentialDerivatives_totalGradient_SOA_GPU(&image_point, lens, Nlens);
+	image_point.x = frame->xmin + col*dx;
+	image_point.y = frame->ymin + row*dy;
 
-    grid_grad_x[index] = Grad.x;
-    grid_grad_y[index] = Grad.y;
+	Grad = module_potentialDerivatives_totalGradient_SOA_GPU(&image_point, lens, Nlens);
 
-    bid += gridDim.x;
-    index = bid * threadsPerBlock + tid;
-    major_index = indexactual + index ;
-  }
+	grid_grad_x[index] = Grad.x;
+	grid_grad_y[index] = Grad.y;
 
+	bid += gridDim.x;
+	index = bid * threadsPerBlock + tid;
+	//}
 }
 
-
-__global__ void gradient_grid_sis_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut) {
-
-  //*grad_x = *grad_y = 0.;
-
-  int bid=blockIdx.x; // index of the block (and of the set of images)
-  int tid=threadIdx.x; // index of the thread within the block
+/*
+void gradient_grid_general_CPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens){
+  int bid=0; // index of the block (and of the set of images)
+  int tid=0; // index of the thread within the block
 
   double dx,dy,x_pos,y_pos;        //pixelsize
   int grid_dim, index;
-  struct point true_coord, true_coord_rotation;
+  point Grad, image_point, true_coord_rotation;
   double      R;
   dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
   dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
   grid_dim = (nbgridcells);
 
-  index = bid * threadsPerBlock + tid ;
-
-
+  index = bid ;
 
   while(index < grid_dim*grid_dim){
 
     grid_grad_x[index] = 0.;
     grid_grad_y[index] = 0.;
 
-    x_pos= frame->xmin + (index/grid_dim)*dx;
-    y_pos= frame->ymin + (index % grid_dim)*dy;
-    //printf("%f %f \n", x_pos, y_pos);
-
-    for(int iterator=0; iterator < Nlens; iterator++){
-
-      true_coord.x = x_pos - lens_x[iterator];
-      true_coord.y = y_pos - lens_y[iterator];
-
-      // Change the origin of the coordinate system to the center of the clump
-      true_coord_rotation = rotateCoordinateSystem_GPU(true_coord, lens_angle[iterator]);
-
-      R=sqrt(true_coord_rotation.x*true_coord_rotation.x*(1-lens_epot[iterator])+true_coord_rotation.y*true_coord_rotation.y*(1+lens_epot[iterator]));  //ellippot = ellipmass/3
-      grid_grad_x[index] += (1-lens_epot[iterator])*lens_b0[iterator]*true_coord_rotation.x/(R);
-      grid_grad_y[index] += (1+lens_epot[iterator])*lens_b0[iterator]*true_coord_rotation.y/(R);
-
-
-
-    }
-
+    image_point.x = frame->xmin + (index/grid_dim)*dx;
+    image_point.y = frame->ymin + (index % grid_dim)*dy;
 
+    Grad = module_potentialDerivatives_totalGradient_SOA(&image_point, lens, Nlens);
 
+    grid_grad_x[index] = Grad.x;
+    grid_grad_y[index] = Grad.y;
 
-    bid += gridDim.x;
-    index = bid * threadsPerBlock + tid;
+    bid += 1;
+    index = bid * 1 + tid;
   }
-
-
 }
 
-__global__ void gradient_grid_piemd_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut) {
-
-  //*grad_x = *grad_y = 0.;
-
-  int bid=blockIdx.x; // index of the block (and of the set of images)
-  int tid=threadIdx.x; // index of the thread within the block
-
-  double dx,dy,x_pos,y_pos;        //pixelsize
-  int grid_dim, index;
-  struct point true_coord, true_coord_rotation;
-  complex      zis;
-  dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
-  dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
-  grid_dim = (nbgridcells);
-
-  index = bid * threadsPerBlock + tid ;
-
-
-
-  while(index < grid_dim*grid_dim){
-
-    //grid_grad_x[index] = 0.;
-    //grid_grad_y[index] = 0.;
-
-    x_pos= frame->xmin + (index/grid_dim)*dx;
-    y_pos= frame->ymin + (index % grid_dim)*dy;
-    //printf("%f %f \n", x_pos, y_pos);
-
-    for(int iterator=0; iterator < Nlens; iterator++){
-
-      true_coord.x = x_pos - lens_x[iterator];
-      true_coord.y = y_pos - lens_y[iterator];
-
-      // Change the origin of the coordinate system to the center of the clump
-      true_coord_rotation = rotateCoordinateSystem_GPU(true_coord, lens_angle[iterator]);
-
-      double x   = true_coord_rotation.x;
-      double y   = true_coord_rotation.y;
-      double eps = lens_epot[iterator];
-      double rc  = lens_rcore[iterator];
-
-      double sqe  = sqrt(eps);
-      //
-      double cx1  = (1. - eps)/(1. + eps);
-      double cxro = (1. + eps)*(1. + eps);
-      double cyro = (1. - eps)*(1. - eps);
-      //
-      double rem2 = x*x/cxro + y*y/cyro;
-
-      complex zci, znum, zden, zres;
-      double norm;
-      //
-      zci.re  = 0;
-      zci.im  = -0.5*(1. - eps*eps)/sqe;
-      //
-      znum.re = cx1*x;
-      znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
-      //
-      zden.re = x;
-      zden.im = 2.*rc*sqe - y;
-      norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
-      //
-      zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
-      zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
-      norm    = zis.re;
-      zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
-      zis.im  = atan2(zis.im, norm);
-      //  norm = zis.re;
-      zres.re = zci.re*zis.re - zci.im*zis.im;   // Re( zci*ln(zis) )
-      zres.im = zci.im*zis.re + zis.im*zci.re;   // Im( zci*ln(zis) )
-      //
-      zis.re  = zres.re;
-      zis.im  = zres.im;
-
-      grid_grad_x[index] += lens_b0[iterator]*zis.re;
-      grid_grad_y[index] += lens_b0[iterator]*zis.im;
-
-
-
-    }
-
-
-
-
-    bid += gridDim.x;
-    index = bid * threadsPerBlock + tid;
-  }
-
+*/
 
+__global__
+void 
+inline
+module_potentialDerivatives_totalGradient_8_SOA_GPU(double *grid_grad_x, double *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int shalos, int nhalos)
+{
+        //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
+        // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
+        //
+        struct point grad, clumpgrad, image_point;
+        grad.x = 0;
+        grad.y = 0;
+	//
+	int row = blockIdx.y * blockDim.y + threadIdx.y;
+        int col = blockIdx.x * blockDim.x + threadIdx.x;
+	//
+	if ((row < nbgridcells) && (col < nbgridcells))
+	{
+		//
+		int index = col*nbgridcells + row;
+		//
+		//grid_grad_x[index] = 0.;
+		//grid_grad_y[index] = 0.;
+		//
+		double dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
+		double dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
+		//
+#if 0
+		__shared__ double img_pt[2];
+		if ((row == 0) && (col == 0))
+		{
+			img_pt[0] = frame->xmin + col*dx;
+			img_pt[1] = frame->ymin + row*dy;
+		}
+		__syncthreads();
+#else
+		image_point.x = frame->xmin + col*dx;
+		image_point.y = frame->ymin + row*dy;	
+#endif
+		//
+		//
+		for(int i = shalos; i < shalos + nhalos; i++)
+		{
+			//IACA_START;
+			//
+			struct point true_coord, true_coord_rot; //, result;
+			//double       R, angular_deviation;
+			complex      zis;
+			//
+			//result.x = result.y = 0.;
+			//
+#if 0
+			true_coord.x = img_pt[0] - __ldg(&lens->position_x[i]);
+			true_coord.y = img_pt[1] - __ldg(&lens->position_y[i]);
+#else
+			true_coord.x = image_point.x - __ldg(&lens->position_x[i]);
+			true_coord.y = image_point.y - __ldg(&lens->position_y[i]);
+#endif
+			double cosi = __ldg(&lens->anglecos[i]);
+			double sinu = __ldg(&lens->anglesin[i]);
+			// positionning at the potential center
+			// Change the origin of the coordinate system to the center of the clump
+			double x = true_coord.x*cosi + true_coord.y*sinu;
+			double y = true_coord.y*cosi - true_coord.x*sinu;
+			//
+			double eps = __ldg(&lens->ellipticity_potential[i]);
+			//
+			double sqe  = sqrt(eps);
+			//
+			double rem2 = x*x/((1. + eps)*(1. + eps)) + y*y/((1. - eps)*(1. - eps));
+			//
+			complex zci;
+			complex znum, zden, zres;
+			double norm;
+			//
+			zci.im  = -0.5*(1. - eps*eps)/sqe;
+			//
+			double rc  = __ldg(&lens->rcore[i]);
+			double cx1  = (1. - eps)/(1. + eps);
+			znum.re = cx1*x;
+			znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
+			//
+			zden.re = x;
+			zden.im = 2.*rc*sqe - y;
+			norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
+			//
+			zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
+			zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
+			//
+			norm    = zis.re;
+			//
+			zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
+			zis.im  = atan2(zis.im, norm);
+			//
+			zres.re = zci.im*zis.im;   // Re( zci*ln(zis) )
+			zres.im = zci.im*zis.re;   // Im( zci*ln(zis) )
+			//
+			double b0  = __ldg(&lens->b0[i]);
+			grad.x += b0*(zres.re*cosi - zres.im*sinu);
+			grad.y += b0*(zres.im*cosi + zres.re*sinu);
+		}
+		//IACA_END;
+		//
+		grid_grad_x[index] += grad.x;
+		grid_grad_y[index] += grad.y;
+	}
 }
 
-__global__ void gradient_grid_piemd_GPU_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, int Ndevice, int indexactual, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut) {
-
-  //*grad_x = *grad_y = 0.;
-
-  int bid=blockIdx.x; // index of the block (and of the set of images)
-  int tid=threadIdx.x; // index of the thread within the block
-
-  double dx,dy,x_pos,y_pos;        //pixelsize
-  int grid_dim, index;
-  struct point true_coord, true_coord_rotation;
-  complex      zis;
-  dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
-  dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
-  grid_dim = (nbgridcells);
-
-  index = bid * threadsPerBlock + tid ;
-
-
-
-  while(index < Ndevice){
-    //printf("%d \n", index);
-
-    grid_grad_x[index] = 0.;
-    grid_grad_y[index] = 0.;
-
-    x_pos= frame->xmin + ((indexactual +index)/grid_dim)*dx;
-    y_pos= frame->ymin + ((indexactual +index) % grid_dim)*dy;
-    //printf("%f %f \n", x_pos, y_pos);
-
-    for(int iterator=0; iterator < Nlens; iterator++){
-
-      true_coord.x = x_pos - lens_x[iterator];
-      true_coord.y = y_pos - lens_y[iterator];
-
-      // Change the origin of the coordinate system to the center of the clump
-      true_coord_rotation = rotateCoordinateSystem_GPU(true_coord, lens_angle[iterator]);
-
-      double x   = true_coord_rotation.x;
-      double y   = true_coord_rotation.y;
-      double eps = lens_epot[iterator];
-      double rc  = lens_rcore[iterator];
-
-      double sqe  = sqrt(eps);
-      //
-      double cx1  = (1. - eps)/(1. + eps);
-      double cxro = (1. + eps)*(1. + eps);
-      double cyro = (1. - eps)*(1. - eps);
-      //
-      double rem2 = x*x/cxro + y*y/cyro;
-
-      complex zci, znum, zden, zres;
-      double norm;
-      //
-      zci.re  = 0;
-      zci.im  = -0.5*(1. - eps*eps)/sqe;
-      //
-      znum.re = cx1*x;
-      znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
-      //
-      zden.re = x;
-      zden.im = 2.*rc*sqe - y;
-      norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
-      //
-      zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
-      zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
-      norm    = zis.re;
-      zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
-      zis.im  = atan2(zis.im, norm);
-      //  norm = zis.re;
-      zres.re = zci.re*zis.re - zci.im*zis.im;   // Re( zci*ln(zis) )
-      zres.im = zci.im*zis.re + zis.im*zci.re;   // Im( zci*ln(zis) )
-      //
-      zis.re  = zres.re;
-      zis.im  = zres.im;
-
-      grid_grad_x[index] += lens_b0[iterator]*zis.re;
-      grid_grad_y[index] += lens_b0[iterator]*zis.im;
-
-
-
-    }
-
-
-
-
-    bid += gridDim.x;
-
-    index = bid * threadsPerBlock + tid;
-  }
-
-
+__global__
+void
+module_potentialDerivatives_totalGradient_8_SOA_GPU_v2(double *grid_grad_x, double *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame,
+ int nbgridcells, int i, int nhalos)
+{
+        //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
+        // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
+        //
+        struct point grad, clumpgrad, image_point;
+        grad.x = 0;
+        grad.y = 0;
+        //
+        int row = blockIdx.y * blockDim.y + threadIdx.y;
+        int col = blockIdx.x * blockDim.x + threadIdx.x;
+        //
+        if ((row < nbgridcells) && (col < nbgridcells))
+        {
+                //
+                int index = col*nbgridcells + row;
+                //
+                //grid_grad_x[index] = 0.;
+                //grid_grad_y[index] = 0.;
+                //
+                double dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
+                double dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
+                //
+#if 0
+                __shared__ double img_pt[2];
+                if ((row == 0) && (col == 0))
+                {
+                        img_pt[0] = frame->xmin + col*dx;
+                        img_pt[1] = frame->ymin + row*dy;
+                }
+                __syncthreads();
+#else
+                image_point.x = frame->xmin + col*dx;
+                image_point.y = frame->ymin + row*dy;
+#endif
+                //
+                //
+                //for(int i = shalos; i < shalos + nhalos; i++)
+                //{
+                        //IACA_START;
+                        //
+                        struct point true_coord, true_coord_rot; //, result;
+                        //double       R, angular_deviation;
+                        complex      zis;
+                        //
+                        //result.x = result.y = 0.;
+                        //
+#if 0
+                        true_coord.x = img_pt[0] - __ldg(&lens->position_x[i]);
+                        true_coord.y = img_pt[1] - __ldg(&lens->position_y[i]);
+#else
+                        true_coord.x = image_point.x - __ldg(&lens->position_x[i]);
+                        true_coord.y = image_point.y - __ldg(&lens->position_y[i]);
+#endif
+                        double cosi = __ldg(&lens->anglecos[i]);
+                        double sinu = __ldg(&lens->anglesin[i]);
+                        // positionning at the potential center
+                        // Change the origin of the coordinate system to the center of the clump
+                        double x = true_coord.x*cosi + true_coord.y*sinu;
+                        double y = true_coord.y*cosi - true_coord.x*sinu;
+                        //
+                        double eps = __ldg(&lens->ellipticity_potential[i]);
+                        //
+                        double sqe  = sqrt(eps);
+                        //
+                        double rem2 = x*x/((1. + eps)*(1. + eps)) + y*y/((1. - eps)*(1. - eps));
+                        //
+                        complex zci;
+                        complex znum, zden, zres;
+                        double norm;
+                        //
+                        zci.im  = -0.5*(1. - eps*eps)/sqe;
+                        //
+                        double rc  = __ldg(&lens->rcore[i]);
+                        double cx1  = (1. - eps)/(1. + eps);
+                        znum.re = cx1*x;
+                        znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
+                        //
+                        zden.re = x;
+                        zden.im = 2.*rc*sqe - y;
+                        norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
+                        //
+                        zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
+                        zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
+                        //
+                        //
+                        double b0  = __ldg(&lens->b0[i]);
+                        grad.x += b0*(zres.re*cosi - zres.im*sinu);
+                        grad.y += b0*(zres.im*cosi + zres.re*sinu);
+                //}
+                //IACA_END;
+                //
+                grid_grad_x[index] += grad.x;
+                grid_grad_y[index] += grad.y;
+        }
 }
 
 
 
-__global__ void piemd_GPU(double *grad_x, double *grad_y, point *pImage, int Nlens, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut) {
-
-  //*grad_x = *grad_y = 0.;
-
-  int bid=blockIdx.x; // index of the block (and of the set of images)
-  int tid=threadIdx.x; // index of the thread within the block
-
-  struct point true_coord, true_coord_rotation;
-  complex      zis;
-  //gridDim.x , threadsPerBlock;
-
-  int iterator = bid * threadsPerBlock + tid ;
-  //for(int iterator = 0; iterator < Nlens; ++iterator){
-  while(iterator < Nlens){
-    //printf(" %d %d %d %d \n",iterator , bid , threadsPerBlock , tid );
-    true_coord.x = pImage->x - lens_x[iterator];
-    true_coord.y = pImage->y - lens_y[iterator];
-
-    // Change the origin of the coordinate system to the center of the clump
-    true_coord_rotation = rotateCoordinateSystem_GPU(true_coord, lens_angle[iterator]);
-
-    double x   = true_coord_rotation.x;
-    double y   = true_coord_rotation.y;
-    double eps = lens_epot[iterator];
-    double rc  = lens_rcore[iterator];
-
-    double sqe  = sqrt(eps);
-    //
-    double cx1  = (1. - eps)/(1. + eps);
-    double cxro = (1. + eps)*(1. + eps);
-    double cyro = (1. - eps)*(1. - eps);
-    //
-    double rem2 = x*x/cxro + y*y/cyro;
-
-    complex zci, znum, zden, zres;
-    double norm;
-    //
-    zci.re  = 0;
-    zci.im  = -0.5*(1. - eps*eps)/sqe;
-    //
-    znum.re = cx1*x;
-    znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
-    //
-    zden.re = x;
-    zden.im = 2.*rc*sqe - y;
-    norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
-    //
-    zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
-    zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
-    norm    = zis.re;
-    zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
-    zis.im  = atan2(zis.im, norm);
-    //  norm = zis.re;
-    zres.re = zci.re*zis.re - zci.im*zis.im;   // Re( zci*ln(zis) )
-    zres.im = zci.im*zis.re + zis.im*zci.re;   // Im( zci*ln(zis) )
-    //
-    zis.re  = zres.re;
-    zis.im  = zres.im;
-    //
-    //zres.re = zis.re*b0;
-    //zres.im = zis.im*b0;
-    //
-    //
-
-    //grad->x += lens_b0[iterator]*zis.re;
-    //grad->y += lens_b0[iterator]*zis.im;
-    atomicAdd_double(grad_x,lens_b0[iterator]*zis.re);
-    atomicAdd_double(grad_y,lens_b0[iterator]*zis.im);
-
-    //printf("%f %f \n ", lens_b0[iterator]*zis.re, lens_b0[iterator]*zis.im);
-    //printf("%f %f \n ", *grad_x, *grad_y);
-
-    bid += gridDim.x;
-    iterator = bid * threadsPerBlock + tid ;
 
-  }
 
+/*
+   typedef struct point (*halo_func_GPU_t) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos);
+
+   __constant__ halo_func_GPU_t halo_func_GPU[100] =
+   {
+   0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_GPU, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_GPU,  0,
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+   0,  module_potentialDerivatives_totalGradient_81_SOA_GPU, 0, 0, 0, 0, 0, 0, 0, 0,
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+   };
+ */
+
+	void
+module_potentialDerivatives_totalGradient_SOA_CPU_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos)
+{
+	struct point grad, clumpgrad;
+	//
+	grad.x = clumpgrad.x = 0;
+	grad.y = clumpgrad.y = 0;
+	int shalos = 0;
+	int GRID_SIZE = (nbgridcells + BLOCK_SIZE - 1)/BLOCK_SIZE; // number of blocks
+	//
+	dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
+	dim3 dimGrid(GRID_SIZE, GRID_SIZE);	
+	int count = nhalos;	
+	cudaMemset(grid_grad_x, 0, nbgridcells*nbgridcells*sizeof(double));
+	cudaMemset(grid_grad_y, 0, nbgridcells*nbgridcells*sizeof(double));
+	module_potentialDerivatives_totalGradient_8_SOA_GPU<<<dimGrid, dimBlock>>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nbgridcells, shalos, count);
+	//grid.x += clumpgrad.x;
+	//grad.y += clumpgrad.y;
+
+	//
+	//	
+	/*
+	   while (shalos < nhalos)
+	   {
+
+	   int lens_type = lens_cpu->type[shalos];
+	   int count     = 1;
+	   while (lens_cpu->type[shalos + count] == lens_type) count++;
+	//std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl;
+	//printf ("%d %d %d \n",lens_type,count,shalos);
+	//
+	clumpgrad = (*halo_func_GPU[lens_type]<<<dimGrid, dimBlock>>> )(lens_gpu, shalos, count);
+	//
+	grad.x += clumpgrad.x;
+	grad.y += clumpgrad.y;
+	shalos += count;
+	}
 
+	return(grad);
+	 */
 }
 
-__device__ static double atomicAdd_double(double* address, double val)
+
+        void
+module_potentialDerivatives_totalGradient_SOA_CPU_GPU_v2(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos)
 {
-    unsigned long long int* address_as_ull =
-                                          (unsigned long long int*)address;
-    unsigned long long int old = *address_as_ull, assumed;
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull, assumed,
-                        __double_as_longlong(val +
-                        __longlong_as_double(assumed)));
-    } while (assumed != old);
-    return __longlong_as_double(old);
+        struct point grad, clumpgrad;
+        //
+        grad.x = clumpgrad.x = 0;
+        grad.y = clumpgrad.y = 0;
+        int shalos = 0;
+        int GRID_SIZE = (nbgridcells + BLOCK_SIZE - 1)/BLOCK_SIZE; // number of blocks
+        //
+        dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
+        dim3 dimGrid(GRID_SIZE, GRID_SIZE);
+        int count = nhalos;
+	//
+	cudaMemset(grid_grad_x, 0, nbgridcells*nbgridcells*sizeof(double));
+	cudaMemset(grid_grad_y, 0, nbgridcells*nbgridcells*sizeof(double));
+	//
+	for (int ii = 0; ii < nhalos; ++ii)
+		module_potentialDerivatives_totalGradient_8_SOA_GPU<<<dimGrid, dimBlock>>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nbgridcells, ii, 1);
+	
+        //grid.x += clumpgrad.x;
+        //grad.y += clumpgrad.y;
+
+        //
+        //
+        /*
+           while (shalos < nhalos)
+           {
+
+           int lens_type = lens_cpu->type[shalos];
+           int count     = 1;
+           while (lens_cpu->type[shalos + count] == lens_type) count++;
+        //std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl;
+        //printf ("%d %d %d \n",lens_type,count,shalos);
+        //
+        clumpgrad = (*halo_func_GPU[lens_type]<<<dimGrid, dimBlock>>> )(lens_gpu, shalos, count);
+        //
+        grad.x += clumpgrad.x;
+        grad.y += clumpgrad.y;
+        shalos += count;
+        }
+
+        return(grad);
+         */
 }
 
diff --git a/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cuh b/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cuh
index 5153dff..575df16 100644
--- a/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cuh
+++ b/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cuh
@@ -1,29 +1,30 @@
 /*
  * gradientgpu.cuh
  *
  *  Created on: Nov 29, 2016
  *      Author: cerschae
  */
 
 #ifndef GRID_GRADIENT_GPU_CUH_
 #define GRID_GRADIENT_GPU_CUH_
 
 #include "cudafunctions.cuh"
 #include "gradient_GPU.cuh"
 #include <structure_hpc.h>
 
 void gradient_grid_GPU_sorted(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens,int nbgridcells);
 void gradient_grid_pinned(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int *Nlens,int nbgridcell);
 void gradient_grid_pinned_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int *Nlens,int nbgridcell);
 void gradient_grid_GPU_sub(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos,int nbgridcells, int indexactual, int Ncells );
 void gradient_grid_GPU_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, int nbgridcells);
 
 __global__ void gradient_grid_kernel(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens);
+__global__ void gradient_grid_kernel_v2(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens);
 __global__ void gradient_grid_piemd_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcell, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut);
 __global__ void gradient_grid_sis_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcell, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut);
 __global__ void gradient_grid_piemd_GPU_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcell, int Ndevice, int indexactual, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut);
 __global__ void gradient_grid_kernel_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells, const struct Potential_SOA *lens, int indexactual, int ncells);
 
 __device__ static double atomicAdd_double(double* address, double val);
 
 #endif /* GRADIENTGPU_CUH_ */
diff --git a/Benchmarks/GridGradientBenchmark/main.cpp b/Benchmarks/GridGradientBenchmark/main.cpp
index 629299c..f09f058 100644
--- a/Benchmarks/GridGradientBenchmark/main.cpp
+++ b/Benchmarks/GridGradientBenchmark/main.cpp
@@ -1,314 +1,314 @@
 /**
 * @file   main.cpp
 * @Author Christoph Schaaefer, EPFL (christophernstrerne.schaefer@epfl.ch)
 * @date   October 2016
 * @brief  Benchmark for gradhalo function
 */
 
 #include <iostream>
 #include <iomanip>
 #include <string.h>
 #include <math.h>
 #include <sys/time.h>
 #include <fstream>
 #include <sys/stat.h>
 //
 #include <mm_malloc.h>
 //
 #include <cuda_runtime.h>
 #include <structure_hpc.h>
 #include "timer.h"
 #include "gradient.hpp"
 #include "chi_CPU.hpp"
 #include "module_cosmodistances.h"
 #include "module_readParameters.hpp"
 #include "grid_gradient_GPU.cuh"
 //#include<omp.h>
 
 int module_readCheckInput_readInput(int argc, char *argv[])
 {
 /// check if there is a correct number of arguments, and store the name of the input file in infile
 
 char* infile;
 struct stat  file_stat;
 
 // If we do not have 3 arguments, stop
 if ( argc != 3 )
 {
 	fprintf(stderr, "\nUnexpected number of arguments\n");
 	fprintf(stderr, "\nUSAGE:\n");
 	fprintf(stderr, "lenstool  input_file  output_directorypath  [-n]\n\n");
 	exit(-1);
 }
 else if ( argc == 3 )
 	infile=argv[1];
 std::ifstream ifile(infile,std::ifstream::in); // Open the file
 
 
 int ts = (int) time (NULL);
 char buffer[10];
 std::stringstream ss;
 ss << ts;
 std::string trimstamp = ss.str();
 //
 std::string outdir = argv[2];
 outdir += "-";
 outdir += trimstamp;
 std::cout << outdir << std::endl;
 
 // check whether the output directory already exists
 if (stat(outdir.c_str(), &file_stat) < 0){
 	mkdir(outdir.c_str(), S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP | S_IXGRP | S_IROTH );
 }
 else {
 	printf("Error : Directory %s already exists. Specify a non existing directory.\n",argv[2]);
 	exit(-1);
 }
 
 // check whether the input file exists. If it could not be opened (ifile = 0), it does not exist
 if(ifile){
 ifile.close();
 }
 else{
 printf("The file %s does not exist, please specify a valid file name\n",infile);
 exit(-1);
 	}
 
 return 0;
 }
 
 int main(int argc, char *argv[])
 {
 
 
 // Setting Up the problem
 //===========================================================================================================
 
 // This module function reads the terminal input when calling LENSTOOL and checks that it is correct
 // Otherwise it exits LENSTOOL
 module_readCheckInput_readInput(argc, argv);
 
 // This module function reads the cosmology parameters from the parameter file
 // Input: struct cosmologicalparameters cosmology, parameter file
 // Output: Initialized cosmology struct
 cosmo_param cosmology;  // Cosmology struct to store the cosmology data from the file
 std::string inputFile = argv[1];   // Input file
 module_readParameters_readCosmology(inputFile, cosmology);
 
 // This module function reads the runmode paragraph and the number of sources, arclets, etc. in the parameter file.
 // The runmode_param stores the information of what exactly the user wants to do with lenstool.
 struct runmode_param runmode;
 module_readParameters_readRunmode(inputFile, &runmode);
 
 module_readParameters_debug_cosmology(runmode.debug, cosmology);
 module_readParameters_debug_runmode(runmode.debug, runmode);
 
 
 //=== Declaring variables
 struct grid_param frame;
 struct galaxy images[runmode.nimagestot];
 struct galaxy sources[runmode.nsets];
 struct Potential lenses[runmode.nhalos+runmode.npotfile-1];
 struct Potential_SOA lenses_SOA_table[NTYPES];
 struct Potential_SOA lenses_SOA;
 struct cline_param cline;
 struct potfile_param potfile;
 struct Potential potfilepotentials[runmode.npotfile];
 struct potentialoptimization host_potentialoptimization[runmode.nhalos];
 int nImagesSet[runmode.nsets]; // Contains the number of images in each set of images
 
 // This module function reads in the potential form and its parameters (e.g. NFW)
 // Input: input file
 // Output: Potentials and its parameters
 
 
 module_readParameters_Potential(inputFile, lenses, runmode.nhalos);
 //Converts to SOA
 //module_readParameters_PotentialSOA(inputFile, lenses, lenses_SOA, runmode.Nlens);
 module_readParameters_PotentialSOA(inputFile, lenses, &lenses_SOA, runmode.nhalos);
 //module_readParameters_PotentialSOA_nonsorted(inputFile, lenses, &lenses_SOA_nonsorted, runmode.nhalos);
 module_readParameters_debug_potential(runmode.debug, lenses, runmode.nhalos);
 //std::cerr << lenses_SOA[1].b0[0] << " " << lenses[0].b0  << std::endl;
 // This module function reads in the potfiles parameters
 // Input: input file
 // Output: Potentials from potfiles and its parameters
 
 if (runmode.potfile == 1 ){
 	module_readParameters_readpotfiles_param(inputFile, &potfile);
 	module_readParameters_debug_potfileparam(runmode.debug, &potfile);
 	module_readParameters_readpotfiles(&runmode,&potfile,lenses);
 	module_readParameters_debug_potential(runmode.debug, lenses, runmode.nhalos+runmode.npotfile);
 
 }
 
 
 
 // This module function reads in the grid form and its parameters
 // Input: input file
 // Output: grid and its parameters
 
 
 module_readParameters_Grid(inputFile, &frame);
 
 
 
 
 if (runmode.image == 1 or runmode.inverse == 1 or runmode.time > 0){
 
 	// This module function reads in the strong lensing images
 	module_readParameters_readImages(&runmode, images, nImagesSet);
 	//runmode.nsets = runmode.nimagestot;
 	for(int i = 0; i < runmode.nimagestot; ++i){
 
 		images[i].dls = module_cosmodistances_objectObject(lenses[0].z, images[i].redshift, cosmology);
 		images[i].dos = module_cosmodistances_observerObject(images[i].redshift, cosmology);
 		images[i].dr = module_cosmodistances_lensSourceToObserverSource(lenses[0].z, images[i].redshift, cosmology);
 
 	}
 	module_readParameters_debug_image(runmode.debug, images, nImagesSet,runmode.nsets);
 
 }
 
 if (runmode.inverse == 1){
 
 	// This module function reads in the potential optimisation limits
 	module_readParameters_limit(inputFile,host_potentialoptimization,runmode.nhalos);
 	module_readParameters_debug_limit(runmode.debug, host_potentialoptimization[0]);
 }
 
 
 if (runmode.source == 1){
 	//Initialisation to default values.(Setting sources to z = 1.5 default value)
 	for(int i = 0; i < runmode.nsets; ++i){
 		sources[i].redshift = 1.5;
 	}
 	// This module function reads in the strong lensing sources
 	module_readParameters_readSources(&runmode, sources);
 	//Calculating cosmoratios
 	for(int i = 0; i < runmode.nsets; ++i){
 
 		sources[i].dls = module_cosmodistances_objectObject(lenses[0].z, sources[i].redshift, cosmology);
 		sources[i].dos = module_cosmodistances_observerObject(sources[i].redshift, cosmology);
 		sources[i].dr = module_cosmodistances_lensSourceToObserverSource(lenses[0].z, sources[i].redshift, cosmology);
 	}
 	module_readParameters_debug_source(runmode.debug, sources, runmode.nsets);
 
 }
 
 std::cout << "--------------------------" << std::endl << std::endl;
 
 double t_1(0),t_2(0),t_3(0),t_4(0);
 
 // Lenstool-CPU Grid-Gradient
 //===========================================================================================================
 
 //Setting Test:
 double dx, dy;
 
 dx = (frame.xmax - frame.xmin)/(runmode.nbgridcells-1);
 dy = (frame.ymax - frame.ymin)/(runmode.nbgridcells-1);
 
 point test_point1_1,test_point2_2, test_result1_1,test_result2_2,test_pointN_N, test_resultN_N;
 double dlsds = images[0].dr;
 
 test_point1_1.x = frame.xmin;
 test_point1_1.y = frame.ymin;
 test_point2_2.x = frame.xmin+dx;
 test_point2_2.y = frame.ymin+dy;
 test_pointN_N.x = frame.xmin + ((runmode.nbgridcells*runmode.nbgridcells-1)/runmode.nbgridcells)*dx;
 test_pointN_N.y = frame.ymin + ((runmode.nbgridcells*runmode.nbgridcells-1) % runmode.nbgridcells)*dy;
 
 
 test_result1_1 = module_potentialDerivatives_totalGradient_SOA(&test_point1_1, &lenses_SOA, runmode.nhalos);
 test_result2_2 = module_potentialDerivatives_totalGradient_SOA(&test_point2_2, &lenses_SOA, runmode.nhalos);
 test_resultN_N = module_potentialDerivatives_totalGradient_SOA(&test_pointN_N, &lenses_SOA, runmode.nhalos);
 
 std::cout << " Initial Test " << std::endl;
 std::cout << " Test 1: " << std::endl;
-std::cout << " Point 1 : " << std::setprecision(5) << test_point1_1.x << " "<< test_point1_1.y <<  std::endl;
-std::cout << " Gradient  " << std::setprecision(5) << test_result1_1.x << " "<< test_result1_1.y <<  std::endl;
+std::cout << " Point 1 : " << std::setprecision(15) << test_point1_1.x << " "<< test_point1_1.y <<  std::endl;
+std::cout << " Gradient  " << std::setprecision(15) << test_result1_1.x << " "<< test_result1_1.y <<  std::endl;
 std::cout << " Test 2: " << std::endl;
-std::cout << " Point 2 : " << std::setprecision(5) << test_point2_2.x << " "<< test_point2_2.y <<  std::endl;
-std::cout << " Gradient  " << std::setprecision(5) << test_result2_2.x << " "<< test_result2_2.y <<  std::endl;
+std::cout << " Point 2 : " << std::setprecision(15) << test_point2_2.x << " "<< test_point2_2.y <<  std::endl;
+std::cout << " Gradient  " << std::setprecision(15) << test_result2_2.x << " "<< test_result2_2.y <<  std::endl;
 std::cout << " Test 3: " << std::endl;
-std::cout << " Point 3 : " << std::setprecision(5) << test_pointN_N.x << " "<< test_pointN_N.y <<  std::endl;
-std::cout << " Gradient  " << std::setprecision(5) << test_resultN_N.x << " "<< test_resultN_N.y <<  std::endl;
+std::cout << " Point 3 : " << std::setprecision(15) << test_pointN_N.x << " "<< test_pointN_N.y <<  std::endl;
+std::cout << " Gradient  " << std::setprecision(15) << test_resultN_N.x << " "<< test_resultN_N.y <<  std::endl;
 
 
 double *grid_gradient_x, *grid_gradient_y;
 
 grid_gradient_x = (double *)malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double));
 grid_gradient_y = (double *)malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double));
 int grid_dim = runmode.nbgridcells;
 
 
 #if 0
 t_1 = -myseconds();
 //Packaging the image to sourceplane conversion
 gradient_grid_CPU(grid_gradient_x,grid_gradient_y,&frame,&lenses_SOA,runmode.nhalos,grid_dim);
 t_1 += myseconds();
 
 std::cout << " gradient_grid_CPU Brute Force Benchmark " << std::endl;
 std::cout << " Test 1: " << std::endl;
 std::cout << " Point 1 : " << std::setprecision(5) << test_point1_1.x << " "<< test_point1_1.y <<  std::endl;
 std::cout << " Gradient  " << std::setprecision(5) << grid_gradient_x[0] << " "<< grid_gradient_y[0] <<  std::endl;
 std::cout << " Test 2: " << std::endl;
 std::cout << " Point 2 : " << std::setprecision(5) << test_point2_2.x << " "<< test_point2_2.y <<  std::endl;
 std::cout << " Gradient  " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells+1] << " "<< grid_gradient_y[runmode.nbgridcells+1] <<  std::endl;
 std::cout << " Time  " << std::setprecision(15) << t_1 << std::endl;
 #endif
 
 #if 1
 t_2 = -myseconds();
-gradient_grid_GPU_sorted(grid_gradient_x,grid_gradient_y,&frame,&lenses_SOA,runmode.nhalos,grid_dim);
+gradient_grid_GPU_sorted(grid_gradient_x,grid_gradient_y, &frame, &lenses_SOA, runmode.nhalos, grid_dim);
 t_2 += myseconds();
 
 std::cout << " gradient_grid_GPU_sorted Brute Force Benchmark " << std::endl;
 std::cout << " Test 1: " << std::endl;
-std::cout << " Point 1 : " << std::setprecision(5) << test_point1_1.x << " "<< test_point1_1.y <<  std::endl;
-std::cout << " Gradient  " << std::setprecision(5) << grid_gradient_x[0] << " "<< grid_gradient_y[0] <<  std::endl;
+std::cout << " Point 1 : " << std::setprecision(15) << test_point1_1.x << " "<< test_point1_1.y <<  std::endl;
+std::cout << " Gradient  " << std::setprecision(15) << grid_gradient_x[0] << " "<< grid_gradient_y[0] <<  std::endl;
 std::cout << " Test 2: " << std::endl;
-std::cout << " Point 2 : " << std::setprecision(5) << test_point2_2.x << " "<< test_point2_2.y <<  std::endl;
-std::cout << " Gradient  " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells+1] << " "<< grid_gradient_y[runmode.nbgridcells+1] <<  std::endl;
+std::cout << " Point 2 : " << std::setprecision(15) << test_point2_2.x << " "<< test_point2_2.y <<  std::endl;
+std::cout << " Gradient  " << std::setprecision(15) << grid_gradient_x[runmode.nbgridcells+1] << " "<< grid_gradient_y[runmode.nbgridcells+1] <<  std::endl;
 std::cout << " Time  " << std::setprecision(15) << t_2 << std::endl;
 #endif
 
 #if 1
 t_3 = -myseconds();
 gradient_grid_GPU_multiple(grid_gradient_x,grid_gradient_y,&frame,&lenses_SOA,runmode.nhalos,grid_dim);
 t_3 += myseconds();
 
 std::cout << " gradient_grid_GPU_multiple Brute Force Benchmark " << std::endl;
 std::cout << " Test 1: " << std::endl;
 std::cout << " Point 1 : " << std::setprecision(5) << test_point1_1.x << " "<< test_point1_1.y <<  std::endl;
 std::cout << " Gradient  " << std::setprecision(5) << grid_gradient_x[0] << " "<< grid_gradient_y[0] <<  std::endl;
 std::cout << " Test 2: " << std::endl;
 std::cout << " Point 2 : " << std::setprecision(5) << test_point2_2.x << " "<< test_point2_2.y <<  std::endl;
 std::cout << " Gradient  " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells+1] << " "<< grid_gradient_y[runmode.nbgridcells+1] <<  std::endl;
 std::cout << " Test 3: " << std::endl;
 std::cout << " Point 3 : " << std::setprecision(5) << test_pointN_N.x << " "<< test_pointN_N.y <<  std::endl;
 std::cout << " Gradient  " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells*runmode.nbgridcells-1] << " "<< grid_gradient_y[runmode.nbgridcells*runmode.nbgridcells-1] <<  std::endl;
 std::cout << " Time  " << std::setprecision(15) << t_3 << std::endl;
 #endif
 
 #if 1
 t_4 = -myseconds();
 gradient_grid_GPU_sub(grid_gradient_x,grid_gradient_y,&frame,&lenses_SOA,runmode.nhalos,grid_dim,0,grid_dim*grid_dim);
 t_4 += myseconds();
 
 std::cout << " gradient_grid_GPU_sub Brute Force Benchmark " << std::endl;
 std::cout << " Test 1: " << std::endl;
 std::cout << " Point 1 : " << std::setprecision(5) << test_point1_1.x << " "<< test_point1_1.y <<  std::endl;
 std::cout << " Gradient  " << std::setprecision(5) << grid_gradient_x[0] << " "<< grid_gradient_y[0] <<  std::endl;
 std::cout << " Test 2: " << std::endl;
 std::cout << " Point 2 : " << std::setprecision(5) << test_point2_2.x << " "<< test_point2_2.y <<  std::endl;
 std::cout << " Gradient  " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells+1] << " "<< grid_gradient_y[runmode.nbgridcells+1] <<  std::endl;
 std::cout << " Test 3: " << std::endl;
 std::cout << " Point 3 : " << std::setprecision(5) << test_pointN_N.x << " "<< test_pointN_N.y <<  std::endl;
 std::cout << " Gradient  " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells*runmode.nbgridcells-1] << " "<< grid_gradient_y[runmode.nbgridcells*runmode.nbgridcells-1] <<  std::endl;
 std::cout << " Time  " << std::setprecision(15) << t_4 << std::endl;
 #endif
 
 
 
 }