diff --git a/Benchmarks/GridGradientBenchmark/Makefile b/Benchmarks/GridGradientBenchmark/Makefile
index c21ed04..61051ca 100644
--- a/Benchmarks/GridGradientBenchmark/Makefile
+++ b/Benchmarks/GridGradientBenchmark/Makefile
@@ -1,81 +1,81 @@
 PROGRAM_NAME := ChiBenchmark
 #CXX=g++ -lm -ffast-math -ftree-loop-vectorize 
 CXX=icpc
 
 
 program_CXX_SRCS := $(wildcard *.cpp)
 program_CXX_OBJS := ${program_CXX_SRCS:.cpp=.o}
 
 #program_CXX_SRCS += $(wildcard ../../*.c) #Find C source files from additonal directories
 program_C_OBJS := ${program_C_SRCS:.c=.o}
 #
 program_CU_SRCS := $(wildcard *.cu)
 #program_CU_SRCS += $(wildcard ../../*.cu) #Find CUDA source files from additional directories
 #program_CU_HEADERS := $(wildcard *.cuh) #Optional: Include .cuh files as dependencies
 #program_CU_HEADERS += $(wildcard ../../*.cuh) #Find .cuh files from additional directories
 program_CU_OBJS := ${program_CU_SRCS:.cu=.cuo}
 #
 program_INCLUDE_DIRS := . /usr/local/cuda/include/ #C++ Include directories
 program_INCLUDE_DIRS += $(CFITSIO_ROOT)/include
 #program_INCLUDE_DIRS += /users/fgilles/bin/iaca-lin32/include
 program_INCLUDE_DIRS += $(LENSTOOL_ROOT)/include
 program_INCLUDE_DIRS += $(GSL_ROOT)
 program_INCLUDE_DIRS += $(LENSTOOLHPC_ROOT)/src
 #
 #program_CU_INCLUDE_DIRS := /home/users/amclaugh/CUB/cub-1.3.2/ #CUDA Include directories
 #
 #program_INCLUDE_LIBS := ./ #Include libraries
 program_INCLUDE_LIBS += $(CFITSIO_ROOT)/lib #Include libraries
 program_INCLUDE_LIBS += $(LENSTOOL_ROOT)/src
 program_INCLUDE_LIBS += $(LENSTOOL_ROOT)/liblt
 program_INCLUDE_LIBS += $(LENSTOOLHPC_ROOT)/src/
 program_INCLUDE_LIBS += $(GSL_ROOT)/lib
 program_INCLUDE_LIBS += $(WCSTOOL_ROOT)
 #
 #
 # Compiler flags
 CPPFLAGS += $(foreach includedir,$(program_INCLUDE_DIRS),-I$(includedir))
 CPPFLAGS += $(foreach includelib,$(program_INCLUDE_LIBS),-L$(includelib))
 CPPFLAGS += -D__WITH_LENSTOOL
 CPPFLAGS += -qopenmp -xHost -g -O3 -std=c++0x -Wall -pedantic
 #CPPFLAGS += -qopenmp -march=core-avx2 -g -O3 -std=c++0x -Wall -pedantic
 #CPPFLAGS += -llenstoolhpc -qopenmp -xHost -g -O3 -std=c++0x -Wall -pedantic
 #CPPFLAGS += -llenstoolhpc -qopenmp -axMIC-AVX512,CORE-AVX2 -g -O3 -std=c++0x -Wall -pedantic
 #CPPFLAGS += -qopt-prefetch-distance=64,8 -qopt-streaming-cache-evict=0 -llenstoolhpc -qopenmp -xMIC-AVX512 -g -O3 -std=c++0x -Wall -pedantic
 LDFLAGS := -llenstoolhpc -lcfitsio -llenstool -llt -lgsl -lgslcblas -lm -lwcs
 
 NVFLAGS := -O3 -g -G -arch=sm_35 -rdc=true -ccbin icpc -Xcompiler -qopenmp -D__WITH_LENSTOOL #rdc=true needed for separable compilation
 #NVFLAGS += --maxrregcount=40 
 NVFLAGS += $(foreach includedir,$(program_INCLUDE_DIRS),-I$(includedir))
 NVFLAGS += $(foreach includelib,$(program_INCLUDE_LIBS),-L$(includelib)) 
 
 CUO_O_OBJECTS := ${program_CU_OBJS:.cuo=.cuo.o}
 
 
 OBJECTS = $(program_CXX_OBJS) $(program_C_OBJS) $(program_CU_OBJS)
 
 .PHONY: all clean distclean
 
 all: $(PROGRAM_NAME) 
 #
 debug: CXXFLAGS = -g -O0 -std=c++0x -Wall -pedantic -DDEBUG $(EXTRA_FLAGS)
 debug: $(PROGRAM_NAME)
 %.cuo: %.cu %.cuh
-	nvcc $(NVFLAGS) --ptxas-options=-v -o $@ -dc $<
-#nvcc $(NVFLAGS)  -Xptxas -v,-abi=no -ptx -src-in-ptx -dc $<
+	nvcc $(NVFLAGS) -Xptxas -dlcm=cg --ptxas-options=-v -o $@ -dc $<
+	#nvcc $(NVFLAGS)  -maxrregcount=100 -Xptxas -dlcm=cg --ptxas-options=-v -o $@ -dc $<
 
 $(PROGRAM_NAME): $(OBJECTS) 
 	@ for cu_obj in $(program_CU_OBJS); \
 	do        \
 		mv $$cu_obj $$cu_obj.o; \
 	done #append a .o suffix for nvcc
 	nvcc $(NVFLAGS) -o $@ $(program_CXX_OBJS) $(program_C_OBJS) $(CUO_O_OBJECTS) $(LDFLAGS)
 	@ for cu_obj in $(CUO_O_OBJECTS);  \
 	do        \
 		mv $$cu_obj $${cu_obj%.*} ;  \
 	done #remove the .o for make
 
 clean:
 	@- $(RM) $(PROGRAM_NAME) $(OBJECTS) *~ *.o *.optrpt
 
 distclean: clean
diff --git a/Benchmarks/GridGradientBenchmark/gradient_GPU.cu b/Benchmarks/GridGradientBenchmark/gradient_GPU.cu
index b115234..52ce9c6 100644
--- a/Benchmarks/GridGradientBenchmark/gradient_GPU.cu
+++ b/Benchmarks/GridGradientBenchmark/gradient_GPU.cu
@@ -1,516 +1,515 @@
 
 #include "gradient_GPU.cuh"
 
 #include <iostream>
 #include <string.h>
 #include <cuda_runtime.h>
 #include <math.h>
 #include <sys/time.h>
 #include <fstream>
 #include <immintrin.h>
 #include <map>
 /*
 #ifdef __AVX__
 #include "simd_math_avx.h"
 #endif
 #ifdef __AVX512F__
 #include "simd_math_avx512f.h"
 #endif
 */
 #include "structure_hpc.h"
 #include "utils.hpp"
 
 
 
 //
 // SOA versions, vectorizable
 //
 __device__ 
 inline struct point module_potentialDerivatives_totalGradient_5_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
         //asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA begins");
   //
   struct point grad, clumpgrad;
         grad.x = 0;
         grad.y = 0;
   for(int i = shalos; i < shalos + nhalos; i++)
   {
     //
     struct point true_coord, true_coord_rotation;
     //
     true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
     //
     true_coord_rotation = rotateCoordinateSystem_GPU(true_coord, lens->ellipticity_angle[i]);
     double R = sqrt(true_coord_rotation.x*true_coord_rotation.x*(1 - lens->ellipticity_potential[i])+true_coord_rotation.y*true_coord_rotation.y*(1 + lens->ellipticity_potential[i]));
     //
     grad.x += (1 - lens->ellipticity[i]/3.)*lens->b0[i]*true_coord_rotation.x/R;
     grad.y += (1 + lens->ellipticity[i]/3.)*lens->b0[i]*true_coord_rotation.y/R;
   }
   return grad;
 }
-
 //
 //
 //
 	__device__ 
 inline struct point module_potentialDerivatives_totalGradient_8_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
 	//asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
 	// 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
 	//
 	struct point grad, clumpgrad;
 	grad.x = 0;
 	grad.y = 0;
 	//
 	for(int i = shalos; i < shalos + nhalos; i++)
 	{
 		//IACA_START;
 		//
 		struct point true_coord, true_coord_rot; //, result;
 		//double       R, angular_deviation;
 		complex      zis;
 		//
 		//result.x = result.y = 0.;
 		//
 		true_coord.x = pImage->x - lens->position_x[i];
 		true_coord.y = pImage->y - lens->position_y[i];
 		//double cosi,sinu;
 		//cosi = cos(lens->ellipticity_angle[i]);
 		//sinu = sin(lens->ellipticity_angle[i]);
 		double cosi = lens->anglecos[i];
 		double sinu = lens->anglesin[i];
 		//positionning at the potential center
 		// Change the origin of the coordinate system to the center of the clump
 		//true_coord_rot = rotateCoordinateSystem_GPU(true_coord, lens->ellipticity_angle[i]);
 		//true_coord_rot = rotateCoordinateSystem_GPU_2(true_coord, cosi, sinu);
 		true_coord_rot.x = true_coord.x*cosi + true_coord.y*sinu;
 		true_coord_rot.y = true_coord.y*cosi - true_coord.x*sinu;	
 		//
 		double x   = true_coord_rot.x;
 		double y   = true_coord_rot.y;
 		double eps = lens->ellipticity_potential[i];
 		double rc  = lens->rcore[i];
 		double b0  = lens->b0[i];
 		//
 		//std::cout << "piemd_lderivatives" << std::endl;
 		//
 		double sqe  = sqrt(eps);
 		//
 		double cx1  = (1. - eps)/(1. + eps);
 		double cxro = (1. + eps)*(1. + eps);
 		double cyro = (1. - eps)*(1. - eps);
 		//
 		double rem2 = x*x/cxro + y*y/cyro;
 		//
 		complex zci, znum, zden, zres;
 		double norm;
 		//
 		zci.re  = 0;
 		zci.im  = -0.5*(1. - eps*eps)/sqe;
 		//
 		znum.re = cx1*x;
 		znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
 		//
 		zden.re = x;
 		zden.im = 2.*rc*sqe - y;
 		norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
 		//
 		zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
 		zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
 		norm    = zis.re;
 		zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
 		zis.im  = atan2(zis.im, norm);
 		//  norm = zis.re;
 		zres.re = zci.re*zis.re - zci.im*zis.im;   // Re( zci*ln(zis) )
 		zres.im = zci.im*zis.re + zis.im*zci.re;   // Im( zci*ln(zis) )
 		//
 		zis.re  = zres.re;
 		zis.im  = zres.im;
 		//
 		grad.x += b0*(zres.re*cosi - zres.im*sinu);
                 grad.y += b0*(zres.im*cosi + zres.re*sinu);
 		
 		//
 		//zres.re = zis.re*b0;
 		//zres.im = zis.im*b0;
 		// rotation
 		//clumpgrad.x = zis.re;
 		//clumpgrad.y = zis.im;
 		//clumpgrad = rotateCoordinateSystem_GPU(true_coord, -lens->ellipticity_angle[i]);
 		//clumpgrad = rotateCoordinateSystem_GPU_2(clumpgrad, cosi, -sinu);
 		//grad.x += b0*(zis.re*cosi - zis.im*sinu);
 		//grad.y += b0*(zis.im*cosi + zis.re*sinu);
 		//clumpgrad.x = true_coord.x*cosi + true_coord.y*sinu;
 		//clumpgrad.y = true_coord.y*cosi - true_coord.x*sinu;	
 		//
 		//clumpgrad.x = lens->b0[i]*clumpgrad.x;
 		//clumpgrad.y = lens->b0[i]*clumpgrad.y;
 		//
 		//clumpgrad.x = lens->b0[i]*zis.re;
 		//clumpgrad.y = lens->b0[i]*zis.im;
 		//nan check
 		//if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y)
 		//{
 		// add the gradients
 		//grad.x += clumpgrad.x;
 		//grad.y += clumpgrad.y;
 		//}
 	}
 	//IACA_END;
 	//
 	return(grad);
 }
 //
 //
 //
 __device__
 inline struct point module_potentialDerivatives_totalGradient_8_SOA_GPU_SM(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
         //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
         // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
         //
         struct point grad, clumpgrad;
         grad.x = 0;
         grad.y = 0;
         //
         for(int i = shalos; i < shalos + nhalos; i++)
         {
                 //IACA_START;
                 //
 		
 		//
                 struct point true_coord, true_coord_rot; //, result;
                 //double       R, angular_deviation;
                 complex      zis;
                 //
                 //result.x = result.y = 0.;
                 //
                 true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
                 //double cosi,sinu;
                 //cosi = cos(lens->ellipticity_angle[i]);
                 //sinu = sin(lens->ellipticity_angle[i]);
                 double cosi = lens->anglecos[i];
                 double sinu = lens->anglesin[i];
                 //positionning at the potential center
                 // Change the origin of the coordinate system to the center of the clump
                 //true_coord_rot = rotateCoordinateSystem_GPU(true_coord, lens->ellipticity_angle[i]);
                 //true_coord_rot = rotateCoordinateSystem_GPU_2(true_coord, cosi, sinu);
                 true_coord_rot.x = true_coord.x*cosi + true_coord.y*sinu;
                 true_coord_rot.y = true_coord.y*cosi - true_coord.x*sinu;
                 //
                 double x   = true_coord_rot.x;
                 double y   = true_coord_rot.y;
                 double eps = lens->ellipticity_potential[i];
                 double rc  = lens->rcore[i];
                 double b0  = lens->b0[i];
                 //
                 //std::cout << "piemd_lderivatives" << std::endl;
                 //
                 double sqe  = sqrt(eps);
                 //
                 double cx1  = (1. - eps)/(1. + eps);
                 double cxro = (1. + eps)*(1. + eps);
                 double cyro = (1. - eps)*(1. - eps);
                 //
                 double rem2 = x*x/cxro + y*y/cyro;
                 //
                 complex zci, znum, zden, zres;
                 double norm;
                 //
                 zci.re  = 0;
                 zci.im  = -0.5*(1. - eps*eps)/sqe;
                 //
                 znum.re = cx1*x;
                 znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
                 //
                 zden.re = x;
                 zden.im = 2.*rc*sqe - y;
                 norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
                 //
                 zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
                 zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
                 norm    = zis.re;
                 zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
                 zis.im  = atan2(zis.im, norm);
                 //  norm = zis.re;
                 zres.re = zci.re*zis.re - zci.im*zis.im;   // Re( zci*ln(zis) )
                 zres.im = zci.im*zis.re + zis.im*zci.re;   // Im( zci*ln(zis) )
                 //
                 zis.re  = zres.re;
                 zis.im  = zres.im;
                 //
                 grad.x += b0*(zres.re*cosi - zres.im*sinu);
                 grad.y += b0*(zres.im*cosi + zres.re*sinu);
                 //
                 //zres.re = zis.re*b0;
                 //zres.im = zis.im*b0;
                 // rotation
                 //clumpgrad.x = zis.re;
                 //clumpgrad.y = zis.im;
         }
         //IACA_END;
         //
         return(grad);
 }
 
 
 
 
 
 
 __device__ inline struct point module_potentialDerivatives_totalGradient_81_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
 	//asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
 	//std::cout << "# module_potentialDerivatives_totalGradient_SOA begins" << std::endl;
 	// 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
 	//
 	struct point grad, clumpgrad;
 	grad.x = 0;
 	grad.y = 0;
 	//
 	for(int i = shalos; i < shalos + nhalos; i++)
 	{
 		//IACA_START;
 		//
 		struct point true_coord, true_coord_rot; //, result;
 		//double       R, angular_deviation;
 		complex      zis;
 		//
 		//result.x = result.y = 0.;
 		//
 		true_coord.x = pImage->x - lens->position_x[i];
 		true_coord.y = pImage->y - lens->position_y[i];
 		//positionning at the potential center
 		// Change the origin of the coordinate system to the center of the clump
 		true_coord_rot = rotateCoordinateSystem_GPU(true_coord, lens->ellipticity_angle[i]);
 		//
 		double x    = true_coord_rot.x;
 		double y    = true_coord_rot.y;
 		double eps  = lens->ellipticity_potential[i];
 		double rc   = lens->rcore[i];
 		double rcut = lens->rcut[i];
 		double b0   = lens->b0[i];
 		double t05  = b0*rcut/(rcut - rc);
 		//printf("b0 = %f, rcut = %f, rc = %f, t05 = %f\n", b0, rcut, rc, t05);
 		//
 		//std::cout << "piemd_lderivatives" << std::endl;
 		//
 		double sqe  = sqrt(eps);
 		//
 		double cx1  = (1. - eps)/(1. + eps);
 		double cxro = (1. + eps)*(1. + eps);
 		double cyro = (1. - eps)*(1. - eps);
 		//
 		double rem2 = x*x/cxro + y*y/cyro;
 		//
 		complex zci, znum, zden, zres_rc, zres_rcut;
 		double norm;
 		//
 		zci.re  = 0;
 		zci.im  = -0.5*(1. - eps*eps)/sqe;
 		//
 		// step 1
 		//
 		{
 			znum.re = cx1*x;
 			znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
 			//
 			zden.re = x;
 			zden.im = 2.*rc*sqe - y;
 			norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
 			//
 			zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
 			zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
 			norm    = zis.re;
 			zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
 			zis.im  = atan2(zis.im, norm);
 			//  norm = zis.re;
 			zres_rc.re = zci.re*zis.re - zci.im*zis.im;   // Re( zci*ln(zis) )
 			zres_rc.im = zci.im*zis.re + zis.im*zci.re;   // Im( zci*ln(zis) )
 		}
 		//
 		// step 2
 		//
 		{
 			znum.re = cx1*x;
 			znum.im = 2.*sqe*sqrt(rcut*rcut + rem2) - y/cx1;
 			//
 			zden.re = x;
 			zden.im = 2.*rcut*sqe - y;
 			norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
 			//
 			zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
 			zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
 			norm    = zis.re;
 			zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
 			zis.im  = atan2(zis.im, norm);
 			//  norm = zis.re;
 			zres_rcut.re = zci.re*zis.re - zci.im*zis.im;   // Re( zci*ln(zis) )
 			zres_rcut.im = zci.im*zis.re + zis.im*zci.re;   // Im( zci*ln(zis) )
 		}
 		zis.re  = t05*(zres_rc.re - zres_rcut.re);
 		zis.im  = t05*(zres_rc.im - zres_rcut.im);
 		//printf("%f %f\n", zis.re, zis.im);
 		//
 		//zres.re = zis.re*b0;
 		//zres.im = zis.im*b0;
 		// rotation
 		clumpgrad.x = zis.re;
 		clumpgrad.y = zis.im;
 		clumpgrad = rotateCoordinateSystem_GPU(clumpgrad, -lens->ellipticity_angle[i]);
 		//
 		//clumpgrad.x = lens->b0[i]*clumpgrad.x;
 		//clumpgrad.y = lens->b0[i]*clumpgrad.y;
 		//
 		//clumpgrad.x = lens->b0[i]*zis.re;
 		//clumpgrad.y = lens->b0[i]*zis.im;
 		//nan check
 		//if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y)
 		//{
 		// add the gradients
 		grad.x += clumpgrad.x;
 		grad.y += clumpgrad.y;
 		//printf("grad = %f %f\n", clumpgrad.x, clumpgrad.y);
 		//}
 	}
 	//IACA_END;
 	//
 	return(grad);
 }
 //
 //
 //
 
 typedef struct point (*halo_func_GPU_t) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos);
 
 __constant__ halo_func_GPU_t halo_func_GPU[100] =
 {
 	0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_GPU, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_GPU,  0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0,  module_potentialDerivatives_totalGradient_81_SOA_GPU, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 //
 //
 //
 __device__ struct point module_potentialDerivatives_totalGradient_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int nhalos)
 {
 	struct point grad, clumpgrad;
 	//
 	grad.x = clumpgrad.x = 0;
 	grad.y = clumpgrad.y = 0;
 	//
 	int shalos = 0;
 	clumpgrad = module_potentialDerivatives_totalGradient_5_SOA_GPU(pImage, lens, 0, nhalos);
 	grad.x += clumpgrad.x;
 	grad.y += clumpgrad.y;
 	return(grad);
 	//
 	//module_potentialDerivatives_totalGradient_81_SOA(pImage, lens, 0, nhalos);
 	//return;
 	/*
 	   int* p_type = &(lens->type)[0];
 	   int* lens_type = (int*) malloc(nhalos*sizeof(int));
 	   memcpy(lens_type, &(lens->type)[0], nhalos*sizeof(int));
 
 	//quicksort(lens_type, nhalos);
 	//*/
 
 	//printf ("%f \n",lens->position_x[shalos]);
 
 
 	while (shalos < nhalos)
 	{
 
 		int lens_type = lens->type[shalos];
 		int count     = 1;
 		while (lens->type[shalos + count] == lens_type) count++;
 		//std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl;
 		//printf ("%d %d %d \n",lens_type,count,shalos);
 		//
 		clumpgrad = (*halo_func_GPU[lens_type])(pImage, lens, shalos, count);
 		//
 		grad.x += clumpgrad.x;
 		grad.y += clumpgrad.y;
 		shalos += count;
 	}
 
 	return(grad);
 }
 
 
 #if 0
 __device__ struct point module_potentialDerivatives_totalGradient_SOA_CPU_GPU(const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, double xmin, double ymin, int nbgridcells, int nhalos)
 {
         struct point grad, clumpgrad;
         //
         grad.x = clumpgrad.x = 0;
         grad.y = clumpgrad.y = 0;
         //
         int shalos = 0;
         //clumpgrad = module_potentialDerivatives_totalGradient_5_SOA_GPU(pImage, lens, 0, nhalos);
         //grad.x += clumpgrad.x;
         //grad.y += clumpgrad.y;
         //return(grad);
         //
         //module_potentialDerivatives_totalGradient_81_SOA(pImage, lens, 0, nhalos);
         //return;
         /*
            int* p_type = &(lens->type)[0];
            int* lens_type = (int*) malloc(nhalos*sizeof(int));
            memcpy(lens_type, &(lens->type)[0], nhalos*sizeof(int));
 
         //quicksort(lens_type, nhalos);
         //*/
 
         //printf ("%f \n",lens->position_x[shalos]);
 
 
         while (shalos < nhalos)
         {
 
                 int lens_type = lens->type[shalos];
                 int count     = 1;
                 while (lens->type[shalos + count] == lens_type) count++;
                 //std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl;
                 //printf ("%d %d %d \n",lens_type,count,shalos);
                 //
                 clumpgrad = (*halo_func_GPU[lens_type])(pImage, lens, shalos, count);
                 //
                 grad.x += clumpgrad.x;
                 grad.y += clumpgrad.y;
                 shalos += count;
         }
 
         return(grad);
 }
 #endif
 
 
 __device__ inline struct point rotateCoordinateSystem_GPU(struct point P, double theta)
 {
 	struct  point   Q;
 
 	Q.x = P.x*cos(theta) + P.y*sin(theta);
 	Q.y = P.y*cos(theta) - P.x*sin(theta);
 
 	return(Q);
 }
 
 __device__ inline struct point rotateCoordinateSystem_GPU_2(struct point P, double cosi, double sinu)
 {
 	struct  point   Q;
 
 	Q.x = P.x*cosi + P.y*sinu;
 	Q.y = P.y*cosi - P.x*sinu;
 
 	return(Q);
 }
diff --git a/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cu b/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cu
index e8b1d16..5872098 100644
--- a/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cu
+++ b/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cu
@@ -1,1052 +1,1260 @@
 #include <fstream>
 #include "grid_gradient_GPU.cuh"
 
 #define BLOCK_SIZE_X 16
 #define BLOCK_SIZE_Y 8
 //#define ROT
 
 #define _SHARED_MEM
 
 #ifdef _SHARED_MEM
 #define SHARED __shared__
 #warning "shared memory"
+extern __shared__ double shared[];
 #else
 #define SHARED 
 #endif
 
 #define Nx 1
 #define Ny 0
 
 extern "C" {
 double myseconds();
 }
 void
 module_potentialDerivatives_totalGradient_SOA_CPU_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos);
 
 void
 module_potentialDerivatives_totalGradient_SOA_CPU_GPU_v2(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos);
 
 void calculate_cossin_values(double *theta_cos, double *theta_sin, double *angles, int nhalos ){
-	for(int i = 0 ; i < nhalos; i++){
-		theta_cos[i]=cos(angles[i]);
-		theta_sin[i]=sin(angles[i]);
+	for(int i = 0 ; i < nhalos; i++)
+	{
+		theta_cos[i] = cos(angles[i]);
+		theta_sin[i] = sin(angles[i]);
 	}
 }
 
 void gradient_grid_GPU_sorted(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos ,int nbgridcells){
 
 
   int nBlocks_gpu = 0;
   // Define the number of threads per block the GPU will use
   cudaDeviceProp properties_gpu;
 
   cudaGetDeviceProperties(&properties_gpu, 0); // Get properties of 0th GPU in use
 
   if (properties_gpu.maxThreadsDim[0]<threadsPerBlock)
     {
     fprintf(stderr, "ERROR: The GPU has to support at least %u threads per block.\n", threadsPerBlock);
     exit(-1);
     }
   else
     {
     nBlocks_gpu = properties_gpu.maxGridSize[0] / threadsPerBlock;  // Get the maximum number of blocks with the chosen number of threads
                     // per Block that the GPU supports
     }
 
   grid_param *frame_gpu;
   Potential_SOA *lens_gpu,*lens_kernel ;
   int *type_gpu;
   double *lens_x_gpu, *lens_y_gpu, *b0_gpu, *angle_gpu, *epot_gpu, *rcore_gpu, *rcut_gpu, *anglecos_gpu, *anglesin_gpu;
   double *grid_grad_x_gpu, *grid_grad_y_gpu ;
 
   lens_gpu = (Potential_SOA *) malloc(sizeof(Potential_SOA));
   lens_gpu->type = (int *) malloc(sizeof(int));
 
   // Allocate variables on the GPU
   cudasafe(cudaMalloc( (void**)&(lens_kernel), sizeof(Potential_SOA)),"Gradientgpu.cu : Alloc Potential_SOA: " );
   cudasafe(cudaMalloc( (void**)&(type_gpu), nhalos*sizeof(int)),"Gradientgpu.cu : Alloc type_gpu: " );
   cudasafe(cudaMalloc( (void**)&(lens_x_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc x_gpu: " );
   cudasafe(cudaMalloc( (void**)&(lens_y_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc y_gpu: " );
   cudasafe(cudaMalloc( (void**)&(b0_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc b0_gpu: " );
   cudasafe(cudaMalloc( (void**)&(angle_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc angle_gpu: " );
   cudasafe(cudaMalloc( (void**)&(epot_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc epot_gpu: " );
   cudasafe(cudaMalloc( (void**)&(rcore_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcore_gpu: " );
   cudasafe(cudaMalloc( (void**)&(rcut_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcut_gpu: " );
   cudasafe(cudaMalloc( (void**)&(anglecos_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglecos_gpu: " );
   cudasafe(cudaMalloc( (void**)&(anglesin_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglesin_gpu: " );
   cudasafe(cudaMalloc( (void**)&(frame_gpu), sizeof(grid_param)),"Gradientgpu.cu : Alloc frame_gpu: " );
   cudasafe(cudaMalloc( (void**)&(grid_grad_x_gpu), (nbgridcells) * (nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_x_gpu: " );
   cudasafe(cudaMalloc( (void**)&(grid_grad_y_gpu), (nbgridcells) * (nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_y_gpu: " );
 
   // Copy values to the GPU
   cudasafe(cudaMemcpy(type_gpu,lens->type , nhalos*sizeof(int),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy type_gpu: " );
   cudasafe(cudaMemcpy(lens_x_gpu,lens->position_x , nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy x_gpu: " );
   cudasafe(cudaMemcpy(lens_y_gpu,lens->position_y , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy y_gpu: " );
   cudasafe(cudaMemcpy(b0_gpu,lens->b0 , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy b0_gpu: " );
   cudasafe(cudaMemcpy(angle_gpu,lens->ellipticity_angle , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy angle_gpu: " );
   cudasafe(cudaMemcpy(epot_gpu, lens->ellipticity_potential, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy epot_gpu: " );
   cudasafe(cudaMemcpy(rcore_gpu, lens->rcore, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy rcore_gpu: " );
   cudasafe(cudaMemcpy(rcut_gpu, lens->rcut, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy rcut_gpu: " );
   cudasafe(cudaMemcpy(anglecos_gpu, lens->anglecos, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy anglecos: " );
   cudasafe(cudaMemcpy(anglesin_gpu, lens->anglesin, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy anglesin: " );
   cudasafe(cudaMemcpy(frame_gpu, frame, sizeof(grid_param), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy fame_gpu: " );
 
 
   //printf("%p \n", lens_gpu);
   //printf("%p \n", type_gpu);
   //printf("%p \n", lens_gpu->type);
   //fflush(stdout);
   lens_gpu->type = type_gpu;
   lens_gpu->position_x = lens_x_gpu;
   lens_gpu->position_y = lens_y_gpu;
   lens_gpu->b0 = b0_gpu;
   lens_gpu->ellipticity_angle = angle_gpu;
   lens_gpu->ellipticity_potential = epot_gpu;
   lens_gpu->rcore = rcore_gpu;
   lens_gpu->rcut = rcut_gpu;
   lens_gpu->anglecos = anglecos_gpu;
   lens_gpu->anglesin = anglesin_gpu;
 
   cudaMemcpy(lens_kernel, lens_gpu, sizeof(Potential_SOA), cudaMemcpyHostToDevice);
 
 
   if (int((nbgridcells) * (nbgridcells)/threadsPerBlock) == 0){
     gradient_grid_kernel<<<1,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel);
   }
   else{
     gradient_grid_kernel<<<(nbgridcells) * (nbgridcells)/threadsPerBlock,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel);
   }
   cudasafe(cudaMemcpy( grid_grad_x, grid_grad_x_gpu, (nbgridcells) * (nbgridcells) *sizeof(double),cudaMemcpyDeviceToHost ),"Gradientgpu.cu : Copy source_x_gpu: " );
   cudasafe(cudaMemcpy( grid_grad_y, grid_grad_y_gpu, (nbgridcells) * (nbgridcells) *sizeof(double), cudaMemcpyDeviceToHost),"Gradientgpu.cu : Copy source_y_gpu: " );
 
 
   //printf("%f %f \n",grid_grad_x[0],grid_grad_y[0]);
 
     // Free GPU memory
   cudaFree(lens_gpu);
   cudaFree(type_gpu);
   cudaFree(lens_x_gpu);
   cudaFree(lens_y_gpu);
   cudaFree(b0_gpu);
   cudaFree(angle_gpu);
   cudaFree(epot_gpu);
   cudaFree(rcore_gpu);
   cudaFree(rcut_gpu);
   cudaFree(anglecos_gpu);
   cudaFree(anglesin_gpu);
   cudaFree(grid_grad_x_gpu);
   cudaFree(grid_grad_y_gpu);
 
 /*
   for (int i = 0; i < nbgridcells; i++){
     for(int j = 0; j < nbgridcells; j++){
       printf(" %f",grid_grad_x[i*nbgridcells + j]);
     }
     printf("\n");
   }*/
 
 }
 
 void gradient_grid_GPU_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, int nbgridcells){
   //get number of GPU devices
   int nDevices;
   cudaGetDeviceCount(&nDevices);
 
   // Initialise kernel variables, table for multiple devices
   grid_param *frame_gpu[nDevices];
   Potential_SOA *lens_gpu[nDevices],*lens_kernel[nDevices] ;
   int *type_gpu[nDevices];
   double *lens_x_gpu[nDevices], *lens_y_gpu[nDevices], *b0_gpu[nDevices], *angle_gpu[nDevices], *epot_gpu[nDevices], *rcore_gpu[nDevices], *rcut_gpu[nDevices],*anglecos_gpu[nDevices], *anglesin_gpu[nDevices];
   double *grid_grad_x_gpu[nDevices], *grid_grad_y_gpu[nDevices] ;
 
 
 
   // Initialise multiple device variables
   int Ndevice[nDevices], indexactual[nDevices];
   cudaStream_t stream[nDevices];
 
   indexactual[0] = 0 ;
   Ndevice[0] = (nbgridcells) * (nbgridcells)/nDevices;
 
   printf("Using %d Gpu's \n",nDevices );
   //printf("%d %d \n",indexactual[0], Ndevice[0]);
 
   for (int dev = 1; dev < nDevices; dev++) {
 
     Ndevice[dev] = (nbgridcells) * (nbgridcells)/nDevices;
 
     if(indexactual[dev]+Ndevice[dev] > (nbgridcells) * (nbgridcells)){
       Ndevice[dev] = (nbgridcells) * (nbgridcells) - indexactual[dev-1];
     }
 
     indexactual[dev] = indexactual[dev-1] + Ndevice[dev];
     //printf("%d %d \n",indexactual[dev], Ndevice[dev]);
   }
 
   for (int dev = 0; dev < nDevices; dev++) {
 
 
     cudaSetDevice(dev);
 
     lens_gpu[dev] = (Potential_SOA *) malloc(sizeof(Potential_SOA));
     lens_gpu[dev]->type = (int *) malloc(sizeof(int));
 
     // Allocate variables on the GPU
     cudasafe(cudaMalloc( (void**)&(lens_kernel[dev]), sizeof(Potential_SOA)),"Gradientgpu.cu : Alloc Potential_SOA: " );
     cudasafe(cudaMalloc( (void**)&(type_gpu[dev]), nhalos*sizeof(int)),"Gradientgpu.cu : Alloc type_gpu: " );
     cudasafe(cudaMalloc( (void**)&(lens_x_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc x_gpu: " );
     cudasafe(cudaMalloc( (void**)&(lens_y_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc y_gpu: " );
     cudasafe(cudaMalloc( (void**)&(b0_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc b0_gpu: " );
     cudasafe(cudaMalloc( (void**)&(angle_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc angle_gpu: " );
     cudasafe(cudaMalloc( (void**)&(epot_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc epot_gpu: " );
     cudasafe(cudaMalloc( (void**)&(rcore_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcore_gpu: " );
     cudasafe(cudaMalloc( (void**)&(rcut_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcut_gpu: " );
     cudasafe(cudaMalloc( (void**)&(frame_gpu[dev]), sizeof(grid_param)),"Gradientgpu.cu : Alloc frame_gpu: " );
     cudasafe(cudaMalloc( (void**)&(anglecos_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglecos_gpu: " );
     cudasafe(cudaMalloc( (void**)&(anglesin_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglesin_gpu: " );
     cudasafe(cudaMalloc( (void**)&(grid_grad_x_gpu[dev]), Ndevice[dev] *sizeof(double)),"Gradientgpu.cu : Alloc source_x_gpu: " );
     cudasafe(cudaMalloc( (void**)&(grid_grad_y_gpu[dev]), Ndevice[dev] *sizeof(double)),"Gradientgpu.cu : Alloc source_y_gpu: " );
 
     cudaStreamCreate(&stream[dev]);
 
   }
 
   for (int dev = 0; dev < nDevices; dev++) {
 
 
     cudaSetDevice(dev);
 
     // Copy values to the GPU
     cudasafe(cudaMemcpyAsync(type_gpu[dev],lens->type , nhalos*sizeof(int),cudaMemcpyHostToDevice,stream[dev] ),"Gradientgpu.cu : Copy type_gpu: " );
     cudasafe(cudaMemcpyAsync(lens_x_gpu[dev],lens->position_x , nhalos*sizeof(double),cudaMemcpyHostToDevice,stream[dev] ),"Gradientgpu.cu : Copy x_gpu: " );
     cudasafe(cudaMemcpyAsync(lens_y_gpu[dev],lens->position_y , nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy y_gpu: " );
     cudasafe(cudaMemcpyAsync(b0_gpu[dev],lens->b0 , nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy b0_gpu: " );
     cudasafe(cudaMemcpyAsync(angle_gpu[dev],lens->ellipticity_angle , nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy angle_gpu: " );
     cudasafe(cudaMemcpyAsync(epot_gpu[dev], lens->ellipticity_potential, nhalos*sizeof(double),cudaMemcpyHostToDevice ,stream[dev]),"Gradientgpu.cu : Copy epot_gpu: " );
     cudasafe(cudaMemcpyAsync(rcore_gpu[dev], lens->rcore, nhalos*sizeof(double),cudaMemcpyHostToDevice ,stream[dev]),"Gradientgpu.cu : Copy rcore_gpu: " );
     cudasafe(cudaMemcpyAsync(rcut_gpu[dev], lens->rcut, nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy rcut_gpu: " );
     cudasafe(cudaMemcpyAsync(anglecos_gpu[dev], lens->anglecos, nhalos*sizeof(double),cudaMemcpyHostToDevice,stream[dev] ),"Gradientgpu.cu : Copy anglecos: " );
     cudasafe(cudaMemcpyAsync(anglesin_gpu[dev], lens->anglesin, nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy anglesin: " );
     cudasafe(cudaMemcpyAsync(frame_gpu[dev], frame, sizeof(grid_param), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy fame_gpu: " );
 
 
     lens_gpu[dev]->type = type_gpu[dev];
     lens_gpu[dev]->position_x = lens_x_gpu[dev];
     lens_gpu[dev]->position_y = lens_y_gpu[dev];
     lens_gpu[dev]->b0 = b0_gpu[dev];
     lens_gpu[dev]->ellipticity_angle = angle_gpu[dev];
     lens_gpu[dev]->ellipticity_potential = epot_gpu[dev];
     lens_gpu[dev]->rcore = rcore_gpu[dev];
     lens_gpu[dev]->rcut = rcut_gpu[dev];
     lens_gpu[dev]->anglecos = anglecos_gpu[dev];
     lens_gpu[dev]->anglesin = anglesin_gpu[dev];
 
     cudaMemcpyAsync(lens_kernel[dev], lens_gpu[dev], sizeof(Potential_SOA), cudaMemcpyHostToDevice,stream[dev]);
   }
 
   for (int dev = 0; dev < nDevices; dev++) {
 
 
     cudaSetDevice(dev);
     int nBlocks_gpu = 0;
     cudaDeviceProp properties_gpu;
     cudaGetDeviceProperties(&properties_gpu, dev); // Get properties of 0th GPU in use
 
     if (properties_gpu.maxThreadsDim[0]<threadsPerBlock)
       {
       fprintf(stderr, "ERROR: The GPU has to support at least %u threads per block.\n", threadsPerBlock);
       exit(-1);
       }
 
 
     if (int((nbgridcells) * (nbgridcells)/threadsPerBlock) == 0){
     	gradient_grid_kernel_multiple<<<1,threadsPerBlock,0,stream[dev]>>>(grid_grad_x_gpu[dev], grid_grad_y_gpu[dev],frame_gpu[dev],nhalos, nbgridcells, lens_kernel[dev], indexactual[dev],Ndevice[dev]);
     }
     else{
     	gradient_grid_kernel_multiple<<<(nbgridcells) * (nbgridcells)/threadsPerBlock,threadsPerBlock,0,stream[dev]>>>(grid_grad_x_gpu[dev], grid_grad_y_gpu[dev],frame_gpu[dev],nhalos, nbgridcells, lens_kernel[dev], indexactual[dev],Ndevice[dev]);
     }
   }
 
   for (int dev = 0; dev < nDevices; dev++) {
     cudasafe(cudaMemcpyAsync( grid_grad_x + indexactual[dev], grid_grad_x_gpu[dev], Ndevice[dev] *sizeof(double),cudaMemcpyDeviceToHost ,stream[dev]),"Gradientgpu.cu : Copy source_x_gpu: " );
     cudasafe(cudaMemcpyAsync( grid_grad_y + indexactual[dev], grid_grad_y_gpu[dev], Ndevice[dev] *sizeof(double), cudaMemcpyDeviceToHost,stream[dev]),"Gradientgpu.cu : Copy source_y_gpu: " );
   }
 
   for (int dev = 0; dev < nDevices; dev++) {
 
 
     cudaSetDevice(dev);
       // Free GPU memory
     cudaFree(type_gpu[dev]);
     cudaFree(lens_x_gpu[dev]);
     cudaFree(lens_y_gpu[dev]);
     cudaFree(b0_gpu[dev]);
     cudaFree(angle_gpu[dev]);
     cudaFree(epot_gpu[dev]);
     cudaFree(rcore_gpu[dev]);
     cudaFree(rcut_gpu[dev]);
     cudaFree(anglecos_gpu[dev]);
     cudaFree(anglesin_gpu[dev]);
     cudaFree(grid_grad_x_gpu[dev]);
     cudaFree(grid_grad_y_gpu[dev]);
     cudaStreamDestroy(stream[dev]);
 
   }
 
 }
 
 #if 1
 void gradient_grid_GPU_sub(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, int nbgridcells, int indexactual, int Ncells ){
 
 
 	// GPU Property query
 	int nBlocks_gpu = 0;
 	// Define the number of threads per block the GPU will use
 	cudaDeviceProp properties_gpu;
 
 	cudaGetDeviceProperties(&properties_gpu, 0); // Get properties of 0th GPU in use
 
 	if (properties_gpu.maxThreadsDim[0]<threadsPerBlock)
 	{
 		fprintf(stderr, "ERROR: The GPU has to support at least %u threads per block.\n", threadsPerBlock);
 		exit(-1);
 	}
 	else
 	{
 		nBlocks_gpu = properties_gpu.maxGridSize[0] / threadsPerBlock;  // Get the maximum number of blocks with the chosen number of threads
 		// per Block that the GPU supports
 	}
 
 	grid_param *frame_gpu;
 	Potential_SOA *lens_gpu,*lens_kernel ;
 	int *type_gpu;
 	double *lens_x_gpu, *lens_y_gpu, *b0_gpu, *angle_gpu, *epot_gpu, *rcore_gpu, *rcut_gpu, *anglecos_gpu, *anglesin_gpu;
 	double *grid_grad_x_gpu, *grid_grad_y_gpu ;
 
 	lens_gpu = (Potential_SOA *) malloc(sizeof(Potential_SOA));
 	lens_gpu->type = (int *) malloc(sizeof(int));
 
 	// Allocate variables on the GPU
 	cudasafe(cudaMalloc( (void**)&(lens_kernel), sizeof(Potential_SOA)),"Gradientgpu.cu : Alloc Potential_SOA: " );
 	cudasafe(cudaMalloc( (void**)&(type_gpu), nhalos*sizeof(int)),"Gradientgpu.cu : Alloc type_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(lens_x_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc x_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(lens_y_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc y_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(b0_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc b0_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(angle_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc angle_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(epot_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc epot_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(rcore_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcore_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(rcut_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcut_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(anglecos_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglecos_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(anglesin_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglesin_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(frame_gpu), sizeof(grid_param)),"Gradientgpu.cu : Alloc frame_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(grid_grad_x_gpu), (nbgridcells) * (nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_x_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(grid_grad_y_gpu), (nbgridcells) * (nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_y_gpu: " );
 
 	// Copy values to the GPU
 	cudasafe(cudaMemcpy(type_gpu,lens->type , nhalos*sizeof(int),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy type_gpu: " );
 	cudasafe(cudaMemcpy(lens_x_gpu,lens->position_x , nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy x_gpu: " );
 	cudasafe(cudaMemcpy(lens_y_gpu,lens->position_y , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy y_gpu: " );
 	cudasafe(cudaMemcpy(b0_gpu,lens->b0 , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy b0_gpu: " );
 	cudasafe(cudaMemcpy(angle_gpu,lens->ellipticity_angle , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy angle_gpu: " );
 	cudasafe(cudaMemcpy(epot_gpu, lens->ellipticity_potential, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy epot_gpu: " );
 	cudasafe(cudaMemcpy(rcore_gpu, lens->rcore, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy rcore_gpu: " );
 	cudasafe(cudaMemcpy(rcut_gpu, lens->rcut, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy rcut_gpu: " );
 	cudasafe(cudaMemcpy(anglecos_gpu, lens->anglecos, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy anglecos: " );
 	cudasafe(cudaMemcpy(anglesin_gpu, lens->anglesin, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy anglesin: " );
 	cudasafe(cudaMemcpy(frame_gpu, frame, sizeof(grid_param), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy fame_gpu: " );
 	//
 	lens_gpu->type = type_gpu;
 	lens_gpu->position_x = lens_x_gpu;
 	lens_gpu->position_y = lens_y_gpu;
 	lens_gpu->b0 = b0_gpu;
 	lens_gpu->ellipticity_angle = angle_gpu;
 	lens_gpu->ellipticity_potential = epot_gpu;
 	lens_gpu->rcore = rcore_gpu;
 	lens_gpu->rcut = rcut_gpu;
 	lens_gpu->anglecos = anglecos_gpu;
 	lens_gpu->anglesin = anglesin_gpu;
 	//
 	cudaMemcpy(lens_kernel, lens_gpu, sizeof(Potential_SOA), cudaMemcpyHostToDevice);
 	//
 	double time = -myseconds();
 	module_potentialDerivatives_totalGradient_SOA_CPU_GPU(grid_grad_x_gpu, grid_grad_y_gpu, frame_gpu, lens, lens_kernel, nbgridcells, nhalos);
 	//
 	if (int((nbgridcells) * (nbgridcells)/threadsPerBlock) == 0)
 	{
 		gradient_grid_kernel<<<1,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel);
 	}
 	else
 	{
 		//gradient_grid_kernel<<<(nbgridcells) * (nbgridcells)/threadsPerBlock,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel);
 		//gradient_grid_kernel_v2<<<dimGrid, dimBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu, frame_gpu, nhalos, nbgridcells, lens_kernel);
 		//gradient_grid_kernel_v2<<<dimGrid, dimBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu, frame_gpu, nhalos, nbgridcells, lens_kernel);
 
 
 	if (int((Ncells)/threadsPerBlock) == 0){
 	gradient_grid_kernel_multiple<<<1,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel, indexactual, Ncells);
 	}
 	else{
 	gradient_grid_kernel_multiple<<<(Ncells)/threadsPerBlock,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel, indexactual, Ncells);
 	}
 #endif
 		module_potentialDerivatives_totalGradient_SOA_CPU_GPU(grid_grad_x_gpu, grid_grad_y_gpu, frame_gpu, lens, lens_kernel, nbgridcells, nhalos);
 	cudasafe(cudaMemcpy( grid_grad_x, grid_grad_x_gpu, (nbgridcells) * (nbgridcells) *sizeof(double),cudaMemcpyDeviceToHost ),"Gradientgpu.cu : Copy source_x_gpu: " );
 	cudasafe(cudaMemcpy( grid_grad_y, grid_grad_y_gpu, (nbgridcells) * (nbgridcells) *sizeof(double), cudaMemcpyDeviceToHost),"Gradientgpu.cu : Copy source_y_gpu: " );
 
 
 	//printf("%f %f \n",grid_grad_x[0],grid_grad_y[0]);
 
 	// Free GPU memory
 	cudaFree(lens_gpu);
 	cudaFree(type_gpu);
 	cudaFree(lens_x_gpu);
 	cudaFree(lens_y_gpu);
 	cudaFree(b0_gpu);
 	cudaFree(angle_gpu);
 	cudaFree(epot_gpu);
 	cudaFree(rcore_gpu);
 	cudaFree(rcut_gpu);
 	cudaFree(anglecos_gpu);
 	cudaFree(anglesin_gpu);
 	cudaFree(grid_grad_x_gpu);
 	cudaFree(grid_grad_y_gpu);
 
 	/*
 	   for (int i = 0; i < nbgridcells; i++){
 	   for(int j = 0; j < nbgridcells; j++){
 	   printf(" %f",grid_grad_x[i*nbgridcells + j]);
 	   }
 	   printf("\n");
 	   }*/
 
 }
 
 
 __global__ void gradient_grid_kernel(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens) {
 
 	//*grad_x = *grad_y = 0.;
 
 	int bid=blockIdx.x; // index of the block (and of the set of images)
 	int tid=threadIdx.x; // index of the thread within the block
 
 	double dx,dy;        //pixelsize
 	int grid_dim, index;
 	struct point image_point, Grad;
 	dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
 	dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
 	grid_dim = (nbgridcells);
 
 	index = bid * threadsPerBlock + tid ;
 
 	while(index < grid_dim*grid_dim){
 
 		grid_grad_x[index] = 0.;
 		grid_grad_y[index] = 0.;
 
 		image_point.x = frame->xmin + (index/grid_dim)*dx;
 		image_point.y = frame->ymin + (index % grid_dim)*dy;
 
 		Grad = module_potentialDerivatives_totalGradient_SOA_GPU(&image_point, lens, Nlens);
 
 		grid_grad_x[index] = Grad.x;
 		grid_grad_y[index] = Grad.y;
 
 		bid += gridDim.x;
 		index = bid * threadsPerBlock + tid;
 	}
 }
 
-__global__ void gradient_grid_kernel_v2(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens) {
-
+__global__ void gradient_grid_kernel_v2(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens) 
+{
 	//*grad_x = *grad_y = 0.;
 
-	int bid = blockIdx.x; // index of the block (and of the set of images)
+	int bid =  blockIdx.x; // index of the block (and of the set of images)
 	int tid = threadIdx.x; // index of the thread within the block
 
 	double dx,dy;        //pixelsize
 	int grid_dim, index;
 	struct point image_point, Grad;
 	//
 	dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
 	dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
 	//
 	grid_dim = (nbgridcells);
 	//
 	int row = blockIdx.y * blockDim.y + threadIdx.y;
 	int col = blockIdx.x * blockDim.x + threadIdx.x;
 	//
 	index = col*nbgridcells + row ;
 	//
 	//while(index < grid_dim*grid_dim){
 
 	//grid_grad_x[index] = 0.;
 	//grid_grad_y[index] = 0.;
 
 	image_point.x = frame->xmin + col*dx;
 	image_point.y = frame->ymin + row*dy;
 
 	Grad = module_potentialDerivatives_totalGradient_SOA_GPU(&image_point, lens, Nlens);
 
 	grid_grad_x[index] = Grad.x;
 	grid_grad_y[index] = Grad.y;
 
 	bid += gridDim.x;
 	index = bid * threadsPerBlock + tid;
 	//}
 }
 
 
 __global__
 void
 module_potentialDerivatives_totalGradient_8_SOA_GPU_cur(double *grid_grad_x, double *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int shalos, int nhalos)
 {
         //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
         // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
         //
         struct point grad, clumpgrad, image_point;
 	//
         grad.x = 0;
         grad.y = 0;
         //
         int col = blockIdx.x*blockDim.x + threadIdx.x;
         int row = blockIdx.y*blockDim.y + threadIdx.y;
         //
         if ((row < nbgridcells) && (col < nbgridcells))
         {
                 //
                 int index = row*nbgridcells + col;
                 //
                 //grid_grad_x[index] = 0.;
                 //grid_grad_y[index] = 0.;
                 //
                 double dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
                 double dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
                 //
                 image_point.x = frame->xmin + col*dx;
                 image_point.y = frame->ymin + row*dy;
 		//
                 for(int i = shalos; i < shalos + nhalos; i++)
                 {
                         struct point true_coord, true_coord_rot; //, result;
                         //double       R, angular_deviation;
                         complex      zis;
                         //
                         // positionning at the potential center
                         // Change the origin of the coordinate system to the center of the clump
                         //
 			//@@if ((row == Ny) && (col == Nx)) printf("image_x = %f, %f image_y = %f, %f\n",  image_point.x, frame->xmin, image_point.y,frame->ymin);
 			double true_coord_x = image_point.x - __ldg(&lens->position_x[i]);
                         double true_coord_y = image_point.y - __ldg(&lens->position_y[i]);
 			//if ((row == Ny) && (col == Nx)) printf("x = %f y = %f\n",  true_coord_x, true_coord_y);	
 			//
                         double cosi = __ldg(&lens->anglecos[i]);
                         double sinu = __ldg(&lens->anglesin[i]);
 			//
                         double x = true_coord_x*cosi + true_coord_y*sinu;
                         double y = true_coord_y*cosi - true_coord_x*sinu;
 			//
 			//if ((row == Ny) && (col == Nx)) printf("x = %f y = %f\n",  x, y);	
                         //
                         double eps = __ldg(&lens->ellipticity_potential[i]);
                         //
                         double sqe  = sqrt(eps);
                         //
                         double rem2 = x*x/((1. + eps)*(1. + eps)) + y*y/((1. - eps)*(1. - eps));
                         //
                         complex zci;
                         complex znum, zden, zres;
                         double norm;
                         //
 			zci.re  = 0;
                         zci.im  = -0.5*(1. - eps*eps)/sqe;
 			//@@if ((col == Nx) && (row == Ny)) printf("%d %d, zis: %f %f\n", row, col, zci.re, zci.im);
                         //
                         double rc  = __ldg(&lens->rcore[i]);
                         double cx1  = (1. - eps)/(1. + eps);
                         znum.re = cx1*x;
                         znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
                         //
                         zden.re = x;
                         zden.im = 2.*rc*sqe - y;
                         norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
 			//@@if ((col == Nx) && (row == Ny)) printf("norm = %f\n", norm);
                         //
                         zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
                         zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
 			//
 			//@@if ((col == Nx) && (row == Ny)) printf("%d %d, zis: %f %f\n", row, col, zis.re, zis.im);
                         //
                         norm    = zis.re;
                         //
                         zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
                         zis.im  = atan2(zis.im, norm);
 			//
 			//@@if ((col == Nx) && (row == Ny)) printf("%d %d, zis: %f %f\n", row, col, zis.re, zis.im);
                         //
                         zres.re = zci.re*zis.re - zci.im*zis.im;   // Re( zci*ln(zis) )
                         zres.im = zci.im*zis.re + zis.im*zci.re;   // Im( zci*ln(zis) )
 			//
 			//@@if ((col == Nx) && (row == Ny)) printf("%d %d, zres: %f %f\n", row, col, zres.re, zres.im);
                         //
                         double b0  = __ldg(&lens->b0[i]);
                         grad.x += b0*(zres.re*cosi - zres.im*sinu);
                         grad.y += b0*(zres.im*cosi + zres.re*sinu);
 			//@@if ((col == Nx) && (row == Ny)) printf("grad: %f %f\n", grad.x, grad.y);
                 }
                 //IACA_END;
                 //
                 grid_grad_x[index] = grad.x;
                 grid_grad_y[index] = grad.y;
 		//if ((row == 0) && (col == 9)) 
 		//printf("%f %f: %f %f\n",  image_point.x, image_point.y, grid_grad_x[index], grid_grad_y[index]);
         }
 }
 
 __global__
 void
 module_potentialDerivatives_totalGradient_8_SOA_GPU_SM2(double *grid_grad_x, double *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int shalos, int nhalos)
 {
 	//
         //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
         // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
         //
         double grad_x, grad_y;
 	double clumpgrad_x, clumpgrad_y;
 	double image_point_x, image_point_y;
 	//
 	SHARED double cosi	[200];
 	SHARED double sinu	[200];
 	SHARED double rc	[200];
 	SHARED double b0	[200];
 	SHARED double epsi	[200];
 	SHARED double position_x[200];
 	SHARED double position_y[200];
 	SHARED double rsqe	[200];
 	SHARED double sonepeps	[200];
 	SHARED double sonemeps	[200];
         //
         grad_x = 0;
         grad_y = 0;
         //
         int col = blockIdx.x*blockDim.x + threadIdx.x;
         int row = blockIdx.y*blockDim.y + threadIdx.y;
 	int ithread  = threadIdx.y*blockDim.x + threadIdx.x;
 	//
 	int index = row*nbgridcells + col;
 	//
 	//grid_grad_x[index] = 0.;
 	//grid_grad_y[index] = 0.;
 	//
 	double dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
 	double dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
 	//
 	image_point_x = frame->xmin + col*dx;
 	image_point_y = frame->ymin + row*dy;
 	//
 	int i = ithread;
 	if (i < nhalos)
 	{
 		cosi[i]       = __ldg(&lens->anglecos		  [shalos + i]);
 		sinu[i]       = __ldg(&lens->anglesin		  [shalos + i]);
 		position_x[i] = __ldg(&lens->position_x		  [shalos + i]);
 		position_y[i] = __ldg(&lens->position_y		  [shalos + i]);
 		rc[i]         = __ldg(&lens->rcore		  [shalos + i]);
 		b0[i]         = __ldg(&lens->b0		          [shalos + i]);
 		epsi[i]       = __ldg(&lens->ellipticity_potential[shalos + i]);
-		sonemeps[i]   = 1 - epsi[i];
-		sonepeps[i]   = 1 + epsi[i];
+		//sonemeps[i]   = 1 - epsi[i];
+		//sonepeps[i]   = 1 + epsi[i];
 		rsqe[i]	      = sqrt(epsi[i]);
 	}
 	__syncthreads();
 	//
 	if ((row < nbgridcells) && (col < nbgridcells))
 	{
 		for(int i = 0; i < nhalos; i++)
 		{
 			//
 			double true_coord_x = image_point_x - position_x[i];
 			double true_coord_y = image_point_y - position_y[i];
 			//
 			double x = true_coord_x*cosi[i] + true_coord_y*sinu[i];
 			double y = true_coord_y*cosi[i] - true_coord_x*sinu[i];
 			//
 			double eps     = epsi[i]; 
 			//double onemeps = 1 - eps; 
 			//double onepeps = 1 + eps; 
 			//
 			//double eps     = epsi[i]; 
 			double onemeps = sonemeps[i]; 
 			double onepeps = sonepeps[i];
 			//
 			//double sqe  = sqrt(eps);
 			double sqe  = rsqe[i];
 			double rem2 = x*x/(onepeps*onepeps) + y*y/(onemeps*onemeps);
 			//
 			complex      zis;
 			//
 			double znum_re, znum_im;
 			double zres_re, zres_im;
 			double norm;
 			double zden_re, zden_im;
 			double  zis_re,  zis_im;
 			//
 			double zci_im  = -0.5*(1. - eps*eps)/sqe;
 			//
 			double cx1  = onemeps/onepeps; 
 			//
 			znum_re = cx1*x;
 			znum_im = 2.*sqe*sqrt(rc[i]*rc[i] + rem2) - y/cx1;
 			//
 			zden_re = x;
 			zden_im = 2.*rc[i]*sqe - y;
 			//
 			norm    = (x*x + zden_im*zden_im);     // zis = znum/zden
 			zis.re  = (znum_re*x + znum_im*zden_im)/norm;
 			zis.im  = (znum_im*x - znum_re*zden_im)/norm;
 			//
 			norm    = zis.re;
 			//
 			zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
 			zis.im  = atan2(zis.im, norm);
 			//
 			zres_re = - zci_im*zis.im;   // Re( zci*ln(zis) )
 			zres_im =   zci_im*zis.re;   // Im( zci*ln(zis) )
 			//
 			grad_x += b0[i]*(zres_re*cosi[i] - zres_im*sinu[i]);
 			grad_y += b0[i]*(zres_im*cosi[i] + zres_re*sinu[i]);
 		}
 		//
 		grid_grad_x[index] = grad_x;
 		grid_grad_y[index] = grad_y;
 		//__syncthreads();
 	}
 }
 //
 //
 //
 __global__
 void
 module_potentialDerivatives_totalGradient_8_SOA_GPU_SM3(double *grid_grad_x, double *grid_grad_y, const struct Potential_SOA
  *lens, const struct grid_param *frame, int nbgridcells, int shalos, int nhalos)
 {
         //
         //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
         // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
         //
         double grad_x, grad_y;
         double clumpgrad_x, clumpgrad_y;
         double image_point_x, image_point_y;
         //
         SHARED double cosi      [200];
         SHARED double sinu      [200];
-        SHARED double rc        [200];
+        SHARED double rci       [200];
         SHARED double b0        [200];
         SHARED double epsi      [200];
         SHARED double position_x[200];
         SHARED double position_y[200];
         SHARED double rsqe      [200];
+	//SHARED double sgrad_x   [(BLOCK_SIZE_X + 1)*BLOCK_SIZE_Y];
+	//SHARED double sgrad_y   [(BLOCK_SIZE_X + 1)*BLOCK_SIZE_Y]; 
+
         //SHARED double sonepeps  [200];
         //SHARED double sonemeps  [200];
         //
         grad_x         = 0;
         grad_y 	       = 0;
         //
         int row        = blockIdx.x*blockDim.x + threadIdx.x;
-        int grid_size  = nbgridcells/blockDim.y; 
-        int col        =( blockIdx.y*blockDim.y + threadIdx.y)*grid_size;
+        int col        = blockIdx.y*blockDim.y + threadIdx.y;
+	//
+	//int loc_row    = threadIdx.x;
+	//int loc_col    = threadIdx.y*blockDim.x + threadIdx.x;
+	//
+        //int grid_size  = nbgridcells/blockDim.y; 
+	//
 	//if (threadIdx.x == 0) printf("%d %d %d: row = %d, col = %d, grid_size = %d\n", blockIdx.y, gridDim.y, threadIdx.y, row, col, grid_size);
         //
-        //
-        //grid_grad_x[index] = 0.;
-        //grid_grad_y[index] = 0.;
-        //
-        double dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
-        double dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
+	double dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
+	double dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
 	//if (threadIdx.x == 0) printf("dx = %f, dy = %f\n", dx, dy);
-        //
-        image_point_x = frame->xmin + col*dx;
-        image_point_y = frame->ymin + row*dy;
-        //
-        int i = row;
-	if (threadIdx.x == 0)
-	for (int i = 0; i < nhalos; i += 1)	
-        {
-                cosi[i]       = __ldg(&lens->anglecos             [shalos + i]);
-                sinu[i]       = __ldg(&lens->anglesin             [shalos + i]);
-                position_x[i] = __ldg(&lens->position_x           [shalos + i]);
-                position_y[i] = __ldg(&lens->position_y           [shalos + i]);
-                rc[i]         = __ldg(&lens->rcore                [shalos + i]);
-                b0[i]         = __ldg(&lens->b0                   [shalos + i]);
-                epsi[i]       = __ldg(&lens->ellipticity_potential[shalos + i]);
-                rsqe[i]       = sqrt(epsi[i]);
-        }
-	//if (threadIdx.x == 0) printf("%d %d\n", blockDim.y, gridDim.y);
-        __syncthreads();
-        //
+	//
+	image_point_x = frame->xmin + col*dx;
+	image_point_y = frame->ymin + row*dy;
+	//
+	//int iloc  = threadIdx.x*blockDim.y + threadIdx.y;
+	int iglob = row*nbgridcells + col;
+	int numThreads = blockDim.x*blockDim.y;
+	//
+	for (int i = 0; i < (nhalos + numThreads - 1)/numThreads; ++i)
+	{ 
+		int iloc  = threadIdx.x*blockDim.y + threadIdx.y + i*numThreads;
+		if (iloc < nhalos)
+		{
+			cosi[iloc]       = __ldg(&lens->anglecos             [shalos + iloc]);
+			sinu[iloc]       = __ldg(&lens->anglesin             [shalos + iloc]);
+			position_x[iloc] = __ldg(&lens->position_x           [shalos + iloc]);
+			position_y[iloc] = __ldg(&lens->position_y           [shalos + iloc]);
+			rci[iloc]         = __ldg(&lens->rcore                [shalos + iloc]);
+			b0[iloc]         = __ldg(&lens->b0                   [shalos + iloc]);
+			epsi[iloc]       = __ldg(&lens->ellipticity_potential[shalos + iloc]);
+			rsqe[iloc]       = sqrt(epsi[iloc]);
+		}
+	}
+	__syncthreads();
+	//
 	if ((row < nbgridcells) && (col < nbgridcells))
 	{
 		//
-		for(int icol = 0; icol < grid_size; ++icol)
+		for(int i = 0; i < nhalos; i++)
 		{
-			int index = row*nbgridcells + col + icol;
-			grad_x = grad_y = 0.;
+			//int index  = iloc; 
+#if 1
+			double rc      = rci[i];
+			double eps     = epsi[i];
+			double onemeps = 1 - eps;
+			double onepeps = 1 + eps;
+			//
+			double sqe = rsqe[i];
+			double cx1 = onemeps/onepeps;
+			//
+			//
+			//double x = true_coord_y*sinu[i];
+			//double y = true_coord_y*cosi[i];
+			double true_coord_y = image_point_y - position_y[i];
+			double true_coord_x = image_point_x - position_x[i] /*+ icol*dx*/;
+			//
+			double zci_im;
+			zci_im  = -0.5*(1. - eps*eps)/sqe;
+			double inv_onepeps = 1./(onepeps*onepeps);
+			double inv_onemeps = 1./(onemeps*onemeps);
+#endif
 			//
-			for(int i = 0; i < nhalos; i++)
 			{
-				//
-				double true_coord_x = image_point_x + icol*dx - position_x[i];
-				double true_coord_y = image_point_y           - position_y[i];
-				//if ((row == 1) && (col == 0)) printf("%d %d: %f %f\n", row, col, true_coord_x, true_coord_y);
-				//
-				double x = true_coord_x*cosi[i] + true_coord_y*sinu[i];
-				double y = true_coord_y*cosi[i] - true_coord_x*sinu[i];
-				//
-				double eps     = epsi[i];
-				double onemeps = 1 - eps;
-				double onepeps = 1 + eps;
-				//if ((row == 1) && (col == 0)) printf("i = %d, eps = %f\n", i, eps);
-				//
-				//double eps     = epsi[i];
-				//double onemeps = sonemeps[i];
-				//double onepeps = sonepeps[i];
-				//
-				//double sqe  = sqrt(eps);
-				double sqe  = rsqe[i];
-				double rem2 = x*x/(onepeps*onepeps) + y*y/(onemeps*onemeps);
-				//
-				complex      zis;
-				//
-				double znum_re, znum_im;
-				double zres_re, zres_im;
-				double norm;
-				double zden_re, zden_im;
-				double  zis_re,  zis_im;
-				//
-				double zci_im  = -0.5*(1. - eps*eps)/sqe;
-				//
-				double cx1  = onemeps/onepeps;
-				//
-				znum_re = cx1*x;
-				znum_im = 2.*sqe*sqrt(rc[i]*rc[i] + rem2) - y/cx1;
-				//
-				zden_re = x;
-				zden_im = 2.*rc[i]*sqe - y;
-				//
-				norm    = (x*x + zden_im*zden_im);     // zis = znum/zden
-				zis.re  = (znum_re*x + znum_im*zden_im)/norm;
-				zis.im  = (znum_im*x - znum_re*zden_im)/norm;
-				//
-				norm    = zis.re;
-				//
-				zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
-				zis.im  = atan2(zis.im, norm);
-				//
-				zres_re = - zci_im*zis.im;   // Re( zci*ln(zis) )
-				zres_im =   zci_im*zis.re;   // Im( zci*ln(zis) )
-				//
-				grad_x += b0[i]*(zres_re*cosi[i] - zres_im*sinu[i]);
-				grad_y += b0[i]*(zres_im*cosi[i] + zres_re*sinu[i]);
+				KERNEL_8_reg(0);
+				//KERNEL_8;
 			}
-			//
-			grid_grad_x[index] += grad_x; 
-			grid_grad_y[index] += grad_y; 
-		}
+			/*{
+				KERNEL_8_reg(BLOCK_SIZE_X/2);
+				grid_grad_x[iglob + BLOCK_SIZE_X/2] += grad_x;
+				grid_grad_y[iglob + BLOCK_SIZE_X/2] += grad_y;
+			}*/
 		//
-		//grid_grad_x[index] = grad_x;
-		//grid_grad_y[index] = grad_y;
-		//__syncthreads();
+		}
+		grid_grad_x[iglob +  0] = grad_x;
+		grid_grad_y[iglob +  0] = grad_y;
 	}
 }
 //
 //
 //
 __global__
-	void
+void
+module_potentialDerivatives_totalGradient_8_SOA_GPU_SM4(double *grid_grad_x, double *grid_grad_y, const struct Potential_SOA
+		*lens, const struct grid_param *frame, int nbgridcells, int shalos, int nhalos/*, double* dtimer*/)
+{
+	//
+	//asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
+	// 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
+	//
+	double grad_x, grad_y;
+	double clumpgrad_x, clumpgrad_y;
+	double image_point_x, image_point_y;
+	//
+	//
+	double* cosi       = &shared[0*nhalos];
+	double* sinu       = &shared[1*nhalos];
+	double* rc         = &shared[2*nhalos];
+	double* b0         = &shared[3*nhalos];
+	double* epsi       = &shared[4*nhalos];
+	double* position_x = &shared[5*nhalos];
+	double* position_y = &shared[6*nhalos];
+	double* rsqe       = &shared[7*nhalos];
+
+	//SHARED double sonepeps  [200];
+	//SHARED double sonemeps  [200];
+	//
+	grad_x         = 0;
+	grad_y         = 0;
+	//
+	int grid_size  =  nbgridcells/gridDim.y;
+	int row        =  blockIdx.x*blockDim.x + threadIdx.x;
+	int col        = (blockIdx.y*blockDim.y + threadIdx.y)/**grid_size*/;
+	//
+	//
+#if 1
+	//SHARED double sgrad_x [
+	if (/*(threadIdx.x == 0) &&*/ (blockIdx.x == 0))
+	{
+		if (threadIdx.x == 0) printf("blockDim.x = %d, blockIdx.x = %d grimdDim.x = %d threadIdx.x = %d\n", blockDim.x, blockIdx.x, gridDim.x, threadIdx.x);
+		if (threadIdx.x == 0) printf("blockDim.y = %d, blockIdx.y = %d grimdDim.y = %d threadIdx.y = %d\n", blockDim.y, blockIdx.y, gridDim.y, threadIdx.y);
+		if (threadIdx.x == 0) printf("row = %d, col = %d, grid_size = %d\n", row, col, grid_size);
+	}
+	__syncthreads();
+#endif
+	double* sgrad_x    = &shared[8*nhalos];
+	double* sgrad_y    = &shared[8*nhalos + (grid_size + 1)*blockDim.x];
+	//
+	//
+	//grid_grad_x[index] = 0.;
+	//grid_grad_y[index] = 0.;
+	//
+	double dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
+	double dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
+	//
+	//if (threadIdx.x == 0) printf("dx = %f, dy = %f\n", dx, dy);
+	//
+	image_point_x = frame->xmin + col*dx;
+	image_point_y = frame->ymin + row*dy;
+	return;
+	//
+	int i = 0;
+#if 0
+	for (; i < nhalos; i = i + blockDim.x)	
+	{
+		int pos = threadIdx.x + i;
+		/*if ((threadIdx.x == 0) && (blockIdx.x == 0))*/ printf("pos = %d\n"); 
+		__syncthreads();
+		//
+		cosi[pos]       = __ldg(&lens->anglecos             [shalos + pos]);
+		sinu[pos]       = __ldg(&lens->anglesin             [shalos + pos]);
+		position_x[pos] = __ldg(&lens->position_x           [shalos + pos]);
+		position_y[pos] = __ldg(&lens->position_y           [shalos + pos]);
+		rc[pos]         = __ldg(&lens->rcore                [shalos + pos]);
+		b0[pos]         = __ldg(&lens->b0                   [shalos + pos]);
+		epsi[pos]       = __ldg(&lens->ellipticity_potential[shalos + pos]);
+		rsqe[pos]       = sqrt(epsi[i]);
+		//
+	}
+#endif
+#if 0
+	if (threadIdx.x == 0)
+		for (; i < nhalos; i += 1)
+		{
+			cosi[i]       = __ldg(&lens->anglecos             [shalos + i]);
+			sinu[i]       = __ldg(&lens->anglesin             [shalos + i]);
+			position_x[i] = __ldg(&lens->position_x           [shalos + i]);
+			position_y[i] = __ldg(&lens->position_y           [shalos + i]);
+			rc[i]         = __ldg(&lens->rcore                [shalos + i]);
+			b0[i]         = __ldg(&lens->b0                   [shalos + i]);
+			epsi[i]       = __ldg(&lens->ellipticity_potential[shalos + i]);
+			rsqe[i]       = sqrt(epsi[i]);
+		}
+#endif
+	__syncthreads();
+	//if ((row == col == 0)) printf("shared mem done...\n");
+	//
+	if (row < nbgridcells)
+	{
+		//for(int icol = 0; icol < grid_size; ++icol){
+                //      if (col + icol < nbgridcells){
+                //grad_x = grad_y = 0.;
+		int  index  = row*nbgridcells + col;
+                //
+                for(int i = 0; i < nhalos; i++)
+                {
+                        int sindex  = threadIdx.x*grid_size; 
+#if 0
+                        double eps     = epsi[i];
+                        double onemeps = 1 - eps;
+                        double onepeps = 1 + eps;
+                        //
+                        double sqe = rsqe[i];
+                        double cx1 = onemeps/onepeps;
+                        //
+                        //
+                        //double x = true_coord_y*sinu[i];
+                        //double y = true_coord_y*cosi[i];
+                        double true_coord_y = image_point_y - position_y[i];
+                        double true_coord_x = image_point_x - position_x[i] /*+ icol*dx*/;
+                        //
+                        complex zci;
+                        zci.im  = -0.5*(1. - eps*eps)/sqe;
+                        double inv_onepeps = 1./(onepeps*onepeps);
+                        double inv_onemeps = 1./(onemeps*onemeps);
+#endif
+                        //
+                        for(int icol = 0; icol < grid_size; ++icol)
+                        {
+                                if (col + icol < nbgridcells)
+                                {
+#if 0
+                                        if ((row == 1) && (col == 1)) printf("%d %d: %f %f\n", row, col, true_coord_x, true_coord_y);
+
+                                        true_coord_x = image_point_x - position_x[i] + icol*dx;
+                                        //
+                                        //x += true_coord_x*cosi[i];
+                                        //y -= true_coord_x*sinu[i];
+                                        double x = true_coord_x*cosi[i] + true_coord_y*sinu[i];
+                                        double y = true_coord_y*cosi[i] - true_coord_x*sinu[i];
+                                        //
+                                        //if ((row == 1) && (col == 0)) printf("i = %d, eps = %f\n", i, eps);
+                                        //
+                                        //double eps     = epsi[i];
+                                        //double onemeps = sonemeps[i];
+                                        //double onepeps = sonepeps[i];
+                                        //
+                                        //double sqe  = sqrt(eps);
+                                        //double rem2 = x*x/(onepeps*onepeps) + y*y/(onemeps*onemeps);
+                                        double rem2 = x*x*inv_onepeps + y*y*inv_onemeps;
+                                        //
+                                        //
+                                        //double znum_re, znum_im;
+                                        //double zres_re, zres_im;
+                                        //double zden_re, zden_im;
+                                        //double  zis_re,  zis_im;
+                                        double norm;
+                                        //
+                                        complex      zis;
+                                        complex      znum;
+                                        complex      zden;
+                                        complex      zres;
+                                        //
+                                        //double cx1  = onemeps/onepeps;
+                                        //
+                                        znum.re = cx1*x;
+                                        znum.im = 2.*sqe*sqrt(rc[i]*rc[i] + rem2) - y/cx1;
+                                        //
+                                        zden.re = x;
+                                        zden.im = 2.*rc[i]*sqe - y;
+                                        //
+                                        norm    = (x*x + zden.im*zden.im);     // zis = znum/zden
+                                        zis.re  = (znum.re*x + znum.im*zden.im)/norm;
+                                        zis.im  = (znum.im*x - znum.re*zden.im)/norm;
+                                        //
+                                        norm    = zis.re;
+                                        //
+                                        zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
+                                        zis.im  = atan2(zis.im, norm);
+                                        //
+                                        zres.re = - zci.im*zis.im;   // Re( zci*ln(zis) )
+                                        zres.im =   zci.im*zis.re;   // Im( zci*ln(zis) )
+                                        //
+                                        grid_grad_x[index] += b0[i]*(zres.re*cosi[i] - zres.im*sinu[i]);
+                                        grid_grad_y[index] += b0[i]*(zres.im*cosi[i] + zres.re*sinu[i]);
+#endif
+                                        //sgrad_x[sindex] += (float)  sindex;
+                                        sgrad_y[sindex] += (float) -sindex;
+                                        //sindex++;
+                                        //
+                                        //grid_grad_x[index] += grad_x;
+                                        //grid_grad_y[index] += grad_y;
+                                }
+                                //
+                        }
+                }
+                        __syncthreads();
+	return;
+                //
+#if 0
+		int sindex = threadIdx.x*grid_size;
+		for(int icol = 0; icol < grid_size; ++icol)
+		{
+			if (col + icol < nbgridcells)
+			{
+                		grid_grad_x[index + col] = sgrad_x[sindex];
+                		grid_grad_y[index + col] = sgrad_y[sindex];
+				sindex++;
+			}
+		}
+		__syncthreads();
+#endif
+        }
+}
+
+
+
+__global__
+void
 module_potentialDerivatives_totalGradient_8_SOA_GPU_v2(double *grid_grad_x, double *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int i, int nhalos)
 {
 	//asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
 	// 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
 	//
 	struct point grad, clumpgrad, image_point;
 	grad.x = 0;
 	grad.y = 0;
 	//
 	int row = blockIdx.y * blockDim.y + threadIdx.y;
 	int col = blockIdx.x * blockDim.x + threadIdx.x;
 	//
 	if ((row < nbgridcells) && (col < nbgridcells))
 	{
 		//
 		int index = col*nbgridcells + row;
 		//
 		//grid_grad_x[index] = 0.;
 		//grid_grad_y[index] = 0.;
 		//
 		double dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
 		double dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
 		//
 #if 0
 		/*SHARED*/ double img_pt[2];
 		if ((row == 0) && (col == 0))
 		{
 			img_pt[0] = frame->xmin + col*dx;
 			img_pt[1] = frame->ymin + row*dy;
 		}
 		__syncthreads();
 #else
 		image_point.x = frame->xmin + col*dx;
 		image_point.y = frame->ymin + row*dy;
 #endif
 		//
 		//
 		//for(int i = shalos; i < shalos + nhalos; i++)
 		//{
 		//IACA_START;
 		//
 		struct point true_coord, true_coord_rot; //, result;
 		//double       R, angular_deviation;
 		complex      zis;
 		//
 		//result.x = result.y = 0.;
 		//
 #if 0
 		true_coord.x = img_pt[0] - __ldg(&lens->position_x[i]);
 		true_coord.y = img_pt[1] - __ldg(&lens->position_y[i]);
 #else
 		true_coord.x = image_point.x - __ldg(&lens->position_x[i]);
 		true_coord.y = image_point.y - __ldg(&lens->position_y[i]);
 #endif
 		double cosi = __ldg(&lens->anglecos[i]);
 		double sinu = __ldg(&lens->anglesin[i]);
 		// positionning at the potential center
 		// Change the origin of the coordinate system to the center of the clump
 		double x = true_coord.x*cosi + true_coord.y*sinu;
 		double y = true_coord.y*cosi - true_coord.x*sinu;
 		//
 		double eps = __ldg(&lens->ellipticity_potential[i]);
 		//
 		double sqe  = sqrt(eps);
 		//
 		double rem2 = x*x/((1. + eps)*(1. + eps)) + y*y/((1. - eps)*(1. - eps));
 		//
 		complex zci;
 		complex znum, zden, zres;
 		double norm;
 		//
 		zci.im  = -0.5*(1. - eps*eps)/sqe;
 		//
 		double rc  = __ldg(&lens->rcore[i]);
 		double cx1  = (1. - eps)/(1. + eps);
 		znum.re = cx1*x;
 		znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
 		//
 		zden.re = x;
 		zden.im = 2.*rc*sqe - y;
 		norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
 		//
 		zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
 		zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
 		//
 		double b0  = __ldg(&lens->b0[i]);
 		grad.x += b0*(zres.re*cosi - zres.im*sinu);
 		grad.y += b0*(zres.im*cosi + zres.re*sinu);
 		//
 		grid_grad_x[index] += grad.x;
 		grid_grad_y[index] += grad.y;
 	}
-	}
+}
 
 
 
 
 
 	/*
 	   typedef struct point (*halo_func_GPU_t) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos);
 
 	   __constant__ halo_func_GPU_t halo_func_GPU[100] =
 	   {
 	   0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_GPU, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_GPU,  0,
 	   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	   0,  module_potentialDerivatives_totalGradient_81_SOA_GPU, 0, 0, 0, 0, 0, 0, 0, 0,
 	   0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 	   };
 	 */
 
 void
 module_potentialDerivatives_totalGradient_SOA_CPU_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos)
-		{
-			struct point grad, clumpgrad;
-			//
-			grad.x = clumpgrad.x = 0.;
-			grad.y = clumpgrad.y = 0.;
-			//
-			int shalos = 0;
-			//
-			int GRID_SIZE_X = (nbgridcells + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks
-			int GRID_SIZE_Y = 1; 
-			//
-			{
-				//dim3 dimBlock(BLOCK_SIZE/1, BLOCK_SIZE);
-				//dim3 dimGrid (GRID_SIZE   , GRID_SIZE );	
-				dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y);
-				dim3 dimGrid (GRID_SIZE_X , GRID_SIZE_Y);	
-				//
-				int count = nhalos;	
-				printf("nhalos = %d\n", nhalos);
-				//
-				cudaMemset(grid_grad_x, 0, nbgridcells*nbgridcells*sizeof(double));
-				cudaMemset(grid_grad_y, 0, nbgridcells*nbgridcells*sizeof(double));
-				//
-				module_potentialDerivatives_totalGradient_8_SOA_GPU_SM3<<<dimGrid, dimBlock>>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nbgridcells, shalos, count);
-			}
-			//
-			/*
-			   {
-			   dim3 dimBlock(BLOCK_SIZE/1, BLOCK_SIZE/2);
-			   dim3 dimGrid(GRID_SIZE, GRID_SIZE);
-			//
-			int count = nhalos;
-			cudaMemset(grid_grad_x, 0, nbgridcells*nbgridcells*sizeof(double));
-			cudaMemset(grid_grad_y, 0, nbgridcells*nbgridcells*sizeof(double));
-			//
-			module_potentialDerivatives_totalGradient_8_SOA_GPU_SM<<<dimGrid, dimBlock>>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nbgridcells, shalos, count);
-			}
-			 */
-			//
-			cudasafe(cudaGetLastError(), "module_potentialDerivative_totalGradient_SOA_CPU_GPU");
-			//
-			//cudaDeviceSynchronize();
-			//module_potentialDerivatives_totalGradient_8_SOA_GPU_SM<<<dimGrid, dimBlock>>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nbgridcells, shalos, count);
-			//grad.x += clumpgrad.x;
-			//grad.y += clumpgrad.y;
+{
+	struct point grad, clumpgrad;
+	//
+	grad.x = clumpgrad.x = 0.;
+	grad.y = clumpgrad.y = 0.;
+	//
+	int shalos = 0;
+	//
+	int GRID_SIZE_X = (nbgridcells + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks
+	int GRID_SIZE_Y = (nbgridcells + BLOCK_SIZE_Y - 1)/BLOCK_SIZE_Y; 
+	printf("grid_size_x = %d, grid_size_y = %d\n", GRID_SIZE_X, GRID_SIZE_Y);
+	//
+	double* timer = (double *) malloc((int) nbgridcells*nbgridcells*sizeof(double)); 
+	double* dtimer;
+	//cudasafe(cudaMalloc( (void**)&(dtimer), (int) nbgridcells*nbgridcells*sizeof(double)),"Gradientgpu.cu : totalGradient_SOA_CPU_GPU: " );
+	//
+	{
+		//dim3 dimBlock(BLOCK_SIZE_X/1, BLOCK_SIZE_Y);
+		//dim3 dimGrid (GRID_SIZE_X   , GRID_SIZE_Y );	
+		dim3 threads(BLOCK_SIZE_Y, BLOCK_SIZE_X);
+		dim3 grid (GRID_SIZE_Y ,  GRID_SIZE_X);	
+		//
+		int count = nhalos;	
+		//printf("nhalos = %d, size of shared memory = %lf\n", nhalos, (double) (8*nhalos + BLOCK_SIZE_X*nbgridcells/BLOCK_SIZE_Y)*sizeof(double));
+		printf("nhalos = %d, size of shared memory = %lf\n", nhalos, (double) (8*nhalos + BLOCK_SIZE_X*BLOCK_SIZE_Y)*sizeof(double));
+		//
+		cudaMemset(grid_grad_x, 0, nbgridcells*nbgridcells*sizeof(double));
+		cudaMemset(grid_grad_y, 0, nbgridcells*nbgridcells*sizeof(double));
+		//
+		module_potentialDerivatives_totalGradient_8_SOA_GPU_SM3<<<grid, threads>>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nbgridcells, shalos, count);
+		//module_potentialDerivatives_totalGradient_8_SOA_GPU_SM4<<<dimGrid, dimBlock, (8*nhalos + BLOCK_SIZE_X*nbgridcells/BLOCK_SIZE_Y)*sizeof(double)>>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nbgridcells, shalos, count/*, dtimer*/);
+		cudasafe(cudaGetLastError(), "module_potentialDerivative_totalGradient_SOA_CPU_GPU_8_SOA_GPU_SM4");
+		
+	}
+	cudaDeviceSynchronize();
+	printf("GPU kernel done... ");fflush(stdout);
+	//cudasafe(cudaMemcpy( timer, dtimer, nbgridcells*nbgridcells*sizeof(double), cudaMemcpyDeviceToHost ),"Gradientgpu.cu : dtimer memcpy " );
+	
+	//
+	/*
+	   {
+	   dim3 dimBlock(BLOCK_SIZE/1, BLOCK_SIZE/2);
+	   dim3 dimGrid(GRID_SIZE, GRID_SIZE);
+	//
+	int count = nhalos;
+	cudaMemset(grid_grad_x, 0, nbgridcells*nbgridcells*sizeof(double));
+	cudaMemset(grid_grad_y, 0, nbgridcells*nbgridcells*sizeof(double));
+	//
+	module_potentialDerivatives_totalGradient_8_SOA_GPU_SM<<<dimGrid, dimBlock>>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nbgridcells, shalos, count);
+	}
+	 */
+	//
+	cudasafe(cudaGetLastError(), "module_potentialDerivative_totalGradient_SOA_CPU_GPU");
+	//
+	printf("\n");
+	//module_potentialDerivatives_totalGradient_8_SOA_GPU_SM<<<dimGrid, dimBlock>>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nbgridcells, shalos, count);
+	//grad.x += clumpgrad.x;
+	//grad.y += clumpgrad.y;
 
-			//
-			//	
-			/*
-			   while (shalos < nhalos)
-			   {
-
-			   int lens_type = lens_cpu->type[shalos];
-			   int count     = 1;
-			   while (lens_cpu->type[shalos + count] == lens_type) count++;
-			//std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl;
-			//printf ("%d %d %d \n",lens_type,count,shalos);
-			//
-			clumpgrad = (*halo_func_GPU[lens_type]<<<dimGrid, dimBlock>>> )(lens_gpu, shalos, count);
-			//
-			grad.x += clumpgrad.x;
-			grad.y += clumpgrad.y;
-			shalos += count;
-			}
+	//
+	//	
+	/*
+	   while (shalos < nhalos)
+	   {
 
-			return(grad);
-			 */
-		}
+	   int lens_type = lens_cpu->type[shalos];
+	   int count     = 1;
+	   while (lens_cpu->type[shalos + count] == lens_type) count++;
+	//std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl;
+	//printf ("%d %d %d \n",lens_type,count,shalos);
+	//
+	clumpgrad = (*halo_func_GPU[lens_type]<<<dimGrid, dimBlock>>> )(lens_gpu, shalos, count);
+	//
+	grad.x += clumpgrad.x;
+	grad.y += clumpgrad.y;
+	shalos += count;
+	}
+
+	return(grad);
+	 */
+}
 
 
diff --git a/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cuh b/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cuh
index 575df16..cae430e 100644
--- a/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cuh
+++ b/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cuh
@@ -1,30 +1,81 @@
 /*
  * gradientgpu.cuh
  *
  *  Created on: Nov 29, 2016
  *      Author: cerschae
  */
 
 #ifndef GRID_GRADIENT_GPU_CUH_
 #define GRID_GRADIENT_GPU_CUH_
 
 #include "cudafunctions.cuh"
 #include "gradient_GPU.cuh"
 #include <structure_hpc.h>
 
 void gradient_grid_GPU_sorted(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens,int nbgridcells);
 void gradient_grid_pinned(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int *Nlens,int nbgridcell);
 void gradient_grid_pinned_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int *Nlens,int nbgridcell);
 void gradient_grid_GPU_sub(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos,int nbgridcells, int indexactual, int Ncells );
 void gradient_grid_GPU_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, int nbgridcells);
 
 __global__ void gradient_grid_kernel(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens);
 __global__ void gradient_grid_kernel_v2(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens);
 __global__ void gradient_grid_piemd_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcell, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut);
 __global__ void gradient_grid_sis_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcell, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut);
 __global__ void gradient_grid_piemd_GPU_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcell, int Ndevice, int indexactual, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut);
 __global__ void gradient_grid_kernel_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells, const struct Potential_SOA *lens, int indexactual, int ncells);
 
 __device__ static double atomicAdd_double(double* address, double val);
 
 #endif /* GRADIENTGPU_CUH_ */
+
+
+#define KERNEL_8 \
+double x = true_coord_x*cosi[i] + true_coord_y*sinu[i]; \
+double y = true_coord_y*cosi[i] - true_coord_x*sinu[i]; \
+double rem2 = x*x*inv_onepeps + y*y*inv_onemeps; \
+double norm; \
+complex      zis; \
+complex      znum; \
+complex      zden; \
+complex      zres; \
+znum.re = cx1*x; \
+znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; \
+zden.re = x; \
+zden.im = 2.*rc*sqe - y; \
+norm    = (x*x + zden.im*zden.im); \ 
+zis.re  = (znum.re*x + znum.im*zden.im)/norm; \
+zis.im  = (znum.im*x - znum.re*zden.im)/norm; \
+norm    = zis.re; \
+zis.re  = log(sqrt(norm*norm + zis.im*zis.im)); \
+zis.im  = atan2(zis.im, norm); \
+zres.re = - zci_im*zis.im; \ 
+zres.im =   zci_im*zis.re; \ 
+grad_x += b0[i]*(zres.re*cosi[i] - zres.im*sinu[i]); \
+grad_y += b0[i]*(zres.im*cosi[i] + zres.re*sinu[i]); 
+
+
+#define KERNEL_8_reg(X) \
+double x = (true_coord_x + X*dx)*cosi[i] +          true_coord_y*sinu[i]; \
+double y =          true_coord_y*cosi[i] - (true_coord_x + X*dx)*sinu[i]; \
+double rem2 = x*x*inv_onepeps + y*y*inv_onemeps; \
+double norm; \
+double      zis_re, zis_im; \
+double      znum_re, znum_im; \
+double      /*zden_re,*/ zden_im; \
+double      zres_re, zres_im; \
+znum_re = cx1*x; \
+znum_im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; \
+/*zden_re = x;*/ \
+zden_im = 2.*rc*sqe - y; \
+norm    = (x*x + zden_im*zden_im); \
+zis_re  = (znum_re*x + znum_im*zden_im)/norm; \
+zis_im  = (znum_im*x - znum_re*zden_im)/norm; \
+norm    = zis_re; \
+zis_re  = log(sqrt(norm*norm + zis_im*zis_im)); \
+zis_im  = atan2(zis_im, norm); \
+zres_re = - zci_im*zis_im; \
+zres_im =   zci_im*zis_re; \
+grad_x += b0[i]*(zres_re*cosi[i] - zres_im*sinu[i]); \
+grad_y += b0[i]*(zres_im*cosi[i] + zres_re*sinu[i]); \
+\
diff --git a/Benchmarks/GridGradientBenchmark/main.cpp b/Benchmarks/GridGradientBenchmark/main.cpp
index 4910e34..6879e75 100644
--- a/Benchmarks/GridGradientBenchmark/main.cpp
+++ b/Benchmarks/GridGradientBenchmark/main.cpp
@@ -1,331 +1,331 @@
 /**
 * @file   main.cpp
 * @Author Christoph Schaaefer, EPFL (christophernstrerne.schaefer@epfl.ch)
 * @date   October 2016
 * @brief  Benchmark for gradhalo function
 */
 
 #include <iostream>
 #include <iomanip>
 #include <string.h>
 #include <math.h>
 #include <sys/time.h>
 #include <fstream>
 #include <sys/stat.h>
 //
 #include <mm_malloc.h>
 //
 #include <cuda_runtime.h>
 #include <structure_hpc.h>
 #include "timer.h"
 #include "gradient.hpp"
 #include "chi_CPU.hpp"
 #include "module_cosmodistances.h"
 #include "module_readParameters.hpp"
 #include "grid_gradient_GPU.cuh"
 //#include<omp.h>
 
 int module_readCheckInput_readInput(int argc, char *argv[])
 {
 	/// check if there is a correct number of arguments, and store the name of the input file in infile
 
 	char* infile;
 	struct stat  file_stat;
 
 	// If we do not have 3 arguments, stop
 	if ( argc != 3 )
 	{
 		fprintf(stderr, "\nUnexpected number of arguments\n");
 		fprintf(stderr, "\nUSAGE:\n");
 		fprintf(stderr, "lenstool  input_file  output_directorypath  [-n]\n\n");
 		exit(-1);
 	}
 	else if ( argc == 3 )
 		infile=argv[1];
 	std::ifstream ifile(infile,std::ifstream::in); // Open the file
 
 
 	int ts = (int) time (NULL);
 	char buffer[10];
 	std::stringstream ss;
 	ss << ts;
 	std::string trimstamp = ss.str();
 	//
 	std::string outdir = argv[2];
 	outdir += "-";
 	outdir += trimstamp;
 	std::cout << outdir << std::endl;
 
 	// check whether the output directory already exists
 	if (stat(outdir.c_str(), &file_stat) < 0){
 		mkdir(outdir.c_str(), S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP | S_IXGRP | S_IROTH );
 	}
 	else {
 		printf("Error : Directory %s already exists. Specify a non existing directory.\n",argv[2]);
 		exit(-1);
 	}
 
 	// check whether the input file exists. If it could not be opened (ifile = 0), it does not exist
 	if(ifile){
 		ifile.close();
 	}
 	else{
 		printf("The file %s does not exist, please specify a valid file name\n",infile);
 		exit(-1);
 	}
 
 	return 0;
 }
 
 int main(int argc, char *argv[])
 {
 
 
 	// Setting Up the problem
 	//===========================================================================================================
 
 	// This module function reads the terminal input when calling LENSTOOL and checks that it is correct
 	// Otherwise it exits LENSTOOL
 	module_readCheckInput_readInput(argc, argv);
 
 	// This module function reads the cosmology parameters from the parameter file
 	// Input: struct cosmologicalparameters cosmology, parameter file
 	// Output: Initialized cosmology struct
 	cosmo_param cosmology;  // Cosmology struct to store the cosmology data from the file
 	std::string inputFile = argv[1];   // Input file
 	module_readParameters_readCosmology(inputFile, cosmology);
 
 	// This module function reads the runmode paragraph and the number of sources, arclets, etc. in the parameter file.
 	// The runmode_param stores the information of what exactly the user wants to do with lenstool.
 	struct runmode_param runmode;
 	module_readParameters_readRunmode(inputFile, &runmode);
 
 	module_readParameters_debug_cosmology(runmode.debug, cosmology);
 	module_readParameters_debug_runmode(runmode.debug, runmode);
 
 
 	//=== Declaring variables
 	struct grid_param frame;
 	struct galaxy images[runmode.nimagestot];
 	struct galaxy sources[runmode.nsets];
 	struct Potential lenses[runmode.nhalos+runmode.npotfile-1];
 	struct Potential_SOA lenses_SOA_table[NTYPES];
 	struct Potential_SOA lenses_SOA;
 	struct cline_param cline;
 	struct potfile_param potfile;
 	struct Potential potfilepotentials[runmode.npotfile];
 	struct potentialoptimization host_potentialoptimization[runmode.nhalos];
 	int nImagesSet[runmode.nsets]; // Contains the number of images in each set of images
 
 	// This module function reads in the potential form and its parameters (e.g. NFW)
 	// Input: input file
 	// Output: Potentials and its parameters
 
 
 	module_readParameters_Potential(inputFile, lenses, runmode.nhalos);
 	//Converts to SOA
 	//module_readParameters_PotentialSOA(inputFile, lenses, lenses_SOA, runmode.Nlens);
 	module_readParameters_PotentialSOA(inputFile, lenses, &lenses_SOA, runmode.nhalos);
 	//module_readParameters_PotentialSOA_nonsorted(inputFile, lenses, &lenses_SOA_nonsorted, runmode.nhalos);
 	module_readParameters_debug_potential(runmode.debug, lenses, runmode.nhalos);
 	//std::cerr << lenses_SOA[1].b0[0] << " " << lenses[0].b0  << std::endl;
 	// This module function reads in the potfiles parameters
 	// Input: input file
 	// Output: Potentials from potfiles and its parameters
 
 	if (runmode.potfile == 1 )
 	{
 		module_readParameters_readpotfiles_param(inputFile, &potfile);
 		module_readParameters_debug_potfileparam(runmode.debug, &potfile);
 		module_readParameters_readpotfiles(&runmode,&potfile,lenses);
 		module_readParameters_debug_potential(runmode.debug, lenses, runmode.nhalos+runmode.npotfile);
 
 	}
 
 
 
 	// This module function reads in the grid form and its parameters
 	// Input: input file
 	// Output: grid and its parameters
 
 
 	module_readParameters_Grid(inputFile, &frame);
 
 
 
 
 	if (runmode.image == 1 or runmode.inverse == 1 or runmode.time > 0){
 
 		// This module function reads in the strong lensing images
 		module_readParameters_readImages(&runmode, images, nImagesSet);
 		//runmode.nsets = runmode.nimagestot;
 		for(int i = 0; i < runmode.nimagestot; ++i){
 
 			images[i].dls = module_cosmodistances_objectObject(lenses[0].z, images[i].redshift, cosmology);
 			images[i].dos = module_cosmodistances_observerObject(images[i].redshift, cosmology);
 			images[i].dr = module_cosmodistances_lensSourceToObserverSource(lenses[0].z, images[i].redshift, cosmology);
 
 		}
 		module_readParameters_debug_image(runmode.debug, images, nImagesSet,runmode.nsets);
 
 	}
 
 	if (runmode.inverse == 1){
 
 		// This module function reads in the potential optimisation limits
 		module_readParameters_limit(inputFile,host_potentialoptimization,runmode.nhalos);
 		module_readParameters_debug_limit(runmode.debug, host_potentialoptimization[0]);
 	}
 
 
 	if (runmode.source == 1){
 		//Initialisation to default values.(Setting sources to z = 1.5 default value)
 		for(int i = 0; i < runmode.nsets; ++i){
 			sources[i].redshift = 1.5;
 		}
 		// This module function reads in the strong lensing sources
 		module_readParameters_readSources(&runmode, sources);
 		//Calculating cosmoratios
 		for(int i = 0; i < runmode.nsets; ++i){
 
 			sources[i].dls = module_cosmodistances_objectObject(lenses[0].z, sources[i].redshift, cosmology);
 			sources[i].dos = module_cosmodistances_observerObject(sources[i].redshift, cosmology);
 			sources[i].dr = module_cosmodistances_lensSourceToObserverSource(lenses[0].z, sources[i].redshift, cosmology);
 		}
 		module_readParameters_debug_source(runmode.debug, sources, runmode.nsets);
 
 	}
 
 	std::cout << "--------------------------" << std::endl << std::endl;
 
 	double t_1,t_2,t_3,t_4;
 
 	// Lenstool-CPU Grid-Gradient
 	//===========================================================================================================
 
 	//Setting Test:
 	double dx, dy;
 	int grid_dim = runmode.nbgridcells;
 
 	dx = (frame.xmax - frame.xmin)/(runmode.nbgridcells-1);
 	dy = (frame.ymax - frame.ymin)/(runmode.nbgridcells-1);
 
 	point test_point1_1,test_point2_2, test_result1_1,test_result2_2,test_pointN_N, test_resultN_N;
 	double dlsds = images[0].dr;
 
 	test_point1_1.x = frame.xmin;
 	test_point1_1.y = frame.ymin;
 	test_point2_2.x = frame.xmin + dx;
 	test_point2_2.y = frame.ymin + dy;
 	test_pointN_N.x = frame.xmin + ((runmode.nbgridcells*runmode.nbgridcells-1)/runmode.nbgridcells)*dx;
 	test_pointN_N.y = frame.ymin + ((runmode.nbgridcells*runmode.nbgridcells-1) % runmode.nbgridcells)*dy;
 
 	double* grid_gradient_x_cpu = (double *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double));
         double* grid_gradient_y_cpu = (double *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double));
 
 	memset(grid_gradient_x_cpu, 0, (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double));
 	memset(grid_gradient_y_cpu, 0, (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double));
 	
 
 	std::cout << " CPU Test... "; 
 	//
 	t_1 = -myseconds();
 	//test_result1_1 = module_potentialDerivatives_totalGradient_SOA(&test_point1_1, &lenses_SOA, runmode.nhalos);
 	//test_result2_2 = module_potentialDerivatives_totalGradient_SOA(&test_point2_2, &lenses_SOA, runmode.nhalos);
 	//test_resultN_N = module_potentialDerivatives_totalGradient_SOA(&test_pointN_N, &lenses_SOA, runmode.nhalos);
 	gradient_grid_CPU(grid_gradient_x_cpu, grid_gradient_y_cpu, &frame, &lenses_SOA, runmode.nhalos, grid_dim);	
 	t_1 += myseconds();
 	//
 	std::cout << " Time  " << std::setprecision(15) << t_1 << std::endl;
 
 	// GPU test
 
 	std::cout << " GPU Test... "; 
 	double *grid_gradient_x, *grid_gradient_y;
 
-	grid_gradient_x = (double *)malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double));
-	grid_gradient_y = (double *)malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double));
+	grid_gradient_x = (double *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double));
+	grid_gradient_y = (double *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double));
 
 
 #if 0
 t_1 = -myseconds();
 //Packaging the image to sourceplane conversion
 gradient_grid_CPU(grid_gradient_x,grid_gradient_y,&frame,&lenses_SOA,runmode.nhalos,grid_dim);
 t_1 += myseconds();
 
 std::cout << " gradient_grid_CPU Brute Force Benchmark " << std::endl;
 std::cout << " Test 1: " << std::endl;
 std::cout << " Point 1 : " << std::setprecision(5) << test_point1_1.x << " "<< test_point1_1.y <<  std::endl;
 std::cout << " Gradient  " << std::setprecision(5) << grid_gradient_x[0] << " "<< grid_gradient_y[0] <<  std::endl;
 std::cout << " Test 2: " << std::endl;
 std::cout << " Point 2 : " << std::setprecision(5) << test_point2_2.x << " "<< test_point2_2.y <<  std::endl;
 std::cout << " Gradient  " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells+1] << " "<< grid_gradient_y[runmode.nbgridcells+1] <<  std::endl;
 std::cout << " Time  " << std::setprecision(15) << t_1 << std::endl;
 #endif
 
-#if 1
 	t_2 = -myseconds();
 	gradient_grid_GPU_sorted(grid_gradient_x, grid_gradient_y, &frame, &lenses_SOA, runmode.nhalos, grid_dim);
 	t_2 += myseconds();
 
 	std::cout << " Time  " << std::setprecision(15) << t_2 << std::endl;
-#endif
+#if 1 
 	double norm_x = 0.;
 	double norm_y = 0.;
 
 	for (int ii = 0; ii < grid_dim*grid_dim; ++ii)
 	{
 		double g_x = grid_gradient_x[ii];
 		double g_y = grid_gradient_y[ii];
 		//
 		double c_x = grid_gradient_x_cpu[ii];
 		double c_y = grid_gradient_y_cpu[ii];
 		//
 		//if (!(ii%grid_dim)) std::cout << std::endl;
-		///std::cout << ii << " " << g_x << " " << c_x << " " << g_y << " " << c_y << std::endl;
+		//		std::cout << ii << " " << g_x << " " << c_x << " " << g_y << " " << c_y << std::endl;
 		//
 		norm_x += (grid_gradient_x[ii] - grid_gradient_x_cpu[ii])*(grid_gradient_x[ii] - grid_gradient_x_cpu[ii]);
 		norm_y += (grid_gradient_y[ii] - grid_gradient_y_cpu[ii])*(grid_gradient_y[ii] - grid_gradient_y_cpu[ii]);
 	}
 
 	std::cout << "  l2 difference norm = " << std::setprecision(15) << norm_x << " " << std::setprecision(15) << norm_y << std::endl;
+#endif
 
 #if 1
 t_3 = -myseconds();
 gradient_grid_GPU_multiple(grid_gradient_x,grid_gradient_y,&frame,&lenses_SOA,runmode.nhalos,grid_dim);
 t_3 += myseconds();
 
 std::cout << " gradient_grid_GPU_multiple Brute Force Benchmark " << std::endl;
 std::cout << " Test 1: " << std::endl;
 std::cout << " Point 1 : " << std::setprecision(5) << test_point1_1.x << " "<< test_point1_1.y <<  std::endl;
 std::cout << " Gradient  " << std::setprecision(5) << grid_gradient_x[0] << " "<< grid_gradient_y[0] <<  std::endl;
 std::cout << " Test 2: " << std::endl;
 std::cout << " Point 2 : " << std::setprecision(5) << test_point2_2.x << " "<< test_point2_2.y <<  std::endl;
 std::cout << " Gradient  " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells+1] << " "<< grid_gradient_y[runmode.nbgridcells+1] <<  std::endl;
 std::cout << " Test 3: " << std::endl;
 std::cout << " Point 3 : " << std::setprecision(5) << test_pointN_N.x << " "<< test_pointN_N.y <<  std::endl;
 std::cout << " Gradient  " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells*runmode.nbgridcells-1] << " "<< grid_gradient_y[runmode.nbgridcells*runmode.nbgridcells-1] <<  std::endl;
 std::cout << " Time  " << std::setprecision(15) << t_3 << std::endl;
 #endif
 
 #if 1
 t_4 = -myseconds();
 gradient_grid_GPU_sub(grid_gradient_x,grid_gradient_y,&frame,&lenses_SOA,runmode.nhalos,grid_dim,0,grid_dim*grid_dim);
 t_4 += myseconds();
 
 std::cout << " gradient_grid_GPU_sub Brute Force Benchmark " << std::endl;
 std::cout << " Test 1: " << std::endl;
 std::cout << " Point 1 : " << std::setprecision(5) << test_point1_1.x << " "<< test_point1_1.y <<  std::endl;
 std::cout << " Gradient  " << std::setprecision(5) << grid_gradient_x[0] << " "<< grid_gradient_y[0] <<  std::endl;
 std::cout << " Test 2: " << std::endl;
 std::cout << " Point 2 : " << std::setprecision(5) << test_point2_2.x << " "<< test_point2_2.y <<  std::endl;
 std::cout << " Gradient  " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells+1] << " "<< grid_gradient_y[runmode.nbgridcells+1] <<  std::endl;
 std::cout << " Test 3: " << std::endl;
 std::cout << " Point 3 : " << std::setprecision(5) << test_pointN_N.x << " "<< test_pointN_N.y <<  std::endl;
 std::cout << " Gradient  " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells*runmode.nbgridcells-1] << " "<< grid_gradient_y[runmode.nbgridcells*runmode.nbgridcells-1] <<  std::endl;
 std::cout << " Time  " << std::setprecision(15) << t_4 << std::endl;
 #endif
 
 
 
 }