diff --git a/Benchmarks/ChiBenchmark/main.cpp b/Benchmarks/ChiBenchmark/main.cpp
index 9eca507..96328e9 100644
--- a/Benchmarks/ChiBenchmark/main.cpp
+++ b/Benchmarks/ChiBenchmark/main.cpp
@@ -1,354 +1,354 @@
 /**
  * @file   main.cpp
  * @Author Christoph Schaaefer, EPFL (christophernstrerne.schaefer@epfl.ch)
  * @date   October 2016
  * @brief  Benchmark for gradhalo function
  */
 
 #include <iostream>
 #include <iomanip>
 #include <string.h>
 #include <math.h>
 #include <sys/time.h>
 #include <fstream>
 #include <sys/stat.h>
 #include <unistd.h>
 //
 #include <mm_malloc.h>
 //
 #include <structure_hpc.hpp>
 #include "timer.h"
 #include "gradient.hpp"
 #include "chi_CPU.hpp"
 #include "module_cosmodistances.hpp"
 #include "module_readParameters.hpp"
 
 
 
 #include<omp.h>
 
 //#define __WITH_LENSTOOL 0
 #ifdef __WITH_LENSTOOL
 #warning "linking with libtool..."
 #include <fonction.h>
 #include <constant.h>
 #include <dimension.h>
 #include <structure.h>
 #include <setup.hpp>
 #endif
 
 #ifdef __WITH_LENSTOOL
 struct g_mode   M;
 struct g_pot    P[NPOTFILE];
 struct g_pixel  imFrame, wFrame, ps, PSF;
 struct g_cube   cubeFrame;
 struct g_dyn    Dy;      //   //TV
 
 
 struct g_source S;
 struct g_image  I;
 struct g_grille G;
 struct g_msgrid H;  // multi-scale grid
 struct g_frame  F;
 struct g_large  L;
 struct g_cosmo  C;
 struct g_cline  CL;
 struct g_observ O;
 struct pot      lens[NLMAX];
 struct pot      lmin[NLMAX], lmax[NLMAX], prec[NLMAX];
 struct g_cosmo  clmin, clmax;       /*cosmological limits*/
 struct galaxie  smin[NFMAX], smax[NFMAX];       // limits on source parameters
 struct ipot     ip;
 struct MCarlo   mc;
 struct vfield   vf;
 struct vfield   vfmin,vfmax; // limits on velocity field parameters
 struct cline    cl[NIMAX];
 lensdata *lens_table;
 
 int  block[NLMAX][NPAMAX];      /*switch for the lens optimisation*/
 int  cblock[NPAMAX];                /*switch for the cosmological optimisation*/
 int  sblock[NFMAX][NPAMAX];                /*switch for the source parameters*/
 int  vfblock[NPAMAX];                /*switch for the velocity field parameters*/
 double excu[NLMAX][NPAMAX];
 double excd[NLMAX][NPAMAX];
 /* supplments tableaux de valeurs pour fonctions g pour Einasto
  *  * Ce sont trois variables globales qu'on pourra utiliser dans toutes les fonctions du projet
  *  */
 
 #define CMAX 20
 #define LMAX 80
 
 float Tab1[LMAX][CMAX];
 float Tab2[LMAX][CMAX];
 float Tab3[LMAX][CMAX];
 
 
 int      nrline, ntline, flagr, flagt;
 long int  narclet;
 
 struct point    gimage[NGGMAX][NGGMAX], gsource_global[NGGMAX][NGGMAX];
 struct biline   radial[NMAX], tangent[NMAX];
 struct galaxie  arclet[NAMAX], source[NFMAX], image[NFMAX][NIMAX];
 struct galaxie  cimage[NFMAX];
 struct pointgal     gianti[NPMAX][NIMAX];
 
 struct point    SC;
 double elix;
 double alpha_e;
 
 double *v_xx;
 double *v_yy;
 double **map_p;
 double **tmp_p;
 double **map_axx;
 double **map_ayy;
 #endif
 
 
 void chi_bruteforce_SOA_CPU_grid_gradient(double *chi, int *error, runmode_param *runmode, const struct Potential_SOA *lens, const struct grid_param *frame, const int *nimages_strongLensing, galaxy *images);
 
 
 int module_readCheckInput_readInput(int argc, char *argv[])
 {
 /// check if there is a correct number of arguments, and store the name of the input file in infile
 
     char* infile;
     struct stat  file_stat;
 
     // If we do not have 3 arguments, stop
     if ( argc != 3 )
     {
         fprintf(stderr, "\nUnexpected number of arguments\n");
         fprintf(stderr, "\nUSAGE:\n");
         fprintf(stderr, "lenstool  input_file  output_directorypath  [-n]\n\n");
         exit(-1);
     }
     else if ( argc == 3 )
         infile=argv[1];
 	std::ifstream ifile(infile,std::ifstream::in); // Open the file
 
 		
 	int ts = (int) time (NULL);
 	char buffer[10];
 	std::stringstream ss;
 	ss << ts;
 	std::string trimstamp = ss.str();
 	//
 	std::string outdir = argv[2];
 	outdir += "-";
 	outdir += trimstamp;
 	std::cout << outdir << std::endl;
 
 	// check whether the output directory already exists
 	if (stat(outdir.c_str(), &file_stat) < 0){
 		mkdir(outdir.c_str(), S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP | S_IXGRP | S_IROTH );
 	}
 	else {
 		printf("Error : Directory %s already exists. Specify a non existing directory.\n",argv[2]);
 		exit(-1);
 	}
 
     // check whether the input file exists. If it could not be opened (ifile = 0), it does not exist
     if(ifile){
 	ifile.close();
     }
     else{
 	printf("The file %s does not exist, please specify a valid file name\n",infile);
 	exit(-1);
         }
 
     return 0;
 }
 
 int main(int argc, char *argv[])
 {
 
 	double wallclock = myseconds();
 	printf("Reading parameter file at time %f s...\n", myseconds() - wallclock);
 	// Setting Up the problem
 	//===========================================================================================================
 
 	// This module function reads the terminal input when calling LENSTOOL and checks that it is correct
 	// Otherwise it exits LENSTOOL
 	module_readCheckInput_readInput(argc, argv);
 
 	// This module function reads the cosmology parameters from the parameter file
 	// Input: struct cosmologicalparameters cosmology, parameter file
 	// Output: Initialized cosmology struct
 	cosmo_param cosmology;  // Cosmology struct to store the cosmology data from the file
 	std::string inputFile = argv[1];   // Input file
 	module_readParameters_readCosmology(inputFile, cosmology);
 
 	// This module function reads the runmode paragraph and the number of sources, arclets, etc. in the parameter file.
 	// The runmode_param stores the information of what exactly the user wants to do with lenstool.
 	struct runmode_param runmode;
 	module_readParameters_readRunmode(inputFile, &runmode);
 
 	module_readParameters_debug_cosmology(runmode.debug, cosmology);
 	module_readParameters_debug_runmode(runmode.debug, runmode);
 
 
 	//=== Declaring variables
 	struct grid_param frame;
 	struct galaxy images[runmode.nimagestot];
 	struct galaxy sources[runmode.nsets];
 	struct Potential lenses[runmode.nhalos+runmode.npotfile-1];
 	struct Potential_SOA lenses_SOA_table[NTYPES];
 	struct Potential_SOA lenses_SOA;
 	struct cline_param cline;
 	struct potfile_param potfile;
 	struct Potential potfilepotentials[runmode.npotfile];
 	struct potentialoptimization host_potentialoptimization[runmode.nhalos];
 	int nImagesSet[runmode.nsets]; // Contains the number of images in each set of imagesnano
 
 	// This module function reads in the potential form and its parameters (e.g. NFW)
 	// Input: input file
 	// Output: Potentials and its parameters
 
 
 	module_readParameters_Potential(inputFile, lenses, runmode.nhalos);
 	//Converts to SOA
 	//module_readParameters_PotentialSOA(inputFile, lenses, lenses_SOA, runmode.Nlens);
 	module_readParameters_PotentialSOA(inputFile, lenses, &lenses_SOA, runmode.nhalos);
 	//module_readParameters_PotentialSOA_nonsorted(inputFile, lenses, &lenses_SOA_nonsorted, runmode.nhalos);
 	module_readParameters_debug_potential(runmode.debug, lenses, runmode.nhalos);
 	//std::cerr << lenses_SOA[1].b0[0] << " " << lenses[0].b0  << std::endl;
 	// This module function reads in the potfiles parameters
 	// Input: input file
 	// Output: Potentials from potfiles and its parameters
 
 	if (runmode.potfile == 1 ){
 		module_readParameters_readpotfiles_param(inputFile, &potfile,cosmology);
 		module_readParameters_debug_potfileparam(runmode.debug, &potfile);
 		module_readParameters_readpotfiles(&runmode,&potfile,lenses);
 		module_readParameters_debug_potential(runmode.debug, lenses, runmode.nhalos+runmode.npotfile);
 
 	}
 
 
 
 	// This module function reads in the grid form and its parameters
 	// Input: input file
 	// Output: grid and its parameters
 
 #if 1
 	module_readParameters_Grid(inputFile, &frame);
 
 	if (runmode.image == 1 or runmode.inverse == 1 or runmode.time > 0){
 
 		// This module function reads in the strong lensing images
 		module_readParameters_readImages(&runmode, images, nImagesSet);
 		//runmode.nsets = runmode.nimagestot;
 		for(int i = 0; i < runmode.nimagestot; ++i){
 
 			images[i].dls = module_cosmodistances_objectObject(lenses[0].z, images[i].redshift, cosmology);
 			images[i].dos = module_cosmodistances_observerObject(images[i].redshift, cosmology);
 			images[i].dr = module_cosmodistances_lensSourceToObserverSource(lenses[0].z, images[i].redshift, cosmology);
 
 		}
 		module_readParameters_debug_image(runmode.debug, images, nImagesSet,runmode.nsets);
 
 	}
 
 	if (runmode.inverse == 1){
 
 		// This module function reads in the potential optimisation limits
 		module_readParameters_limit(inputFile,host_potentialoptimization,runmode.nhalos);
 		module_readParameters_debug_limit(runmode.debug, host_potentialoptimization[0]);
 	}
 
 
 	if (runmode.source == 1)
 	{
 		//Initialisation to default values.(Setting sources to z = 1.5 default value)
 		for(int i = 0; i < runmode.nsets; ++i)
 		{
 			sources[i].redshift = 1.5;
 		}
 		// This module function reads in the strong lensing sources
 		module_readParameters_readSources(&runmode, sources);
 		//Calculating cosmoratios
 		for(int i = 0; i < runmode.nsets; ++i)
 		{
 			sources[i].dls = module_cosmodistances_objectObject(lenses[0].z, sources[i].redshift, cosmology);
 			sources[i].dos = module_cosmodistances_observerObject(sources[i].redshift, cosmology);
 			sources[i].dr = module_cosmodistances_lensSourceToObserverSource(lenses[0].z, sources[i].redshift, cosmology);
 		}
 		module_readParameters_debug_source(runmode.debug, sources, runmode.nsets);
 	}
 	//
 	std::cout << "--------------------------" << std::endl << std::endl;
 #endif
-	//
+
 	// Lenstool Bruteforce
 	//===========================================================================================================
 #ifdef __WITH_LENSTOOLL
 	{
 		printf("Calling lenstool at time %f s\n", myseconds() - wallclock);
 		setup_lenstool();
 		//
 		double chi2;
 		double lhood0(0);
 		int error(0);
 		double time;
 
 		if ( M.ichi2 != 0 )
 		{
 			int error;
 		//NPRINTF(stdout, "INFO: compute chires.dat\n");
 		readConstraints();
 		o_chires("chires.dat");
 		time = -myseconds();
 		error = o_chi_lhood0(&chi2, &lhood0, NULL);
 		time += myseconds();
 		printf("INFO: chi2 %lf  Lhood %lf\n", chi2, -0.5 * ( chi2 + lhood0 )  );
 		o_global_free();
 	}
 
 	std::cout << "Lenstool 6.8.1 chi Benchmark: ";
 	std::cout << " Chi : " << std::setprecision(15) << chi2 ;
 	std::cout << " Time  " << std::setprecision(15) << time << std::endl;
 	o_global_free();
 	}
 #endif
 
 	// Lenstool-GPU Bruteforce
 	//===========================================================================================================
 #if 0
 	{
 		printf("Calling lenstoolhpc orig at time %f s\n", myseconds() - wallclock);
 		std::cout << "LenstoolHPC dist chi Benchmark:\n ";
 		double chi2;
 		double time;
 		int error;
 		time = -myseconds();
 		mychi_bruteforce_SOA_CPU_grid_gradient_orig(&chi2, &error, &runmode, &lenses_SOA, &frame, nImagesSet, images);
 		time += myseconds();
 
 		std::cout << " Chi : " << std::setprecision(15) << chi2;
 		std::cout << " Time  " << std::setprecision(15) << time << std::endl;
 	}
 #endif
 
 
 
 #if 0
 	{
 		//std::cout << "MylenstoolHPC chi Benchmark:\n "; 
 		printf("Calling lenstoolhpc at time %f s\n", myseconds() - wallclock);
 		double chi2;
 		double time;
 		int error;
 		time = -myseconds();
 		mychi_bruteforce_SOA_CPU_grid_gradient(&chi2, &error, &runmode, &lenses_SOA, &frame, nImagesSet, images);
 		time += myseconds();
 
 		std::cout << " Chi : " << std::setprecision(15) << chi2;
 		std::cout << " Time  " << std::setprecision(15) << time << std::endl;
 	}
 #endif
 
 		printf("Ending execution at time %f s\n", myseconds() - wallclock);
 
 
 }
diff --git a/Benchmarks/GradientBenchmark/main.cpp b/Benchmarks/GradientBenchmark/main.cpp
index 0b1b482..1c9fb35 100644
--- a/Benchmarks/GradientBenchmark/main.cpp
+++ b/Benchmarks/GradientBenchmark/main.cpp
@@ -1,249 +1,250 @@
 /**
  * @file   main.cpp
  * @Author Christoph Schaaefer, EPFL (christophernstrerne.schaefer@epfl.ch)
  * @date   October 2016
  * @brief  Benchmark for gradhalo function
  */
 
 #include <iostream>
 #include <iomanip>
 #include <string.h>
 //#include <cuda_runtime.h>
 #include <math.h>
 #include <sys/time.h>
 #include <fstream>
 //
 //
 #include <mm_malloc.h>
 
 //#define __WITH_LENSTOOL 0
 //
 #ifdef __WITH_LENSTOOL
 #warning "linking with libtool..."
 #include<fonction.h>
 #include<constant.h>
 #include<dimension.h>
 #include<structure.h>
 //#include <liblenstool>
 #endif
 //#include "../../../Projects/lenstool-6.8.1/include/structure.h"
 //#include "structure.h"
 #include "structure_hpc.hpp"
 #include "timer.h"
 #include "gradient.hpp"
 #include "gradient_avx.hpp"
 #include "gradient_avx512f.hpp"
 #include "setup.hpp"
 #include <type.h>
 //
 #define NN 100000
+
 //
 //
 //
 #ifdef __WITH_LENSTOOL
 struct g_mode   M;
 struct g_pot    P[NPOTFILE];
 struct g_pixel  imFrame, wFrame, ps, PSF;
 struct g_cube   cubeFrame;
 struct g_dyn    Dy;      //   //TV
 
 
 struct g_source S;
 struct g_image  I;
 struct g_grille G;
 struct g_msgrid H;  // multi-scale grid
 struct g_frame  F;
 struct g_large  L;
 struct g_cosmo  C;
 struct g_cline  CL;
 struct g_observ O;
 struct pot      lens[NLMAX];
 struct pot      lmin[NLMAX], lmax[NLMAX], prec[NLMAX];
 struct g_cosmo  clmin, clmax;       /*cosmological limits*/
 struct galaxie  smin[NFMAX], smax[NFMAX];       // limits on source parameters
 struct ipot     ip;
 struct MCarlo   mc;
 struct vfield   vf;
 struct vfield   vfmin,vfmax; // limits on velocity field parameters
 struct cline    cl[NIMAX];
 lensdata *lens_table;
 
 int  block[NLMAX][NPAMAX];      /*switch for the lens optimisation*/
 int  cblock[NPAMAX];                /*switch for the cosmological optimisation*/
 int  sblock[NFMAX][NPAMAX];                /*switch for the source parameters*/
 int  vfblock[NPAMAX];                /*switch for the velocity field parameters*/
 double excu[NLMAX][NPAMAX];
 double excd[NLMAX][NPAMAX];
 /* supplments tableaux de valeurs pour fonctions g pour Einasto
  *  * Ce sont trois variables globales qu'on pourra utiliser dans toutes les fonctions du projet
  *  */
 
 #define CMAX 20
 #define LMAX 80
 
 float Tab1[LMAX][CMAX];
 float Tab2[LMAX][CMAX];
 float Tab3[LMAX][CMAX];
 
 
 int      nrline, ntline, flagr, flagt;
 long int  narclet;
 
 struct point    gimage[NGGMAX][NGGMAX], gsource_global[NGGMAX][NGGMAX];
 struct biline   radial[NMAX], tangent[NMAX];
 struct galaxie  arclet[NAMAX], source[NFMAX], image[NFMAX][NIMAX];
 struct galaxie  cimage[NFMAX];
 struct pointgal gianti[NPMAX][NIMAX];
 
 struct point    SC;
 double elix;
 double alpha_e;
 
 double *v_xx;
 double *v_yy;
 double **map_p;
 double **tmp_p;
 double **map_axx;
 double **map_ayy;
 #endif
 //struct pot      lens_ref[NLMAX];
 //
 //#include <iitnotify.h>
 int main()
 {
 	double t0, t1, t2, t3;
 	//
 	//Variable creation
 	//
 	point image;
 	//
 	//Initialisation
 	//
         int nlenses;
         type_t x, y;
         type_t sol_grad_x, sol_grad_y;
 	point grad; // store the result
 	//pot* lens;
 #ifdef __WITH_LENSTOOL
 	setup_jauzac_LT(&nlenses, &image.x, &image.y, &sol_grad_x, &sol_grad_y);
 	//
 	struct point grad_lt, Grad;
 	//printf("Number lenses = %d, b0 = %f\n", nlenses, lens[0].b0);
 	//
 	t0 = -myseconds();	
 	for (int ii = 0; ii < NN; ++ii)
 	{
 		grad_lt.x = grad_lt.y = 0.;
 		for (long int jj = 0; jj < nlenses; ++jj)
 		{
 
 			//printf("%f\n", lens[ii].b0);
 			Grad = e_grad_pot(&image, jj);
 			//
 			grad_lt.x += Grad.x;
 			grad_lt.y += Grad.y;
 			//printf("%f %f\n", grad_lt.x, grad_lt.y);
 		}
 	}
 	t0 += myseconds();
 #endif
 	//
 	//
 	//setup_jauzac(Potential** lens, int* nlenses, double* x, double* y, double* sol_grad_x, double* sol_grad_y)
 	//
 	//
 	Potential*   lens_aos;
 	setup_jauzac(&lens_aos, &nlenses, &image.x, &image.y, &sol_grad_x, &sol_grad_y);
 	t1 = -myseconds();
 	for (int ii = 0; ii < NN; ++ii)
 		grad = module_potentialDerivatives_totalGradient(nlenses, &image, lens_aos);
 	t1 += myseconds();
 	//printf("---> grad = %f %f\n", grad.x, grad.y);
 	//
 	// Setting up the AOS potential 
 	//
 	point image_soa;
 	int nlenses_soa;
 	double x_soa, y_soa;
 	type_t sol_grad_x_soa, sol_grad_y_soa;
 	//
 	Potential_SOA   lens_soa;
 	setup_jauzac_SOA(&lens_soa, &nlenses_soa, &image_soa.x, &image_soa.y, &sol_grad_x_soa, &sol_grad_y_soa);
 	//
 	std::cout << "Benchmark for Gradient Calculation using  " << nlenses_soa << " lenses with type " << lens_soa.type[0] << ", image: " << image_soa.x << " " << image_soa.y << std::endl;
 	//
 	// AVX version
 	//
 #if 0
 	point grad_soa_novec;
 	double t21 = -myseconds();
         for (int ii = 0; ii < NN; ++ii)
         {
                 //grad_soa = module_potentialDerivatives_totalGradient_SOA_AVX512(&image_soa, &lens_soa, nlenses_soa);
                 //grad_soa = module_potentialDerivatives_totalGradient_8_SOA_AVX(&image_soa, &lens_soa, nlenses_soa);
                 //
                 //grad_soa_avx = module_potentialDerivatives_totalGradient_81_SOA_AVX(&image_soa, &lens_soa, 0, nlenses_soa);
                 grad_soa_novec = module_potentialDerivatives_totalGradient_SOA_novec(&image_soa, &lens_soa, nlenses_soa);
                 //grad_soa = module_potentialDerivatives_totalGradient_SOA(&image_soa, &lens_soa, 0, nlenses_soa);
         }
         t21 += myseconds();	
 	std::cout << "1" << std::endl;
 #endif
 	//
 	// no vectorization
 	//
 #if 1
 	point grad_soa; // store the result
 	t2 = -myseconds();
 	for (int ii = 0; ii < NN; ++ii)
 	{
 		//grad_soa = module_potentialDerivatives_totalGradient_SOA_AVX512(&image_soa, &lens_soa, nlenses_soa);
 		//grad_soa = module_potentialDerivatives_totalGradient_8_SOA_AVX(&image_soa, &lens_soa, nlenses_soa);
 		//
 		//grad_soa_avx = module_potentialDerivatives_totalGradient_81_SOA_AVX(&image_soa, &lens_soa, 0, nlenses_soa);
 		grad_soa = module_potentialDerivatives_totalGradient_SOA(&image_soa, &lens_soa, nlenses_soa);
 		//grad_soa = module_potentialDerivatives_totalGradient_SOA(&image_soa, &lens_soa, 0, nlenses_soa);
 	}	
 	t2 += myseconds();
 #endif
 	//__SSC_MARK(0x222);
 	//
 	// autovectorized version
 	//
 	point grad_soa_avx;
 	t3 = -myseconds();
 	for (int ii = 0; ii < NN; ++ii)
 	{
 		//grad_soa_avx = module_potentialDerivatives_totalGradient_SOA_AVX512(&image_soa, &lens_soa, nlenses_soa);
 		//                grad_soa = module_potentialDerivatives_totalGradient_81_SOA_AVX(&image_soa, &lens_soa, nlenses_soa);
 		//                                //grad_soa = module_potentialDerivatives_totalGradient_81_SOA_AVX(&image_soa, &lens_soa, nlenses_soa);
 		//grad_soa = module_potentialDerivatives_totalGradient_81_SOA(&image_soa, &lens_soa, 0, nlenses_soa);
 		//grad_soa = module_potentialDerivatives_totalGradient_8_SOA(&image_soa, &lens_soa, 0, nlenses_soa);
 #ifdef _double
 		grad_soa_avx = module_potentialDerivatives_totalGradient_SOA_AVX(&image_soa, &lens_soa, nlenses_soa);
 #endif
 	}
 	t3 += myseconds();
 	//
 #ifdef _double
 	std::cout << " Double calculation   = "  << std::endl;
 #else
 	std::cout << " Float  calculation   = "  << std::endl;
 #endif
 
 	std::cout << " ref sol   = " << std::setprecision(15) << sol_grad_x << " " << std::setprecision(15) << sol_grad_y << std::endl;
 	//
 #ifdef __WITH_LENSTOOL
 	std::cout << " Lenstool sol   = " << std::setprecision(15) << grad_lt.x << " " << std::setprecision(15) << grad_lt.y << ", time = " << t0 << " s." << std::endl;
 #endif
 	//
 	std::cout << " grad           = " << std::setprecision(15) << grad.x << " " << std::setprecision(15) << grad.y << ", time = " << t1 << " s., speedup = " << (double) t0/t1 << std::endl;
 	//
 	//std::cout << " grad novec     = " << std::setprecision(15) << grad.x << " " << std::setprecision(15) << grad.y << ", time = " << t21 << " s., speedup = " << (double) t0/t21 << std::endl;
 	//
 	std::cout << " grad SIMD      = " << std::setprecision(15) << grad_soa.x << " " << std::setprecision(15) << grad_soa.y << ", time = " << t2 << " s., speedup = " << (double) t0/t2 << std::endl;
 	//
 	std::cout << " grad handcoded = " << std::setprecision(15) << grad_soa_avx.x << " " << std::setprecision(15) << grad_soa_avx.y << ", time = " << t3 << " s. speedup = " << (double) t0/t3 << std::endl;
 	//
 }
 
diff --git a/Benchmarks/GradientBenchmark/setup.cpp b/Benchmarks/GradientBenchmark/setup.cpp
index 2aadf99..0c1c37a 100644
--- a/Benchmarks/GradientBenchmark/setup.cpp
+++ b/Benchmarks/GradientBenchmark/setup.cpp
@@ -1,294 +1,295 @@
 #include <iostream>
 #include <string.h>
 //#include <cuda_runtime.h>
 #include <math.h>
 #include <sys/time.h>
 #include <fstream>
 #include <immintrin.h>
 
 //#include "simd_math.h"
 
 //#define __WITH_LENSTOOL
 //#include "structure.h"
 #include "structure_hpc.hpp"
 #include <type.h>
 //#ifdef __WITH_LENSTOOL
 //#endif
 //#include "setup.hpp"
 
 extern int num_lenses = 95;
 //
 extern type_t x_c = -44.094528948812552;
 extern type_t y_c = -36.928385887965803;
 //
 //extern int lense_type[]  = {81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81};
 extern int lense_type[]  = {8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
 //extern int lense_type[]  = {5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
+
 //
 extern type_t pos_x[] = {0.000000000000000, 35.000000000000000, 65.000000000000000, 130.000000000000000, -19.399999999999999, 11.025931099982012, -2.671581400009952, 8.706480300003618, -46.071898399997828, -10.335049000010734, -25.638072299983587, -21.186195800000924, 47.770210000017265, 20.889859399994865, -9.947619900012000, 28.559020299985104, -11.757235600015839, 44.290179900007189, 49.190403899989448, 36.260348400000979, -47.958373599980469, 42.325706400014852, 37.455656899984845, 35.637785899984095, 48.099282799982589, 35.578290900001157, -3.050470599984273, 3.330580999983544, 43.051886999993762, 12.070367100002324, -0.012525300009348, 0.389991100006194, 18.864752299989611, 3.278487299995037, 10.076003800004774, 11.586436500008050, 37.425482399992134, 0.486777199988778, 25.632663699981624, 64.980204499991984, 18.813228000016544, 3.462950200001255, 28.896917700002792, -21.734176000018063, -0.928577399996435, -0.731873800008366, 78.370563400014419, 39.869047099991327, 10.636794000018757, 81.825827699993738, 11.692331899990036, 65.849002200017992, 12.142102700007115, 7.184091700006435, 92.001179599988561, 9.130061900012469, 1.379771499990539, 1.090836400019363, 63.261966300009220, -26.291947899982713, 64.747063800006757, 65.182885999997012, 69.098453600009847, 63.422517399987171, 83.228941699997549, 62.814757499999601, 58.726967500001948, 109.445168699990987, 65.110011700002232, 17.873548699988238, 108.654938499989214, 4.474934500009342, 3.973924000006243, 88.196061999997639, 61.916070000008830, 87.056832499986754, 108.613092799990099, 61.803057900019461, 79.079379299996546, 75.673077399983285, 129.107267799989302, 132.913239399982729, 124.503095399984701, 143.243164500019446, 128.260958600014533, 138.446558499998758, 130.452594800015163, 134.181991500016807, 131.942247000012628, 130.527746399989439, 132.704864599988241, 118.334119300008808, 147.068493300001194, 150.936237199997493, 142.130408800003835};
 
 extern type_t pos_y[] = {0.000000000000000, -10.000000000000000, 40.000000000000000, 80.000000000000000, -21.699999999999999, -59.119920000003390, -58.477319999988708, -50.961599999999407, -45.297360000006393, -42.065999999999804, -46.933559999993690, -46.580759999989141, -37.085399999989477, -38.415960000011751, -38.951640000004772, -33.509519999998361, -35.354879999999866, -22.304880000001504, -22.674599999999145, -9.754919999988942, -22.953600000010965, -21.260159999994244, -20.612880000001610, -18.947520000006079, -7.068240000003811, -1.161000000010404, -1.597679999997581, -1.477079999995112, 5.219999999994229, -3.917879999997353, 0.001799999995455, -2.880720000001702, -0.453960000010056, 2.791800000011335, 1.789919999993117, 0.972720000009986, 2.251080000010575, 20.615760000009686, 20.495160000007218, 19.365120000000502, 16.301160000006121, 17.766359999990300, 18.265679999993267, 21.366720000011696, 32.603399999999283, 21.648600000006013, 23.630759999994666, 22.818239999989487, 21.551399999995624, 22.820040000010522, 25.769519999997215, 33.740639999987820, 25.422120000004611, 23.979960000008305, 33.112080000009314, 33.039720000007833, 32.423399999993308, 33.522119999992128, 33.943680000007248, 39.039480000008098, 33.088679999991655, 36.841680000006249, 40.110120000011307, 36.187200000006214, 43.166519999994080, 37.808640000000082, 41.374079999997093, 42.308279999991782, 39.722400000007951, 43.650719999999410, 45.423719999999435, 47.048760000009793, 48.891600000007429, 45.961559999992119, 53.340119999990065, 47.024999999987926, 46.107720000011909, 52.768080000006989, 58.089600000010932, 62.472239999991075, 77.337719999999877, 72.065520000009542, 74.942999999998960, 76.529879999998229, 77.375160000002552, 85.891679999997450, 83.807999999999083, 87.973920000004568, 87.793919999998593, 95.139359999993189, 95.914439999995693, 101.069640000000049, 101.447279999987927, 107.222760000001927, 106.741440000004673};
 
 extern type_t theta[] = {1.047197551196598, 1.047197551196598, 1.047197551196598, 1.047197551196598, -0.698131700797732, -1.408480706359424, 0.359537825910832, 0.027925268031909, -0.991347015132779, -0.055850536063819, -0.991347015132779, 0.036651914291881, 0.687659725285766, 0.561996019142174, 0.907571211037051, 0.054105206811824, 0.202458193231342, -1.343903524035634, 0.420624349730633, 0.246091424531200, 0.150098315671512, -0.521853446346304, -0.040142572795870, 0.980875039620813, -0.884881930761125, -1.509709802975095, 0.666715774261834, -0.514872129338327, 1.174606586592184, -0.282743338823081, 1.457349925415265, 0.260054058547155, -1.164134611080218, -0.654498469497874, 1.038470904936626, -0.453785605518526, -0.886627260013119, -0.003490658503989, 1.549852375770965, 0.134390352403563, 0.293215314335047, -1.453859266911276, 0.109955742875643, 0.823795406941324, -0.349065850398866, 1.096066770252439, -0.527089434102287, -0.766199541625511, -1.186823891356144, 1.368338133563554, 0.748746249105567, 0.066322511575785, -0.651007810993885, -1.052433538952581, 0.256563400043166, 1.321214243759707, 1.110029404268394, 0.794124809657420, 0.134390352403563, -1.132718684544320, -0.961676417848876, -1.078613477732496, -1.530653753999027, -0.191986217719376, -0.127409035395586, -0.366519142918809, -0.815068760681352, -1.469567230179226, 0.026179938779915, 1.448623279155294, -0.146607657167524, 1.118756050528365, 0.837758040957278, -0.422369678982628, 1.024508270920671, -0.464257581030492, 0.226892802759263, 1.364847475059566, 0.249582083035189, -1.115265392024376, 1.418952681871390, -0.031415926535898, -1.127482696788337, 1.083849465488479, -0.047123889803847, -0.855211333477221, -0.867428638241182, 0.841248699461267, 0.289724655831059, 1.075122819228507, 0.099483767363677, -1.197295866868110, -0.534070751110265, 0.481710873550435, -0.174532925199433};
 //
 extern type_t  epot[] =  {0.075426688904937, 0.075426688904937, 0.075426688904937, 0.075426688904937, 0.116562483442831, 0.344322344322344,0.075409836065574, 0.034285714285715, 0.158045977011495, 0.064297800338409, 0.180914512922465, 0.085616438356165, 0.449781659388646, 0.198529411764706, 0.179611650485437, 0.160975609756097, 0.492772667542707, 0.216589861751152, 0.031496062992126, 0.043407043407043, 0.376311844077961, 0.157622739018088, 0.396428571428571, 0.373650107991361, 0.241081081081081, 0.364293085655315, 0.528824833702883, 0.317901234567901, 0.338345864661654, 0.129963898916967, 0.175792507204611, 0.151898734177215, 0.059829059829059, 0.208333333333333, 0.340163934426229, 0.590493601462523, 0.039062500000001, 0.015317286652079, 0.126005361930295, 0.123473541383989, 0.308584686774942, 0.200913242009132, 0.035587188612100, 0.096280087527352, 0.167197452229299, 0.109909909909910, 0.104109589041096, 0.051409618573797, 0.087431693989071, 0.176470588235294, 0.097178683385580, 0.045769764216366, 0.074235807860262, 0.131944444444444, 0.101449275362319, 0.285714285714286, 0.367619047619048, 0.239263803680982, 0.132132132132132, 0.318295739348371, 0.078817733990148, 0.108695652173913, 0.045143638850889, 0.381165919282511, 0.099236641221374,0.240437158469945, 0.334513274336283, 0.103594080338266, 0.186246418338109, 0.328918322295806, 0.098844672657253, 0.024911032028470, 0.102719033232628, 0.462857142857143, 0.039920159680638, 0.403565640194490, 0.023391812865496, 0.315315315315315, 0.274418604651163, 0.161987041036717, 0.137254901960784, 0.044673539518900, 0.057902973395931, 0.160283687943262, 0.122340425531915, 0.520395550061805, 0.021645021645023, 0.094555873925501, 0.142045454545455, 0.145907473309608, 0.194915254237288, 0.282904689863843, 0.450331125827815, 0.135245901639344, 0.340425531914894};
 //
 extern type_t rcore[] = {10.000000000000000, 5.000000000000000, 10.000000000000000, 10.000000000000000, 1.000000000000000, 0.034052339117993, 0.030349179184560, 0.046574365790645, 0.038383710030628, 0.068256961228370, 0.020804005013518, 0.028193367328818, 0.055481341027049, 0.039097316478764, 0.031633478431847, 0.039641214329569, 0.048322227956496, 0.041128886304460, 0.028585576136945, 0.080565099864725, 0.039097316478764, 0.027425019798032, 0.034367421532497, 0.025594517675956, 0.049905332073874, 0.073815395284787, 0.052984269343359, 0.048322227956496,0.067320440611066, 0.044683475951999, 0.067320440611066, 0.022811137889776, 0.036320092822069, 0.039277781492078, 0.037857065785625, 0.021288592541707, 0.024668738981743, 0.069206510145232, 0.053474526751449, 0.048322227956496, 0.025359865727459, 0.029117021675657, 0.031199450836488, 0.051067776588639, 0.099116726380952, 0.029386438160114, 0.050833141468429, 0.034367421532497, 0.021784467563449, 0.027934888983489, 0.044273815382088, 0.052498506642202, 0.035821762466628, 0.013432193817836, 0.052257297914513, 0.037337647370553, 0.023342477548558, 0.043266020305102, 0.039641214329569, 0.061965070730691, 0.021093417552129, 0.028717521161380, 0.051067776588639, 0.032669835654398, 0.052498506642202, 0.017707080013958, 0.037509990120305, 0.030349179184560, 0.013936282772780, 0.028454237344829, 0.057829168758607, 0.036825355643309, 0.040192678553178, 0.029522079795690, 0.056512816167846, 0.034685419368686, 0.023886193697910, 0.035821762466628, 0.038738870126742, 0.034685419368686, 0.083588580432868, 0.032221588704875, 0.042672388251826, 0.052017197440259, 0.033124318340140, 0.048100207486392, 0.026677611870563, 0.023996447358569, 0.016000993535772, 0.057563467863253, 0.034526054342700, 0.045935341535597, 0.039459079494046, 0.031488135800734, 0.039277781492078};
 //
 extern type_t rcut[] = {500.000000000000000, 500.000000000000000, 500.000000000000000, 500.000000000000000, 150.000000000000000, 5.107850867699009, 4.552376877684008, 6.986154868596703, 5.757556504594200, 10.238544184255536, 3.120600752027715, 4.229005099322641, 8.322201154057367, 5.864597471814614, 4.745021764777110, 5.946182149435385, 7.248334193474441, 6.169332945669035, 4.287836420541705, 12.084764979708812, 5.864597471814614, 4.113752969704730, 5.155113229874521, 3.839177651393335, 7.485799811081068, 11.072309292717980, 7.947640401503818, 7.248334193474441, 10.098066091659968, 6.702521392799788, 10.098066091659968, 3.421670683466420, 5.448013923310371, 5.891667223811720, 5.678559867843796, 3.193288881256013, 3.700310847261404, 10.380976521784758, 8.021179012717386, 7.248334193474441, 3.803979859118900, 4.367553251348570, 4.679917625473273, 7.660166488295854, 14.867508957142872, 4.407965724017149, 7.624971220264309, 5.155113229874521, 3.267670134517328, 4.190233347523285, 6.641072307313197, 7.874775996330366, 5.373264369994223, 2.014829072675421, 7.838594687177010, 5.600647105583010, 3.501371632283681, 6.489903045765340, 5.946182149435385, 9.294760609603639, 3.164012632819346, 4.307628174206992, 7.660166488295854, 4.900475348159637, 7.874775996330366, 2.656062002093733, 5.626498518045686, 4.552376877684008, 2.090442415917007, 4.268135601724389, 8.674375313791064, 5.523803346496345, 6.028901782976731, 4.428311969353570, 8.476922425176864, 5.202812905302965, 3.582929054686518, 5.373264369994223, 5.810830519011283, 5.202812905302965, 12.538287064930271, 4.833238305731319, 6.400858237773873, 7.802579616038900, 4.968647751020976, 7.215031122958762, 4.001641780584524, 3.599467103785322, 2.400149030365801, 8.634520179487971, 5.178908151405005, 6.890301230339541, 5.918861924106924, 4.723220370110043, 5.891667223811720};
 //
 type_t b0[] = {27.686284800000006, 15.573535200000000, 21.197311800000001, 43.259820000000005, 1.401618168000000, 0.701887044377384, 0.625557486765266, 0.959991142907221, 0.791165290944736, 1.406912946824478, 0.428812292146906, 0.581121878203467, 1.143581776765294, 0.805874151883638, 0.652029471542907, 0.817084978065710, 0.996018089699853, 0.847748882881212, 0.589206088811179, 1.660608383702515, 0.805874151883638, 0.565284693698478, 0.708381525237709, 0.527554371568815, 1.028649042482203, 1.521483427216619, 1.092112118320076, 0.996018089699853, 1.387609377521935, 0.921016108754891, 1.387609377521935, 0.470183328577265, 0.748629999074688, 0.809593897959512, 0.780310096202241, 0.438800613557777, 0.508472214857728, 1.426485055525682, 1.102217307333858, 0.996018089699853, 0.522717724018210, 0.600160247852116, 0.643083291809730, 1.052609356688346, 2.042994635018428, 0.605713462253246, 1.047773056002485, 0.708381525237709, 0.449021592862168, 0.575794120800079, 0.912572182315177, 1.082099574236042, 0.738358410415477, 0.276864469886694, 1.077127778308336, 0.769603840315186, 0.481135305220354, 0.891799503367263, 0.817084978065710, 1.277224457300750, 0.434777665351592, 0.591925740547885, 1.052609356688346, 0.673390873628514, 1.082099574236042, 0.364978453094982, 0.773156170239529, 0.625557486765266, 0.287254754837836, 0.586498932739152, 1.191975217859029, 0.759044479765867, 0.828451762375228, 0.608509308563808, 1.164842549348754, 0.714936098789562, 0.492342386170850, 0.738358410415477, 0.798485853249674, 0.714936098789562, 1.722928385636979, 0.664151604471083, 0.879563555467234, 1.072178825707933, 0.682758673823120, 0.991441802267353, 0.549879116438807, 0.494614935370580, 0.329812587059260, 1.186498589205643, 0.711651265795296, 0.946819570637053, 0.813330813603161, 0.649033668246664, 0.80959389795951};
 //
 extern type_t   grad_x = -77.124413458712169;
 extern type_t   grad_y = -46.092652444451353;
 
 //
 //
 //
 
 /**** usefull functions for PIEMD profile : see old lenstool ****/
 
 /**  I*w,v=0.5 Kassiola & Kovner, 1993 PIEMD, paragraph 4.1
  *
  * Global variables used :
  * - none
  */
 
 /** @brief This module function calculates profile depended information like the impactparameter b0 and the potential ellipticity epot
  * 
  * @param lens: mass distribution for which to calculate parameters
  */
 void module_readParameters_calculatePotentialparameter(Potential *lens)
 {
         //std::cout << "module_readParameters_calculatePotentialparameter..." << std::endl;
         switch (lens->type)
         {
 
                 case(5): /*Elliptical Isothermal Sphere*/
                         //impact parameter b0
                         lens->b0 = 4* pi_c2 * lens->vdisp * lens->vdisp ;
                         //ellipticity_potential 
                         lens->ellipticity_potential = lens->ellipticity/3 ;
                         break;
 
                 case(8): /* PIEMD */
                         //impact parameter b0
                         lens->b0 = 6.*pi_c2 * lens->vdisp * lens->vdisp;
                         //ellipticity_parameter
                         if ( lens->ellipticity == 0. && lens->ellipticity_potential != 0. ){
                                 // emass is (a2-b2)/(a2+b2)
                                 lens->ellipticity = 2.*lens->ellipticity_potential / (1. + lens->ellipticity_potential * lens->ellipticity_potential);
                                 //printf("1 : %f %f \n",lens->ellipticity,lens->ellipticity_potential);
                         }
                         else if ( lens->ellipticity == 0. && lens->ellipticity_potential == 0. ){
                                 lens->ellipticity_potential = 0.00001;
                                 //printf("2 : %f %f \n",lens->ellipticity,lens->ellipticity_potential);
                         }
                         else{
                                 // epot is (a-b)/(a+b)
                                 lens->ellipticity_potential = (1. - sqrt(1 - lens->ellipticity * lens->ellipticity)) / lens->ellipticity;
                                 //printf("3 : %f %f \n",lens->ellipticity,lens->ellipticity_potential);
                         }
                         break;
 
                 default:
                         std::cout << "ERROR: LENSPARA profil type of clump "<< lens->name << " unknown : "<< lens->type << std::endl;
                         //printf( "ERROR: LENSPARA profil type of clump %s unknown : %d\n",lens->name, lens->type);
                         break;
         };
 }
 //
 //
 //
 void module_readParameters_calculatePotentialparameter_SOA(Potential_SOA *lens, int ii)
 {
         //std::cout << "module_readParameters_calculatePotentialparameter..." << std::endl;
         switch (lens->type[ii])
         {
 
                 case(5): /*Elliptical Isothermal Sphere*/
                         //impact parameter b0
                         lens->b0[ii] = 4* pi_c2 * lens->vdisp[ii] * lens->vdisp[ii] ;
                         //ellipticity_potential 
                         lens->ellipticity_potential[ii] = lens->ellipticity[ii]/3 ;
                         break;
 
                 case(8): /* PIEMD */
                         //impact parameter b0
                         lens->b0[ii] = 6.*pi_c2 * lens->vdisp[ii] * lens->vdisp[ii];
                         //ellipticity_parameter
                         if ( lens->ellipticity[ii] == 0. && lens->ellipticity_potential[ii] != 0. ){
                                 // emass is (a2-b2)/(a2+b2)
                                 lens->ellipticity[ii] = 2.*lens->ellipticity_potential[ii] / (1. + lens->ellipticity_potential[ii] * lens->ellipticity_potential[ii]);
                                 //printf("1 : %f %f \n",lens->ellipticity[ii],lens->ellipticity_potential[ii]);
                         }
                         else if ( lens->ellipticity[ii] == 0. && lens->ellipticity_potential[ii] == 0. ){
                                 lens->ellipticity_potential[ii] = 0.00001;
                                 //printf("2 : %f %f \n",lens->ellipticity[ii],lens->ellipticity_potential[ii]);
                         }
                         else{
                                 // epot is (a-b)/(a+b)
                                 lens->ellipticity_potential[ii] = (1. - sqrt(1 - lens->ellipticity[ii] * lens->ellipticity[ii])) / lens->ellipticity[ii];
                                 //printf("3 : %f %f \n",lens->ellipticity[ii],lens->ellipticity_potential[ii]);
                         }
                         break;
 
                 default:
                         std::cout << "ERROR: LENSPARA profil type of clump "<< lens->name << " unknown : "<< lens->type << std::endl;
                         //printf( "ERROR: LENSPARA profil type of clump %s unknown : %d\n",lens->name, lens->type);
                         break;
         };
 
 }
 //
 //
 //
 void 
 setup_jauzac_SOA(Potential_SOA *lens_soa, int* nlenses, type_t* x, type_t* y, type_t* sol_grad_x, type_t* sol_grad_y)
 {
 	*nlenses    = num_lenses;
 	*sol_grad_x = grad_x; 
 	*sol_grad_y = grad_y; 
 	*x	    = x_c;
 	*y	    = y_c;
 	//
         //Potential_SOA lens_soa
         //
         for (int i = 0; i < *nlenses; ++i)
         {
                 lens_soa->type       = (int*) malloc(*nlenses*sizeof(int));
                 lens_soa->vdisp      = (type_t*) malloc(*nlenses*sizeof(type_t));
                 //
                 lens_soa->position_x = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->position_y = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->weight = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->b0 = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->vdisp = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->ellipticity_angle = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->anglecos = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->anglesin = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->ellipticity = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->ellipticity_potential = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->rcore = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->rcut = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->rscale = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->exponent = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->alpha = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->einasto_kappacritic = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->z = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->mag = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->lum = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->theta = (type_t*) malloc(*nlenses*sizeof(type_t));
                 lens_soa->sigma = (type_t*) malloc(*nlenses*sizeof(type_t));
         }
         //
         for (int i = 0; i < *nlenses; ++i)
         {
                 //ilens = &lens[i];
                 //
                 lens_soa->vdisp[i]                 = 1.;
                 lens_soa->position_x[i]            = pos_x[i];
                 lens_soa->position_y[i]            = pos_y[i];
                 lens_soa->type[i]                  = lense_type[i];
                 lens_soa->ellipticity[i]           = 0.11;
                 lens_soa->ellipticity_potential[i] = epot[i];
                 lens_soa->ellipticity_angle[i]     = theta[i];
 		lens_soa->anglecos[i]		   = cos(theta[i]);
 		lens_soa->anglesin[i]		   = sin(theta[i]);
                 lens_soa->rcut[i]                  = rcut[i];
                 lens_soa->rcore[i]                 = rcore[i];
                 lens_soa->b0[i]                    = b0[i];
                 lens_soa->weight[i]                = 0;
                 lens_soa->rscale[i]                = 0;
                 lens_soa->exponent[i]              = 0;
                 lens_soa->alpha[i]                 = 0.;
                 lens_soa->einasto_kappacritic[i]   = 0;
                 lens_soa->z[i]                     = 0.4;
         }
 }
 
 void
 setup_jauzac(Potential** lens, int* nlenses, type_t* x, type_t* y, type_t* sol_grad_x, type_t* sol_grad_y)
 {
         *nlenses    = num_lenses;
         *sol_grad_x = grad_x;
         *sol_grad_y = grad_y;
         *x          = x_c;
         *y          = y_c;
         //
        	*lens = (Potential*) malloc(sizeof(Potential)*(*nlenses)); 
         //
         for (int i = 0; i < *nlenses; ++i)
         {
                 //ilens = &lens[i];
                 //
 		(*lens)[i].position.x            = pos_x[i];
                 (*lens)[i].position.y            = pos_y[i];
 		//
                 (*lens)[i].vdisp                 = 1.;
                 (*lens)[i].type                  = lense_type[i];
                 (*lens)[i].ellipticity           = 0.11;
                 (*lens)[i].ellipticity_potential = epot[i];
                 (*lens)[i].ellipticity_angle     = theta[i];
                 (*lens)[i].rcut                  = rcut[i];
                 (*lens)[i].rcore                 = rcore[i];
                 (*lens)[i].b0                    = b0[i];
                 (*lens)[i].weight                = 0;
                 (*lens)[i].rscale                = 0;
                 (*lens)[i].exponent              = 0;
                 (*lens)[i].alpha                 = 0.;
                 (*lens)[i].einasto_kappacritic   = 0;
                 (*lens)[i].z                     = 0.4;
         }
 	std::cout << "Setup done..." << std::endl;
 }
 
 #ifdef __WITH_LENSTOOL
 extern struct pot lens[NLMAX];
 void
 //setup_jauzac_LT(struct pot** lens, int* nlenses, type_t* x, type_t* y, type_t* sol_grad_x, type_t* sol_grad_y)
 setup_jauzac_LT( int* nlenses, type_t* x, type_t* y, type_t* sol_grad_x, type_t* sol_grad_y)
 {
         *nlenses    = num_lenses;
         *sol_grad_x = grad_x;
         *sol_grad_y = grad_y;
         *x          = x_c;
         *y          = y_c;
         //
         //*lens = (struct pot*) malloc(sizeof(struct pot)*(*nlenses));
         //
         for (int i = 0; i < *nlenses; ++i)
         {
                 //
                 lens[i].C.x            = pos_x[i];
                 lens[i].C.y            = pos_y[i];
                 //
                 lens[i].sigma                 = 1.; 			// disp
 		if (lense_type[i] == 5)
 			lens[i].type                  = 1;
 		else
 			lens[i].type                  = lense_type[i];
                 lens[i].emass                 = 0.11;
                 lens[i].epot                  = epot[i];
                 lens[i].theta                 = theta[i];
                 lens[i].rcut                  = rcut[i];
                 lens[i].rc                    = rcore[i];
                 lens[i].b0                    = b0[i];
                 lens[i].masse                 = 0;			// weight
                 //lens[i].rc  	                 = 0;			// rscale
 //               (&lens)[i].exponent              = 0;
                 lens[i].alpha                 = 0.;
 //                (&lens)[i].einasto_kappacritic   = 0;
                 lens[i].z                     = 0.4;
         }
         std::cout << "Setup done..." << std::endl;
 }
 #endif
 
 
 
 
 
 
 
 
diff --git a/src/gradient.cpp b/src/gradient.cpp
index cfccd6b..5cd12be 100644
--- a/src/gradient.cpp
+++ b/src/gradient.cpp
@@ -1,967 +1,965 @@
 #include <iostream>
 #include <iomanip>
 #include <string.h>
 //#include <cuda_runtime.h>
 #include <math.h>
 #include <sys/time.h>
 #include <fstream>
 #include <immintrin.h>
 #include <map>
 /*
 #ifdef __AVX__
 #include "simd_math_avx.h"
 #endif
 #ifdef __AVX512F__
 #include "simd_math_avx512f.h"
 #endif
 */
 #include "structure_hpc.hpp"
 #include "gradient.hpp"
 #include "utils.hpp"
 //#include "iacaMarks.h"
 //
 //
 //
 /**@brief Return the gradient of the projected lens potential for one clump
  * !!! You have to multiply by dlsds to obtain the true gradient
  * for the expressions, see the papers : JP Kneib & P Natarajan, Cluster Lenses, The Astronomy and Astrophysics Review (2011) for 1 and 2
  * and JP Kneib PhD (1993) for 3
  * 
  * @param pImage 	point where the result is computed in the lens plane
  * @param lens		mass distribution
  */
 
 
 //
 /// Useful functions
 //
 complex
 piemd_1derivatives_ci05(type_t x, type_t y, type_t eps, type_t rc)
 {
         type_t  sqe, cx1, cxro, cyro, rem2;
         complex zci, znum, zden, zis, zres;
         type_t norm;
         //
         //std::cout << "piemd_lderivatives" << std::endl;
         sqe  = sqrt(eps);
         cx1  = (1. - eps) / (1. + eps);
         cxro = (1. + eps) * (1. + eps);
         cyro = (1. - eps) * (1. - eps);
         //
         rem2 = x * x / cxro + y * y / cyro;
         /*zci=cpx(0.,-0.5*(1.-eps*eps)/sqe);
           znum=cpx(cx1*x,(2.*sqe*sqrt(rc*rc+rem2)-y/cx1));
           zden=cpx(x,(2.*rc*sqe-y));
           zis=pcpx(zci,lncpx(dcpx(znum,zden)));
           zres=pcpxflt(zis,b0);*/
 
         // --> optimized code
         zci.re  = 0;
         zci.im  = -0.5*(1. - eps*eps)/sqe;
         znum.re = cx1 * x;
         znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
         zden.re = x;
         zden.im = 2.*rc * sqe - y;
         norm = zden.re * zden.re + zden.im * zden.im;     // zis = znum/zden
         zis.re = (znum.re * zden.re + znum.im * zden.im) / norm;
         zis.im = (znum.im * zden.re - znum.re * zden.im) / norm;
         norm = zis.re;
         zis.re = log(sqrt(norm * norm + zis.im * zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
         zis.im = atan2(zis.im, norm);
         //  norm = zis.re;
         zres.re = zci.re * zis.re - zci.im * zis.im;   // Re( zci*ln(zis) )
         zres.im = zci.im * zis.re + zis.im * zci.re;   // Im( zci*ln(zis) )
         //
         //zres.re = zis.re*b0;
         //zres.im = zis.im*b0;
         //
         return(zres);
 }
 //
 //// changes the coordinates of point P into a new basis (rotation of angle theta)
 ////      y'      y    x'
 ////       *      |   /
 ////         *    |  /  theta
 ////           *  | /
 ////             *|--------->x
 /*
 inline
 struct point rotateCoordinateSystem(struct point P, double theta)
 {
 	struct  point   Q;
 
 	Q.x = P.x*cos(theta) + P.y*sin(theta);
 	Q.y = P.y*cos(theta) - P.x*sin(theta);
 
 	return(Q);
 }
 */
 //
 //
 struct point
 grad_halo(const struct point *pImage, const struct Potential *lens)
 {
 	struct point true_coord, true_coord_rot, result;
 	type_t X, Y, R, angular_deviation, u;
 	complex zis;
 	//
 	result.x = result.y = 0.;
 	//
 	/*positionning at the potential center*/
 	// Change the origin of the coordinate system to the center of the clump
 	true_coord.x   = pImage->x - lens->position.x;  
 	true_coord.y   = pImage->y - lens->position.y;
 	//printf("x, y = %f, %f\n", lens->position.x, lens->position.y);
 	//
 	true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle);
 	//
 	switch (lens->type)
 	{
 		case(5): /*Elliptical Isothermal Sphere*/
 			/*rotation of the coordiante axes to match the potential axes*/
 			//true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle);
 			//
 			R=sqrt(true_coord_rot.x*true_coord_rot.x*(1 - lens->ellipticity_potential) + true_coord_rot.y*true_coord_rot.y*(1 + lens->ellipticity_potential));	//ellippot = ellipmass/3
 			result.x = (1 - lens->ellipticity_potential)*lens->b0*true_coord_rot.x/(R);
 			result.y = (1 + lens->ellipticity_potential)*lens->b0*true_coord_rot.y/(R);
 			break;
 		case(8): /* PIEMD */
 			/*rotation of the coordiante axes to match the potential axes*/
 			/*Doing something....*/
 			complex zis = piemd_1derivatives_ci05(true_coord_rot.x, true_coord_rot.y, lens->ellipticity_potential, lens->rcore);
 			//
 			result.x = lens->b0*zis.re;
 			result.y = lens->b0*zis.im;
 			break;
 		case(81): //PIEMD Kassiola & Kovner,1993 I0.5c-I0.5cut
 			type_t t05;
 			if ( lens->ellipticity_potential > 2E-4 )
 			{
 				//printf("1 ");
 				t05 = lens->b0*lens->rcut/(lens->rcut - lens->rcore);
 				complex zis     = piemd_1derivatives_ci05(true_coord_rot.x, true_coord_rot.y, lens->ellipticity_potential, lens->rcore);
 				complex zis_cut = piemd_1derivatives_ci05(true_coord_rot.x, true_coord_rot.y, lens->ellipticity_potential, lens->rcut);
 				result.x = t05 * (zis.re - zis_cut.re);
 				result.y = t05 * (zis.im - zis_cut.im);
 				//printf("	g   = %f %f\n", result.x, result.y);
 			}
 			else if (( u = true_coord_rot.x*true_coord_rot.x + true_coord_rot.y*true_coord_rot.y) > 0. )
 			{
 				//printf("2 ");
 				// Circular dPIE Elliasdottir 2007 Eq A23 slighly modified for t05
 				X = lens->rcore;
 				Y = lens->rcut;
 				t05  = sqrt(u + X * X) - X - sqrt(u + Y * Y) + Y;  // Faster and equiv to Elliasdottir (see Golse PhD)
 				t05 *= lens->b0 * Y / (Y - X) / u; // 1/u because t05/sqrt(u) and normalised Q/sqrt(u)
 				result.x = t05*true_coord_rot.x;
 				result.y = t05*true_coord_rot.y;
 			}
 			else
 			{
 				//printf("3 ");
 				result.x = 0.;
 				result.y = 0.;
 			}
 			break;
 
 
 		default:
 			std::cout << "ERROR: Grad 1 profil type of clump "<< lens->name << " unknown : "<< lens->type << std::endl;
 			break;
 	};
 	result = rotateCoordinateSystem(result, -lens->ellipticity_angle); 
 	//printf("	rot grad = %f %f\n", result.x, result.y); 
 	return result;
 }
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient(const int nhalos, const struct point* pImage, const struct Potential* lens)
 {
 	struct point grad, clumpgrad;
 	//
 	grad.x = 0;
 	grad.y = 0;
 	//
 	for(int i = 0; i < nhalos; i++)
 	{
 		clumpgrad = grad_halo(pImage, &lens[i]);  //compute gradient for each clump separately
 		//std::cout << clumpgrad.x << " " << clumpgrad.y << std::endl;
 		//nan check
 		//if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y)
 		{ 
 			// add the gradients
 			grad.x += clumpgrad.x;
 			grad.y += clumpgrad.y;
 		}  
 	}
 	//	
 	return(grad);
 }
 //
 // SOA versions, vectorizable
 //
 struct point module_potentialDerivatives_totalGradient_5_SOA(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA begins");
         //printf("# module_potentialDerivatives_totalGradient_SIS_SOA begins\n");
 	//
 	struct point grad, result;
         grad.x = 0;
         grad.y = 0;
 	for(int i = shalos; i < shalos + nhalos; i++)
 	{
 		//
 		struct point true_coord, true_coord_rotation;
 		//
 		true_coord.x = pImage->x - lens->position_x[i];
 		true_coord.y = pImage->y - lens->position_y[i];
 		//
 		true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
 		type_t ell_pot = lens->ellipticity_potential[i];
                 //
 		type_t R = sqrt(true_coord_rotation.x*true_coord_rotation.x*(1 - ell_pot) + true_coord_rotation.y*true_coord_rotation.y*(1 + ell_pot));
 		//
 		result.x = (1 - ell_pot)*lens->b0[i]*true_coord_rotation.x/R;
 		result.y = (1 + ell_pot)*lens->b0[i]*true_coord_rotation.y/R;
 		//
 		result = rotateCoordinateSystem(result, -lens->ellipticity_angle[i]);
 		//
 		grad.x += result.x;
 		grad.y += result.y;
 		
 	}
 	return grad;
 }
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient_5_SOA_v2(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins");
         //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n");
         //
         struct point grad, result;
         grad.x = 0;
         grad.y = 0;
         for(int i = shalos; i < shalos + nhalos; i++)
         {
-		//
 			struct point true_coord, true_coord_rotation;
 			//
 			true_coord.x = pImage->x - lens->position_x[i];
 			true_coord.y = pImage->y - lens->position_y[i];
 			//
 			//true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
 			type_t cose = lens->anglecos[i];
 			type_t sine = lens->anglesin[i];
 			//
 			type_t x = true_coord.x*cose + true_coord.y*sine;
 			type_t y = true_coord.y*cose - true_coord.x*sine;
 			//
 			type_t ell_pot = lens->ellipticity_potential[i];
 			//
 			type_t R = sqrt(x*x*(1 - ell_pot) + y*y*(1 + ell_pot));
 			//
 			result.x = (1 - ell_pot)*lens->b0[i]*x/R;
 			result.y = (1 + ell_pot)*lens->b0[i]*y/R;
 			//
 			grad.x += result.x*cose - result.y*sine;
 			grad.y += result.y*cose + result.x*sine;
 
 			//grad.x = x/R;
 			//grad.y = y/R;
         }
         return grad;
 }
 
 struct point module_potentialDerivatives_totalGradient_5_SOA_print(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos, int index)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins");
         //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n");
         //
         struct point grad, result;
     	std::ofstream myfile;
         grad.x = 0;
         grad.y = 0;
 
 #ifdef _double
         std::string name = "Double_";
 #else
         std::string name = "Float_";
 #endif
 
         for(int i = shalos; i < shalos + nhalos; i++)
         {
 		//
 			struct point true_coord, true_coord_rotation;
 			//
 			true_coord.x = pImage->x - lens->position_x[i];
 			true_coord.y = pImage->y - lens->position_y[i];
 			////
 			/*
         	myfile.open (name + "pImage->x_1.txt", std::ios_base::app);
         	myfile << index << " " << pImage->x << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	/**
         	myfile.open (name + "pImage->y_1.txt", std::ios_base::app);
         	myfile << index << " " << pImage->y << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	*/
 			/*
         	myfile.open (name + "true_coord.x_2.txt", std::ios_base::app);
         	myfile << index << " " << true_coord.x << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	/*
         	myfile.open (name + "true_coord.y_2.txt", std::ios_base::app);
         	myfile << index << " " << true_coord.y << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
 			*///
 			type_t cose = lens->anglecos[i];
 			type_t sine = lens->anglesin[i];
 			////
 
         	myfile.open (name + "lens->anglecos[i]_3.txt", std::ios_base::app);
         	myfile << index << " " << lens->anglecos[i] << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	//
         	myfile.open (name + "lens->anglesin[i]_3.txt", std::ios_base::app);
         	myfile << index << " " << lens->anglesin[i] << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
 			////
 			type_t x = true_coord.x*cose + true_coord.y*sine;
 			type_t y = true_coord.y*cose - true_coord.x*sine;
 			////
 			/*
         	myfile.open (name + "x_4.txt", std::ios_base::app);
         	myfile << index << " " << x << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	/*
         	myfile.open (name + "y_4.txt", std::ios_base::app);
         	myfile << index << " " << y << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	*////
 			type_t ell_pot = lens->ellipticity_potential[i];
 			//
 			type_t R = sqrt(x*x*(1 - ell_pot) + y*y*(1 + ell_pot));
 			//
 			result.x = (1 - ell_pot)*lens->b0[i]*x/R;
 			result.y = (1 + ell_pot)*lens->b0[i]*y/R;
 			////
 			/*
         	myfile.open (name + "ell_pot_5.txt", std::ios_base::app);
         	myfile << index << " " << ell_pot << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	//
         	myfile.open (name + "R_5.txt", std::ios_base::app);
         	myfile << index << " " << R << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	myfile.open (name + "x_6.txt", std::ios_base::app);
         	myfile << index << " " << x << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	/*
         	myfile.open (name + "y_6.txt", std::ios_base::app);
         	myfile << index << " " << y << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	*///
 			grad.x += result.x*cose - result.y*sine;
 			grad.y += result.y*cose + result.x*sine;
 			////
 			/*
         	myfile.open (name + "grad.x_7.txt", std::ios_base::app);
         	myfile << index << " " << grad.x << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	/*
         	myfile.open (name + "grad.y_7.txt", std::ios_base::app);
         	myfile << index << " " << grad.y << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
 			*////
         }
         return grad;
 }
-
-
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient_5_SOA_v2_novec(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA begins");
         //
         struct point grad, result;
         grad.x = 0;
         grad.y = 0;
 #pragma novec
 	for(int i = shalos; i < shalos + nhalos; i++)
         {
                 //
                 struct point true_coord, true_coord_rotation;
                 //
                 true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
                 //
                 //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
                 type_t cose = lens->anglecos[i];
                 type_t sine = lens->anglesin[i];
                 //
                 type_t x = true_coord.x*cose + true_coord.y*sine;
                 type_t y = true_coord.y*cose - true_coord.x*sine;
                 //:
                 type_t R = sqrt(x*x*(1 - lens->ellipticity[i]/3.) + y*y*(1 + lens->ellipticity[i]/3.));
+
                 result.x = (1 - lens->ellipticity[i]/3.)*lens->b0[i]*x/R;
                 result.y = (1 + lens->ellipticity[i]/3.)*lens->b0[i]*y/R;
                 //
                 grad.x += result.x*cose - result.y*sine;
                 grad.y += result.y*cose + result.x*sine;
         }
         return grad;
 }
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient_8_SOA(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
 	asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
 	// 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
 	// 
 	struct point grad, clumpgrad;
 	grad.x = 0;
 	grad.y = 0;
 	//
 	for(int i = shalos; i < shalos + nhalos; i++)
 	{
 		//IACA_START;
 		//
 		struct point true_coord, true_coord_rot; //, result;
 		//type_t       R, angular_deviation;
 		complex      zis;
 		//
 		//result.x = result.y = 0.;
 		//
 		//@@printf("image_x = %f image_y = %f\n",  pImage->x, pImage->y);	
 		true_coord.x = pImage->x - lens->position_x[i];
 		true_coord.y = pImage->y - lens->position_y[i];
 		//printf("x = %f y = %f\n",  true_coord.x, true_coord.y);	
 		/*positionning at the potential center*/
 		// Change the origin of the coordinate system to the center of the clump
 		true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
 		//
 		type_t x   = true_coord_rot.x;
 		type_t y   = true_coord_rot.y;
 		//@@printf("x = %f y = %f\n",  x, y);	
 		type_t eps = lens->ellipticity_potential[i];
 		type_t rc  = lens->rcore[i];
 		//
 		//std::cout << "piemd_lderivatives" << std::endl;
 		//
 		type_t sqe  = sqrt(eps);
 		//
 		type_t cx1  = (1. - eps)/(1. + eps);
 		type_t cxro = (1. + eps)*(1. + eps);
 		type_t cyro = (1. - eps)*(1. - eps);
 		//
 		type_t rem2 = x*x/cxro + y*y/cyro;
 		//
 		complex zci, znum, zden, zres;
 		type_t norm;
 		//	
 		zci.re  = 0;
 		zci.im  = -0.5*(1. - eps*eps)/sqe;
 		//@@printf("zci = %f %f\n", zci.re, zci.im);	
 		//
 		znum.re = cx1*x;
 		znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
 		//
 		zden.re = x;
 		zden.im = 2.*rc*sqe - y;
 		norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
 		//@@printf("norm = %f\n", norm);
 		//
 		zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
 		zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
 		//@@printf("zis = %f %f\n", zis.re, zis.im);	
 		norm    = zis.re;
 		zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
 		zis.im  = atan2(zis.im, norm);
 		//@@printf("y,x = %f %f\n", zis.im, norm);	
 		//  norm = zis.re;
 		zres.re = zci.re*zis.re - zci.im*zis.im;   // Re( zci*ln(zis) )
 		zres.im = zci.im*zis.re + zis.im*zci.re;   // Im( zci*ln(zis) )
 		//
 		//@@printf("zres: %f %f\n", zres.re, zres.im); 
 		//
 		zis.re  = zres.re;
 		zis.im  = zres.im;
 		//
 		//zres.re = zis.re*b0;
 		//zres.im = zis.im*b0;
 		// rotation
 		clumpgrad.x = zis.re;
 		clumpgrad.y = zis.im;
 		clumpgrad = rotateCoordinateSystem(clumpgrad, -lens->ellipticity_angle[i]);
 		//
 		clumpgrad.x = lens->b0[i]*clumpgrad.x;
 		clumpgrad.y = lens->b0[i]*clumpgrad.y;
 		//
 		//clumpgrad.x = lens->b0[i]*zis.re;
 		//clumpgrad.y = lens->b0[i]*zis.im;
 		//nan check
 		//if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y)
 		//{
 			// add the gradients
 		grad.x += clumpgrad.x;
 		grad.y += clumpgrad.y;
 		//@@printf("grad: %f %f\n", grad.x, grad.y); 
 		//@@std::cout << "grad.x = " << grad.x << " grad.y = " << grad.y << std::endl;
 		//}
 	}
 	//IACA_END;
 	//
 	return(grad);
 }
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient_8_SOA_v2(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_8_SOA_v2 begins");
         //std::cout << "# module_potentialDerivatives_totalGradient_8_SOA_v2 begins" << std::endl;
 	//
         // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
         //
         struct point grad, clumpgrad;
         grad.x = 0;
         grad.y = 0;
 	//printf("%d %d\n", shalos, nhalos);
         //
         for(int i = shalos; i < shalos + nhalos; i++)
         {
                 //IACA_START;
                 //
                 struct point true_coord, true_coord_rot; //, result;
                 complex      zis;
                 type_t b0   = lens->b0[i];
                 //
                 true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
                 //
                 //true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
                 //
                 type_t cose = lens->anglecos[i];
                 type_t sine = lens->anglesin[i];
 		//
                 type_t x = true_coord.x*cose + true_coord.y*sine;
                 type_t y = true_coord.y*cose - true_coord.x*sine;
                 //
                 type_t eps = lens->ellipticity_potential[i];
                 type_t rc  = lens->rcore[i];
                 //
                 type_t sqe  = sqrt(eps);
                 //
                 type_t cx1  = (1. - eps)/(1. + eps);
                 type_t cxro = (1. + eps)*(1. + eps);
                 type_t cyro = (1. - eps)*(1. - eps);
                 //
                 type_t rem2 = x*x/cxro + y*y/cyro;
                 //
                 complex zci, znum, zden, zres;
                 type_t norm;
                 //
                 zci.re  = 0;
                 zci.im  = -0.5*(1. - eps*eps)/sqe;
                 //
 		KERNEL(rc, zres)
 		//
                 grad.x += b0*(zres.re*cose - zres.im*sine);
                 grad.y += b0*(zres.im*cose + zres.re*sine);
                 //
         }
         //IACA_END;
         //
         return(grad);
 }
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient_8_SOA_v2_novec(const struct point *pImage, const struct Potential_SOA *lens
 , int shalos, int nhalos)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_8_SOA_v2 begins");
         //std::cout << "# module_potentialDerivatives_totalGradient_8_SOA_v2 begins" << std::endl;
         //
         // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
         //
         struct point grad, clumpgrad;
         grad.x = 0;
         grad.y = 0;
         //printf("%d %d\n", shalos, nhalos);
         //
 #pragma novector
         for(int i = shalos; i < shalos + nhalos; i++)
         {
                 //IACA_START;
                 //
                 struct point true_coord, true_coord_rot; //, result;
                 complex      zis;
                 type_t b0   = lens->b0[i];
                 //
                 true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
                 //
                 //true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
                 //
                 type_t cose = lens->anglecos[i];
                 type_t sine = lens->anglesin[i];
                 //
                 type_t x = true_coord.x*cose + true_coord.y*sine;
                 type_t y = true_coord.y*cose - true_coord.x*sine;
                 //
                 type_t eps = lens->ellipticity_potential[i];
                 type_t rc  = lens->rcore[i];
                 //
                 type_t sqe  = sqrt(eps);
                 //
                 type_t cx1  = (1. - eps)/(1. + eps);
                 type_t cxro = (1. + eps)*(1. + eps);
                 type_t cyro = (1. - eps)*(1. - eps);
                 //
                 type_t rem2 = x*x/cxro + y*y/cyro;
                 //
                 complex zci, znum, zden, zres;
                 type_t norm;
                 //
                 zci.re  = 0;
                 zci.im  = -0.5*(1. - eps*eps)/sqe;
                 //
                 KERNEL(rc, zres)
                 //
                 grad.x += b0*(zres.re*cose - zres.im*sine);
                 grad.y += b0*(zres.im*cose + zres.re*sine);
                 //
         }
         //IACA_END;
         //
         return(grad);
 }
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient_81_SOA(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_81_SOA begins");
 	//std::cout << "# module_potentialDerivatives_totalGradient_SOA begins" << std::endl;
         // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
         // 
         struct point grad, clumpgrad;
         grad.x = 0;
         grad.y = 0;
         for(int i = shalos; i < shalos + nhalos; i++)
         {
                 //IACA_START;
                 //
                 struct point true_coord, true_coord_rot; //, result;
                 //type_t       R, angular_deviation;
                 complex      zis;
                 //
                 //result.x = result.y = 0.;
                 //
                 true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
                 /*positionning at the potential center*/
                 // Change the origin of the coordinate system to the center of the clump
                 true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
                 //
                 type_t x    = true_coord_rot.x;
                 type_t y    = true_coord_rot.y;
                 type_t eps  = lens->ellipticity_potential[i];
                 type_t rc   = lens->rcore[i];
                 type_t rcut = lens->rcut[i];
 		type_t b0   = lens->b0[i];
 		type_t t05  = b0*rcut/(rcut - rc);
                 //
                 type_t sqe  = sqrt(eps);
                 //
                 type_t cx1  = (1. - eps)/(1. + eps);
                 type_t cxro = (1. + eps)*(1. + eps);
                 type_t cyro = (1. - eps)*(1. - eps);
                 //
                 type_t rem2 = x*x/cxro + y*y/cyro;
                 //
                 complex zci, znum, zden, zres_rc, zres_rcut;
                 type_t norm;
                 //      
                 zci.re  = 0;
                 zci.im  = -0.5*(1. - eps*eps)/sqe;
 		// step 1
 		{
 			KERNEL(rc, zres_rc)
 		}
 		// step 2
 		{
 			KERNEL(rcut, zres_rcut)
                 }
 		zis.re  = t05*(zres_rc.re - zres_rcut.re);
 		zis.im  = t05*(zres_rc.im - zres_rcut.im); 
                 // rotation
                 clumpgrad.x = zis.re;
                 clumpgrad.y = zis.im;
                 clumpgrad = rotateCoordinateSystem(clumpgrad, -lens->ellipticity_angle[i]);
                 //
                 grad.x += clumpgrad.x;
                 grad.y += clumpgrad.y;
                 //}
         }
         //IACA_END;
         //
         return(grad);
 }
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient_81_SOA_v2(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_81_SOA begins");
         //std::cout << "# module_potentialDerivatives_totalGradient_81_SOA begins" << std::endl;
         // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
         //
         struct point grad, clumpgrad;
         grad.x = 0;
         grad.y = 0;
         for(int i = shalos; i < shalos + nhalos; i++)
         {
                 //IACA_START;
                 //
                 struct point true_coord, true_coord_rot; //, result;
                 //type_t       R, angular_deviation;
                 complex      zis;
                 //
                 //result.x = result.y = 0.;
                 //
                 true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
                 /*positionning at the potential center*/
                 // Change the origin of the coordinate system to the center of the clump
                 type_t cose = lens->anglecos[i];
                 type_t sine = lens->anglesin[i];
                 type_t x = true_coord.x*cose + true_coord.y*sine;
                 type_t y = true_coord.y*cose - true_coord.x*sine;
 		//
                 type_t eps  = lens->ellipticity_potential[i];
                 type_t rc   = lens->rcore[i];
                 type_t rcut = lens->rcut[i];
                 type_t b0   = lens->b0[i];
                 type_t t05  = b0*rcut/(rcut - rc);
                 //
                 type_t sqe  = sqrt(eps);
                 //
                 type_t cx1  = (1. - eps)/(1. + eps);
                 type_t cxro = (1. + eps)*(1. + eps);
                 type_t cyro = (1. - eps)*(1. - eps);
                 //
                 type_t rem2 = x*x/cxro + y*y/cyro;
                 //
                 complex zci, znum, zden, zres_rc, zres_rcut;
                 type_t norm;
                 //
                 zci.re  = 0;
                 zci.im  = -0.5*(1. - eps*eps)/sqe;
 		//
                 // step 1
                 {
 			KERNEL(rc, zres_rc)
                 }
                 // step 2
                 {
 			KERNEL(rcut, zres_rcut)
                 }
                 zis.re  = t05*(zres_rc.re - zres_rcut.re);
                 zis.im  = t05*(zres_rc.im - zres_rcut.im);
                 // rotation
 		grad.x += (zis.re*cose - zis.im*sine);
                 grad.y += (zis.im*cose + zis.re*sine);
                 //
         }
         //
         return(grad);
 }
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient_81_SOA_v2_novec(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_81_SOA begins");
         //std::cout << "# module_potentialDerivatives_totalGradient_81_SOA begins" << std::endl;
         // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
         struct point grad, clumpgrad;
         grad.x = 0;
         grad.y = 0;
 #pragma novector
         for(int i = shalos; i < shalos + nhalos; i++)
         {
                 //IACA_START;
                 //
                 struct point true_coord, true_coord_rot; //, result;
                 //type_t       R, angular_deviation;
                 complex      zis;
                 //
                 //result.x = result.y = 0.;
                 //
                 true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
                 /*positionning at the potential center*/
                 // Change the origin of the coordinate system to the center of the clump
                 type_t cose = lens->anglecos[i];
                 type_t sine = lens->anglesin[i];
                 type_t x = true_coord.x*cose + true_coord.y*sine;
                 type_t y = true_coord.y*cose - true_coord.x*sine;
                 //
                 type_t eps  = lens->ellipticity_potential[i];
                 type_t rc   = lens->rcore[i];
                 type_t rcut = lens->rcut[i];
                 type_t b0   = lens->b0[i];
                 type_t t05  = b0*rcut/(rcut - rc);
                 //
                 type_t sqe  = sqrt(eps);
                 //
                 type_t cx1  = (1. - eps)/(1. + eps);
                 type_t cxro = (1. + eps)*(1. + eps);
                 type_t cyro = (1. - eps)*(1. - eps);
                 //
                 type_t rem2 = x*x/cxro + y*y/cyro;
                 //
                 complex zci, znum, zden, zres_rc, zres_rcut;
                 type_t norm;
                 //
                 zci.re  = 0;
                 zci.im  = -0.5*(1. - eps*eps)/sqe;
                 //
                 // step 1
                 {
                         KERNEL(rc, zres_rc)
                 }
                 // step 2
                 {
                         KERNEL(rcut, zres_rcut)
                 }
                 zis.re  = t05*(zres_rc.re - zres_rcut.re);
                 zis.im  = t05*(zres_rc.im - zres_rcut.im);
                 // rotation
                 grad.x += (zis.re*cose - zis.im*sine);
                 grad.y += (zis.im*cose + zis.re*sine);
                 //
         }
         //
         return(grad);
 }
 //
 //
 //
 typedef struct point (*halo_func_t) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos); 
 halo_func_t halo_func[100] = 
 { 
 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_v2, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_v2,  0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0,  module_potentialDerivatives_totalGradient_81_SOA_v2, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient_SOA(const struct point *pImage, const struct Potential_SOA *lens, int nhalos)
 {
         struct point grad, clumpgrad;
         //
         grad.x = clumpgrad.x = 0;
         grad.y = clumpgrad.y = 0;
 	//
 	int shalos = 0;
 	//
 	//module_potentialDerivatives_totalGradient_81_SOA(pImage, lens, 0, nhalos);
 	//return;
 	/*
 	int* p_type = &(lens->type)[0];
 	int* lens_type = (int*) malloc(nhalos*sizeof(int));
 	memcpy(lens_type, &(lens->type)[0], nhalos*sizeof(int));
 	*/
 	//quicksort(lens_type, nhalos);
 	//
 	while (shalos < nhalos)
 	{
 		int lens_type = lens->type[shalos];
 		int count     = 1;
 		while (lens->type[shalos + count] == lens_type) count++;
 		//std::cerr << "type = " << lens_type << " " << count << " " << shalos << " " << " func = " << halo_func[lens_type] << std::endl;
 		//	
 		clumpgrad = (*halo_func[lens_type])(pImage, lens, shalos, count);
 		//
 		grad.x += clumpgrad.x;
 		grad.y += clumpgrad.y;
 		shalos += count;
 	}	
 
         return(grad);
 }
 
 
 
 typedef struct point (*halo_func_t_novec) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos);
 halo_func_t_novec halo_func_novec[100] =
 {
 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_v2_novec, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_v2_novec,  0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0,  module_potentialDerivatives_totalGradient_81_SOA_v2_novec, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient_SOA_novec(const struct point *pImage, const struct Potential_SOA *lens, int
  nhalos)
 {
         struct point grad, clumpgrad;
         //
         grad.x = clumpgrad.x = 0;
         grad.y = clumpgrad.y = 0;
         //
         int shalos = 0;
         //
         //module_potentialDerivatives_totalGradient_81_SOA(pImage, lens, 0, nhalos);
         //return;
         /*
         int* p_type = &(lens->type)[0];
         int* lens_type = (int*) malloc(nhalos*sizeof(int));
         memcpy(lens_type, &(lens->type)[0], nhalos*sizeof(int));
         */
         //quicksort(lens_type, nhalos);
         //
         while (shalos < nhalos)
         {
                 int lens_type = lens->type[shalos];
                 int count     = 1;
                 while (lens->type[shalos + count] == lens_type) count++;
                 //std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl;
                 //
                 clumpgrad = (*halo_func_novec[lens_type])(pImage, lens, shalos, count);
                 //
                 grad.x += clumpgrad.x;
                 grad.y += clumpgrad.y;
                 shalos += count;
         }
 
         return(grad);
 }
 
 
 
diff --git a/src/gradient_GPU.cu b/src/gradient_GPU.cu
index a21248d..ddfecc5 100644
--- a/src/gradient_GPU.cu
+++ b/src/gradient_GPU.cu
@@ -1,946 +1,949 @@
 #include <fstream>
 #include "grid_gradient_GPU.cuh"
 #include "gradient_GPU.cuh"
 #include "gradient.hpp"
 
 #define BLOCK_SIZE_X 8
 #define BLOCK_SIZE_Y 16
 
 //#define ROT
 
 #define _SHARED_MEM
 
 #ifdef _SHARED_MEM
 #define SHARED __shared__
 #warning "shared memory"
 extern __shared__ type_t shared[];
 #else
 #define SHARED 
 #endif
 
 #define Nx 1
 #define Ny 0
 
 extern "C" {
 double myseconds();
 }
 void
 module_potentialDerivatives_totalGradient_SOA_CPU_GPU(type_t *grid_grad_x, type_t *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos);
 
 void
 module_potentialDerivatives_totalGradient_SOA_CPU_GPU_v2(type_t *grid_grad_x, type_t *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos);
 
 void calculate_cossin_values(type_t *theta_cos, type_t *theta_sin, type_t *angles, int nhalos ){
 	for(int i = 0 ; i < nhalos; i++)
 	{
 		theta_cos[i] = cos(angles[i]);
 		theta_sin[i] = sin(angles[i]);
 	}
 }
 
 
 
 
 
 
 __global__
 void
 module_potentialDerivatives_totalGradient_8_SOA_GPU_cur(type_t *grid_grad_x, type_t *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int shalos, int nhalos)
 {
         //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
         // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
         //
         struct point grad, clumpgrad, image_point;
 	//
         grad.x = 0;
         grad.y = 0;
         //
         int col = blockIdx.x*blockDim.x + threadIdx.x;
         int row = blockIdx.y*blockDim.y + threadIdx.y;
         //
         if ((row < nbgridcells) && (col < nbgridcells))
         {
                 //
                 int index = row*nbgridcells + col;
                 //
                 //grid_grad_x[index] = 0.;
                 //grid_grad_y[index] = 0.;
                 //
                 type_t dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
                 type_t dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
                 //
                 image_point.x = frame->xmin + col*dx;
                 image_point.y = frame->ymin + row*dy;
 		//
                 for(int i = shalos; i < shalos + nhalos; i++)
                 {
                         struct point true_coord, true_coord_rot; //, result;
                         //type_t       R, angular_deviation;
                         complex      zis;
                         //
                         // positionning at the potential center
                         // Change the origin of the coordinate system to the center of the clump
                         //
 			//@@if ((row == Ny) && (col == Nx)) printf("image_x = %f, %f image_y = %f, %f\n",  image_point.x, frame->xmin, image_point.y,frame->ymin);
 			type_t true_coord_x = image_point.x - __ldg(&lens->position_x[i]);
                         type_t true_coord_y = image_point.y - __ldg(&lens->position_y[i]);
 			//if ((row == Ny) && (col == Nx)) printf("x = %f y = %f\n",  true_coord_x, true_coord_y);	
 			//
                         type_t cosi = __ldg(&lens->anglecos[i]);
                         type_t sinu = __ldg(&lens->anglesin[i]);
 			//
                         type_t x = true_coord_x*cosi + true_coord_y*sinu;
                         type_t y = true_coord_y*cosi - true_coord_x*sinu;
 			//
 			//if ((row == Ny) && (col == Nx)) printf("x = %f y = %f\n",  x, y);	
                         //
                         type_t eps = __ldg(&lens->ellipticity_potential[i]);
                         //
                         type_t sqe  = sqrt(eps);
                         //
                         type_t rem2 = x*x/((1. + eps)*(1. + eps)) + y*y/((1. - eps)*(1. - eps));
                         //
                         complex zci;
                         complex znum, zden, zres;
                         type_t norm;
                         //
 			zci.re  = 0;
                         zci.im  = -0.5*(1. - eps*eps)/sqe;
 			//@@if ((col == Nx) && (row == Ny)) printf("%d %d, zis: %f %f\n", row, col, zci.re, zci.im);
                         //
                         type_t rc  = __ldg(&lens->rcore[i]);
                         type_t cx1  = (1. - eps)/(1. + eps);
                         znum.re = cx1*x;
                         znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
                         //
                         zden.re = x;
                         zden.im = 2.*rc*sqe - y;
                         norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
 			//@@if ((col == Nx) && (row == Ny)) printf("norm = %f\n", norm);
                         //
                         zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
                         zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
 			//
 			//@@if ((col == Nx) && (row == Ny)) printf("%d %d, zis: %f %f\n", row, col, zis.re, zis.im);
                         //
                         norm    = zis.re;
                         //
                         zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
                         zis.im  = atan2(zis.im, norm);
 			//
 			//@@if ((col == Nx) && (row == Ny)) printf("%d %d, zis: %f %f\n", row, col, zis.re, zis.im);
                         //
                         zres.re = zci.re*zis.re - zci.im*zis.im;   // Re( zci*ln(zis) )
                         zres.im = zci.im*zis.re + zis.im*zci.re;   // Im( zci*ln(zis) )
 			//
 			//@@if ((col == Nx) && (row == Ny)) printf("%d %d, zres: %f %f\n", row, col, zres.re, zres.im);
                         //
                         type_t b0  = __ldg(&lens->b0[i]);
                         grad.x += b0*(zres.re*cosi - zres.im*sinu);
                         grad.y += b0*(zres.im*cosi + zres.re*sinu);
 			//@@if ((col == Nx) && (row == Ny)) printf("grad: %f %f\n", grad.x, grad.y);
                 }
                 //IACA_END;
                 //
                 grid_grad_x[index] = grad.x;
                 grid_grad_y[index] = grad.y;
 		//if ((row == 0) && (col == 9)) 
 		//printf("%f %f: %f %f\n",  image_point.x, image_point.y, grid_grad_x[index], grid_grad_y[index]);
         }
 }
 
 __global__
 void
 module_potentialDerivatives_totalGradient_8_SOA_GPU_SM2(type_t *grid_grad_x, type_t *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int shalos, int nhalos)
 {
 	//
         //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
         // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
         //
         type_t grad_x, grad_y;
 	type_t clumpgrad_x, clumpgrad_y;
 	type_t image_point_x, image_point_y;
 	//
 	SHARED type_t cosi	[200];
 	SHARED type_t sinu	[200];
 	SHARED type_t rc	[200];
 	SHARED type_t b0	[200];
 	SHARED type_t epsi	[200];
 	SHARED type_t position_x[200];
 	SHARED type_t position_y[200];
 	SHARED type_t rsqe	[200];
 	SHARED type_t sonepeps	[200];
 	SHARED type_t sonemeps	[200];
         //
         grad_x = 0;
         grad_y = 0;
         //
         int col = blockIdx.x*blockDim.x + threadIdx.x;
         int row = blockIdx.y*blockDim.y + threadIdx.y;
 	int ithread  = threadIdx.y*blockDim.x + threadIdx.x;
 	//
 	int index = row*nbgridcells + col;
 	//
 	//grid_grad_x[index] = 0.;
 	//grid_grad_y[index] = 0.;
 	//
 	type_t dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
 	type_t dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
 	//
 	image_point_x = frame->xmin + col*dx;
 	image_point_y = frame->ymin + row*dy;
 	//
 	int i = ithread;
 	if (i < nhalos)
 	{
 		cosi[i]       = __ldg(&lens->anglecos		  [shalos + i]);
 		sinu[i]       = __ldg(&lens->anglesin		  [shalos + i]);
 		position_x[i] = __ldg(&lens->position_x		  [shalos + i]);
 		position_y[i] = __ldg(&lens->position_y		  [shalos + i]);
 		rc[i]         = __ldg(&lens->rcore		  [shalos + i]);
 		b0[i]         = __ldg(&lens->b0		          [shalos + i]);
 		epsi[i]       = __ldg(&lens->ellipticity_potential[shalos + i]);
 		//sonemeps[i]   = 1 - epsi[i];
 		//sonepeps[i]   = 1 + epsi[i];
 		rsqe[i]	      = sqrt(epsi[i]);
 	}
 	__syncthreads();
 	//
 	if ((row < nbgridcells) && (col < nbgridcells))
 	{
 		for(int i = 0; i < nhalos; i++)
 		{
 			//
 			type_t true_coord_x = image_point_x - position_x[i];
 			type_t true_coord_y = image_point_y - position_y[i];
 			//
 			type_t x = true_coord_x*cosi[i] + true_coord_y*sinu[i];
 			type_t y = true_coord_y*cosi[i] - true_coord_x*sinu[i];
 			//
 			type_t eps     = epsi[i];
 			//type_t onemeps = 1 - eps;
 			//type_t onepeps = 1 + eps;
 			//
 			//type_t eps     = epsi[i];
 			type_t onemeps = sonemeps[i];
 			type_t onepeps = sonepeps[i];
 			//
 			//type_t sqe  = sqrt(eps);
 			type_t sqe  = rsqe[i];
 			type_t rem2 = x*x/(onepeps*onepeps) + y*y/(onemeps*onemeps);
 			//
 			complex      zis;
 			//
 			type_t znum_re, znum_im;
 			type_t zres_re, zres_im;
 			type_t norm;
 			type_t zden_re, zden_im;
 			type_t  zis_re,  zis_im;
 			//
 			type_t zci_im  = -0.5*(1. - eps*eps)/sqe;
 			//
 			type_t cx1  = onemeps/onepeps;
 			//
 			znum_re = cx1*x;
 			znum_im = 2.*sqe*sqrt(rc[i]*rc[i] + rem2) - y/cx1;
 			//
 			zden_re = x;
 			zden_im = 2.*rc[i]*sqe - y;
 			//
 			norm    = (x*x + zden_im*zden_im);     // zis = znum/zden
 			zis.re  = (znum_re*x + znum_im*zden_im)/norm;
 			zis.im  = (znum_im*x - znum_re*zden_im)/norm;
 			//
 			norm    = zis.re;
 			//
 			zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
 			zis.im  = atan2(zis.im, norm);
 			//
 			zres_re = - zci_im*zis.im;   // Re( zci*ln(zis) )
 			zres_im =   zci_im*zis.re;   // Im( zci*ln(zis) )
 			//
 			grad_x += b0[i]*(zres_re*cosi[i] - zres_im*sinu[i]);
 			grad_y += b0[i]*(zres_im*cosi[i] + zres_re*sinu[i]);
 		}
 		//
 		grid_grad_x[index] = grad_x;
 		grid_grad_y[index] = grad_y;
 		//__syncthreads();
 	}
 }
 //
 //
 //
 __global__
 void
 module_potentialDerivatives_totalGradient_8_SOA_GPU_SM3(type_t *grid_grad_x, type_t *grid_grad_y, const struct Potential_SOA
  *lens, const struct grid_param *frame, int nbgridcells, int shalos, int nhalos)
 {
         //
         //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
         // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
         //
         type_t grad_x, grad_y;
         type_t clumpgrad_x, clumpgrad_y;
         type_t image_point_x, image_point_y;
         //
         SHARED type_t cosi      [200];
         SHARED type_t sinu      [200];
         SHARED type_t rci       [200];
         SHARED type_t b0        [200];
         SHARED type_t epsi      [200];
         SHARED type_t position_x[200];
         SHARED type_t position_y[200];
         SHARED type_t rsqe      [200];
 	//SHARED type_t sgrad_x   [(BLOCK_SIZE_X + 1)*BLOCK_SIZE_Y];
 	//SHARED type_t sgrad_y   [(BLOCK_SIZE_X + 1)*BLOCK_SIZE_Y];
 
         //SHARED type_t sonepeps  [200];
         //SHARED type_t sonemeps  [200];
         //
         grad_x         = 0;
         grad_y 	       = 0;
         //
         int row        = blockIdx.y*blockDim.y + threadIdx.y;
         int col        = blockIdx.x*blockDim.x + threadIdx.x;
 	//
 	//int loc_row    = threadIdx.x;
 	//int loc_col    = threadIdx.y*blockDim.x + threadIdx.x;
 	//
         //int grid_size  = nbgridcells/blockDim.y; 
 	//
 	//if (threadIdx.x == 0) printf("%d %d %d: row = %d, col = %d, grid_size = %d\n", blockIdx.y, gridDim.y, threadIdx.y, row, col, grid_size);
         //
 	type_t dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
 	type_t dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
 	//if (threadIdx.x == 0) printf("dx = %f, dy = %f\n", dx, dy);
 	//
 	image_point_x = frame->xmin + col*dx;
 	image_point_y = frame->ymin + row*dy;
 	//
 	//int iloc  = threadIdx.x*blockDim.y + threadIdx.y;
 	int iglob = row*nbgridcells + col;
 	int numThreads = blockDim.x*blockDim.y;
 	//
 	for (int i = 0; i < (nhalos + numThreads - 1)/numThreads; ++i)
 	{ 
 		int iloc  = threadIdx.y*blockDim.x + threadIdx.x + i*numThreads;
 		if (iloc < nhalos)
 		{
 			cosi[iloc]       = __ldg(&lens->anglecos             [shalos + iloc]);
 			sinu[iloc]       = __ldg(&lens->anglesin             [shalos + iloc]);
 			position_x[iloc] = __ldg(&lens->position_x           [shalos + iloc]);
 			position_y[iloc] = __ldg(&lens->position_y           [shalos + iloc]);
 			rci[iloc]        = __ldg(&lens->rcore                [shalos + iloc]);
 			b0[iloc]         = __ldg(&lens->b0                   [shalos + iloc]);
 			epsi[iloc]       = __ldg(&lens->ellipticity_potential[shalos + iloc]);
 			rsqe[iloc]       = sqrt(epsi[iloc]);
 		}
 	}
 	__syncthreads();
 	//
 	if ((row < nbgridcells) && (col < nbgridcells))
 	{
 		//
 		for(int i = 0; i < nhalos; i++)
 		{
 			//int index  = iloc; 
 #if 1
 			type_t rc      = rci[i];
 			type_t eps     = epsi[i];
 			type_t onemeps = 1 - eps;
 			type_t onepeps = 1 + eps;
 			//
 			type_t sqe = rsqe[i];
 			type_t cx1 = onemeps/onepeps;
 			//
 			//
 			//type_t zci_im = 1;
 			type_t zci_im  = -0.5*(1. - eps*eps)/sqe;
 			type_t inv_onepeps = 1./(onepeps*onepeps);
 			type_t inv_onemeps = 1./(onemeps*onemeps);
 #endif
 			//
 			{
 				//KERNEL_8;
 				type_t grad_x = grad_y = 0.;
 				type_t true_coord_y = image_point_y - position_y[i];
 				type_t true_coord_x = image_point_x - position_x[i];
 				KERNEL_8_reg(0);
 				grid_grad_x[iglob +  0] += grad_x;
 				grid_grad_y[iglob +  0] += grad_y;
 			}
 			/*
 			{
 				//KERNEL_8;
 				type_t grad_x = grad_y = 0.;
 				type_t true_coord_y = image_point_y - position_y[i];
 				type_t true_coord_x = image_point_x - position_x[i] + BLOCK_SIZE_X/2*dx;
 				KERNEL_8_reg(0);
 				grid_grad_x[iglob + BLOCK_SIZE_X/2] += grad_x;
 				grid_grad_y[iglob + BLOCK_SIZE_X/2] += grad_y;
 			}
 			*/
 		//
 		}
 	}
 }
 //
 //
 //
 __global__
 void
 module_potentialDerivatives_totalGradient_8_SOA_GPU_SM4(type_t *grid_grad_x, type_t *grid_grad_y, const struct Potential_SOA
 		lens, const struct grid_param *frame, int nbgridcells, int shalos, int nhalos/*, type_t* dtimer*/)
 {
 	//
 	//asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
 	// 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
 	//
 	type_t grad_x, grad_y;
 	type_t clumpgrad_x, clumpgrad_y;
 	type_t image_point_x, image_point_y;
 	//
 	//
 	type_t* cosi       = &shared[0*nhalos];
 	type_t* sinu       = &shared[1*nhalos];
 	type_t* rc         = &shared[2*nhalos];
 	type_t* b0         = &shared[3*nhalos];
 	type_t* epsi       = &shared[4*nhalos];
 	type_t* position_x = &shared[5*nhalos];
 	type_t* position_y = &shared[6*nhalos];
 	type_t* rsqe       = &shared[7*nhalos];
 
 	//SHARED type_t sonepeps  [200];
 	//SHARED type_t sonemeps  [200];
 	//
 	grad_x         = 0;
 	grad_y         = 0;
 	//
 	int grid_size  =  nbgridcells/gridDim.y;
 	int row        =  blockIdx.x*blockDim.x + threadIdx.x;
 	int col        = (blockIdx.y*blockDim.y + threadIdx.y)/**grid_size*/;
 	//
 	//
 #if 1
 	//SHARED type_t sgrad_x [
 	if (/*(threadIdx.x == 0) &&*/ (blockIdx.x == 0))
 	{
 		if (threadIdx.x == 0) printf("blockDim.x = %d, blockIdx.x = %d grimdDim.x = %d threadIdx.x = %d\n", blockDim.x, blockIdx.x, gridDim.x, threadIdx.x);
 		if (threadIdx.x == 0) printf("blockDim.y = %d, blockIdx.y = %d grimdDim.y = %d threadIdx.y = %d\n", blockDim.y, blockIdx.y, gridDim.y, threadIdx.y);
 		if (threadIdx.x == 0) printf("row = %d, col = %d, grid_size = %d\n", row, col, grid_size);
 	}
 	__syncthreads();
 #endif
 	type_t* sgrad_x    = &shared[8*nhalos];
 	type_t* sgrad_y    = &shared[8*nhalos + (grid_size + 1)*blockDim.x];
 	//
 	//
 	//grid_grad_x[index] = 0.;
 	//grid_grad_y[index] = 0.;
 	//
 	type_t dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
 	type_t dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
 	//
 	//if (threadIdx.x == 0) printf("dx = %f, dy = %f\n", dx, dy);
 	//
 	image_point_x = frame->xmin + col*dx;
 	image_point_y = frame->ymin + row*dy;
 	return;
 	//
 	int i = 0;
 #if 0
 	for (; i < nhalos; i = i + blockDim.x)	
 	{
 		int pos = threadIdx.x + i;
 		/*if ((threadIdx.x == 0) && (blockIdx.x == 0))*/ printf("pos = %d\n"); 
 		__syncthreads();
 		//
 		cosi[pos]       = __ldg(&lens->anglecos             [shalos + pos]);
 		sinu[pos]       = __ldg(&lens->anglesin             [shalos + pos]);
 		position_x[pos] = __ldg(&lens->position_x           [shalos + pos]);
 		position_y[pos] = __ldg(&lens->position_y           [shalos + pos]);
 		rc[pos]         = __ldg(&lens->rcore                [shalos + pos]);
 		b0[pos]         = __ldg(&lens->b0                   [shalos + pos]);
 		epsi[pos]       = __ldg(&lens->ellipticity_potential[shalos + pos]);
 		rsqe[pos]       = sqrt(epsi[i]);
 		//
 	}
 #endif
 #if 0
 	if (threadIdx.x == 0)
 		for (; i < nhalos; i += 1)
 		{
 			cosi[i]       = __ldg(&lens->anglecos             [shalos + i]);
 			sinu[i]       = __ldg(&lens->anglesin             [shalos + i]);
 			position_x[i] = __ldg(&lens->position_x           [shalos + i]);
 			position_y[i] = __ldg(&lens->position_y           [shalos + i]);
 			rc[i]         = __ldg(&lens->rcore                [shalos + i]);
 			b0[i]         = __ldg(&lens->b0                   [shalos + i]);
 			epsi[i]       = __ldg(&lens->ellipticity_potential[shalos + i]);
 			rsqe[i]       = sqrt(epsi[i]);
 		}
 #endif
 	__syncthreads();
 	//if ((row == col == 0)) printf("shared mem done...\n");
 	//
 	if (row < nbgridcells)
 	{
 		//for(int icol = 0; icol < grid_size; ++icol){
                 //      if (col + icol < nbgridcells){
                 //grad_x = grad_y = 0.;
 		int  index  = row*nbgridcells + col;
                 //
                 for(int i = 0; i < nhalos; i++)
                 {
                         int sindex  = threadIdx.x*grid_size; 
 #if 0
                         type_t eps     = epsi[i];
                         type_t onemeps = 1 - eps;
                         type_t onepeps = 1 + eps;
                         //
                         type_t sqe = rsqe[i];
                         type_t cx1 = onemeps/onepeps;
                         //
                         //
                         //type_t x = true_coord_y*sinu[i];
                         //type_t y = true_coord_y*cosi[i];
                         type_t true_coord_y = image_point_y - position_y[i];
                         type_t true_coord_x = image_point_x - position_x[i] /*+ icol*dx*/;
                         //
                         complex zci;
                         zci.im  = -0.5*(1. - eps*eps)/sqe;
                         type_t inv_onepeps = 1./(onepeps*onepeps);
                         type_t inv_onemeps = 1./(onemeps*onemeps);
 #endif
                         //
                         for(int icol = 0; icol < grid_size; ++icol)
                         {
                                 if (col + icol < nbgridcells)
                                 {
 #if 0
                                         if ((row == 1) && (col == 1)) printf("%d %d: %f %f\n", row, col, true_coord_x, true_coord_y);
 
                                         true_coord_x = image_point_x - position_x[i] + icol*dx;
                                         //
                                         //x += true_coord_x*cosi[i];
                                         //y -= true_coord_x*sinu[i];
                                         type_t x = true_coord_x*cosi[i] + true_coord_y*sinu[i];
                                         type_t y = true_coord_y*cosi[i] - true_coord_x*sinu[i];
                                         //
                                         //if ((row == 1) && (col == 0)) printf("i = %d, eps = %f\n", i, eps);
                                         //
                                         //double eps     = epsi[i];
                                         //double onemeps = sonemeps[i];
                                         //double onepeps = sonepeps[i];
                                         //
                                         //double sqe  = sqrt(eps);
                                         //double rem2 = x*x/(onepeps*onepeps) + y*y/(onemeps*onemeps);
                                         type_t rem2 = x*x*inv_onepeps + y*y*inv_onemeps;
                                         //
                                         //
                                         //double znum_re, znum_im;
                                         //double zres_re, zres_im;
                                         //double zden_re, zden_im;
                                         //double  zis_re,  zis_im;
                                         type_t norm;
                                         //
                                         complex      zis;
                                         complex      znum;
                                         complex      zden;
                                         complex      zres;
                                         //
                                         //double cx1  = onemeps/onepeps;
                                         //
                                         znum.re = cx1*x;
                                         znum.im = 2.*sqe*sqrt(rc[i]*rc[i] + rem2) - y/cx1;
                                         //
                                         zden.re = x;
                                         zden.im = 2.*rc[i]*sqe - y;
                                         //
                                         norm    = (x*x + zden.im*zden.im);     // zis = znum/zden
                                         zis.re  = (znum.re*x + znum.im*zden.im)/norm;
                                         zis.im  = (znum.im*x - znum.re*zden.im)/norm;
                                         //
                                         norm    = zis.re;
                                         //
                                         zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
                                         zis.im  = atan2(zis.im, norm);
                                         //
                                         zres.re = - zci.im*zis.im;   // Re( zci*ln(zis) )
                                         zres.im =   zci.im*zis.re;   // Im( zci*ln(zis) )
                                         //
                                         grid_grad_x[index] += b0[i]*(zres.re*cosi[i] - zres.im*sinu[i]);
                                         grid_grad_y[index] += b0[i]*(zres.im*cosi[i] + zres.re*sinu[i]);
 #endif
                                         //sgrad_x[sindex] += (float)  sindex;
                                         sgrad_y[sindex] += (float) -sindex;
                                         //sindex++;
                                         //
                                         //grid_grad_x[index] += grad_x;
                                         //grid_grad_y[index] += grad_y;
                                 }
                                 //
                         }
                 }
                         __syncthreads();
 	return;
                 //
 #if 0
 		int sindex = threadIdx.x*grid_size;
 		for(int icol = 0; icol < grid_size; ++icol)
 		{
 			if (col + icol < nbgridcells)
 			{
                 		grid_grad_x[index + col] = sgrad_x[sindex];
                 		grid_grad_y[index + col] = sgrad_y[sindex];
 				sindex++;
 			}
 		}
 		__syncthreads();
 #endif
         }
 }
 
 
 
 __global__
 void
 module_potentialDerivatives_totalGradient_8_SOA_GPU_v2(type_t *grid_grad_x, type_t *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int i, int nhalos)
 {
 	//asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
 	// 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
 	//
 	struct point grad, clumpgrad, image_point;
 	grad.x = 0;
 	grad.y = 0;
 	//
 	int row = blockIdx.y * blockDim.y + threadIdx.y;
 	int col = blockIdx.x * blockDim.x + threadIdx.x;
 	//
 	if ((row < nbgridcells) && (col < nbgridcells))
 	{
 		//
 		int index = col*nbgridcells + row;
 		//
 		//grid_grad_x[index] = 0.;
 		//grid_grad_y[index] = 0.;
 		//
 		type_t dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
 		type_t dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
 		//
 #if 0
 		/*SHARED*/ type_t img_pt[2];
 		if ((row == 0) && (col == 0))
 		{
 			img_pt[0] = frame->xmin + col*dx;
 			img_pt[1] = frame->ymin + row*dy;
 		}
 		__syncthreads();
 #else
 		image_point.x = frame->xmin + col*dx;
 		image_point.y = frame->ymin + row*dy;
 #endif
 		//
 		//
 		//for(int i = shalos; i < shalos + nhalos; i++)
 		//{
 		//IACA_START;
 		//
 		struct point true_coord, true_coord_rot; //, result;
 		//type_t       R, angular_deviation;
 		complex      zis;
 		//
 		//result.x = result.y = 0.;
 		//
 #if 0
 		true_coord.x = img_pt[0] - __ldg(&lens->position_x[i]);
 		true_coord.y = img_pt[1] - __ldg(&lens->position_y[i]);
 #else
 		true_coord.x = image_point.x - __ldg(&lens->position_x[i]);
 		true_coord.y = image_point.y - __ldg(&lens->position_y[i]);
 #endif
 		type_t cosi = __ldg(&lens->anglecos[i]);
 		type_t sinu = __ldg(&lens->anglesin[i]);
 		// positionning at the potential center
 		// Change the origin of the coordinate system to the center of the clump
 		type_t x = true_coord.x*cosi + true_coord.y*sinu;
 		type_t y = true_coord.y*cosi - true_coord.x*sinu;
 		//
 		type_t eps = __ldg(&lens->ellipticity_potential[i]);
 		//
 		type_t sqe  = sqrt(eps);
 		//
 		type_t rem2 = x*x/((1. + eps)*(1. + eps)) + y*y/((1. - eps)*(1. - eps));
 		//
 		complex zci;
 		complex znum, zden, zres;
 		type_t norm;
 		//
 		zci.im  = -0.5*(1. - eps*eps)/sqe;
 		//
 		type_t rc  = __ldg(&lens->rcore[i]);
 		type_t cx1  = (1. - eps)/(1. + eps);
 		znum.re = cx1*x;
 		znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
 		//
 		zden.re = x;
 		zden.im = 2.*rc*sqe - y;
 		norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
 		//
 		zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
 		zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
 		//
 		type_t b0  = __ldg(&lens->b0[i]);
 		grad.x += b0*(zres.re*cosi - zres.im*sinu);
 		grad.y += b0*(zres.im*cosi + zres.re*sinu);
 		//
 		grid_grad_x[index] += grad.x;
 		grid_grad_y[index] += grad.y;
 	}
 }
 
 #if 1
 typedef struct point (*halo_func_GPU_t) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos);
 
 __constant__ halo_func_GPU_t halo_func_GPU[100] =
 {
 	0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_GPU, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_GPU,  0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	   0,  module_potentialDerivatives_totalGradient_81_SOA_GPU, 0, 0, 0, 0, 0, 0, 0, 0,
 	   0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 	   };
 #endif
 
 __device__ point module_potentialDerivatives_totalGradient_5_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos){
     //asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins");
     //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n");
     //
     struct point grad, result;
     grad.x = 0;
     grad.y = 0;
     for(int i = shalos; i < shalos + nhalos; i++)
     {
             //
 		struct point true_coord, true_coord_rotation;
 		//
 		true_coord.x = pImage->x - lens->position_x[i];
 		true_coord.y = pImage->y - lens->position_y[i];
 		//
 		//true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
 		type_t cose = lens->anglecos[i];
 		type_t sine = lens->anglesin[i];
 		//
 		type_t x = true_coord.x*cose + true_coord.y*sine;
 		type_t y = true_coord.y*cose - true_coord.x*sine;
 		//
 		type_t ell_pot = lens->ellipticity_potential[i];
 		//
 		type_t b0_inv_R = lens->b0[i]/sqrt(x*x*(1 - ell_pot) + y*y*(1 + ell_pot));
 		//
 		result.x = (1 - ell_pot)*x*b0_inv_R;
 		result.y = (1 + ell_pot)*y*b0_inv_R;
 		//
 		grad.x += result.x*cose - result.y*sine;
 		grad.y += result.y*cose + result.x*sine;
     }
     return grad;
 
 }
 __device__ point module_potentialDerivatives_totalGradient_8_SOA_GPU(const struct point *image_point, const struct Potential_SOA *lens, int shalos, int nhalos){
 	struct point grad;
 	grad.x = 0;
 	grad.y = 0;
 	//
 	for(int i = shalos; i < shalos + nhalos; i++)
 	{
         complex      zis;
         // positioning at the potential center
         // Change the origin of the coordinate system to the center of the clump
         //@@if ((row == Ny) && (col == Nx)) printf("image_x = %f, %f image_y = %f, %f\n",  image_point.x, frame->xmin, image_point.y,frame->ymin);
+
         type_t true_coord_x = image_point->x - __ldg(&lens->position_x[i]);
         type_t true_coord_y = image_point->y - __ldg(&lens->position_y[i]);
         //if ((row == Ny) && (col == Nx)) printf("x = %f y = %f\n",  true_coord_x, true_coord_y);
         //
         type_t cosi = __ldg(&lens->anglecos[i]);
         type_t sinu = __ldg(&lens->anglesin[i]);
         //
         type_t x = true_coord_x*cosi + true_coord_y*sinu;
         type_t y = true_coord_y*cosi - true_coord_x*sinu;
         //
         //if ((row == Ny) && (col == Nx)) printf("x = %f y = %f\n",  x, y);
         //
         type_t eps = __ldg(&lens->ellipticity_potential[i]);
         //
         type_t sqe  = sqrt(eps);
         //
         type_t rem2 = x*x/((1. + eps)*(1. + eps)) + y*y/((1. - eps)*(1. - eps));
         //
         complex zci;
         complex znum, zden, zres;
         type_t norm;
         //
         zci.re  = 0;
         zci.im  = -0.5*(1. - eps*eps)/sqe;
         //@@if ((col == Nx) && (row == Ny)) printf("%d %d, zis: %f %f\n", row, col, zci.re, zci.im);
         //
         type_t rc  = __ldg(&lens->rcore[i]);
         type_t cx1  = (1. - eps)/(1. + eps);
+
         znum.re = cx1*x;
         znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
         //
         zden.re = x;
         zden.im = 2.*rc*sqe - y;
         norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
         //@@if ((col == Nx) && (row == Ny)) printf("norm = %f\n", norm);
         //
         zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
         zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
         //
         //@@if ((col == Nx) && (row == Ny)) printf("%d %d, zis: %f %f\n", row, col, zis.re, zis.im);
         //
         norm    = zis.re;
         //
         zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
         zis.im  = atan2(zis.im, norm);
         //
         //@@if ((col == Nx) && (row == Ny)) printf("%d %d, zis: %f %f\n", row, col, zis.re, zis.im);
         //
         zres.re = zci.re*zis.re - zci.im*zis.im;   // Re( zci*ln(zis) )
         zres.im = zci.im*zis.re + zis.im*zci.re;   // Im( zci*ln(zis) )
         //
         //@@if ((col == Nx) && (row == Ny)) printf("%d %d, zres: %f %f\n", row, col, zres.re, zres.im);
         //
         type_t b0  = __ldg(&lens->b0[i]);
+
         grad.x += b0*(zres.re*cosi - zres.im*sinu);
         grad.y += b0*(zres.im*cosi + zres.re*sinu);
         //@@if ((col == Nx) && (row == Ny)) printf("grad: %f %f\n", grad.x, grad.y);
 	}
 	//IACA_END;
 	//
 	return(grad);
 
 }
 __device__ point module_potentialDerivatives_totalGradient_81_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos){
     //asm volatile("# module_potentialDerivatives_totalGradient_81_SOA begins");
     //std::cout << "# module_potentialDerivatives_totalGradient_81_SOA begins" << std::endl;
     // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
     //
     struct point grad, clumpgrad;
     grad.x = 0;
     grad.y = 0;
     for(int i = shalos; i < shalos + nhalos; i++)
     {
 		//IACA_START;
 		//
 		struct point true_coord, true_coord_rot; //, result;
 		//type_t       R, angular_deviation;
 		complex      zis;
 		//
 		//result.x = result.y = 0.;
 		//
 		true_coord.x = pImage->x - lens->position_x[i];
 		true_coord.y = pImage->y - lens->position_y[i];
 		/*positionning at the potential center*/
 		// Change the origin of the coordinate system to the center of the clump
 		type_t cose = lens->anglecos[i];
 		type_t sine = lens->anglesin[i];
 		type_t x = true_coord.x*cose + true_coord.y*sine;
 		type_t y = true_coord.y*cose - true_coord.x*sine;
 		//
 		type_t eps  = lens->ellipticity_potential[i];
 		type_t rc   = lens->rcore[i];
 		type_t rcut = lens->rcut[i];
 		type_t b0   = lens->b0[i];
 		type_t t05  = b0*rcut/(rcut - rc);
 		//
 		type_t sqe  = sqrt(eps);
 		//
 		type_t cx1  = (1. - eps)/(1. + eps);
 		type_t cxro = (1. + eps)*(1. + eps);
 		type_t cyro = (1. - eps)*(1. - eps);
 		//
 		type_t rem2 = x*x/cxro + y*y/cyro;
 		//
 		complex zci, znum, zden, zres_rc, zres_rcut;
 		type_t norm;
 		//
 		zci.re  = 0;
 		zci.im  = -0.5*(1. - eps*eps)/sqe;
 		//
 		// step 1
 		{
 		KERNEL(rc, zres_rc)
 		}
 		// step 2
 		{
 		KERNEL(rcut, zres_rcut)
 		}
 		zis.re  = t05*(zres_rc.re - zres_rcut.re);
 		zis.im  = t05*(zres_rc.im - zres_rcut.im);
 		// rotation
 		grad.x += (zis.re*cose - zis.im*sine);
 		grad.y += (zis.im*cose + zis.re*sine);
             //
     }
     //
     return(grad);
 
 }
 
 __global__
 void
 module_potentialDerivatives_totalGradient_SOA_GPU(type_t *grid_grad_x, type_t *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int nhalos)
 {
     struct point grad, clumpgrad, image_point;
     //
     int col = blockIdx.x*blockDim.x + threadIdx.x;
     int row = blockIdx.y*blockDim.y + threadIdx.y;
 
     //
     if ((row < nbgridcells) && (col < nbgridcells))
     {
         //
         type_t dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
         type_t dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
 		//
 		int index = row*nbgridcells + col;
 		// Create temp grad variable to minimise writing to global memory grid_grad
 		grad.x = 0;
 		grad.y = 0;
 		//
 		image_point.x = frame->xmin + col*dx;
 		image_point.y = frame->ymin + row*dy;
 		//
 		int shalos = 0;
 		while (shalos < nhalos)
 		{
 			int lens_type = lens->type[shalos];
 			int count     = 1;
 			while (lens->type[shalos + count] == lens_type) count++;
 			//
 			//if(row == 0 && col == 0) printf("type = %d, count %d , shalos %d \n", lens_type,count,shalos );
 			//
 			clumpgrad = (*halo_func_GPU[lens_type])(&image_point, lens, shalos, count);
 			//
 			grad.x += clumpgrad.x;
 			grad.y += clumpgrad.y;
 			shalos += count;
 		}
 		// Write to global memory
 		grid_grad_x[index] = grad.x;
 		grid_grad_y[index] = grad.y;
 	//if ((row == 0) && (col == 9))
 	//printf("%f %f: %f %f\n",  image_point.x, image_point.y, grid_grad_x[index], grid_grad_y[index]);
     }
 
 
 
 
 }