diff --git a/src/chi_CPU.cpp b/src/chi_CPU.cpp
index 3863483..23b9c0f 100644
--- a/src/chi_CPU.cpp
+++ b/src/chi_CPU.cpp
@@ -1,541 +1,540 @@
 #include <iostream>
 #include <string.h>
 //#include <cuda_runtime.h>
 #include <math.h>
 #include <sys/time.h>
 #include <fstream>
 #include <unistd.h>
 #include <assert.h>
 //
 #ifndef __xlC__
 #warning "gnu compilers"
-#include <immintrin.h>
 #endif
 //
 //#include "simd_math.h"
 #include "chi_CPU.hpp"
 #ifdef __WITH_GPU
 #include <cuda_runtime.h>
 #include "cudafunctions.cuh"
 #include "grid_gradient_GPU.cuh"
 #endif
 ////#endif
 //#include "gradient_GPU.cuh"
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 #ifdef __WITH_MPI
 #warning "MPI enabled"
 #include <mpi.h>
 #include "mpi_check.h"
 #endif
 #include "delense_CPU_utils.hpp"
 #include "delense_CPU.hpp"
 //
 #define MIN(a,b) (((a)<(b))?(a):(b))
 #define MAX(a,b) (((a)>(b))?(a):(b))
 //
 extern 
 double myseconds();
 #if 0
 {
 	struct timeval  tp;
 	struct timezone tzp;
 	int i;
 
 	i = gettimeofday(&tp,&tzp);
 	return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
 }
 #endif
 //
 void mychi_bruteforce_SOA_CPU_grid_gradient(double *chi, int *error, runmode_param *runmode, const struct Potential_SOA *lens, const struct grid_param *frame, const int *nimages_strongLensing, galaxy *images)
 {
 	//
 	//double dx, dy; //, x_pos, y_pos;        //pixelsize
 	//	
 	//	double im_dist[MAXIMPERSOURCE]; // distance to a real image for an "theoretical" image found from a theoretical source
 	//	int im_index;       		// index of the closest real image for an image found from a theoretical source
 	//	int second_closest_id;   	// index of the second closest real image for an image found from a theoretical source
 	//	int thread_found_image = 0;   	// at each iteration in the lens plane, turn to 1 whether the thread find an image
 	// 	struct point im_position, temp; // position of the image found from a theoretical source + temp variable for comparison
 	//	struct triplet Tsup, Tinf, Tsupsource, Tinfsource;// triangles for the computation of the images created by the theoretical sources
 	unsigned int nsets      = runmode->nsets;
 	unsigned int nimagestot = runmode->nimagestot;
 	//
         int MAXIMPERSOURCE;
         int numImages = 0;
         //
         int maxImagesPerSource = 0;
         for( int source_id = 0; source_id < nsets; source_id ++)
         {
                 numImages += nimages_strongLensing[source_id];
                 maxImagesPerSource = MAX(nimages_strongLensing[source_id], maxImagesPerSource);
         }
         MAXIMPERSOURCE = maxImagesPerSource;
 	//	
 	//
 	struct galaxy sources  [nsets][nimagestot]; // theoretical sources (common for a set)
 	int    nimagesfound    [nsets][nimagestot][MAXIMPERSOURCE]; // number of images found from the theoretical sources
 	int    locimagesfound  [nsets][nimagestot];
 	struct point image_pos [nsets][nimagestot][MAXIMPERSOURCE];
 	//double image_dist    [nsets][nimagestot][MAXIMPERSOURCE];
 	struct point tim       [nimagestot][MAXIMPERSOURCE]; // theoretical images (computed from sources)
 	//
 	int world_size = 1;
 	int world_rank = 0;
 #ifdef __WITH_MPI
 	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 	MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 	MPI_Barrier(MPI_COMM_WORLD);
 #endif
 	unsigned int verbose = (world_rank == 0);
 	//
 	int grid_size     = runmode->nbgridcells;
 	int loc_grid_size = runmode->nbgridcells/world_size;
 	//
 	double y_len      = fabs(frame->ymax - frame->ymin);
 	int    y_len_loc  = runmode->nbgridcells/world_size;
 	int    y_pos_loc  = (int) world_rank*y_len_loc;
 	int    y_bound    = y_len_loc;
 	if ((world_rank + 1) != world_size) y_bound++;
 	//
 	//
 	const double dx   = (frame->xmax - frame->xmin)/(runmode->nbgridcells - 1);
 	const double dy   = (frame->ymax - frame->ymin)/(runmode->nbgridcells - 1);	
 	//
 	double *grid_gradient_x, *grid_gradient_y;
 	//
 	//grid_gradient_x   = (double *)malloc((int) grid_size*loc_grid_size*sizeof(double));
 	//grid_gradient_y   = (double *)malloc((int) grid_size*loc_grid_size*sizeof(double));
 #ifdef __WITH_GPU
 	cudasafe(cudaMallocHost(&grid_gradient_x, grid_size*y_bound*sizeof(double)), "mychi_bruteforce_SOA_CPU_grid_gradient: allocating grid_gradient_x");
 	cudasafe(cudaMallocHost(&grid_gradient_y, grid_size*y_bound*sizeof(double)), "mychi_bruteforce_SOA_CPU_grid_gradient: allocating grid_gradient_x");
 #else
 	grid_gradient_x   = (double *)malloc((int) grid_size*y_bound*sizeof(double));
 	grid_gradient_y   = (double *)malloc((int) grid_size*y_bound*sizeof(double));
 #endif
 	//
 	//uais
 	//if (verbose) printf("@@%d: nsets = %d nimagestot = %d maximgpersource = %d, grid_size = %d, loc_grid_size = %d, y_pos_loc = %d\n", world_rank, nsets, nimagestot, MAXIMPERSOURCE, grid_size, loc_grid_size, y_pos_loc);
 	//
 	const int grid_dim = runmode->nbgridcells;
 	//Packaging the image to sourceplane conversion
 	double time      = -myseconds();
 	double grad_time = -myseconds();
 	//gradient_grid_CPU(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, grid_dim);
 	int zero = 0;
 #ifdef __WITH_GPU
 #warning "Using GPUs..."
 #ifdef __WITH_UM
 #warning "Chi computation using UNIFIED MEMORY"
 	//gradient_grid_CPU(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, dx, dy, grid_size, y_bound, zero, y_pos_loc);
 	gradient_grid_GPU_UM(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, dx, dy, grid_size, y_bound, zero, y_pos_loc);
 #else
 	gradient_grid_GPU(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, dx, dy, grid_size, y_bound, zero, y_pos_loc);
 #endif
 	cudasafe(cudaGetLastError(), "after gradient_grid_GPU");
 	//gradient_grid_GPU(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, grid_size);
 #else
 	gradient_grid_CPU(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, dx, dy, grid_size, y_bound, zero, y_pos_loc);
 #endif
 	//
 #ifdef __WITH_MPI
 	MPI_Barrier(MPI_COMM_WORLD);
 #endif
 	grad_time += myseconds();
 	//
 	int index           = 0;       // index tracks the image within the total image array in the image plane
 	*chi  		    = 0;
 	//
 	double chi2_time    = 0.;
 	double delense_time = 0.;
 	double image_time   = 0.;
 	//
 	int images_found    = 0;
 	//
 	long int images_total   = 0;
 	//
 	printf("@@nsets = %d nx = %d ny = %d, xmin = %f, dx = %f, ymin = %f, dy = %f\n", nsets, runmode->nbgridcells, runmode->nbgridcells, frame->xmin, dx, frame->ymin, dy );
 	//
 	int numsets = 0;
 	//
 	for( int  source_id = 0; source_id < nsets; source_id ++)
 		numsets += nimages_strongLensing[source_id];
 	printf("@@Total numsets = %d\n", numsets);
 	//
 	//
 	// nsets     : number of images in the source plane
 	// nimagestot: number of images in the image plane
 	//
 	image_time -= myseconds();
 	//
 	for( int  source_id = 0; source_id < nsets; source_id ++)
 	{
 		// number of images in the image plane for the specific image (1,3,5...)
 		unsigned short int nimages = nimages_strongLensing[source_id];
 		//@@printf("@@ source_id = %d, nimages = %d\n",  source_id, nimages_strongLensing[source_id]);
 		//____________________________ image (constrains) loop ________________________________
 		for(unsigned short int image_id = 0; image_id < nimages; image_id++)
 		{
 			//printf("@@  nimages = %d\n",  nimages_strongLensing[source_id]);
 			//________ computation of theoretical sources _________
 			// output: sources[source_id].center
 			//printf("Image = %f %f\n", images[index + image_id].center.x, images[index + image_id].center.y);
 			mychi_transformImageToSourcePlane_SOA(runmode->nhalos, 
 					&images[index + image_id].center,
 					images[index + image_id].dr, 
 					lens, 
 					&sources[source_id][image_id].center);
 			//
 			// void mychi_transformImageToSourcePlane_SOA(const int Nlens, const struct point *image_point, double dlsds, const struct Potential_SOA *lens, struct point *source_point)
 			//
 			struct point Grad = module_potentialDerivatives_totalGradient_SOA(&images[index + image_id].center, lens, runmode->nhalos);
 			//printf("	image %d, %d (%d) = (%.15f, %.15f) -> (%.15f, %.15f)\n", source_id, image_id, nimages, images[index + image_id].center.x, images[index + image_id].center.y, sources[source_id][image_id].center.x, sources[source_id][image_id].center.y );
 			//printf("Grad.x = %f, %f\n", Grad.x, grid_gradient_x[images[index + image_id].center.x/dx]); 
 			//
 			// find the position of the constrain in the source plane
 			sources[source_id][image_id].center.x = images[index + image_id].center.x - images[index + image_id].dr*Grad.x;
 			sources[source_id][image_id].center.y = images[index + image_id].center.y - images[index + image_id].dr*Grad.y;
 			//
 			//time += myseconds(); 
 			//
 			sources[source_id][image_id].redshift = images[index + image_id].redshift;
 			//
 			sources[source_id][image_id].dr       = images[index + image_id].dr;
 			sources[source_id][image_id].dls      = images[index + image_id].dls;
 			sources[source_id][image_id].dos      = images[index + image_id].dos;
 //#if 1
 		}
 		index += nimages; 
 	}
 #ifdef __WITH_MPI
 	MPI_Barrier(MPI_COMM_WORLD);
 #endif
 	image_time += myseconds();
 	//
 	// main loop
 	//
 	//double y_len     = fabs(frame->ymax - frame->ymin);
 	//int    y_len_loc = runmode->nbgridcells/world_size; 
 	//int    y_pos_loc = (int) world_rank*y_len_loc;
 	//printf("%d out of %d: y_id = %d to %d\n", world_rank, world_size, y_pos_loc, y_pos_loc + y_len_loc - 1);
 	//fflush(stdout);
 	//MPI_Barrier(MPI_COMM_WORLD);
 	//
 	delense_time    -= myseconds();
 	index 	      = 0;
 	int numimg    = 0;
 	//
 	delense(/*nsets, nimagestot,*/ &image_pos[0][0][0], MAXIMPERSOURCE, &locimagesfound[0][0], &numimg, runmode, lens, frame, nimages_strongLensing, &sources[0][0], grid_gradient_x, grid_gradient_y);
 	printf("%d: numimg = %d\n", world_rank, numimg);
 //
 // local delensing done, moving on to the reduction
 //
 #ifdef __WITH_MPI
 	MPI_Barrier(MPI_COMM_WORLD);
 #endif
 	delense_time += myseconds();
 	//
 	double comm_time = -myseconds();
 	//
 	int          numimagesfound    [nsets][nimagestot];
 	memset(&numimagesfound, 0, nsets*nimagestot*sizeof(int));
 	struct point imagesposition    [nsets][nimagestot][MAXIMPERSOURCE];
 	memset(&imagesposition, 0, nsets*nimagestot*MAXIMPERSOURCE*sizeof(point));
 	//
 	int          numimagesfound_tmp[nsets][nimagestot];
 	struct point imagesposition_tmp[nsets][nimagestot][MAXIMPERSOURCE];
 	//
 	memset(numimagesfound_tmp, 0, nsets*nimagestot*sizeof(int));
 	memset(imagesposition_tmp, 0, nsets*nimagestot*sizeof(int));
 	//
 #ifdef __WITH_MPI
 	int total = 0;
 	//MPI_Reduce(&locimagesfound, &imagesfound, nsets*nimagestot, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
 	//MPI_Reduce(&locimagesfound, &imagesfound, nsets*nimagestot, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
 	//
 	if (!verbose)
 	{
 		MPI_CHECK(MPI_Send( &numimg        , 1                                                , MPI_INT   , 0, 666 + world_rank, MPI_COMM_WORLD ));
 		if (numimg != 0)
 		{
 			MPI_CHECK(MPI_Send( &locimagesfound, nsets*nimagestot               , MPI_INT   , 0, 666 + world_rank, MPI_COMM_WORLD ));
 			//MPI_CHECK(MPI_Send( &image_pos,      nsets*nimagestot*MAXIMPERSOURCE, MPI_points, 0, 667 + world_rank, MPI_COMM_WORLD ));
 			MPI_CHECK(MPI_Send( &image_pos,      nsets*nimagestot*MAXIMPERSOURCE*2, MPI_DOUBLE, 0, 667 + world_rank, MPI_COMM_WORLD ));
 		}
 	}
 	//
 	if (verbose)
 	{
 		int image_sum = 0;
 		//
 		for (int ipe = 0; ipe < world_size; ++ipe)
 		{
 			MPI_Status status;
 			//
 			if (ipe == 0)
 			{
 				memcpy(&numimagesfound_tmp, &locimagesfound, nsets*nimagestot*sizeof(int));
 				memcpy(&imagesposition_tmp, &image_pos,      nsets*nimagestot*MAXIMPERSOURCE*sizeof(point)); 
 			}
 			else
 			{
 				MPI_CHECK(MPI_Recv(&numimg            , 1                                                  , MPI_INT   , ipe, 666 + ipe, MPI_COMM_WORLD, &status));	
 				if (numimg != 0)
 				{
 					MPI_CHECK(MPI_Recv(&numimagesfound_tmp, nsets*nimagestot                 , MPI_INT   , ipe, 666 + ipe, MPI_COMM_WORLD, &status));
 					MPI_CHECK(MPI_Recv(&imagesposition_tmp, nsets*nimagestot*MAXIMPERSOURCE*2, MPI_DOUBLE, ipe, 667 + ipe, MPI_COMM_WORLD, &status));
 				}
 			}
 			//
 			//MPI_Reduce(&imagesfound_tmp, &total, ipe, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
 			//
 			if (numimg != 0)
 				for (int jj = 0; jj < nimagestot; ++jj)
 				{
 					for (int ii = 0; ii < nsets; ++ii)
 					{
 						//int img_len = numimagesfound[ii][jj];
 						int img_len = numimagesfound_tmp[ii][jj];
 						//printf("%d: %d %d, img_len = %d\n", ipe, ii, jj, img_len);
 						image_sum  += img_len;
 						if (img_len != 0)
 						{
 							//int loc_length = numimagesfound[ii][jj];
 							int loc_length = numimagesfound[ii][jj];
 							memcpy(&imagesposition[ii][jj][loc_length], &imagesposition_tmp[ii][jj], img_len*sizeof(point));
 							numimagesfound[ii][jj] += img_len;
 							numimg += img_len;
 
 						}
 					}
 				}
 		}
 	}
 	//MPI_Barrier(MPI_COMM_WORLD);
 #endif
 	comm_time += myseconds();
 	//
 	// image extraction to compute 
 	//
 	chi2_time = -myseconds();
 	index     =  0;
 	//
 	if (verbose)
 		for( int  source_id = 0; source_id < nsets; source_id ++)
 		{
 			unsigned short int nimages = nimages_strongLensing[source_id];
 			//
 			//______________________Initialisation______________________
 			//
 			for (unsigned short int i = 0; i < nimages; ++i)
 				for (unsigned short int j = 0; j < nimages; ++j)
 					nimagesfound[source_id][i][j] = 0;
 			//
 			for( unsigned short int image_id = 0; image_id < nimages; image_id++)
 			{
 //#endif
 				//
 				double image_dist[MAXIMPERSOURCE];
 				//
 				//printf("	Image %d, number of sources found %d\n", image_id, loc_images_found);
 				//
 				struct point image_position;
 				int	     image_index;
 				//
 				for (int ii = 0; ii < /*loc*/numimagesfound[source_id][image_id]; ++ii)
 				{	
 					//
 					//
 					int image_index = 0;
 					//
 					image_position = imagesposition[source_id][image_id][ii]; 
 					image_dist[0] = mychi_dist(image_position, images[index + 0].center);  // get the distance to the real image
 					for(int i = 1; i < nimages_strongLensing[source_id]; i++)
 					{  // get the distance to each real image and keep the index of the closest real image
 
 						image_dist[i] = mychi_dist(image_position, images[index + i].center);
 						//printf("	*** image %d = %f %f, distance = %f\n", index + i, images[index + i].center.x, images[index + i].center.y, image_dist[i]);
 						if (image_dist[i] < image_dist[image_index])
 						{
 							image_index = i;
 						}
 					}
 					//
 					// we should exit loops here
 					//
 					// p1_time += myseconds();
 					//
 					int skip_image = 0;
 					// Sometimes due to the numerical errors at the centerpoint, 
 					// for SIE potentials an additional image will appear at the center of the Potential.
 					// This is due to the fact that it is not possible to simulate an infinity value 
 					// at the center correctly, Check that sis correspond to Nlens[0]
 					/*
 					   for (int iterator = 0; iterator < runmode->Nlens[0]; ++iterator)
 					   {
 					   if ( fabs(image_position.x - lens[0].position_x[iterator]) <= dx/2. and fabs(image_position.y  - lens[0].position_y[iterator]) <= dx/2.)
 					   {
 					   skip_image = 1;
 					   printf("WARNING: You are using SIE potentials. An image to close to one of the potential centers has been classified as numerical error and removed \n");
 					   }
 					   }
 					   */
 					//printf("%d %d %d %d\n", x_id, y_id,thread_found_image, skip_image);
 					if (!skip_image)
 					{
 						//#pragma omp atomic
 						images_found++;
 						struct point temp;
 						//printf("                        source %d, image %d, index %d, Images found: %d\n", source_id, image_id, image_index, nimagesfound[source_id][image_id][image_index]);
 						//checking whether a closest image has already been found
 						if (nimagesfound[source_id][image_id][image_index] == 0)
 						{ // if no image found up to now
 
 							//image position is allocated to theoretical image
 							//#pragma omp critical
 							tim[image_id][image_index] = image_position;  
 							//#pragma omp atomic
 							nimagesfound[source_id][image_id][image_index]++;
 						}
 						else if (nimagesfound[source_id][image_id][image_index] > 0)
 						{ // if we have already found an image
 							// If the new image we found is closer than the previous image
 							//printf("	tim2: %f %f\n", image_dist[image_index], mychi_dist(images[index + image_index].center, tim[image_id][image_index]));
 							if (image_dist[image_index] < mychi_dist(images[index + image_index].center, tim[image_id][image_index]))
 							{
 								temp = tim[image_id][image_index]; // we store the position of the old image in temp
 								//#pragma omp critical
 								tim[image_id][image_index] = image_position; // we link the observed image with the image we just found
 								//printf("tim2 %d %d = %f %f\n", image_id, image_index, image_position.x, image_position.y);
 							}
 							else
 							{
 								temp = image_position; // we store the position of the image we just found in temp
 							}
 							// initialising second_closest_id to the highest value
 							// Loop over all images in the set except the closest one
 							// and initialize to the furthest away image
 							int second_closest_id = 0;
 							for (int i = 1; i < nimages_strongLensing[source_id] && (i != image_index); i++)
 							{
 								if(image_dist[i] > image_dist[second_closest_id]) second_closest_id=i;
 							}
 							///////////////////////////////////////////////////////////////
 							// Loop over all images in the set that are not yet allocated to a theoretical image
 							// and allocate the closest one
 							// we search for an observed image not already linked (nimagesfound=0)
 							for(int i = 0; i < nimages_strongLensing[source_id] && nimagesfound[source_id][image_id][i] == 0; i++) 
 							{
 								if(image_dist[i] < image_dist[second_closest_id])
 								{
 									second_closest_id = i;
 									// im_index value changes only if we found a not linked yet image
 									image_index = i; 
 									//printf("tim3 %d %d = %f %f\n", image_id, image_index, temp.x, temp.y);
 									//#pragma omp critical
 									tim[image_id][image_index] = temp; // if we found an observed and not already linked image, we allocate the theoretical image temp
 								}
 							}
 							// increasing the total number of images found (If we find more than 1 theoretical image linked to 1 real image, 
 							// these theoretical
 							//#pragma omp atomic
 							nimagesfound[source_id][image_id][image_index]++; 
 							// images are included in this number)
 						}
 					}
 					//#pragma omp atomic
 					//loc_images_found++;
 					//thread_found_image  = 0; // for next iteration
 				}
 				//
 			}
 			//#pragma omp barrier
 			//____________________________ end of image loop
 			//
 			//____________________________ computing the local chi square
 			//
 			double chiimage;
 			//
 			int _nimages = nimages_strongLensing[source_id];
 			//
 			for( int iter = 0; iter < _nimages*_nimages; iter++)
 			{
 				int i=iter/nimages_strongLensing[source_id];
 				int j=iter%nimages_strongLensing[source_id];
 
 				//printf("nimagesfound %d %d = %d\n", i, j, nimagesfound[i][j]);
 				if(i != j)
 				{
 					// In the current method, we get the source in the source plane by ray tracing image in nimagesfound[i][i]. If we ray trace back,
 					// we arrive again at the same position and thus the chi2 from it is 0. Thus we do not calculate the chi2 (-> if i!=j)
 					if(nimagesfound[source_id][i][j] > 0)
 					{
 						double pow1 = images[index + j].center.x - tim[i][j].x;
 						double pow2 = images[index + j].center.y - tim[i][j].y;
 						//
 						//chiimage = pow(images[index + j].center.x - tim[i][j].x, 2) + pow(images[index + j].center.y - tim[i][j].y, 2);  // compute the chi2
 						chiimage = pow1*pow1 + pow2*pow2;  // compute the chi2
 						//printf("%d %d = %.15f\n", i, j, chiimage);
 						*chi    += chiimage;
 					}
 					else 
 						if(nimagesfound[source_id][i][j] == 0)
 						{
 							// If we do not find a correpsonding image, we add a big value to the chi2 to disfavor the model
 							*chi += 100.*nimages_strongLensing[source_id];
 						}
 				}
 			}
 			//
 			//____________________________ end of computing the local chi square
 			//
 			//printf("%d: chi = %.15f\n", source_id, *chi);
 			/*
 			   for (int i=0; i < nimages_strongLensing[source_id]; ++i){
 			   for (int j=0; j < nimages_strongLensing[source_id]; ++j){
 			   printf(" %d",nimagesfound[i][j]);
 			   }
 			   printf("\n");
 			   }*/
 
 			//Incrementing Index: Images already treated by previous source_id
 			index += nimages_strongLensing[source_id];
 		}
 #ifdef __WITH_MPI
 	MPI_Barrier(MPI_COMM_WORLD);
 #endif
 	//
 	chi2_time += myseconds();
 	time      += myseconds();
 	//
 	if (verbose)
 	{
 		//
 		//		int nthreads = 1;
 		//
 		//#pragma omp parallel
 		//		nthreads = omp_get_num_threads();
 		//
 		printf("	overall time  = %f s.\n", time);
 		printf("		- grad    time = %f s.\n", grad_time);
 		printf("		- image   time = %f s.\n", image_time);
 		printf("		- delense time = %f s.\n", delense_time);
 		printf("		- comm    time = %f s.\n", comm_time);
 		printf("		- chi2    time = %f s.\n", chi2_time);
 		//
 		printf("	images found: %d out of %ld\n", images_found, images_total);
 	}
 	//
 #ifdef __WITH_GPU
 #ifndef __WITH_UM
 	cudasafe(cudaFree(grid_gradient_x), "deallocating grid_gradient_x");
 	cudasafe(cudaFree(grid_gradient_y), "deallocating grid_gradient_y");
 #endif
 #else
 	free(grid_gradient_x);
 	free(grid_gradient_y);
 #endif
 }
 
 
diff --git a/src/chi_CPU_barycenter.cpp b/src/chi_CPU_barycenter.cpp
index bbf74d7..60d1047 100644
--- a/src/chi_CPU_barycenter.cpp
+++ b/src/chi_CPU_barycenter.cpp
@@ -1,283 +1,282 @@
 #include <iostream>
 #include <string.h>
 //#include <cuda_runtime.h>
 #include <math.h>
 #include <sys/time.h>
 #include <fstream>
 #include <unistd.h>
 #include <assert.h>
 
 
 #ifndef __xlC__
 #warning "gnu compilers"
-#include <immintrin.h>
 #endif
 
 //#include "simd_math.h"
 #include "chi_CPU.hpp"
 //#ifdef __WITH_GPU
 #include "grid_gradient_GPU.cuh"
 ////#endif
 //#include "gradient_GPU.cuh"
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 #ifdef __WITH_MPI
 #warning "MPI enabled"
 #include <mpi.h>
 #include "mpi_check.h"
 #endif
 
 #include "delense_CPU_utils.hpp"
 #include "delense_CPU.hpp"
 #include "delense_GPU.cuh"
 #include "chi_comm.hpp"
 #include "chi_computation.hpp"
 #ifdef __WITH_GPU
 #include <cuda_runtime.h>
 #include "cudafunctions.cuh"
 #endif
 
 #define MIN(a,b) (((a)<(b))?(a):(b))
 #define MAX(a,b) (((a)>(b))?(a):(b))
 
 
 //
 //
 //
 double myseconds()
 {
 	struct timeval  tp;
 	struct timezone tzp;
 	int i;
 
 	i = gettimeofday(&tp,&tzp);
 	return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
 }
 
 void mychi_bruteforce_SOA_CPU_grid_gradient_barycentersource(double *chi, int *error, runmode_param *runmode, const struct Potential_SOA *lens, const struct grid_param *frame, const int *nimages_strongLensing, galaxy *images)
 {
 	//
 	//double dx, dy; //, x_pos, y_pos;        //pixelsize
 	//
 	//	double im_dist[MAXIMPERSOURCE]; // distance to a real image for an "theoretical" image found from a theoretical source
 	//	int im_index;       		// index of the closest real image for an image found from a theoretical source
 	//	int second_closest_id;   	// index of the second closest real image for an image found from a theoretical source
 	//	int thread_found_image = 0;   	// at each iteration in the lens plane, turn to 1 whether the thread find an image
 	// 	struct point im_position, temp; // position of the image found from a theoretical source + temp variable for comparison
 	//	struct triplet Tsup, Tinf, Tsupsource, Tinfsource;// triangles for the computation of the images created by the theoretical sources
 	//	
 	unsigned int nsets = runmode->nsets;
         unsigned int nimagestot = runmode->nimagestot;
         //
         int MAXIMPERSOURCE;
         int numImages = 0;
         //
         int maxImagesPerSource = 0;
         for( int source_id = 0; source_id < nsets; source_id ++)
         {
                 numImages += nimages_strongLensing[source_id];
                 maxImagesPerSource = MAX(nimages_strongLensing[source_id], maxImagesPerSource);
         }
         MAXIMPERSOURCE = maxImagesPerSource;
 
 	//
 	struct galaxy sources[nsets]; // theoretical sources (common for a set)
 	int    nimagesfound  [nsets][MAXIMPERSOURCE]; // number of images found from the theoretical sources
 	int    locimagesfound[nsets];
 	//int*    locimagesfound = (int*) malloc(nsets*sizeof(int));
 	struct point image_pos [nsets][MAXIMPERSOURCE];
 	//struct point* image_pos = (struct point*) malloc(nsets*MAXIMPERSOURCE*sizeof(struct point));	
 	//double image_dist    [nsets][runmode->nimagestot][MAXIMPERSOURCE];
 	struct point tim     [MAXIMPERSOURCE]; // theoretical images (computed from sources)
 	//
 	int world_size = 1;
 	int world_rank = 0;
 #ifdef __WITH_MPI
 	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 	MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 	MPI_Barrier(MPI_COMM_WORLD);
 #endif
 	unsigned int verbose = (world_rank == 0);
 	//
 	int grid_size     = runmode->nbgridcells;
 	int loc_grid_size = runmode->nbgridcells/world_size;
 	//
 	double y_len      = fabs(frame->ymax - frame->ymin);
 	int    y_len_loc  = runmode->nbgridcells/world_size;
 	int    y_pos_loc  = (int) world_rank*y_len_loc;
 	int    y_bound    = y_len_loc;
 	//
 	if ((world_rank + 1) != world_size) y_bound++;
 	printf("@@nsets = %d nx = %d ny = %d, xmin = %f, ymin = %f\n", nsets, runmode->nbgridcells, y_bound, frame->xmin, frame->ymin );
 	//
 	//
 	const double dx   = (frame->xmax - frame->xmin)/(runmode->nbgridcells - 1);
 	const double dy   = (frame->ymax - frame->ymin)/(runmode->nbgridcells - 1);
 	//
 	double *grid_gradient_x, *grid_gradient_y;
 	//
 	//grid_gradient_x   = (double *)malloc((int) grid_size*loc_grid_size*sizeof(double));
 	//grid_gradient_y   = (double *)malloc((int) grid_size*loc_grid_size*sizeof(double));
 	//grid_gradient_x   = (double *)malloc((int) grid_size*y_bound*sizeof(double));
 	//grid_gradient_y   = (double *)malloc((int) grid_size*y_bound*sizeof(double));
         grid_gradient_x   = (double *)malloc((int) grid_size*y_bound*sizeof(double));
         grid_gradient_y   = (double *)malloc((int) grid_size*y_bound*sizeof(double));
 	//
 	const int grid_dim = runmode->nbgridcells;
 	//Packaging the image to sourceplane conversion
 	double time = -myseconds();
 	double grad_time = -myseconds();
 	//gradient_grid_CPU(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, grid_dim);
 	int zero = 0;
 	gradient_grid_CPU(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, dx, dy, grid_size, y_bound, zero, y_pos_loc);
 	//gradient_grid_CPU(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, grid_size, loc_grid_size);
 #ifdef __WITH_MPI
 	MPI_Barrier(MPI_COMM_WORLD);
 #endif
 	grad_time += myseconds();
 	//
 	int index             = 0;       // index tracks the image within the total image array in the image plane
 	*chi  		      = 0;
 	//
 	double chi2_time      = 0.;
 	double delense_time   = 0.;
 	double image_time     = 0.;
 	//
 	int images_found      = 0;
 	long int images_total = 0;
 	//
 	//printf("@@nsets = %d nx = %d ny = %d, xmin = %f, dx = %f, ymin = %f, dy = %f\n", nsets, runmode->nbgridcells, runmode->nbgridcells, frame->xmin, dx, frame->ymin, dy );
 	//
 	int numsets = 0;
 	//
 	for( int  source_id = 0; source_id < nsets; source_id ++)
 		numsets += nimages_strongLensing[source_id];
 	//
 	time = -myseconds();
 	//
 	// nsets     : number of images in the source plane
 	// nimagestot: number of images in the image plane
 	//
 	//
 	// Image Lensing
 	//
 	image_time -= myseconds();
 	//
 	//unsigned int nsets = nsets;
 	//
 	for( int  source_id = 0; source_id < nsets; source_id ++)
 	{
 		// number of images in the image plane for the specific image (1,3,5...)
 		unsigned short int nimages = nimages_strongLensing[source_id];
 		//@@printf("@@ source_id = %d, nimages = %d\n",  source_id, nimages_strongLensing[source_id]);
 		//Init sources
 		sources[source_id].center.x = sources[source_id].center.y = 0.;
 		//____________________________ image (constrains) loop ________________________________
 		for(unsigned short int image_id = 0; image_id < nimages; image_id++)
 		{
 			//
 			struct galaxy sources_temp;
 			struct point Grad = module_potentialDerivatives_totalGradient_SOA(&images[index + image_id].center, lens, runmode->nhalos);
 			//
 			// find the position of the constrain in the source plane
 			sources_temp.center.x = images[index + image_id].center.x - images[index + image_id].dr*Grad.x;
 			sources_temp.center.y = images[index + image_id].center.y - images[index + image_id].dr*Grad.y;
 
 			//________ Adding up for barycenter comp _________
 			sources[source_id].center.x += sources_temp.center.x;
 			sources[source_id].center.y += sources_temp.center.y;
 			//time += myseconds();
 			sources[source_id].redshift = 0.;
 			sources[source_id].shape.a = sources[source_id].shape.b = sources[source_id].shape.theta = (type_t) 0.;
 			sources[source_id].mag = 0.;
 			//
 		}
 		//________ Dividing by nimages for final barycenter result _________
 		sources[source_id].center.x /= nimages;
 		sources[source_id].center.y /= nimages;
 
 		sources[source_id].redshift = images[index].redshift;
 		sources[source_id].dr       = images[index].dr;
 		sources[source_id].dls      = images[index].dls;
 		sources[source_id].dos      = images[index].dos;
 //#if 1
 		index += nimages;
 	}
 #ifdef __WITH_MPI
 	MPI_Barrier(MPI_COMM_WORLD);
 #endif
 	image_time += myseconds();
 	//
 	// Delensing
 	//
 	delense_time -= myseconds();
 	index 	      = 0;
 	int numimg    = 0;
 #ifndef __WITH_GPU
 	delense_barycenter(&image_pos[0][0], MAXIMPERSOURCE, &locimagesfound[0], &numimg, runmode, lens, frame, nimages_strongLensing, sources, grid_gradient_x, grid_gradient_y);	
 #else
 	//delense_barycenter_GPU(&image_pos[0][0], &locimagesfound[0], &numimg, runmode, lens, frame, nimages_strongLensing, sources, grid_gradient_x, grid_gradient_y);
 	//struct point* ip[MAXIMPERSOURCE] = image_pos;	
 	delense_barycenter_GPU(&image_pos[0][0], MAXIMPERSOURCE, &locimagesfound[0], &numimg, runmode, lens, frame, nimages_strongLensing, sources, grid_gradient_x, grid_gradient_y);
 	printf("done. numimg = %d\n", numimg);fflush(stdout);
 #endif
 	//
 	// Communications
 	//
 #ifdef __WITH_MPI
 	MPI_Barrier(MPI_COMM_WORLD);
 #endif
 	delense_time += myseconds();
 	//
 	double comm_time = -myseconds();
 	//
 	int          numimagesfound    [nsets];
 	struct point imagesposition    [nsets][MAXIMPERSOURCE];
 	//
 	memset(&numimagesfound, 0, nsets*sizeof(int));
 	memset(&imagesposition, 0, nsets*MAXIMPERSOURCE*sizeof(point));
 	//
 	delense_comm(&numimagesfound[0], MAXIMPERSOURCE, &imagesposition[0][0],  &numimg, nimages_strongLensing, &locimagesfound[0], &image_pos[0][0], nsets, world_rank, world_size);
 	//
 	comm_time += myseconds();
 	//
 	// image extraction to compute
 	//
 	chi2_time = -myseconds();
 	if (verbose)
 		chi_computation(chi, &images_found, &numimagesfound[0], &imagesposition[0][0], MAXIMPERSOURCE, nimages_strongLensing, images, nsets);
 
 #ifdef __WITH_GPU
 	MPI_Barrier(MPI_COMM_WORLD);
 #endif
 	//
 	chi2_time += myseconds();
 	time      += myseconds();
 	//
 	if (verbose)
 	{
 		//
 		//		int nthreads = 1;
 		//
 		//#pragma omp parallel
 		//		nthreads = omp_get_num_threads();
 		//
 		printf("	overall time  = %f s.\n", time);
 		printf("		- grad    time = %f s.\n", grad_time);
 		printf("		- image   time = %f s.\n", image_time);
 		printf("		- delense time = %f s.\n", delense_time);
 		printf("		- comm    time = %f s.\n", comm_time);
 		printf("		- chi2    time = %f s.\n", chi2_time);
 		//
 		printf("	images found: %d out of %ld\n", images_found, images_total);
 	}
 	//
 #ifdef __WITH_GPU
         cudaFree(grid_gradient_x);
         cudaFree(grid_gradient_y);
 #else
         free(grid_gradient_x);
         free(grid_gradient_y);
 #endif
 }
diff --git a/src/chi_barycenter_GPU.cu b/src/chi_barycenter_GPU.cu
index 4028cce..5b3e4a5 100644
--- a/src/chi_barycenter_GPU.cu
+++ b/src/chi_barycenter_GPU.cu
@@ -1,252 +1,251 @@
 #include <iostream>
 #include <string.h>
 //#include <cuda_runtime.h>
 #include <math.h>
 #include <sys/time.h>
 #include <fstream>
 #include <unistd.h>
 #include <assert.h>
 
 
 #ifndef __xlC__
 #warning "gnu compilers"
-#include <immintrin.h>
 #endif
 
 //#include "simd_math.h"
 #include "chi_CPU.hpp"
 //#ifdef __WITH_GPU
 #include "grid_gradient_GPU.cuh"
 ////#endif
 //#include "gradient_GPU.cuh"
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 #ifdef __WITH_MPI
 #warning "MPI enabled"
 #include <mpi.h>
 #include "mpi_check.h"
 #include "chi_comm.hpp"
 #endif
 
 #include "delense_CPU_utils.hpp"
 #include "delense_CPU.hpp"
 #include "delense_GPU.cuh"
 #include "chi_comm.hpp"
 #include "chi_computation.hpp"
 
 #ifdef __WITH_GPU
 #include <cuda_runtime.h>
 #include "nvml_GPU.cuh"
 #include "cudafunctions.cuh"
 #endif
 
 #define MIN(a,b) (((a)<(b))?(a):(b))
 #define MAX(a,b) (((a)>(b))?(a):(b))
 
 int MAXIMPERSOURCE;
 //
 //
 //
 double myseconds()
 {
 	struct timeval  tp;
 	struct timezone tzp;
 	int i;
 
 	i = gettimeofday(&tp,&tzp);
 	return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
 }
 
 void mychi_bruteforce_SOA_GPU_grid_gradient_barycentersource(type_t* grid_gradient_x, type_t* grid_gradient_y, double *chi, int *error, runmode_param *runmode, const  struct Potential_SOA *lens, const struct grid_param *frame, const int *nimages_strongLensing, galaxy *images, double dx, double dy, const int y_pos_loc, const int y_bound)
 {
 	//
 	PUSH_RANGE("chi barycenter computation", 7);
 	unsigned int nsets = runmode->nsets;
 	//
         int numImages = 0;
         //
         int maxImagesPerSource = 0;
         for( int source_id = 0; source_id < nsets; source_id ++)
         {
                 numImages += nimages_strongLensing[source_id];
                 maxImagesPerSource = MAX(nimages_strongLensing[source_id], maxImagesPerSource);
         }
 	MAXIMPERSOURCE = maxImagesPerSource + 2;
 	struct galaxy* sources;
 	cudaMallocManaged(&sources, nsets*sizeof(struct galaxy)); // theoretical sources (common for a set)
 	cudaMemPrefetchAsync(sources, nsets*sizeof(struct galaxy), cudaCpuDeviceId);	
 	//
 	int    nimagesfound  [nsets][MAXIMPERSOURCE]; // number of images found from the theoretical sources
 	int    locimagesfound[nsets];
 	//int*    locimagesfound = (int*) malloc(nsets*sizeof(int));
 	struct point image_pos [nsets][MAXIMPERSOURCE];
 	//struct point* image_pos = (struct point*) malloc(nsets*MAXIMPERSOURCE*sizeof(struct point));	
 	//double image_dist    [nsets][runmode->nimagestot][MAXIMPERSOURCE];
 	struct point tim     [MAXIMPERSOURCE]; // theoretical images (computed from sources)
 	int grid_size     = runmode->nbgridcells;
 	int world_size = 1;
 	int world_rank = 0;
 #ifdef __WITH_MPI
 #warning "using MPI..."
 	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 	MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 	MPI_Barrier(MPI_COMM_WORLD);
 #endif
 	unsigned int verbose = (world_rank == 0);
 	//
 	//const double dx   = (frame->xmax - frame->xmin)/(runmode->nbgridcells - 1);
 	//const double dy   = (frame->ymax - frame->ymin)/(runmode->nbgridcells - 1);
 	//
 	const int grid_dim = runmode->nbgridcells;
 	double time = -myseconds();
 	double grad_time = -myseconds();
 	PUSH_RANGE("gradient_grid_GPU_UM", 1);
 	gradient_grid_GPU_UM(grid_gradient_x, grid_gradient_y, frame, lens, runmode->nhalos, dx, dy, grid_size, y_bound, 0, y_pos_loc);
 //#ifdef __WITH_MPI
 //	MPI_Barrier(MPI_COMM_WORLD);
 //#endif
 	//
 	int index             = 0;       // index tracks the image within the total image array in the image plane
 	*chi  		      = 0;
 	//
 	double chi2_time      = 0.;
 	double delense_time   = 0.;
 	double image_time     = 0.;
 	//
 	int images_found      = 0;
 	long int images_total = 0;
 	//
 	//printf("@@nsets = %d nx = %d ny = %d, xmin = %f, dx = %f, ymin = %f, dy = %f\n", nsets, runmode->nbgridcells, runmode->nbgridcells, frame->xmin, dx, frame->ymin, dy );
 	//
 	//
 	time = -myseconds();
 	//
 	// nsets     : number of images in the source plane
 	// nimagestot: number of images in the image plane
 	//
 	//
 	// Image Lensing
 	//
 	PUSH_RANGE("image construction", 2);
 	image_time -= myseconds();
 	//
 	//unsigned int nsets = nsets;
 	//
 	for( int  source_id = 0; source_id < nsets; source_id ++)
 	{
 		// number of images in the image plane for the specific image (1,3,5...)
 		unsigned short int nimages = nimages_strongLensing[source_id];
 		//printf("@@ source_id = %d, nimages = %d\n",  source_id, nimages_strongLensing[source_id]);
 		//Init sources
 		sources[source_id].center.x = sources[source_id].center.y = 0.;
 		//____________________________ image (constrains) loop ________________________________
 		for(unsigned short int image_id = 0; image_id < nimages; image_id++)
 		{
 			//
 			struct galaxy sources_temp;
 			struct point Grad = module_potentialDerivatives_totalGradient_SOA(&images[index + image_id].center, lens, runmode->nhalos);
 			//
 			// find the position of the constrain in the source plane
 			sources_temp.center.x = images[index + image_id].center.x - images[index + image_id].dr*Grad.x;
 			sources_temp.center.y = images[index + image_id].center.y - images[index + image_id].dr*Grad.y;
 			//std::cerr << "Image " << index + image_id << "x: " << images[index + image_id].center.x << " y: "<<images[index + image_id].center.y  << "Grad.x: " << Grad.x << " Grad.y: "<<Grad.y << std::endl;
 
 			//________ Adding up for barycenter comp _________
 			sources[source_id].center.x += sources_temp.center.x;
 			sources[source_id].center.y += sources_temp.center.y;
 			//time += myseconds();
 			sources[source_id].redshift = 0.;
 			sources[source_id].shape.a = sources[source_id].shape.b = sources[source_id].shape.theta = (type_t) 0.;
 			sources[source_id].mag = 0.;
 			//
 		}
 		//________ Dividing by nimages for final barycenter result _________
 		sources[source_id].center.x /= nimages;
 		sources[source_id].center.y /= nimages;
 
 		sources[source_id].redshift = images[index].redshift;
 		sources[source_id].dr       = images[index].dr;
 		sources[source_id].dls      = images[index].dls;
 		sources[source_id].dos      = images[index].dos;
 		index += nimages;
 	}
 	cudaMemPrefetchAsync(&sources[0], MAXIMPERSOURCE*sizeof(struct galaxy), 0);
 #ifdef __WITH_MPI
 	MPI_Barrier(MPI_COMM_WORLD);
 #endif
 	image_time += myseconds();
 	POP_RANGE; // image construction
 	cudaDeviceSynchronize();
 	grad_time += myseconds();
 	POP_RANGE; // gradient computation
 	//
 	// Delensing
 	//
 	PUSH_RANGE("delense barycenter", 3);
 	delense_time -= myseconds();
 	index 	      = 0;
 	int numimg    = 0;
 	delense_barycenter_GPU(&image_pos[0][0], MAXIMPERSOURCE, &locimagesfound[0], &numimg, runmode, lens, frame, nimages_strongLensing, sources, grid_gradient_x, grid_gradient_y, y_pos_loc, y_bound);
 #ifdef __WITH_MPI
 	MPI_Barrier(MPI_COMM_WORLD);
 #endif
 	delense_time += myseconds();
 	POP_RANGE;
 	//
 	// Communications
 	//
 	PUSH_RANGE("delense communication", 4);
 	double comm_time = -myseconds();
 	//
 	int          numimagesfound    [nsets];
 	struct point imagesposition    [nsets][MAXIMPERSOURCE];
 	//
 	memset(&numimagesfound, 0, nsets*sizeof(int));
 	memset(&imagesposition, 0, nsets*MAXIMPERSOURCE*sizeof(point));
 	//
 	delense_comm(&numimagesfound[0], MAXIMPERSOURCE, &imagesposition[0][0],  &numimg, nimages_strongLensing, &locimagesfound[0], &image_pos[0][0], nsets, world_rank, world_size);
 	//
 	comm_time += myseconds();
 	POP_RANGE;
 	//
 	// image extraction to compute
 	//
 	PUSH_RANGE("chi computation", 5);
 	chi2_time = -myseconds();
 	if (verbose)
 		chi_computation(chi, &images_found, &numimagesfound[0], &imagesposition[0][0], MAXIMPERSOURCE, nimages_strongLensing, images, nsets);
 
 #ifdef __WITH_MPI
 	MPI_Barrier(MPI_COMM_WORLD);
 #endif
 	//
 	chi2_time += myseconds();
 	POP_RANGE;
 	//
 	// All done
 	//
 	time      += myseconds();
 	//
 	if (verbose)
 	{
 		//
 		//		int nthreads = 1;
 		//
 		//#pragma omp parallel
 		//		nthreads = omp_get_num_threads();
 		//
 		printf("	overall time  = %f s.\n", time);
 		printf("		- grad    time = %f s.\n", grad_time);
 		printf("		- image   time = %f s.\n", image_time);
 		printf("		- delense time = %f s.\n", delense_time);
 		printf("		- comm    time = %f s.\n", comm_time);
 		printf("		- chi2    time = %f s.\n", chi2_time);
 		//
 		printf("	images found: %d out of %ld\n", images_found, images_total);
 	}
 	//
 	PUSH_RANGE("Memory deallocation", 6);
 	POP_RANGE;
 	POP_RANGE;
 }
diff --git a/src/gradient.cpp b/src/gradient.cpp
index 75a29ab..c51e745 100644
--- a/src/gradient.cpp
+++ b/src/gradient.cpp
@@ -1,993 +1,992 @@
 /**
 Lenstool-HPC: HPC based massmodeling software and Lens-map generation
 Copyright (C) 2017  Christoph Schaefer, EPFL (christophernstrerne.schaefer@epfl.ch), Gilles Fourestey (gilles.fourestey@epfl.ch)
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 @brief: Function for single order computation on CPUs
 
 */
 
 #include <iostream>
 #include <iomanip>
 #include <string.h>
 //#include <cuda_runtime.h>
 #include <math.h>
 #include <sys/time.h>
 #include <fstream>
-#include <immintrin.h>
 #include <map>
 /*
 #ifdef __AVX__
 #include "simd_math_avx.h"
 #endif
 #ifdef __AVX512F__
 #include "simd_math_avx512f.h"
 #endif
 */
 #include "structure_hpc.hpp"
 #include "gradient.hpp"
 #include "utils.hpp"
 
 #ifdef __WITH_MARLA
 #warning "using marla"
 #include <exp.h>
 #include <log.h>
 #include <atan.h>
 #define exp marla_exp
 #define log marla_log
 #define atan2 marla_atan2
 #endif
 
 //#include "iacaMarks.h"
 //
 //
 //
 /**@brief Return the gradient of the projected lens potential for one clump
  * !!! You have to multiply by dlsds to obtain the true gradient
  * for the expressions, see the papers : JP Kneib & P Natarajan, Cluster Lenses, The Astronomy and Astrophysics Review (2011) for 1 and 2
  * and JP Kneib PhD (1993) for 3
  * 
  * @param pImage 	point where the result is computed in the lens plane
  * @param lens		mass distribution
  */
 
 
 //
 /// Useful functions
 //
 complex
 piemd_1derivatives_ci05(type_t x, type_t y, type_t eps, type_t rc)
 {
         type_t  sqe, cx1, cxro, cyro, rem2;
         complex zci, znum, zden, zis, zres;
         type_t norm;
         //
         //std::cout << "piemd_lderivatives" << std::endl;
         sqe  = sqrt(eps);
         cx1  = ((type_t) 1. - eps) / ((type_t) 1. + eps);
         cxro = ((type_t) 1. + eps) * ((type_t) 1. + eps);
         cyro = ((type_t) 1. - eps) * ((type_t) 1. - eps);
         //
         rem2 = x * x / cxro + y * y / cyro;
         /*zci=cpx(0.,-0.5*(1.-eps*eps)/sqe);
           znum=cpx(cx1*x,(2.*sqe*sqrt(rc*rc+rem2)-y/cx1));
           zden=cpx(x,(2.*rc*sqe-y));
           zis=pcpx(zci,lncpx(dcpx(znum,zden)));
           zres=pcpxflt(zis,b0);*/
 
         // --> optimized code
         zci.re  = (type_t)  0.;
         zci.im  = (type_t) -0.5*((type_t) 1. - eps*eps)/sqe;
         znum.re = cx1 * x;
         znum.im = (type_t) 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
         zden.re = x;
         zden.im = (type_t) 2.*rc * sqe - y;
         norm = zden.re * zden.re + zden.im * zden.im;     // zis = znum/zden
         zis.re = (znum.re * zden.re + znum.im * zden.im) / norm;
         zis.im = (znum.im * zden.re - znum.re * zden.im) / norm;
         norm = zis.re;
         zis.re = log(sqrt(norm * norm + zis.im * zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
         zis.im = atan2(zis.im, norm);
         //  norm = zis.re;
         zres.re = zci.re * zis.re - zci.im * zis.im;   // Re( zci*ln(zis) )
         zres.im = zci.im * zis.re + zis.im * zci.re;   // Im( zci*ln(zis) )
         //
         //zres.re = zis.re*b0;
         //zres.im = zis.im*b0;
         //
         return zres;
 }
 //
 //// changes the coordinates of point P into a new basis (rotation of angle theta)
 ////      y'      y    x'
 ////       *      |   /
 ////         *    |  /  theta
 ////           *  | /
 ////             *|--------->x
 /*
 inline
 struct point rotateCoordinateSystem(struct point P, double theta)
 {
 	struct  point   Q;
 
 	Q.x = P.x*cos(theta) + P.y*sin(theta);
 	Q.y = P.y*cos(theta) - P.x*sin(theta);
 
 	return(Q);
 }
 */
 //
 //
 struct point
 grad_halo(const struct point *pImage, const struct Potential *lens)
 {
 	struct point true_coord, true_coord_rot, result;
 	type_t X, Y, R, angular_deviation, u;
 	complex zis;
 	//
 	result.x = result.y = (type_t) 0.;
 	//
 	/*positionning at the potential center*/
 	// Change the origin of the coordinate system to the center of the clump
 	true_coord.x   = pImage->x - lens->position.x;  
 	true_coord.y   = pImage->y - lens->position.y;
 	//printf("x, y = %f, %f\n", lens->position.x, lens->position.y);
 	//
 	true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle);
 	switch (lens->type)
 	{
 		case 5 : /*Elliptical Isothermal Sphere*/
 			/*rotation of the coordiante axes to match the potential axes*/
 			//true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle);
 			//
 			R=sqrt(true_coord_rot.x*true_coord_rot.x*((type_t) 1. - lens->ellipticity_potential) + true_coord_rot.y*true_coord_rot.y*((type_t) 1. + lens->ellipticity_potential));	//ellippot = ellipmass/3
 			result.x = ((type_t) 1. - lens->ellipticity_potential)*lens->b0*true_coord_rot.x/(R);
 			result.y = ((type_t) 1. + lens->ellipticity_potential)*lens->b0*true_coord_rot.y/(R);
 			break;
 		case 8 : /* PIEMD */
 			/*rotation of the coordiante axes to match the potential axes*/
 			/*Doing something....*/
 			complex zis;
 			zis = piemd_1derivatives_ci05(true_coord_rot.x, true_coord_rot.y, lens->ellipticity_potential, lens->rcore);
 			//
 			result.x = lens->b0*zis.re;
 			result.y = lens->b0*zis.im;
 			break;
 		case 81 : //PIEMD Kassiola & Kovner,1993 I0.5c-I0.5cut
 			//
 			type_t t05;
 			if ( lens->ellipticity_potential > (type_t) 2E-4 )
 			{
 				t05 = lens->b0*lens->rcut/(lens->rcut - lens->rcore);
 				complex zis     = piemd_1derivatives_ci05(true_coord_rot.x, true_coord_rot.y, lens->ellipticity_potential, lens->rcore);
 				complex zis_cut = piemd_1derivatives_ci05(true_coord_rot.x, true_coord_rot.y, lens->ellipticity_potential, lens->rcut);
 				result.x = t05 * (zis.re - zis_cut.re);
 				result.y = t05 * (zis.im - zis_cut.im);
 				//printf("	g   = %f %f\n", result.x, result.y);
 			}
 			else if (( u = true_coord_rot.x*true_coord_rot.x + true_coord_rot.y*true_coord_rot.y) > (type_t) 0. )
 			{
 				// Circular dPIE Elliasdottir 2007 Eq A23 slighly modified for t05
 				X = lens->rcore;
 				Y = lens->rcut;
 				t05  = sqrt(u + X * X) - X - sqrt(u + Y * Y) + Y;  // Faster and equiv to Elliasdottir (see Golse PhD)
 				t05 *= lens->b0 * Y / (Y - X) / u; // 1/u because t05/sqrt(u) and normalised Q/sqrt(u)
 				result.x = t05*true_coord_rot.x;
 				result.y = t05*true_coord_rot.y;
 			}
 			else
 			{
 				result.x = (type_t) 0.;
 				result.y = (type_t) 0.;
 			}
 			//
 			break;
 		default:
 			std::cout << "ERROR: Grad 1 profil type of clump "<< lens->name << " unknown : "<< lens->type << std::endl;
 			break;
 	};
 	result = rotateCoordinateSystem(result, -lens->ellipticity_angle); 
 	return result;
 }
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient(const int nhalos, const struct point* pImage, const struct Potential* lens)
 {
 	struct point grad, clumpgrad;
 	//
 	grad.x = (type_t) 0.;
 	grad.y = (type_t) 0.;
 	//
 	for(int i = 0; i < nhalos; i++)
 	{
 		clumpgrad = grad_halo(pImage, &lens[i]);  //compute gradient for each clump separately
 		//std::cout << clumpgrad.x << " " << clumpgrad.y << std::endl;
 		//nan check
 		//if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y)
 		{ 
 			// add the gradients
 			grad.x += clumpgrad.x;
 			grad.y += clumpgrad.y;
 		}  
 	}
 	//	
 	return(grad);
 }
 //
 // SOA versions, vectorizable
 //
 struct point module_potentialDerivatives_totalGradient_5_SOA(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA begins");
         //printf("# module_potentialDerivatives_totalGradient_SIS_SOA begins\n");
 	//
 	struct point grad, result;
         grad.x = (type_t) 0.;
         grad.y = (type_t) 0.;
 	for(int i = shalos; i < shalos + nhalos; i++)
 	{
 		//
 		struct point true_coord, true_coord_rotation;
 		//
 		true_coord.x = pImage->x - lens->position_x[i];
 		true_coord.y = pImage->y - lens->position_y[i];
 		//
 		true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
 		type_t ell_pot = lens->ellipticity_potential[i];
                 //
 		type_t R = sqrt(true_coord_rotation.x*true_coord_rotation.x*((type_t) 1. - ell_pot) + true_coord_rotation.y*true_coord_rotation.y*((type_t) 1. + ell_pot));
 		//
 		result.x = ((type_t) 1. - ell_pot)*lens->b0[i]*true_coord_rotation.x/R;
 		result.y = ((type_t) 1. + ell_pot)*lens->b0[i]*true_coord_rotation.y/R;
 		//
 		result = rotateCoordinateSystem(result, -lens->ellipticity_angle[i]);
 		//
 		grad.x += result.x;
 		grad.y += result.y;
 		
 	}
 	return grad;
 }
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient_5_SOA_v2(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins");
         //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n");
         //
         struct point grad, result;
         grad.x = (type_t) 0.;
         grad.y = (type_t) 0.;
         for(int i = shalos; i < shalos + nhalos; i++)
         {
 			struct point true_coord, true_coord_rotation;
 			//
 			true_coord.x = pImage->x - lens->position_x[i];
 			true_coord.y = pImage->y - lens->position_y[i];
 			//
 			//true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
 			type_t b0 = lens->b0[i];
 			//
 			type_t cose = lens->anglecos[i];
 			type_t sine = lens->anglesin[i];
 			//
 			type_t x = true_coord.x*cose + true_coord.y*sine;
 			type_t y = true_coord.y*cose - true_coord.x*sine;
 			//
 			type_t ell_pot = lens->ellipticity_potential[i];
 			//
 			type_t val = x*x*((type_t) 1. - ell_pot) + y*y*((type_t) 1. + ell_pot);
 			type_t R   = 1./sqrtf(val);
 			R = R*(1.5 - 0.5*val*R*R); 
 			result.x = ((type_t) 1. - ell_pot)*b0*x*R;
 			result.y = ((type_t) 1. + ell_pot)*b0*y*R;
 			//
 			grad.x += result.x*cose - result.y*sine;
 			grad.y += result.y*cose + result.x*sine;
 
 			//grad.x = x/R;
 			//grad.y = y/R;
         }
         return grad;
 }
 #if 0
 struct point module_potentialDerivatives_totalGradient_5_SOA_print(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos, int index)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins");
         //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n");
         //
         struct point grad, result;
     	std::ofstream myfile;
         grad.x = (type_t) 0.;
         grad.y = (type_t) 0.;
 
 #ifdef _double
         std::string name = "Double_";
 #else
         std::string name = "Float_";
 #endif
 
         for(int i = shalos; i < shalos + nhalos; i++)
         {
 		//
 			struct point true_coord, true_coord_rotation;
 			//
 			true_coord.x = pImage->x - lens->position_x[i];
 			true_coord.y = pImage->y - lens->position_y[i];
 			////
 			/*
         	myfile.open (name + "pImage->x_1.txt", std::ios_base::app);
         	myfile << index << " " << pImage->x << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	//
         	myfile.open (name + "pImage->y_1.txt", std::ios_base::app);
         	myfile << index << " " << pImage->y << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	//
 
         	myfile.open (name + "true_coord.x_2.txt", std::ios_base::app);
         	myfile << index << " " << true_coord.x << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	//
         	myfile.open (name + "true_coord.y_2.txt", std::ios_base::app);
         	myfile << index << " " << true_coord.y << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
 			*///
 			type_t cose = lens->anglecos[i];
 			type_t sine = lens->anglesin[i];
 			////
 
         	myfile.open (name + "lens->anglecos[i]_3.txt", std::ios_base::app);
         	myfile << index << " " << lens->anglecos[i] << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	//
         	myfile.open (name + "lens->anglesin[i]_3.txt", std::ios_base::app);
         	myfile << index << " " << lens->anglesin[i] << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
 			////
 			type_t x = true_coord.x*cose + true_coord.y*sine;
 			type_t y = true_coord.y*cose - true_coord.x*sine;
 			////
 			/*
         	myfile.open (name + "x_4.txt", std::ios_base::app);
         	myfile << index << " " << x << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	/
         	myfile.open (name + "y_4.txt", std::ios_base::app);
         	myfile << index << " " << y << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	*////
 			type_t ell_pot = lens->ellipticity_potential[i];
 			//
 			type_t R = sqrt(x*x*(1 - ell_pot) + y*y*(1 + ell_pot));
 			//
 			result.x = (1 - ell_pot)*lens->b0[i]*x/R;
 			result.y = (1 + ell_pot)*lens->b0[i]*y/R;
 			////
 			/*
         	myfile.open (name + "ell_pot_5.txt", std::ios_base::app);
         	myfile << index << " " << ell_pot << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	//
         	myfile.open (name + "R_5.txt", std::ios_base::app);
         	myfile << index << " " << R << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	myfile.open (name + "x_6.txt", std::ios_base::app);
         	myfile << index << " " << x << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
 
         	myfile.open (name + "y_6.txt", std::ios_base::app);
         	myfile << index << " " << y << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
         	*///
 			grad.x += result.x*cose - result.y*sine;
 			grad.y += result.y*cose + result.x*sine;
 			////
 			/*
         	myfile.open (name + "grad.x_7.txt", std::ios_base::app);
         	myfile << index << " " << grad.x << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
 
         	myfile.open (name + "grad.y_7.txt", std::ios_base::app);
         	myfile << index << " " << grad.y << std::setprecision(7)  << " " << std::endl;
         	myfile.close();
 			*////
         }
         return grad;
 }
 #endif
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient_5_SOA_v2_novec(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA begins");
         //
         struct point grad, result;
         grad.x = (type_t) 0.;
         grad.y = (type_t) 0.;
 #pragma novector
 	for(int i = shalos; i < shalos + nhalos; i++)
         {
                 //
                 struct point true_coord, true_coord_rotation;
                 //
                 true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
                 //
                 //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
                 type_t cose = lens->anglecos[i];
                 type_t sine = lens->anglesin[i];
                 //
                 type_t x = true_coord.x*cose + true_coord.y*sine;
                 type_t y = true_coord.y*cose - true_coord.x*sine;
                 //:
                 type_t R = sqrt(x*x*((type_t) 1. - lens->ellipticity[i]/(type_t) 3.) + y*y*((type_t) 1. + lens->ellipticity[i]/(type_t) 3.));
 
                 result.x = ((type_t) 1. - lens->ellipticity[i]/(type_t) 3.)*lens->b0[i]*x/R;
                 result.y = ((type_t) 1. + lens->ellipticity[i]/(type_t) 3.)*lens->b0[i]*y/R;
                 //
                 grad.x += result.x*cose - result.y*sine;
                 grad.y += result.y*cose + result.x*sine;
         }
         return grad;
 }
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient_8_SOA(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
 	asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
 	// 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
 	// 
 	struct point grad, clumpgrad;
 	grad.x = (type_t) 0.;
 	grad.y = (type_t) 0.;
 	//
 	for(int i = shalos; i < shalos + nhalos; i++)
 	{
 		//IACA_START;
 		//
 		struct point true_coord, true_coord_rot; //, result;
 		//type_t       R, angular_deviation;
 		complex      zis;
 		//
 		//result.x = result.y = 0.;
 		//
 		//@@printf("image_x = %f image_y = %f\n",  pImage->x, pImage->y);	
 		true_coord.x = pImage->x - lens->position_x[i];
 		true_coord.y = pImage->y - lens->position_y[i];
 		//printf("x = %f y = %f\n",  true_coord.x, true_coord.y);	
 		/*positionning at the potential center*/
 		// Change the origin of the coordinate system to the center of the clump
 		true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
 		//
 		type_t x   = true_coord_rot.x;
 		type_t y   = true_coord_rot.y;
 		//@@printf("x = %f y = %f\n",  x, y);	
 		type_t eps = lens->ellipticity_potential[i];
 		type_t rc  = lens->rcore[i];
 		//
 		//std::cout << "piemd_lderivatives" << std::endl;
 		//
 		type_t sqe  = sqrt(eps);
 		//
 		type_t cx1  = ((type_t) 1. - eps)/((type_t) 1. + eps);
 		type_t cxro = ((type_t) 1. + eps)*((type_t) 1. + eps);
 		type_t cyro = ((type_t) 1. - eps)*((type_t) 1. - eps);
 		//
 		type_t rem2 = x*x/cxro + y*y/cyro;
 		//
 		complex zci, znum, zden, zres;
 		type_t norm;
 		//	
 		zci.re  = (type_t) 0.;
 		zci.im  = (type_t) -0.5*((type_t) 1. - eps*eps)/sqe;
 		//@@printf("zci = %f %f\n", zci.re, zci.im);	
 		//
 		znum.re = cx1*x;
 		znum.im = (type_t) 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
 		//
 		zden.re = x;
 		zden.im = (type_t) 2.*rc*sqe - y;
 		norm    = (zden.re*zden.re + zden.im*zden.im);     // zis = znum/zden
 		//@@printf("norm = %f\n", norm);
 		//
 		zis.re  = (znum.re*zden.re + znum.im*zden.im)/norm;
 		zis.im  = (znum.im*zden.re - znum.re*zden.im)/norm;
 		//@@printf("zis = %f %f\n", zis.re, zis.im);	
 		norm    = zis.re;
 		zis.re  = log(sqrt(norm*norm + zis.im*zis.im));  // ln(zis) = ln(|zis|)+i.Arg(zis)
 		zis.im  = atan2(zis.im, norm);
 		//@@printf("y,x = %f %f\n", zis.im, norm);	
 		//  norm = zis.re;
 		zres.re = zci.re*zis.re - zci.im*zis.im;   // Re( zci*ln(zis) )
 		zres.im = zci.im*zis.re + zis.im*zci.re;   // Im( zci*ln(zis) )
 		//
 		//@@printf("zres: %f %f\n", zres.re, zres.im); 
 		//
 		zis.re  = zres.re;
 		zis.im  = zres.im;
 		//
 		//zres.re = zis.re*b0;
 		//zres.im = zis.im*b0;
 		// rotation
 		clumpgrad.x = zis.re;
 		clumpgrad.y = zis.im;
 		clumpgrad = rotateCoordinateSystem(clumpgrad, -lens->ellipticity_angle[i]);
 		//
 		clumpgrad.x = lens->b0[i]*clumpgrad.x;
 		clumpgrad.y = lens->b0[i]*clumpgrad.y;
 		//
 		//clumpgrad.x = lens->b0[i]*zis.re;
 		//clumpgrad.y = lens->b0[i]*zis.im;
 		//nan check
 		//if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y)
 		//{
 			// add the gradients
 		grad.x += clumpgrad.x;
 		grad.y += clumpgrad.y;
 		//@@printf("grad: %f %f\n", grad.x, grad.y); 
 		//@@std::cout << "grad.x = " << grad.x << " grad.y = " << grad.y << std::endl;
 		//}
 	}
 	//IACA_END;
 	//
 	return(grad);
 }
 //
 //
 struct point module_potentialDerivatives_totalGradient_8_SOA_v2(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_8_SOA_v2 begins");
         //std::cout << "# module_potentialDerivatives_totalGradient_8_SOA_v2 begins" << std::endl;
 	//
         // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
         //
         struct point grad, clumpgrad;
         grad.x = 0;
         grad.y = 0;
 	//printf("%d %d\n", shalos, nhalos);
         //
         for(int i = shalos; i < shalos + nhalos; i++)
         {
                 //IACA_START;
                 //
                 struct point true_coord, true_coord_rot; //, result;
                 complex      zis;
                 type_t b0   = lens->b0[i];
                 //
                 true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
                 //
                 //true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
                 //
                 type_t cose = lens->anglecos[i];
                 type_t sine = lens->anglesin[i];
 		//
                 type_t x = true_coord.x*cose + true_coord.y*sine;
                 type_t y = true_coord.y*cose - true_coord.x*sine;
                 //
                 type_t eps = lens->ellipticity_potential[i];
                 type_t rc  = lens->rcore[i];
                 //
                 type_t sqe  = sqrt(eps);
                 //
                 type_t cx1  = ((type_t) 1. - eps)/((type_t) 1. + eps);
                 type_t cxro = ((type_t) 1. + eps)*((type_t) 1. + eps);
                 type_t cyro = ((type_t) 1. - eps)*((type_t) 1. - eps);
                 //
                 type_t rem2 = x*x/cxro + y*y/cyro;
                 //
                 complex zci, znum, zden, zres;
                 type_t norm;
                 //
                 zci.re  = (type_t) 0.;
                 zci.im  = (type_t) -0.5*((type_t) 1. - eps*eps)/sqe;
                 //
 		KERNEL(rc, zres)
 		//
                 grad.x += b0*(zres.re*cose - zres.im*sine);
                 grad.y += b0*(zres.im*cose + zres.re*sine);
                 //
         }
         //IACA_END;
         //
         return(grad);
 }
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient_8_SOA_v2_novec(const struct point *pImage, const struct Potential_SOA *lens
 , int shalos, int nhalos)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_8_SOA_v2 begins");
         //std::cout << "# module_potentialDerivatives_totalGradient_8_SOA_v2 begins" << std::endl;
         //
         // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
         //
         struct point grad, clumpgrad;
         grad.x = (type_t) 0.;
         grad.y = (type_t) 0.;
         //printf("%d %d\n", shalos, nhalos);
         //
 #pragma novector
         for(int i = shalos; i < shalos + nhalos; i++)
         {
                 //IACA_START;
                 //
                 struct point true_coord, true_coord_rot; //, result;
                 complex      zis;
                 type_t b0   = lens->b0[i];
                 //
                 true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
                 //
                 //true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
                 //
                 type_t cose = lens->anglecos[i];
                 type_t sine = lens->anglesin[i];
                 //
                 type_t x = true_coord.x*cose + true_coord.y*sine;
                 type_t y = true_coord.y*cose - true_coord.x*sine;
                 //
                 type_t eps = lens->ellipticity_potential[i];
                 type_t rc  = lens->rcore[i];
                 //
                 type_t sqe  = sqrt(eps);
                 //
                 type_t cx1  = ((type_t) 1. - eps)/((type_t) 1. + eps);
                 type_t cxro = ((type_t) 1. + eps)*((type_t) 1. + eps);
                 type_t cyro = ((type_t) 1. - eps)*((type_t) 1. - eps);
                 //
                 type_t rem2 = x*x/cxro + y*y/cyro;
                 //
                 complex zci, znum, zden, zres;
                 type_t norm;
                 //
                 zci.re  = (type_t) 0.;
                 zci.im  = (type_t) -0.5*((type_t) 1. - eps*eps)/sqe;
                 //
                 KERNEL(rc, zres)
                 //
                 grad.x += b0*(zres.re*cose - zres.im*sine);
                 grad.y += b0*(zres.im*cose + zres.re*sine);
                 //
         }
         //IACA_END;
         //
         return(grad);
 }
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient_81_SOA(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_81_SOA begins");
 	//std::cout << "# module_potentialDerivatives_totalGradient_SOA begins" << std::endl;
         // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
         // 
         struct point grad, clumpgrad;
         grad.x = (type_t) 0.;
         grad.y = (type_t) 0.;
         for(int i = shalos; i < shalos + nhalos; i++)
         {
                 //IACA_START;
                 //
                 struct point true_coord, true_coord_rot; //, result;
                 //type_t       R, angular_deviation;
                 complex      zis;
                 //
                 //result.x = result.y = 0.;
                 //
                 true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
                 /*positionning at the potential center*/
                 // Change the origin of the coordinate system to the center of the clump
                 true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
                 //
                 type_t x    = true_coord_rot.x;
                 type_t y    = true_coord_rot.y;
                 type_t eps  = lens->ellipticity_potential[i];
                 type_t rc   = lens->rcore[i];
                 type_t rcut = lens->rcut[i];
 		type_t b0   = lens->b0[i];
 		type_t t05  = b0*rcut/(rcut - rc);
                 //
                 type_t sqe  = sqrt(eps);
                 //
                 type_t cx1  = ((type_t) 1. - eps)/((type_t) 1. + eps);
                 type_t cxro = ((type_t) 1. + eps)*((type_t) 1. + eps);
                 type_t cyro = ((type_t) 1. - eps)*((type_t) 1. - eps);
                 //
                 type_t rem2 = x*x/cxro + y*y/cyro;
                 //
                 complex zci, znum, zden, zres_rc, zres_rcut;
                 type_t norm;
                 //      
                 zci.re  = (type_t) 0.;
                 zci.im  = (type_t) -0.5*((type_t) 1. - eps*eps)/sqe;
 		// step 1
 		{
 			KERNEL(rc, zres_rc)
 		}
 		// step 2
 		{
 			KERNEL(rcut, zres_rcut)
                 }
 		zis.re  = t05*(zres_rc.re - zres_rcut.re);
 		zis.im  = t05*(zres_rc.im - zres_rcut.im); 
                 // rotation
                 clumpgrad.x = zis.re;
                 clumpgrad.y = zis.im;
                 clumpgrad = rotateCoordinateSystem(clumpgrad, -lens->ellipticity_angle[i]);
                 //
                 grad.x += clumpgrad.x;
                 grad.y += clumpgrad.y;
                 //}
         }
         //IACA_END;
         //
         return(grad);
 }
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient_81_SOA_v2(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_81_SOA begins");
         //std::cout << "# module_potentialDerivatives_totalGradient_81_SOA begins" << std::endl;
         // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
         //
         struct point grad, clumpgrad;
         grad.x = (type_t) 0.;
         grad.y = (type_t) 0.;
         for(int i = shalos; i < shalos + nhalos; i++)
         {
                 //IACA_START;
                 //
                 struct point true_coord, true_coord_rot; //, result;
                 //type_t       R, angular_deviation;
                 complex      zis;
                 //
                 //result.x = result.y = 0.;
                 //
                 true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
                 /*positionning at the potential center*/
                 // Change the origin of the coordinate system to the center of the clump
                 type_t cose = lens->anglecos[i];
                 type_t sine = lens->anglesin[i];
 		//
                 type_t x    = true_coord.x*cose + true_coord.y*sine;
                 type_t y    = true_coord.y*cose - true_coord.x*sine;
 		//
                 type_t eps  = lens->ellipticity_potential[i];
                 type_t rc   = lens->rcore[i];
                 type_t rcut = lens->rcut[i];
                 type_t b0   = lens->b0[i];
                 type_t t05  = b0*rcut/(rcut - rc);
                 //
                 type_t sqe  = sqrt(eps);
                 //
                 type_t cx1  = ((type_t) 1. - eps)/((type_t) 1. + eps);
                 type_t cxro = ((type_t) 1. + eps)*((type_t) 1. + eps);
                 type_t cyro = ((type_t) 1. - eps)*((type_t) 1. - eps);
                 //
                 type_t rem2 = x*x/cxro + y*y/cyro;
                 //
                 complex zci, znum, zden, zres_rc, zres_rcut;
                 type_t norm;
                 //
                 zci.re  = (type_t)  0.;
                 zci.im  = (type_t) -0.5*((type_t) 1. - eps*eps)/sqe;
 		//
                 // step 1
                 {
 			KERNEL(rc, zres_rc)
                 }
                 // step 2
                 {
 			KERNEL(rcut, zres_rcut)
                 }
                 zis.re  = t05*(zres_rc.re - zres_rcut.re);
                 zis.im  = t05*(zres_rc.im - zres_rcut.im);
                 // rotation
 		grad.x += (zis.re*cose - zis.im*sine);
                 grad.y += (zis.im*cose + zis.re*sine);
                 //
         }
         //
         return(grad);
 }
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient_81_SOA_v2_novec(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_81_SOA begins");
         //std::cout << "# module_potentialDerivatives_totalGradient_81_SOA begins" << std::endl;
         // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
         struct point grad, clumpgrad;
         grad.x = 0.;
         grad.y = 0.;
 #pragma novector
         for(int i = shalos; i < shalos + nhalos; i++)
         {
                 //IACA_START;
                 //
                 struct point true_coord, true_coord_rot; //, result;
                 //type_t       R, angular_deviation;
                 complex      zis;
                 //
                 //result.x = result.y = 0.;
                 //
                 true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
                 /*positionning at the potential center*/
                 // Change the origin of the coordinate system to the center of the clump
                 type_t cose = lens->anglecos[i];
                 type_t sine = lens->anglesin[i];
                 type_t x = true_coord.x*cose + true_coord.y*sine;
                 type_t y = true_coord.y*cose - true_coord.x*sine;
                 //
                 type_t eps  = lens->ellipticity_potential[i];
                 type_t rc   = lens->rcore[i];
                 type_t rcut = lens->rcut[i];
                 type_t b0   = lens->b0[i];
                 type_t t05  = b0*rcut/(rcut - rc);
                 //
                 type_t sqe  = sqrt(eps);
                 //
                 type_t cx1  = ((type_t) 1. - eps)/((type_t) 1. + eps);
                 type_t cxro = ((type_t) 1. + eps)*((type_t) 1. + eps);
                 type_t cyro = ((type_t) 1. - eps)*((type_t) 1. - eps);
                 //
                 type_t rem2 = x*x/cxro + y*y/cyro;
                 //
                 complex zci, znum, zden, zres_rc, zres_rcut;
                 type_t norm;
                 //
                 zci.re  = (type_t) 0.;
                 zci.im  = (type_t) -0.5*((type_t) 1. - eps*eps)/sqe;
                 //
                 // step 1
                 {
                         KERNEL(rc, zres_rc)
                 }
                 // step 2
                 {
                         KERNEL(rcut, zres_rcut)
                 }
                 zis.re  = t05*(zres_rc.re - zres_rcut.re);
                 zis.im  = t05*(zres_rc.im - zres_rcut.im);
                 // rotation
                 grad.x += (zis.re*cose - zis.im*sine);
                 grad.y += (zis.im*cose + zis.re*sine);
                 //
         }
         //
         return(grad);
 }
 //
 //
 //
 typedef struct point (*halo_func_t) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos); 
 halo_func_t halo_func[100] = 
 { 
 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_v2, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_v2,  0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0,  module_potentialDerivatives_totalGradient_81_SOA_v2, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient_SOA(const struct point *pImage, const struct Potential_SOA *lens, int nhalos)
 {
         struct point grad, clumpgrad;
         //
         grad.x = clumpgrad.x = (type_t) 0.;
         grad.y = clumpgrad.y = (type_t) 0.;
 	//
 	int shalos = 0;
 
 	while (shalos < nhalos)
 	{
 		int lens_type = lens->type[shalos];
 		int count     = 1;
 		while (lens->type[shalos + count] == lens_type and shalos + count < nhalos){
 			//int i = count;
 			//std::cerr << lens->position_x[i] << lens->position_y[i] << lens->anglecos[i]<< lens->anglesin[i]<< lens->ellipticity_potential[i] <<  lens->rcore[i] << lens->rcut[i] << lens->b0[i] << std::endl;
 			//std::cerr << "lens->type[shalos + count] = " << lens->type[shalos + count] << " " << lens_type << " " << lens_type << " " << " count = " << count << std::endl;
 			count++;
 		}
 
 		//	
 		clumpgrad = (*halo_func[lens_type])(pImage, lens, shalos, count);
 		//
 		//std::cerr << lens->position_x[i] << lens->position_y[i] << lens->anglecos[i]<< lens->anglesin[i]<< lens->ellipticity_potential[i] <<  lens->rcore[i] << lens->rcut[i] << lens->b0[i] << std::endl;
 		//std::cerr << "type = " << lens_type << " " << count << " " << nhalos << " " << " grad.x = " << clumpgrad.x << " grad.y = " << clumpgrad.y << std::endl;
 		grad.x += clumpgrad.x;
 		grad.y += clumpgrad.y;
 		shalos += count;
 	}	
 
         return(grad);
 }
 //
 typedef struct point (*halo_func_t_novec) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos);
 //
 halo_func_t_novec halo_func_novec[100] =
 {
 	0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_v2_novec, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_v2_novec,  0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0,  module_potentialDerivatives_totalGradient_81_SOA_v2_novec, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 //
 //
 //
 struct point module_potentialDerivatives_totalGradient_SOA_novec(const struct point *pImage, const struct Potential_SOA *lens, int nhalos)
 {
 	struct point grad, clumpgrad;
 	//
 	grad.x = clumpgrad.x = 0;
 	grad.y = clumpgrad.y = 0;
 	//
 	int shalos = 0;
 	//
 	//module_potentialDerivatives_totalGradient_81_SOA(pImage, lens, 0, nhalos);
 	//return;
 	/*
 	   int* p_type = &(lens->type)[0];
 	   int* lens_type = (int*) malloc(nhalos*sizeof(int));
 	   memcpy(lens_type, &(lens->type)[0], nhalos*sizeof(int));
 	   */
 	//quicksort(lens_type, nhalos);
 	//
 	while (shalos < nhalos)
 	{
 		int lens_type = lens->type[shalos];
 		int count     = 1;
 		while (lens->type[shalos + count] == lens_type) count++;
 		//std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl;
 		//
 		clumpgrad = (*halo_func_novec[lens_type])(pImage, lens, shalos, count);
 		//
 		grad.x += clumpgrad.x;
 		grad.y += clumpgrad.y;
 		shalos += count;
 	}
 
 	return(grad);
 }
 
 
 
diff --git a/src/gradient2.cpp b/src/gradient2.cpp
index b728ed2..17d9dcb 100644
--- a/src/gradient2.cpp
+++ b/src/gradient2.cpp
@@ -1,258 +1,257 @@
 /**
 Lenstool-HPC: HPC based massmodeling software and Lens-map generation
 Copyright (C) 2017  Christoph Schaefer, EPFL (christophernstrerne.schaefer@epfl.ch), Gilles Fourestey (gilles.fourestey@epfl.ch)
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 @brief: Function for second order computation on CPU to compare with GPUs
 
 */
 
 #include <iostream>
 #include <iomanip>
 #include <string.h>
 //#include <cuda_runtime.h>
 #include <math.h>
 #include <sys/time.h>
 #include <fstream>
-#include <immintrin.h>
 #include <map>
 /*
 #ifdef __AVX__
 #include "simd_math_avx.h"
 #endif
 #ifdef __AVX512F__
 #include "simd_math_avx512f.h"
 #endif
 */
 #include "structure_hpc.hpp"
 #include "gradient2.hpp"
 
 //lenstool fct
 /*
 * derivates of I0.5 KK
 * Parameters :
 * - (x,y) is the computation position of the potential
 * - eps is the ellepticity (a-b)/(a+b)
 * - rc is the core radius
 * - b0 asymptotic Einstein radius E0. (6pi*vdisp^2/c^2)
 *
 * Return a the 4 second derivatives of the PIEMD potential
 */
 inline
 void mdci05_hpc(type_t x, type_t y, type_t eps, type_t rc, type_t b0, struct matrix *res)
 {
 	type_t  ci, sqe, cx1, cxro, cyro, wrem;
 	type_t  didyre, didyim, didxre;		// didxim;
 	type_t  cx1inv, den1, num2, den2;
 	//
 	sqe    = sqrt(eps);
 	cx1    = (1. - eps)/(1. + eps);
 	cx1inv = 1./cx1;
 	//
 	cxro   = (1. + eps)*(1. + eps);     /* rem^2=x^2/(1+e^2) + y^2/(1-e^2) Eq 2.3.6*/
 	cyro   = (1. - eps)*(1. - eps);
 	ci     = 0.5*(1. - eps * eps) / sqe;
 	wrem   = sqrt(rc*rc + x*x/cxro + y*y/cyro); /*wrem^2=w^2+rem^2 with w core radius*/
 	den1   = 2.*sqe*wrem - y*cx1inv;
 	den1   = cx1*cx1*x*x + den1*den1;
 	num2   = 2.*rc*sqe - y;
 	den2   = x*x + num2*num2;
 	//
 	didxre = ci*( cx1*(2.*sqe*x*x/cxro/wrem - 2.*sqe*wrem + y*cx1inv)/den1 + num2/den2 );
 	didyre = ci*( (2.*sqe*x*y*cx1/cyro/wrem - x)/den1 + x/den2 );
 	didyim = ci * ( (2.* sqe * wrem * cx1inv - y * cx1inv * cx1inv - 4.*eps*y/cyro +
 				2.* sqe * y * y / cyro / wrem * cx1inv)/den1 - num2/den2 );
 	//
 	res->a = b0 * didxre;
 	res->b = res->d = b0 * didyre; //(didyre+didxim)/2.;
 	res->c = b0 * didyim;
 	//  return(res);
 }
 //
 void printmat(matrix A){
 	std::cerr << A.a << " " << A.b << " " << A.c << " " << A.d <<  std::endl;
 }
 
 //
 // SOA versions, vectorizable
 //
 //
 struct matrix module_potentialDerivatives_totalGradient2_81_SOA_v2(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
 	asm volatile("# module_potentialDerivatives_totalGradient2_81_SOA begins");
 	//std::cout << "# module_potentialDerivatives_totalGradient_81_SOA begins" << std::endl;
 	// 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
 	//
 	type_t t05;
 	type_t RR;
 	//
 	struct matrix grad2, clump, clumpcore, clumpcut;
 	struct point true_coord, true_coord_rotation;
 	//
 	grad2.a = clump.a = 0;
 	grad2.b = clump.b = 0;
 	grad2.c = clump.c = 0;
 	grad2.d = clump.d = 0;
 	//
 	for(int i = shalos; i < shalos + nhalos; i++)
 	{
 		//if (lens->ellipticity_potential[i] < 0.0001) printf("halo = %d, ellipticity_potential = %f\n", i, lens->ellipticity_potential[i]);
 		//if(lens->ellipticity_potential[i] > 0.0001)
 		if (true)
 		{
 			//True coord
 			true_coord.x = pImage->x - lens->position_x[i];
 			true_coord.y = pImage->y - lens->position_y[i];
 			//
 			//std::cerr << "start" << std::endl;
 			//Rotation
 			type_t cose = lens->anglecos[i];
 			type_t sine = lens->anglesin[i];
 			//
 			type_t x = true_coord.x*cose + true_coord.y*sine;
 			type_t y = true_coord.y*cose - true_coord.x*sine;
 			// 81 comput
 			t05 = lens->rcut[i] / (lens->rcut[i] - lens->rcore[i]);
 			mdci05_hpc(x, y, lens->ellipticity_potential[i], lens->rcore[i], lens->b0[i], &clumpcore);
 			mdci05_hpc(x, y, lens->ellipticity_potential[i], lens->rcut[i], lens->b0[i], &clumpcut);
 
 			//printf("X %f Y %f Ga:  %f %f\n",true_coord.x  ,true_coord.y , clumpcore.a, clumpcut.a);
 			//printf("X %f Y %f Gb:  %f %f\n",true_coord.x  ,true_coord.y,  clumpcore.b, clumpcut.b);
 			//printf("X %f Y %f Gc:  %f %f\n", true_coord.x  ,true_coord.y,clumpcore.c, clumpcut.c);
 			//printf("X %f Y %f Gd:  %f %f\n",true_coord.x  ,true_coord.y,  clumpcore.d, clumpcut.d);
 			//
 			clumpcore.a = t05 * (clumpcore.a - clumpcut.a);
 			clumpcore.b = t05 * (clumpcore.b - clumpcut.b);
 			clumpcore.c = t05 * (clumpcore.c - clumpcut.c);
 			clumpcore.d = t05 * (clumpcore.d - clumpcut.d);
 			//printmat(clumpcore);
 			//printf("X %f Y %f Ga:  %f %f\n",pImage->x  ,pImage->y , clumpcore.a, t05);
 			//printf("X %f Y %f Gb:  %f %f\n",pImage->x  ,pImage->y,  clumpcore.b, clumpcut.b);
 			//printf("X %f Y %f Gc:  %f %f\n",pImage->x  ,pImage->y, clumpcore.c, clumpcut.c);
 			//printf("X %f Y %f Gd:  %f %f\n",pImage->x  ,pImage->y,  clumpcore.d, clumpcut.d);
 			//rotation matrix  1
 			clumpcut.a = clumpcore.a * cose + clumpcore.b * -sine;
 			clumpcut.b = clumpcore.a * sine + clumpcore.b *  cose;
 			clumpcut.c = clumpcore.d * sine + clumpcore.c *  cose;
 			clumpcut.d = clumpcore.d * cose + clumpcore.c * -sine;
 			//printf("X %f Y %f theta: %f %f %f\n",pImage->x ,pImage->y,lens->ellipticity_angle[i], cose, sine);
 			//printmat(clumpcut);
 			//rotation matrix  2
 			clump.a = cose*clumpcut.a - sine*clumpcut.d;
 			clump.b = cose*clumpcut.b - sine*clumpcut.c;
 			clump.c = sine*clumpcut.b + cose*clumpcut.c;
 			clump.d = sine*clumpcut.a + cose*clumpcut.d;
 			//
 			//printf("X %f Y %f Ga:  %f \n",pImage->x  ,pImage->y , clump.a );
 			//printf("X %f Y %f Gb:  %f \n",pImage->x  ,pImage->y,  clump.b );
 			//printf("X %f Y %f Gc:  %f \n",pImage->x  ,pImage->y, clump.c);
 			//printf("X %f Y %f Gd:  %f \n",pImage->x  ,pImage->y,  clump.d );
 			//printmat(clump);
 			//
 			grad2.a += clump.a;
 			grad2.b += clump.b;
 			grad2.c += clump.c;
 			grad2.d += clump.d;
 		}
 		else if((RR = true_coord.x * true_coord.x + true_coord.y * true_coord.y) > 0.)
 		{
 			// Circular dPIE Elliasdottir 2007 Eq A23 slighly modified for t05
 			type_t X,Y,z,p,t05;
 			X = lens->rcore[i];
 			Y = lens->rcut[i];
 			t05 = lens->b0[i] * Y / (Y - X); // 1/u because t05/sqrt(u) and normalised Q/sqrt(u)
 			z  = sqrt(RR + X * X) - X - sqrt(RR + Y * Y) + Y;  // R*dphi/dR
 			X = RR / X;
 			Y = RR / Y;
 			p  = (1. - 1. / sqrt(1. + X / lens->rcore[i])) / X - (1. - 1. / sqrt(1. + Y / lens->rcut[i])) / Y;  // d2phi/dR2
 			X = true_coord.x * true_coord.x / RR;
 			Y = true_coord.y * true_coord.y / RR;
 			clump.a = t05 * (p * X + z * Y / RR);
 			clump.c = t05 * (p * Y + z * X / RR);
 			X = true_coord.x * true_coord.y / RR;
 			clump.b = clump.d = t05 * (p * X - z * X / RR);
 
 			grad2.a += clump.a;
 			grad2.b += clump.b;
 			grad2.c += clump.c;
 			grad2.d += clump.d;
 		}
 		else
 		{
 			clump.a = clump.c = lens->b0[i] / lens->rcore[i]/ 2.;
 			clump.b = clump.d = 0.;
 			grad2.a += clump.a;
 			grad2.b += clump.b;
 			grad2.c += clump.c;
 			grad2.d += clump.d;
 		}
 
 	}
 	//
 	return(grad2);
 }
 //
 //This natrix handles the calling of the gradient functions for the different type without losing time to switch or if condition
 //
 typedef struct matrix (*halo_g2_func_t) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos);
 halo_g2_func_t halo_g2_func[100] =
 { 
 	0, 0, 0, 0, 0, 0, 0, 0, 0,  0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	0,  module_potentialDerivatives_totalGradient2_81_SOA_v2, 0, 0, 0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 //
 //
 //
 struct matrix module_potentialDerivatives_totalGradient2_SOA(const struct point *pImage, const struct Potential_SOA *lens, int nhalos)
 {
 	struct matrix grad2, clumpgrad;
 	//
 	grad2.a = clumpgrad.a = 0;
 	grad2.b = clumpgrad.b = 0;
 	grad2.c = clumpgrad.c = 0;
 	grad2.d = clumpgrad.d = 0;
 	//
 	int shalos = 0;
 
 	while (shalos < nhalos)
 	{
 		int lens_type = lens->type[shalos];
 		int count     = 1;
 		while (lens->type[shalos + count] == lens_type and shalos + count < nhalos) count++;
 		//std::cerr << "type = " << lens_type << " " << count << " " << shalos << " " << nhalos << " " <<  " func = " << halo_g2_func[lens_type] << std::endl;
 		//	
 		clumpgrad = (*halo_g2_func[lens_type])(pImage, lens, shalos, count);
 		//
 		grad2.a += clumpgrad.a;
 		grad2.b += clumpgrad.b;
 		grad2.c += clumpgrad.c;
 		grad2.d += clumpgrad.d;
 		shalos += count;
 	}	
 
 	return(grad2);
 }
 //