diff --git a/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_CPU.cpp b/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_CPU.cpp
index e7e633a..a922194 100644
--- a/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_CPU.cpp
+++ b/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_CPU.cpp
@@ -1,420 +1,428 @@
 #include <stdlib.h>
 #include <iomanip>
 #include "grid_gradient_mixed_CPU.hpp"
 #include "structure_mixed.hpp"
 
 //#include <iacaMarks.h>
 
 extern "C"{
 double mysecond()
 {
         struct timeval  tp;
         struct timezone tzp;
         //
         int i = gettimeofday(&tp,&tzp);
         //
         return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
 }
 }
 
 
 //
 //
 //
 //#define SIS_THRESHOLD 0.0092
 //
 //
 //
 //struct point_double module_potentialDerivatives_totalGradient_5_SOA_DP_v2(const struct point_double *pImage, const struct Potential_SOA *lens, int shalos, int nhalos, int echo = false)
 //inline
 struct point_double module_potentialDerivatives_totalGradient_5_SOA_DP_v2(const struct point_double *pImage, const struct Potential_Mixed<double>* lens, int shalos, int nhalos, int echo = false)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins");
         //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n");
         //
 	double one       = 1.;
 	double threehalf = 1.5;
 	double half      = 0.5;
 	//
         struct point_double grad, result;
 	//
         grad.x = (double) 0.;
         grad.y = (double) 0.;
 	//int i = 0*shalos;
 	//
 	//IACA_START;
         for(int i = shalos; i < shalos + nhalos; i++)
         {
                 //
                 double b0 = lens->b0[i];
                 struct point_double true_coord, true_coord_rotation;
                 //
                 true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
                 //
                 //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
                 double cose = lens->anglecos[i];
                 double sine = lens->anglesin[i];
                 //
                 double x = true_coord.x*cose + true_coord.y*sine;
                 double y = true_coord.y*cose - true_coord.x*sine;
                 //
                 double ell_pot = lens->ellipticity_potential[i];
                 //
 		double val = x*x*(one - ell_pot) + y*y*(one + ell_pot);
 		//
 		double R   = 1.f/sqrtf(val);
 		R = R*(threehalf - half*val*R*R);
 		//
 		result.x = (one - ell_pot)*b0*x*R;
 		result.y = (one + ell_pot)*b0*y*R;
                 //double R = sqrt(x*x*(1 - ell_pot) + y*y*(1 + ell_pot));
                 //
                 //result.x = (1 - ell_pot)*lens->b0[i]*x/R;
                 //result.y = (1 + ell_pot)*lens->b0[i]*y/R;
                 //
                 grad.x += result.x*cose - result.y*sine;
                 grad.y += result.y*cose + result.x*sine;
 		//if (echo) printf("coord = %.15f %.15f, x = %.15f %.15f, ell_pot = %.15f, res = %.15f %.15f\n", true_coord.x, true_coord.y, x, y, ell_pot, result.x, result.y);
         }
 	//IACA_END;
         return grad;
 }
 //
 //
 //
 struct point_double module_potentialDerivatives_totalGradient_5_SOA_DP_v2(const struct point_double *pImage, const struct Potential_SOA *lens, int shalos, int nhalos, int echo = false)
 {
         asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins");
         //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n");
         //
         double one       = 1.;
         double threehalf = 1.5;
         double half      = 0.5;
         //
         struct point_double grad, result;
         //
         grad.x = (double) 0.;
         grad.y = (double) 0.;
         //
         for(int i = shalos; i < shalos + nhalos; i++)
         {
                 //
                 double b0 = lens->b0[i];
                 struct point_double true_coord, true_coord_rotation;
                 //
                 true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
                 //
                 //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
                 double cose = lens->anglecos[i];
                 double sine = lens->anglesin[i];
                 //
                 double x = true_coord.x*cose + true_coord.y*sine;
                 double y = true_coord.y*cose - true_coord.x*sine;
                 //
                 double ell_pot = lens->ellipticity_potential[i];
                 //
                 double val = x*x*(one - ell_pot) + y*y*(one + ell_pot);
                 //
                 double R   = 1.f/sqrtf(val);
                 R = R*(threehalf - half*val*R*R);
                 //
                 result.x = (one - ell_pot)*b0*x*R;
                 result.y = (one + ell_pot)*b0*y*R;
                 //double R = sqrt(x*x*(1 - ell_pot) + y*y*(1 + ell_pot));
                 //
                 //result.x = (1 - ell_pot)*lens->b0[i]*x/R;
                 //result.y = (1 + ell_pot)*lens->b0[i]*y/R;
                 //
                 grad.x += result.x*cose - result.y*sine;
                 grad.y += result.y*cose + result.x*sine;
                 //if (echo) printf("coord = %.15f %.15f, x = %.15f %.15f, ell_pot = %.15f, res = %.15f %.15f\n", true_coord.x, true_coord.y, x, y, ell_pot, result.x, result.y);
         }
         return grad;
 }
 //
 //
 //
 //void gradient_grid_general_double_CPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells, const struct Potential_SOA *lens, int istart, int jstart)
 void gradient_grid_general_double_CPU(double* __restrict__ grid_grad_x, double* __restrict__ grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells, const struct Potential_Mixed<double> *lens, int istart, int jstart)
 {
 	//printf("gradient_grid_general_double_CPU Potential_Mixed<double> %d %d\n", istart, jstart); fflush(stdout);
         const int grid_dim = nbgridcells;
         //
         //const type_t dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
         //const type_t dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
         const double dx = (frame->xmax - frame->xmin)/(nbgridcells - 1);
         const double dy = (frame->ymax - frame->ymin)/(nbgridcells - 1);
         //
 #pragma omp parallel
 #pragma omp for 
         for (int jj = jstart; jj < nbgridcells; ++jj)
                 for (int ii = istart; ii < nbgridcells; ++ii)
                 {
                         int index = jj*nbgridcells + ii;
                         //
 			point_double image_point;
                         image_point.x = frame->xmin + ii*dx;
                         image_point.y = frame->ymin + jj*dy;
 
                         //Grad = module_potentialDerivatives_totalGradient_SOA(&image_point, lens, 0, Nlens);
 			//printf("%d %d: %d ", ii, jj, index);
                         point_double Grad_DP = module_potentialDerivatives_totalGradient_5_SOA_DP_v2(&image_point, lens, 0, Nlens, (ii == 0) && (jj == 0));
 			//if (index == 0) std::cout << "** " << index << " " << ii << " " << jj << " " << std::setprecision(15) << image_point.x << " " << image_point.y << ": " << Grad_DP.x << " " << Grad_DP.y << std::endl;
                         //
                         grid_grad_x[index] = (double) Grad_DP.x;
                         grid_grad_y[index] = (double) Grad_DP.y;
                         //
                 }
 }
 //
 //
 //
 void gradient_grid_general_double_CPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells, const struct Potential_SOA *lens, int istart, int jstart)
 {
         const int grid_dim = nbgridcells;
         //
         //const type_t dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
         //const type_t dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
         const double dx = (frame->xmax - frame->xmin)/(nbgridcells - 1);
         const double dy = (frame->ymax - frame->ymin)/(nbgridcells - 1);
         //
 #pragma omp parallel
 #pragma omp for
         for (int jj = jstart; jj < nbgridcells; ++jj)
                 for (int ii = istart; ii < nbgridcells; ++ii)
                 {
                         int index = jj*nbgridcells + ii;
                         //
                         point_double image_point;
                         image_point.x = frame->xmin + ii*dx;
                         image_point.y = frame->ymin + jj*dy;
 
                         //Grad = module_potentialDerivatives_totalGradient_SOA(&image_point, lens, 0, Nlens);
                         //printf("%d %d: %d ", ii, jj, index);
                         point_double Grad_DP = module_potentialDerivatives_totalGradient_5_SOA_DP_v2(&image_point, lens, 0, Nlens, 0);
                         //std::cout << "** " << index << " " << ii << " " << jj << " " << std::setprecision(15) << image_point.x << " " << image_point.y<< ": " << Grad_DP.x << " " << Grad_DP.y << std::endl;
                         //
                         grid_grad_x[index] = (double) Grad_DP.x;
                         grid_grad_y[index] = (double) Grad_DP.y;
                         //
                 }
 }
 //
 //
 //
 void
 gradient_grid_euler(double* grid_grad_x, double* grid_grad_y, const double* grid_grad_temp_x, const double* grid_grad_temp_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart, int* count)
 //gradient_grid_euler(double* grid_grad_temp_x, double* grid_grad_temp_y, double* grid_grad_x, double* grid_grad_y, const struct grid_param *frame, const struct Potential_Mixed<double> *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart, int* count)
 {
 	int loc_count =  0;
         double time   = -mysecond();
 	//
         {
                 const double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1);
                 const double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1);
 #pragma omp parallel
 #pragma omp for reduction (+: loc_count) 
 //#pragma omp single 
                 for (int jj = jstart + 1; jj < nbgridcells_y; ++jj)
                 {
                         grid_grad_x[jj*nbgridcells_y] = grid_grad_temp_x[jj*nbgridcells_y];
                         grid_grad_y[jj*nbgridcells_y] = grid_grad_temp_y[jj*nbgridcells_y];
 //#pragma omp task 
                         for (int ii = istart + 1; ii < nbgridcells_x; ++ii)
                         {
                                 {
                                         int index           = (jj - 0)*nbgridcells_x + (ii - 0);
                                         int index_north     = (jj - 1)*nbgridcells_x + (ii - 0);
                                         int index_west      = (jj - 0)*nbgridcells_x + (ii - 1);
                                         //
                                         type_t grad_north_x = grid_grad_temp_x[index] - grid_grad_temp_x[index_north];
                                         type_t grad_north_y = grid_grad_temp_y[index] - grid_grad_temp_y[index_north];
                                         //
                                         type_t grad_west_x  = grid_grad_temp_x[index] - grid_grad_temp_x[index_west];
                                         type_t grad_west_y  = grid_grad_temp_y[index] - grid_grad_temp_y[index_west];
                                         //
                                         int c1 = fabs(dx - grad_north_x) < SIS_THRESHOLD;
                                         int c2 = fabs(dx -  grad_west_x) < SIS_THRESHOLD;
                                         int c3 = fabs(dy - grad_north_y) < SIS_THRESHOLD;
                                         int c4 = fabs(dy -  grad_west_y) < SIS_THRESHOLD;
                                         //
-                                        if (c1 || c2 || c3 || c4)
+                                        //if (c1 || c2 || c3 || c4)
+                                        if (c2 || c3)
                                         {
                                                 struct point_double pImage;
                                                 //
                                                 pImage.x = (double) frame->xmin + ii*dx;
                                                 pImage.y = (double) frame->ymin + jj*dy;
                                                 //
                                                 //point_double Grad_DP = module_potentialDerivatives_totalGradient_5_SOA_DP_v2(&pImage, lens, 0, Nlens, ((ii == 1) && (jj == 175)));
                                                 point_double Grad_DP = module_potentialDerivatives_totalGradient_5_SOA_DP_v2(&pImage, lens, 0, Nlens, true);
                                                 //
                                                 grid_grad_x[index] = Grad_DP.x;
                                                 grid_grad_y[index] = Grad_DP.y;
                                                 loc_count++;
 						//if (ii == 1) printf("%d %d: %f %f -> %f %f\n", ii, jj, pImage.x, pImage.y, Grad_DP.x, Grad_DP.y);
                                         }
                                         else
                                         {
                                                 grid_grad_x[index] = grid_grad_temp_x[index];
                                                 grid_grad_y[index] = grid_grad_temp_y[index];
                                         }
                                 }
                         }
                 }
         }
 	*count = loc_count;
 //#pragma omp taskwait
 }
 //
 //
 //
-void gradient_grid_SIS_patch(double* grid_grad_x, double* grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart, int* count)
+void gradient_grid_SIS_patch(double* __restrict__ grid_grad_sis_x, double* __restrict__ grid_grad_sis_y, double* __restrict__ grid_grad_x, double* __restrict__ grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart, int* count)
 //void gradient_grid_SIS_patch(double* grid_grad_x, double* grid_grad_y, const struct grid_param *frame, const struct Potential_Mixed<double> *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart, int* count)
 {
         //
         // SIS Potentials in double precision
         //                
 	printf(" ** CPU SIS patch\n"); fflush(stdout);
         *count = 0;
         for (int pot = 0; pot < Nlens; ++pot)
         {
                 int patch_size = 20;
                 if (lens->vdisp[pot] > 900.)
                 {
                         patch_size = 200;
                 }
                 const double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1);
                 const double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1);
                 //
                 const int px = (int) ((lens->position_x[pot] - frame->xmin)/dx);
                 const int py = (int) ((lens->position_y[pot] - frame->ymin)/dy);
                 (*count)++;
                 //
                 int istart = px - patch_size/2;
                 int jstart = py - patch_size/2;
                 {
                         const double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1);
                         const double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1);
 #pragma omp parallel
 #pragma omp for collapse(2) 
                 for (int jj = 0; jj < patch_size; ++jj)
                         for (int ii = 0; ii < patch_size; ++ii)
                         {
                                 //
                                 int index = (jj + jstart)*nbgridcells_x + (ii + istart);
                                 //
                                 struct point_double image_point_DP;
                                 image_point_DP.x = (double) frame->xmin + (ii + istart)*dx;
                                 image_point_DP.y = (double) frame->ymin + (jj + jstart)*dy;
                                 //
                                 point_double Grad_DP = module_potentialDerivatives_totalGradient_5_SOA_DP_v2(&image_point_DP, lens, 0, Nlens, 0);
                                 //
                                 // needs checking?
                                 //
 				//printf("%.15f %.15f\n", Grad_DP.x, Grad_DP.y);
-                                grid_grad_x[index] = Grad_DP.x;
-                                grid_grad_y[index] = Grad_DP.y;
+                                grid_grad_sis_x[index] = Grad_DP.x;
+                                grid_grad_sis_y[index] = Grad_DP.y;
                         }
                 }
         }
 
 }
+
+void gradient_grid_SIS_patch(double* grid_grad_x, double* grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart, int* count)
+{
+		gradient_grid_SIS_patch(grid_grad_x, grid_grad_y, grid_grad_x, grid_grad_y, frame, lens, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart, count);
+}
+
+
 //
 //
 //
 void gradient_grid_general_mixed_CPU(double* grid_grad_x, double* grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart)
 {
 	const int grid_dim = nbgridcells_x;
 	//
         struct Potential_Mixed<double> lens_mixed;
 	alloc(lens_mixed,       Nlens);
 	copy (lens_mixed, lens, Nlens);
 	//
 	//point  image_point;
 	//
 	const type_t dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1);
 	const type_t dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1);
 	//
 	double* grid_grad_temp_x = (double*) malloc(nbgridcells_x*nbgridcells_y*sizeof(double));
 	double* grid_grad_temp_y = (double*) malloc(nbgridcells_x*nbgridcells_y*sizeof(double));
 	double time;
 	//
 	//printf("MIXED -> dx = %.15f, dy = %.15f, nbgridcells_x = %d, nbgridcells_y = %d\n", dx, dy, nbgridcells_x, nbgridcells_y);
 	//
 	// Float computation
 	//
 #if 1
 	int count = 0;
 	time = -mysecond();
 #pragma omp parallel
 #pragma omp for 
 	for (int jj = jstart; jj < nbgridcells_y; ++jj) 
 		for (int ii = istart; ii < nbgridcells_x; ++ii)
 		{
 			int index = jj*nbgridcells_x + ii;
 			//
 			struct point image_point;
 			image_point.x = frame->xmin + (ii + istart)*dx;
 			image_point.y = frame->ymin + (jj + jstart)*dy;
 			//
 			point Grad = module_potentialDerivatives_totalGradient_SOA(&image_point, lens, Nlens);
 			//
 			grid_grad_temp_x[index] = (double) Grad.x;
 			grid_grad_temp_y[index] = (double) Grad.y;
 			//
 		}
 	memcpy(grid_grad_x, grid_grad_temp_x, nbgridcells_x*sizeof(double));
 	memcpy(grid_grad_y, grid_grad_temp_y, nbgridcells_y*sizeof(double));
 #endif
 	time += mysecond();
 	printf("\n** Float computation: %f s.\n", time);
 	//
 	//
 	//
 #ifdef __SIS
 #warning "SIS CPU"
         //
         // SIS Potentials in double precision
         //                
         count = 0;
         time = -mysecond();
 	{
 		gradient_grid_SIS_patch(grid_grad_temp_x, grid_grad_temp_y, frame, lens, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart, &count);
 	}
         time += mysecond();
         printf("** %d SIS computations: %f s.\n", count, time);
 
 #endif
 	//
 	// Euler approximation
 	//
 //#ifdef __EULER
 #if __EULER
 #warning "EULER CPU"
 	//memset(grid_grad_x, nbgridcells_x*nbgridcells_y*sizeof(double));
 	//memset(grid_grad_y, nbgridcells_x*nbgridcells_y*sizeof(double));
 	count = 0;
 	time = -mysecond();
 	{
 		//gradient_grid_euler(grid_grad_temp_x, grid_grad_temp_y, grid_grad_x, grid_grad_y, frame, &lens_mixed, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart, &count);
 		gradient_grid_euler(grid_grad_x, grid_grad_y, grid_grad_temp_x, grid_grad_temp_y, frame, lens, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart, &count);
 	}
 	time += mysecond();
 	printf("** %d Euler computations: %f s.\n", count, time);
 #else
         memcpy(grid_grad_x, grid_grad_temp_x, nbgridcells_x*nbgridcells_y*sizeof(double));
         memcpy(grid_grad_y, grid_grad_temp_y, nbgridcells_x*nbgridcells_y*sizeof(double));
 #endif
 }
 
 
 void gradient_grid_mixed_CPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, int nbgridcells_x, int nbgridcells_y, int istart, int jstart)
 {
 	double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1);
 	double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1);
 	//
 	gradient_grid_general_mixed_CPU(grid_grad_x, grid_grad_y, frame, lens, nhalos, nbgridcells_x, nbgridcells_y, istart, jstart);
 	//
 }
diff --git a/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_CPU.hpp b/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_CPU.hpp
index e9d8450..a1d8ccf 100644
--- a/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_CPU.hpp
+++ b/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_CPU.hpp
@@ -1,61 +1,63 @@
 /*
  * grid_gradient_CPU.hpp
  *
  *  Created on: Jan 12, 2017
  *      Author: cerschae
  */
 
 #ifndef GRID_GRADIENT_CPU_HPP_
 #define GRID_GRADIENT_CPU_HPP_
 
 
 #include "structure_hpc.hpp"
 #include "structure_mixed.hpp"
 
 #include <iostream>
 #include <string.h>
 //#include <cuda_runtime.h>
 #include <math.h>
 #include <gradient_avx.hpp>
 #include <sys/time.h>
 #include <fstream>
 //
 #define SIS_THRESHOLD 0.0092
 //
 //
 //#include <immintrin.h>
 //
 //
 struct point_double
 {
         double x;
         double y;
 };
 //
 //
 //
 void gradient_grid_CPU(type_t *grid_grad_x, type_t *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells, int istart = 0, int jstart = 0);
 //
 //
 //
 void gradient_grid_CPU(type_t *grid_grad_x, type_t *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart = 0, int jstart = 0);
 //
 static void gradient_grid_general_CPU_old(type_t *grid_grad_x, type_t *grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells,  const struct Potential_SOA *lens);
 //
 void gradient_grid_mixed_CPU(double *grid_grad_x, double *grid_grad_y, 
 			     const struct grid_param *frame, 
 			     const struct Potential_SOA *lens, 
 			     int nhalos, 
 			     int nbgridcells_x, int nbgridcells_y, 
 			     int istart, int jstart);
 //
 void gradient_grid_general_mixed_CPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells, const struct Potential_SOA *lens, int istart, int jstart);
 //
 void gradient_grid_general_double_CPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells, const struct Potential_SOA *lens, int istart, int jstart);
 //
 void gradient_grid_general_double_CPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells, const struct Potential_Mixed<double> *lens, int istart, int jstart);
 //
-void gradient_grid_SIS_patch(double* grid_grad_x, double* grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart, int* count);
+void gradient_grid_SIS_patch(double* __restrict__ grid_grad_x, double* __restrict__grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart, int* count);
+//
+void gradient_grid_SIS_patch(double* __restrict__ grid_grad_sis_x, double* __restrict__ grid_grad_sis_y, double* __restrict__ grid_grad_x, double* __restrict__grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart, int* count);
 //
 //
 #endif /* GRID_GRADIENT_CPU_HPP_ */
diff --git a/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_GPU.cu b/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_GPU.cu
index acd63f2..f50bf20 100644
--- a/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_GPU.cu
+++ b/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_GPU.cu
@@ -1,1393 +1,1417 @@
 #include <stdlib.h>
 #include <iomanip>
 #include "grid_gradient_mixed_CPU.hpp"
 #include "grid_gradient_mixed_GPU.cuh"
 //#include "structure_mixed.hpp"
 #include "gradient_GPU.cuh"
 //#include "cudafunctions.h"
 // must be squared
 #define BLOCK_SIZE_X 16
 #define BLOCK_SIZE_Y 16
 //
 //
 #define CUDA_SAFE_CALL(call) \
 do {                                \
     cudaError_t err = call;          \
     if (cudaSuccess != err) {         \
         fprintf (stderr, "'%s' Cuda error in file '%s' in line %i : %s.", \
                  __FILE__, __LINE__, cudaGetErrorString(err) );      \
         exit(EXIT_FAILURE);                                          \
     }                                                                \
 } while (0)
 //
 //
 //
 __device__ __managed__ unsigned int gpu_count ;
 
 extern "C"{
 double seconds()
 {
 	struct timeval  tp;
 	struct timezone tzp;
 	//
 	int i = gettimeofday(&tp,&tzp);
 	//
 	return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
 }
 }
 //
 //
 //
 #define SIS_THRESHOLD 0.0092
 //
 //
 //
 __device__ 
 point 
 module_potentialDerivatives_totalGradient_5_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos);
 
 __device__
 point_double
 module_potentialDerivatives_totalGradient_5_SOA_DP_GPU(const struct point_double *pImage, const struct Potential_Mixed<double> *lens, int shalos, int nhalos, int echo = 0);
 //
 //
 //
 __device__
 point
 module_potentialDerivatives_totalGradient_5_SOA_SP_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos)
 {
     //asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins");
     //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n");
     //
     struct point grad, result;
     grad.x = 0;
     grad.y = 0;
     for(int i = shalos; i < shalos + nhalos; i++)
     {
             //
                 struct point true_coord, true_coord_rotation;
                 //
                 true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
                 //
                 //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
 		//
                 float cose = lens->anglecos[i];
                 float sine = lens->anglesin[i];
                 //
                 float x = true_coord.x*cose + true_coord.y*sine;
                 float y = true_coord.y*cose - true_coord.x*sine;
                 //
                 float ell_pot = lens->ellipticity_potential[i];
                 //
                 float b0_inv_R = lens->b0[i]/sqrtf((float) (x*x*(1.f - ell_pot) + y*y*(1.f + ell_pot)));
                 //
                 result.x = (1.f - ell_pot)*x*b0_inv_R;
                 result.y = (1.f + ell_pot)*y*b0_inv_R;
                 //
                 grad.x += result.x*cose - result.y*sine;
                 grad.y += result.y*cose + result.x*sine;
     }
     return grad;
 }
 
 __device__
 point_double
 module_potentialDerivatives_totalGradient_5_SOA_DP_GPU(const struct point_double *pImage, const struct Potential_SOA *lens, int shalos, int nhalos, int echo)
 {
     //asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins");
     //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n");
     //
     struct point_double grad, result;
     grad.x = 0;
     grad.y = 0;
     //
     for(int i = shalos; i < shalos + nhalos; i++)
     {
                 struct point_double true_coord, true_coord_rotation;
                 //
                 true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
                 //
                 //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
                 double cose = lens->anglecos[i];
                 double sine = lens->anglesin[i];
                 //
                 double x = true_coord.x*cose + true_coord.y*sine;
                 double y = true_coord.y*cose - true_coord.x*sine;
                 //
                 double ell_pot = lens->ellipticity_potential[i];
                 //
                 double b0_inv_R;
                 b0_inv_R = lens->b0[i]/sqrt(x*x*(1. - ell_pot) + y*y*(1. + ell_pot));
                 //
                 result.x = (1. - ell_pot)*x*b0_inv_R;
                 result.y = (1. + ell_pot)*y*b0_inv_R;
                 //if (echo) printf("x, y = %.15f %15f, lens = %.15f %15f, coord = %.15f %.15f, x = %.15f %.15f, ell_pot = %.15f, res = %.15f %.15f\n", pImage->x, pImage->y, lens->position_x[i], lens->position_y[i], true_coord.x, true_coord.y, x, y, ell_pot, result.x, result.y);
                 //
                 grad.x += result.x*cose - result.y*sine;
                 grad.y += result.y*cose + result.x*sine;
                 //
     }
     //if (echo) printf("grad = %.15f %.15f\n", grad.x, grad.y);
     return grad;
 }
 
 
 
 //
 //
 // Single Precision
 //
 __device__
 point
 module_potentialDerivatives_totalGradient_5_SOA_SP_GPU(const struct point_double *pImage, const struct Potential_SOA *lens, int shalos, int nhalos, int echo = 0)
 {
 	// asm volatile("# module_potentialDerivatives_totalGradient_5_SOA_SP_GPU begins");
 	// printf("# module_potentialDerivatives_totalGradient_5_SOA_SP_GPU begins\n");
 	//
 	struct point grad, result;
 	grad.x = 0;
 	grad.y = 0;
 	//
 	for(int i = shalos; i < shalos + nhalos; i++)
 	{
 		//
 		struct point_double true_coord, true_coord_rotation;
 		//
 		true_coord.x = pImage->x - lens->position_x[i];
 		true_coord.y = pImage->y - lens->position_y[i];
 		//
 		float cose = lens->anglecos[i];
 		float sine = lens->anglesin[i];
 		//
 		float x = true_coord.x*cose + true_coord.y*sine;
 		float y = true_coord.y*cose - true_coord.x*sine;
 		//
 		float ell_pot = lens->ellipticity_potential[i];
 		//
 		float b0_inv_R = lens->b0[i]/sqrtf(x*x*(1.f - ell_pot) + y*y*(1.f + ell_pot));
 		//
 		result.x = (1.f - ell_pot)*x*b0_inv_R;
 		result.y = (1.f + ell_pot)*y*b0_inv_R;
 		//
 		grad.x += result.x*cose - result.y*sine;
 		grad.y += result.y*cose + result.x*sine;
 	}
 	return grad;
 }
 //
 __global__
 void
 gradient_grid_general_SP_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, int nbgridcells_x, int nbgridcells_y, int istart, int jstart)
 {
 	//
 	struct point grad;
 	struct point pImage;
 	//
 	int col = blockIdx.x*blockDim.x + threadIdx.x;
         if (col > nbgridcells_x) return;
 	int row = blockIdx.y*blockDim.y + threadIdx.y;
         if (row > nbgridcells_y) return;
 	//
 	int index = row*nbgridcells_x + col;
 	//
 	const float dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1);
 	const float dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1);
 	//
 	pImage.x = frame->xmin + (col + istart)*dx;
 	pImage.y = frame->ymin + (row + jstart)*dy;
 	//
 #if 0
 	grad = module_potentialDerivatives_totalGradient_5_SOA_SP_GPU(&pImage, lens, 0, nhalos);
 #else
 	struct point result;
         grad.x = 0;
         grad.y = 0;
         //
         for(int i = 0; i < nhalos; i++)
         {
                 //
                 struct point true_coord, true_coord_rotation;
                 //
                 true_coord.x = pImage.x - lens->position_x[i];
                 true_coord.y = pImage.y - lens->position_y[i];
                 //
                 float cose = lens->anglecos[i];
                 float sine = lens->anglesin[i];
                 //
                 float x = true_coord.x*cose + true_coord.y*sine;
                 float y = true_coord.y*cose - true_coord.x*sine;
                 //
                 float ell_pot = lens->ellipticity_potential[i];
                 //
                 //float b0_inv_R = lens->b0[i]/sqrtf((float) (x*x*(1.f - ell_pot) + y*y*(1.f + ell_pot)));
                 //float b0_inv_R = lens->b0[i]/__fsqrt_rn((float) (x*x*(1.f - ell_pot) + y*y*(1.f + ell_pot)));
                 float b0_inv_R = lens->b0[i]*rsqrtf((float) (x*x*(1.f - ell_pot) + y*y*(1.f + ell_pot)));
                 //
                 result.x = (1.f - ell_pot)*x*b0_inv_R;
                 result.y = (1.f + ell_pot)*y*b0_inv_R;
                 //
                 grad.x += result.x*cose - result.y*sine;
                 grad.y += result.y*cose + result.x*sine;
         }
 #endif
 	//
 	grid_grad_x[index] = (double) grad.x;
 	grid_grad_y[index] = (double) grad.y;
 	//
 }
 //
 //
 //
 __global__
 void
 gradient_grid_euler_GPU(double* grid_grad_euler_x, double* grid_grad_euler_y,
 		const double* grid_grad_temp_x, const double* grid_grad_temp_y, 
 		const struct grid_param *frame, 
 		const struct Potential_SOA *lens, 
 		int Nlens, 
 		int nbgridcells_x, int nbgridcells_y, 
 		int istart, int jstart)
 {
 	int loc_count = 0;
 	//*count = 100;
 	//return;
 	//double time  = -second();
 	//
 #if 1
 	int ii = blockIdx.x*blockDim.x + threadIdx.x;
 	int jj = blockIdx.y*blockDim.y + threadIdx.y;
+	if ((ii == istart) || (jj == jstart))
+	{
+		int index       = (jj - 0)*nbgridcells_x + (ii - 0);
+		grid_grad_euler_x[index] = grid_grad_temp_x[index];
+		grid_grad_euler_y[index] = grid_grad_temp_y[index];
+		return;
+	}
 	//
 	if (((ii < nbgridcells_x) && (jj < nbgridcells_y) && (ii > istart) && (jj > jstart)))
 	{
 		const double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1);
 		const double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1);
 		{
 			{
 				{
 					int index       = (jj - 0)*nbgridcells_x + (ii - 0);
 					int index_north = (jj - 1)*nbgridcells_x + (ii - 0);
 					int index_west  = (jj - 0)*nbgridcells_x + (ii - 1);
 					//
 					type_t grid_x = grid_grad_temp_x[index];
 					type_t grid_y = grid_grad_temp_y[index];
 					//
 					type_t grad_north_x = grid_x - grid_grad_temp_x[index_north];
 					type_t grad_north_y = grid_y - grid_grad_temp_y[index_north];
 					//
 					type_t grad_west_x  = grid_x - grid_grad_temp_x[index_west];
 					type_t grad_west_y  = grid_y - grid_grad_temp_y[index_west];
 					//
 					int c1 = fabs(dx - grad_north_x) < SIS_THRESHOLD;
 					int c2 = fabs(dx -  grad_west_x) < SIS_THRESHOLD;
 					int c3 = fabs(dy - grad_north_y) < SIS_THRESHOLD;
 					int c4 = fabs(dy -  grad_west_y) < SIS_THRESHOLD;
 					//
-					if (c1 || c2 || c3 || c4)
+					//if (c1 || c2 || c3 || c4)
+					if (c2 || c3)
 					{
-						//atomicAdd(&gpu_count, 1);
+						atomicAdd(&gpu_count, 1);
 						//printf("count = \n", count);
 						struct point_double pImage;
 						//
 						pImage.x = (double) frame->xmin + ii*dx;
 						pImage.y = (double) frame->ymin + jj*dy;
 						struct point_double grad, result;
 						//
 						point_double Grad_DP;
 						Grad_DP = module_potentialDerivatives_totalGradient_5_SOA_DP_GPU(&pImage, lens, 0, Nlens, ((ii == 1) && (jj == 175)));
 						grid_grad_euler_x[index] = Grad_DP.x;
 						grid_grad_euler_y[index] = Grad_DP.y;
 						//
 					}
 					else
 					{
-						//grid_grad_euler_x[index] = grid_grad_temp_x[index];
-						//grid_grad_euler_y[index] = grid_grad_temp_y[index];
+						grid_grad_euler_x[index] = grid_grad_temp_x[index];
+						grid_grad_euler_y[index] = grid_grad_temp_y[index];
 					}
 				}
 			}
 		}
 	}
 	//#pragma omp taskwait
 #endif
 }
 //
 //
 //
 void gradient_grid_general_mixed_GPU(double* __restrict__ grid_grad_x, double* __restrict__ grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart)
 {
 	grid_param *frame_gpu;
 	Potential_SOA *lens_gpu,*lens_kernel;
 	//double *grid_grad_x_gpu, *grid_grad_y_gpu ;
 	lens_gpu = (Potential_SOA *) malloc(sizeof(Potential_SOA));
 	//lens_gpu->type = (int *) malloc(sizeof(int));
 	//
 	int nhalos = Nlens;
 	//
 	// Allocate variables on the GPU
 	//
 	double atime = -seconds();
 	//
 	double* __restrict__ grid_grad_sis_x;  cudaMallocHost((void**) &grid_grad_sis_x, nbgridcells_x*nbgridcells_y*sizeof(double));
 	double* __restrict__ grid_grad_sis_y;  cudaMallocHost((void**) &grid_grad_sis_y, nbgridcells_x*nbgridcells_y*sizeof(double));
 	//
+	double* __restrict__ grid_grad_euler_cpu_x;  cudaMallocHost((void**) &grid_grad_euler_cpu_x, nbgridcells_x*nbgridcells_y*sizeof(double));
+	double* __restrict__ grid_grad_euler_cpu_y;  cudaMallocHost((void**) &grid_grad_euler_cpu_y, nbgridcells_x*nbgridcells_y*sizeof(double));
+	//
 	memset(grid_grad_sis_x, 0., nbgridcells_x*nbgridcells_y*sizeof(double));
 	memset(grid_grad_sis_y, 0., nbgridcells_x*nbgridcells_y*sizeof(double));
 	//
         double* grid_grad_gpu_x; cudasafer(cudaMalloc((void**) &grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_gpu_x");
         double* grid_grad_gpu_y; cudasafer(cudaMalloc((void**) &grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_gpu_y");
 	//
         double* grid_grad_euler_x; cudasafer(cudaMalloc((void**) &grid_grad_euler_x, nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_gpu_x");
         double* grid_grad_euler_y; cudasafer(cudaMalloc((void**) &grid_grad_euler_y, nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_gpu_y");
         //
 #if 1
 	int *type_gpu;
 	type_t *lens_x_gpu, *lens_y_gpu, *b0_gpu, *angle_gpu, *epot_gpu, *rcore_gpu, *rcut_gpu, *anglecos_gpu, *anglesin_gpu;
 	//
 	cudasafer(cudaMalloc( (void**)&(lens_kernel) , sizeof(Potential_SOA)),"Gradientgpu.cu : Alloc Potential_SOA: " );
 	cudasafer(cudaMalloc( (void**)&(type_gpu)    , nhalos*sizeof(int))   ,"Gradientgpu.cu : Alloc type_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(lens_x_gpu)  , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc x_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(lens_y_gpu)  , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc y_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(b0_gpu)      , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc b0_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(angle_gpu)   , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc angle_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(epot_gpu)    , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc epot_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(rcore_gpu)   , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc rcore_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(rcut_gpu)    , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc rcut_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(anglecos_gpu), nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc anglecos_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(anglesin_gpu), nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc anglesin_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(frame_gpu)   , sizeof(grid_param))   ,"Gradientgpu.cu : Alloc frame_gpu: " );
 	//
 	// Copy values to the GPU
 	//
         double ttime = -seconds();
         cudasafer(cudaMemcpy(type_gpu       , lens->type                 , nhalos*sizeof(int)   , cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy type_gpu: " );
         cudasafer(cudaMemcpy(lens_x_gpu     , lens->position_x           , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy x_gpu: " );
         cudasafer(cudaMemcpy(lens_y_gpu     , lens->position_y           , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy y_gpu: " );
         cudasafer(cudaMemcpy(b0_gpu         , lens->b0                   , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy b0_gpu: " );
         cudasafer(cudaMemcpy(angle_gpu      , lens->ellipticity_angle    , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy angle_gpu: " );
         cudasafer(cudaMemcpy(epot_gpu       , lens->ellipticity_potential, nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy epot_gpu: " );
         cudasafer(cudaMemcpy(rcore_gpu      , lens->rcore                , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy rcore_gpu: " );
         cudasafer(cudaMemcpy(rcut_gpu       , lens->rcut                 , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy rcut_gpu: " );
         cudasafer(cudaMemcpy(anglecos_gpu   , lens->anglecos             , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy anglecos: " );
         cudasafer(cudaMemcpy(anglesin_gpu   , lens->anglesin             , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy anglesin: " );
         cudasafer(cudaMemcpy(frame_gpu      , frame                      ,    sizeof(grid_param), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy fame_gpu: " );
         //
         lens_gpu->type                  = type_gpu;
         lens_gpu->position_x            = lens_x_gpu;
         lens_gpu->position_y            = lens_y_gpu;
         lens_gpu->b0                    = b0_gpu;
         lens_gpu->ellipticity_angle     = angle_gpu;
         lens_gpu->ellipticity_potential = epot_gpu;
         lens_gpu->rcore                 = rcore_gpu;
         lens_gpu->rcut                  = rcut_gpu;
         lens_gpu->anglecos              = anglecos_gpu;
         lens_gpu->anglesin              = anglesin_gpu;
 	//
 	cudaMemcpy(lens_kernel, lens_gpu, sizeof(Potential_SOA), cudaMemcpyHostToDevice);
 	//cudaDeviceSynchronize();
 #else
 	gpu_allocate_SOA<type_t>(lens_gpu, lens, frame, nhalos, nbgridcells_x, nbgridcells_y);
 #endif
 	//
         atime += seconds();
         long int mem_len = 10*nhalos*sizeof(type_t) + sizeof(grid_param) + sizeof(Potential_SOA);
 	//
         printf("\n  ** transfer bandwidth = %f GB/s, %d Bytes, %f s.\n", (double) mem_len/ttime/1024./1024./1024., mem_len, atime);
 	//
         const int grid_dim = nbgridcells_x;
         //
         //point  image_point;
         //
         const type_t dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1);
         const type_t dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1);
         //
 	//
 	//
         double time, euler_time, sis_time;
         //printf("MIXED -> dx = %.15f, dy = %.15f, nbgridcells_x = %d, nbgridcells_y = %d, %p %p\n", dx, dy, nbgridcells_x, nbgridcells_y, grid_grad_temp_x, grid_grad_temp_y);
 	//
         int count = 0;
 	cudaStream_t stream1;
 	cudaStreamCreate(&stream1);
 	//
 	cudaStream_t stream2;
 	cudaStreamCreate(&stream2);
         //
         // Float computation
         //
 #if 0
 #pragma omp parallel
 #pragma omp for
         for (int jj = jstart; jj < nbgridcells_y; ++jj)
                 for (int ii = istart; ii < nbgridcells_x; ++ii)
                 {
 			//
                         int index = jj*nbgridcells_x + ii;
                         //
                         struct point image_point;
                         image_point.x = frame->xmin + (ii + istart)*dx;
                         image_point.y = frame->ymin + (jj + jstart)*dy;
                         //
                         point Grad = module_potentialDerivatives_totalGradient_SOA(&image_point, lens, Nlens);
                         //
                         grid_grad_sis_x[index] = (double) Grad.x;
                         grid_grad_sis_y[index] = (double) Grad.y;
                         //
                 }
         //memcpy(grid_grad_x, grid_grad_temp_x, nbgridcells_x*sizeof(double));
         //memcpy(grid_grad_y, grid_grad_temp_y, nbgridcells_y*sizeof(double));
 	//
 #endif
 	PUSH_RANGE("Float GPU", 1);
         time = -seconds();
 #if 1
         {
                 int GRID_SIZE_X = (nbgridcells_x + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks
                 int GRID_SIZE_Y = (nbgridcells_y + BLOCK_SIZE_Y - 1)/BLOCK_SIZE_Y;
                 //
                 dim3 threads(BLOCK_SIZE_X, BLOCK_SIZE_Y/1);
                 dim3 grid   ( GRID_SIZE_X,  GRID_SIZE_Y);
                 //
                 //gradient_grid_general_SP_GPU<<<grid, threads>>>(grid_grad_temp_x, grid_grad_temp_y, frame_gpu, lens_kernel, Nlens, dx, dy, nbgridcells_x, nbgridcells_y, istart, jstart);
                 gradient_grid_general_SP_GPU<<<grid, threads, 0, stream1>>>(grid_grad_gpu_x, grid_grad_gpu_y, frame_gpu, lens_kernel, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart);
 		//POP_RANGE;
 		//
 		//
                 CudaCheckError();
 
 		//PUSH_RANGE("Float GPU", 2);
 		//@@cudaMemcpy(grid_grad_sis_x, grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost);
 		//@@cudaMemcpy(grid_grad_sis_y, grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost);
 		cudaDeviceSynchronize();
 		time += seconds();
-		cudaMemcpyAsync(grid_grad_sis_x, grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost, stream1);
-		cudaMemcpyAsync(grid_grad_sis_y, grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost, stream1);
+		cudaMemcpyAsync(grid_grad_x, grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost, stream1);
+		cudaMemcpyAsync(grid_grad_y, grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost, stream1);
         }
 #endif
 #if !defined(__EULER) && !defined(__SIS)
 	cudaDeviceSynchronize();
 #endif
         //time += seconds();
 	POP_RANGE;
         //
         //
         //
         double setime = time - seconds();
 #ifdef __EULER
 #warning "EULER defined"
         //memset(grid_grad_x, nbgridcells_x*nbgridcells_y*sizeof(double));
         //memset(grid_grad_y, nbgridcells_x*nbgridcells_y*sizeof(double));
         //
 	//
         count = 0;
         //int* euler_count;  CudaSafeCall(cudaMallocManaged((void**) &euler_count, sizeof(int)));
         euler_time = -seconds();
         {
                 int GRID_SIZE_X = (nbgridcells_x + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks
                 int GRID_SIZE_Y = (nbgridcells_y + BLOCK_SIZE_Y - 1)/BLOCK_SIZE_Y;
                 //
                 dim3 threads(BLOCK_SIZE_X, BLOCK_SIZE_Y);
                 dim3 grid   ( GRID_SIZE_X,  GRID_SIZE_Y);
 		//
 		//cudaMemcpy(grid_grad_euler_x, grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyHostToHost);
 		//cudaMemcpy(grid_grad_euler_y, grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyHostToHost);
                 //
 		PUSH_RANGE("Euler GPU", 3);
                 //@@gradient_grid_euler_GPU<<<grid, threads, 0, stream1>>>(grid_grad_euler_x, grid_grad_euler_y, grid_grad_sis_x, grid_grad_sis_y, frame_gpu, lens_kernel, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart);
                 gradient_grid_euler_GPU<<<grid, threads, 0, stream2>>>(grid_grad_euler_x, grid_grad_euler_y, grid_grad_gpu_x, grid_grad_gpu_y, frame_gpu, lens_kernel, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart);
                 //gradient_grid_euler_GPU<<<grid, threads>>>(grid_grad_x, grid_grad_y, grid_grad_temp_x, grid_grad_temp_y, frame_gpu, lens_kernel, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart);
                 //daxpy<<<grid, threads>>>(nbgridcells_x*nbgridcells_y, 1., grid_grad_temp_x, grid_grad_temp_y);
 		//cudaDeviceSynchronize();
                 CudaCheckError();
 		//POP_RANGE;
 		//
 		//PUSH_RANGE("Euler COPY", 4);
 		//cudaMemcpyAsync(grid_grad_x, grid_grad_euler_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost, stream2);
 		//cudaMemcpyAsync(grid_grad_y, grid_grad_euler_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost, stream2);
 		//cudaMemcpyAsync(&a[offset], &d_a[offset], 
 		//		streamBytes, cudaMemcpyDeviceToHost, cudaMemcpyDeviceToHost, stream[i]);
 		POP_RANGE;
         }
         euler_time += seconds();
         //
 #else
-	cudaMemcpy(grid_grad_x, grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost);
-        cudaMemcpy(grid_grad_y, grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost);
+	//cudaMemcpy(grid_grad_x, grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost);
+        //cudaMemcpy(grid_grad_y, grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost);
+	cudaMemcpy(grid_grad_euler_x, grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToDevice);
+        cudaMemcpy(grid_grad_euler_y, grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToDevice);
 #endif
 	//
 #ifdef __SIS
         //
         // SIS Potentials in double precision
         //
 	cudaStreamSynchronize(stream1);
 	PUSH_RANGE("SIS CPU", 5);
         count = 0;
         sis_time = -seconds();
         {
                 //gradient_grid_SIS_patch(grid_grad_temp_x, grid_grad_temp_y, frame, lens, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart, &count);
-                gradient_grid_SIS_patch(grid_grad_sis_x, grid_grad_sis_y, frame, lens, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart, &count);
+                gradient_grid_SIS_patch(grid_grad_sis_x, grid_grad_sis_y, grid_grad_x, grid_grad_y, frame, lens, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart, &count);
         }
 	POP_RANGE;
 	cudaDeviceSynchronize();
 	sis_time += seconds();
 #endif
 //
 //
 /*
 #ifdef __EULER
 	cudaMemcpy(grid_grad_x, grid_grad_euler_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost);
         cudaMemcpy(grid_grad_y, grid_grad_euler_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost);
 #endif
 */
 //
         double rtime = -seconds();
 #ifdef __SIS
 	//cudaMemcpyAsync(grid_grad_x, grid_grad_euler_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost, stream2);
 	cudaStreamSynchronize(stream2);	
-	cudaMemcpy(grid_grad_x, grid_grad_euler_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost);
+	cudaMemcpy(grid_grad_euler_cpu_x, grid_grad_euler_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost);
 	PUSH_RANGE("Reduction CPU x", 7);
-	cudaMemcpyAsync(grid_grad_y, grid_grad_euler_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost, stream2);
+	cudaMemcpyAsync(grid_grad_euler_cpu_y, grid_grad_euler_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost, stream2);
 	//
 #pragma omp parallel for
 	for (int ii = 0; ii < nbgridcells_x*nbgridcells_y; ++ii)
 		//if ((grid_grad_sis_x[ii] != 0.) || (grid_grad_sis_y[ii] != 0.))
 		if (grid_grad_sis_x[ii] != 0.)
 		{
 			grid_grad_x[ii] = grid_grad_sis_x[ii];
 		}
+		else
+		{
+			grid_grad_x[ii] = grid_grad_euler_cpu_x[ii];	
+		}
 	POP_RANGE;	
 	PUSH_RANGE("Reduction CPU y", 8);
 	cudaStreamSynchronize(stream2);	
 #pragma omp parallel for
         for (int ii = 0; ii < nbgridcells_x*nbgridcells_y; ++ii)
                 if (grid_grad_sis_y[ii] != 0.)
                 {
                         grid_grad_y[ii] = grid_grad_sis_y[ii];
                 }
+		else
+		{
+			grid_grad_y[ii] = grid_grad_euler_cpu_y[ii];
+		}
 
 
 
 	POP_RANGE;
 	rtime += seconds();
+#else
+	cudaMemcpy(grid_grad_x, grid_grad_euler_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost);
+        cudaMemcpy(grid_grad_y, grid_grad_euler_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost);
 #endif
 	setime += seconds();
 	//time += rtime;
         printf("  ** GPU Float computation: %f s.\n", time);
 	printf("  ** %d Euler computations: %f s.\n", gpu_count, euler_time);
 	printf("  ** %d SIS computations  : %f s.\n", count, sis_time);
 	printf("  ** reduction time       : %f s.\n", count, rtime);
         printf("  ** SP + Euler + SIS time: %f s.\n", time + sis_time + euler_time + rtime);
 	//POP_RANGE;
 	//
 	//
 	//
 	cudaStreamDestroy(stream1);
 	cudaStreamDestroy(stream2);
         //
         // Euler approximation
         //
 	//cudaDeviceSynchronize();
         //printf("** %d Euler computations: %f s.\n", gpu_count, time);
 	//cudasafer(cudaFree(grid_grad_temp_x ), "free grid_grad_temp_x");	
 	//cudasafer(cudaFree(grid_grad_temp_y ), "free grid_grad_temp_y");	
 	cudasafer(cudaFree(grid_grad_euler_x), "free grid_grad_euler_x");	
 	cudasafer(cudaFree(grid_grad_euler_y), "free grid_grad_euler_y");	
 	cudasafer(cudaFree(grid_grad_gpu_x)  , "free grid_grad_gpu_x");	
 	cudasafer(cudaFree(grid_grad_gpu_y)  , "free grid_grad_gpu_y");	
 	//
 	// lens_gpu = (Potential_SOA *) malloc(sizeof(Potential_SOA));
         // lens_gpu->type = (int *) malloc(sizeof(int));
 	//
         // Allocate variables on the GPU
         // cudasafer(cudaFree(lens_gpu), "free lens_kernel");
 	//
 	free(lens_gpu);
 	//
         cudasafer(cudaFree(lens_kernel) , "free kernel_gpu");
         cudasafer(cudaFree(type_gpu)    , "free type_gpu");  
         cudasafer(cudaFree(lens_x_gpu)  , "free lens_x_gpu");
         cudasafer(cudaFree(lens_y_gpu)  , "free lens_y_gpu");
         cudasafer(cudaFree(b0_gpu)      , "free b0_gpu");
         cudasafer(cudaFree(angle_gpu)   , "free angle_gpu");
         cudasafer(cudaFree(epot_gpu)    , "free epot_gpu");
         cudasafer(cudaFree(rcore_gpu)   , "free rcore_gpu");
         cudasafer(cudaFree(rcut_gpu)    , "free rcut_gpu");
         cudasafer(cudaFree(anglecos_gpu), "free anglecos_gpu");
         cudasafer(cudaFree(anglesin_gpu), "free anglesin_gpu");
         cudasafer(cudaFree(frame_gpu)   , "free frame_gpu");
 }
 //
 //
 //
 void gradient_grid_mixed_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, int nbgridcells_x, int nbgridcells_y, int istart, int jstart)
 {
         double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1);
         double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1);
         //
 	std::cout << "Calling general mixed GPU" << std::endl;
         gradient_grid_general_mixed_GPU(grid_grad_x, grid_grad_y, frame, lens, nhalos, nbgridcells_x, nbgridcells_y, istart, jstart);
         //
 }
 //
 // templated double precision kernels
 /*
 __device__
 point_double
 module_potentialDerivatives_totalGradient_5_SOA_DP_GPU(const struct point_double *pImage, const struct Potential_Mixed<double> *lens, int shalos,
 int nhalos, int echo)
 {
     //asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins");
     //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n");
     //
     struct point_double grad, result;
     grad.x = 0;
     grad.y = 0;
     //
     for(int i = shalos; i < shalos + nhalos; i++)
     {
                 struct point_double true_coord, true_coord_rotation;
                 //
                 true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
                 //
                 //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
                 double cose = lens->anglecos[i];
                 double sine = lens->anglesin[i];
                 //
                 double x = true_coord.x*cose + true_coord.y*sine;
                 double y = true_coord.y*cose - true_coord.x*sine;
                 //
                 double ell_pot = lens->ellipticity_potential[i];
                 //
                 double b0_inv_R;
                 b0_inv_R = lens->b0[i]/sqrt(x*x*(1. - ell_pot) + y*y*(1. + ell_pot));
                 //
                 result.x = (1. - ell_pot)*x*b0_inv_R;
                 result.y = (1. + ell_pot)*y*b0_inv_R;
                 //
                 grad.x += result.x*cose - result.y*sine;
                 grad.y += result.y*cose + result.x*sine;
                 //
     }
     //if (echo) printf("grad = %.15f %.15f\n", grad.x, grad.y);
     return grad;
 }
 //
 __global__
 void
 gradient_grid_general_DP_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_Mixed<double> *lens, int nhalos, int nbgridcells_x, int nbgridcells_y, int istart, int jstart)
 {
         //
         //
         //
         int col = blockIdx.x*blockDim.x + threadIdx.x;
         if (col > nbgridcells_x) return;
         int row = blockIdx.y*blockDim.y + threadIdx.y;
         if (row > nbgridcells_y) return;
         //
         int index = row*nbgridcells_x + col;
         //
         const double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1);
         const double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1);
         //
         struct point_double grad;
         struct point_double pImage;
         //
         pImage.x = (double) frame->xmin + (col + istart)*dx;
         pImage.y = (double) frame->ymin + (row + jstart)*dy;
         //if (index == 0) printf("x, y = %.15f %15f\n", pImage.x, pImage.y);
         double grad_x, grad_y;
         //
         grad = module_potentialDerivatives_totalGradient_5_SOA_DP_GPU(&pImage, lens, 0, nhalos, index == 0);
         grid_grad_x[index] = (double) grad.x;
         grid_grad_y[index] = (double) grad.y;
 }
 */
 //
 //
 __device__
 point_double
 module_potentialDerivatives_totalGradient_5_SOA_DP_GPU(const struct point_double *pImage, const struct Potential_Mixed<double> *lens, int shalos, int nhalos, int echo)
 {
     //asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins");
     //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n");
     //
     struct point_double grad, result;
     grad.x = 0;
     grad.y = 0;
     //
     for(int i = shalos; i < shalos + nhalos; i++)
     {
                 struct point_double true_coord, true_coord_rotation;
                 //
                 true_coord.x = pImage->x - lens->position_x[i];
                 true_coord.y = pImage->y - lens->position_y[i];
                 //
                 //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
                 double cose = lens->anglecos[i];
                 double sine = lens->anglesin[i];
                 //
                 double x = true_coord.x*cose + true_coord.y*sine;
                 double y = true_coord.y*cose - true_coord.x*sine;
                 //
                 double ell_pot = lens->ellipticity_potential[i];
                 //
                 double b0_inv_R;
                 b0_inv_R = lens->b0[i]/sqrt(x*x*(1. - ell_pot) + y*y*(1. + ell_pot));
                 //
                 result.x = (1. - ell_pot)*x*b0_inv_R;
                 result.y = (1. + ell_pot)*y*b0_inv_R;
                 //if (echo) printf("x, y = %.15f %15f, lens = %.15f %15f, coord = %.15f %.15f, x = %.15f %.15f, ell_pot = %.15f, res = %.15f %.15f\n", pImage->x, pImage->y, lens->position_x[i], lens->position_y[i], true_coord.x, true_coord.y, x, y, ell_pot, result.x, result.y); 
                 //
                 grad.x += result.x*cose - result.y*sine;
                 grad.y += result.y*cose + result.x*sine;
 		//
     }
     //if (echo) printf("grad = %.15f %.15f\n", grad.x, grad.y);
     return grad;
 }
 //
 //
 //
 __global__
 void
 gradient_grid_general_DP_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_Mixed<double> *lens, int nhalos, int nbgridcells_x, int nbgridcells_y, int istart, int jstart)
 {
         //
         //
 	//
         int col = blockIdx.x*blockDim.x + threadIdx.x;
 	if (col > nbgridcells_x) return; 
         int row = blockIdx.y*blockDim.y + threadIdx.y;
 	if (row > nbgridcells_y) return; 
 	//
         int index = row*nbgridcells_x + col;
 	//
         const double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1);
         const double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1);
         //
         struct point_double grad;
         struct point_double pImage;
 	//
         pImage.x = (double) frame->xmin + (col + istart)*dx;
         pImage.y = (double) frame->ymin + (row + jstart)*dy;
 	//if (index == 0) printf("x, y = %.15f %15f\n", pImage.x, pImage.y);
         //
 #if 0
         grad = module_potentialDerivatives_totalGradient_5_SOA_DP_GPU(&pImage, lens, 0, nhalos, index == 0);
 	grid_grad_x[index] = (double) grad.x;
 	grid_grad_y[index] = (double) grad.y;
 #else
 	struct point_double result;
 	double grad_x = 0., grad_y = 0.;
 	for(int i = 0; i < nhalos; i++)
 	{
 		struct point_double true_coord, true_coord_rotation;
 		//
 		true_coord.x = pImage.x - lens->position_x[i];
 		true_coord.y = pImage.y - lens->position_y[i];
 		//
 		//true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]);
 		double cose = lens->anglecos[i];
 		double sine = lens->anglesin[i];
 		//
 		double x = true_coord.x*cose + true_coord.y*sine;
 		double y = true_coord.y*cose - true_coord.x*sine;
 		//
 		double ell_pot = lens->ellipticity_potential[i];
 		//
 		double b0_inv_R;
 		b0_inv_R = lens->b0[i]/sqrt(x*x*(1. - ell_pot) + y*y*(1. + ell_pot));
 		//
 		result.x = (1. - ell_pot)*x*b0_inv_R;
 		result.y = (1. + ell_pot)*y*b0_inv_R;
 		//
 		grad_x += result.x*cose - result.y*sine;
 		grad_y += result.y*cose + result.x*sine;
 		//
 	}
 	grid_grad_x[index] = (double) grad_x;
         grid_grad_y[index] = (double) grad_y;
 
 #endif
 	//
 }
 //
 //
 //
 __global__
 void
 gradient_grid_SP_GPU(float *grid_grad_x, float *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, float dx, float dy, int nbgridcells_x, int nbgridcells_y, int istart, int jstart)
 {
         //
         struct point grad;
         struct point pImage;
         //
         int col = blockIdx.x*blockDim.x + threadIdx.x;
         if (col > nbgridcells_x) return;
         int row = blockIdx.y*blockDim.y + threadIdx.y;
         if (row > nbgridcells_y) return;
         //
         int index = row*nbgridcells_x + col;
         //
         //const float dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1);
         //const float dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1);
         //
         pImage.x = frame->xmin + (col + istart)*dx;
         pImage.y = frame->ymin + (row + jstart)*dy;
         //
         struct point result;
         float grad_x = 0., grad_y = 0.;
 	//
         for(int i = 0; i < nhalos; i++)
         {
 		//
                 struct point true_coord;
                 //
                 true_coord.x = pImage.x - lens->position_x[i];
                 true_coord.y = pImage.y - lens->position_y[i];
                 //
                 float cose = lens->anglecos[i];
                 float sine = lens->anglesin[i];
                 //
                 float x = true_coord.x*cose + true_coord.y*sine;
                 float y = true_coord.y*cose - true_coord.x*sine;
                 //
                 float ell_pot = lens->ellipticity_potential[i];
                 //
                 float b0_inv_R;
                 b0_inv_R = lens->b0[i]*rsqrtf((float) (x*x*(1.f - ell_pot) + y*y*(1.f + ell_pot)));
                 //
                 result.x = (1.f - ell_pot)*x*b0_inv_R;
                 result.y = (1.f + ell_pot)*y*b0_inv_R;
                 //
                 grad_x += result.x*cose - result.y*sine;
                 grad_y += result.y*cose + result.x*sine;
 		//if (index == 0) printf("%d: %f %f %f %f %f %f\n", i, cose, sine, ell_pot, b0_inv_R, grad_x, grad_y);
                 //
         }
 	//
         grid_grad_x[index] = grad_x;
         grid_grad_y[index] = grad_y;
         //
 }
 //
 //
 //
 void 
 gradient_grid_general_SP_GPU(float* __restrict__ grid_grad_x, float* __restrict__ grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart)
 {
         grid_param *frame_gpu;
         //
         int nhalos = Nlens;
         //
         // Allocate variables on the GPU
         //
         printf("\n");
         //printf("  size of gridparam = %d %d %d\n", sizeof(grid_param), sizeof(frame->xmin), sizeof(type_t));
         double atime = -seconds();
         //
         Potential_SOA *lens_gpu, *lens_kernel;// = (Potential_Mixed<double> *) malloc(sizeof(Potential_Mixed<double>));
         lens_gpu = (Potential_SOA *) malloc(sizeof(Potential_SOA));
 	int   *type_gpu;
 	float *lens_x_gpu, *lens_y_gpu, *b0_gpu, *angle_gpu, *epot_gpu, *rcore_gpu, *rcut_gpu, *anglecos_gpu, *anglesin_gpu;
 #if 1
         float* grid_grad_gpu_x;  cudasafer(cudaMalloc((void**) &grid_grad_gpu_x , nbgridcells_x*nbgridcells_y*sizeof(float)), "grid_grad_temp_x");
         float* grid_grad_gpu_y;  cudasafer(cudaMalloc((void**) &grid_grad_gpu_y , nbgridcells_x*nbgridcells_y*sizeof(float)), "grid_grad_temp_y");
 	//
 	lens_gpu = (Potential_SOA *) malloc(sizeof(Potential_SOA));
         //lens_gpu->type = (int *) malloc(sizeof(int));
         // Allocate variables on the GPU
         cudasafer(cudaMalloc( (void**)&(lens_kernel), sizeof(Potential_SOA)),"Gradientgpu.cu : Alloc Potential_SOA: " );
         cudasafer(cudaMalloc( (void**)&(type_gpu), nhalos*sizeof(int)),"Gradientgpu.cu : Alloc type_gpu: " );
         cudasafer(cudaMalloc( (void**)&(lens_x_gpu), nhalos*sizeof(float)),"Gradientgpu.cu : Alloc x_gpu: " );
         cudasafer(cudaMalloc( (void**)&(lens_y_gpu), nhalos*sizeof(float)),"Gradientgpu.cu : Alloc y_gpu: " );
         cudasafer(cudaMalloc( (void**)&(b0_gpu), nhalos*sizeof(float)),"Gradientgpu.cu : Alloc b0_gpu: " );
         cudasafer(cudaMalloc( (void**)&(angle_gpu), nhalos*sizeof(float)),"Gradientgpu.cu : Alloc angle_gpu: " );
         cudasafer(cudaMalloc( (void**)&(epot_gpu), nhalos*sizeof(float)),"Gradientgpu.cu : Alloc epot_gpu: " );
         cudasafer(cudaMalloc( (void**)&(rcore_gpu), nhalos*sizeof(float)),"Gradientgpu.cu : Alloc rcore_gpu: " );
         cudasafer(cudaMalloc( (void**)&(rcut_gpu), nhalos*sizeof(float)),"Gradientgpu.cu : Alloc rcut_gpu: " );
         cudasafer(cudaMalloc( (void**)&(anglecos_gpu), nhalos*sizeof(float)),"Gradientgpu.cu : Alloc anglecos_gpu: " );
         cudasafer(cudaMalloc( (void**)&(anglesin_gpu), nhalos*sizeof(float)),"Gradientgpu.cu : Alloc anglesin_gpu: " );
         cudasafer(cudaMalloc( (void**)&(frame_gpu), sizeof(grid_param)),"Gradientgpu.cu : Alloc frame_gpu: " );
         cudasafer(cudaMalloc( (void**)&(grid_grad_gpu_x), (nbgridcells_x) * (nbgridcells_y) *sizeof(type_t)),"Gradientgpu.cu : Alloc source_x_gpu: " );
         cudasafer(cudaMalloc( (void**)&(grid_grad_gpu_y), (nbgridcells_x) * (nbgridcells_y) *sizeof(type_t)),"Gradientgpu.cu : Alloc source_y_gpu: " );
         atime += seconds();
         std::cout << "\n  ** allocation time = " << atime << " s." << std::endl;
         //
         // Copy values to the GPU
         //
         double ttime = -seconds();
         cudasafer(cudaMemcpy(type_gpu, lens->type , nhalos*sizeof(int),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy type_gpu: " );
         cudasafer(cudaMemcpy(lens_x_gpu, lens->position_x , nhalos*sizeof(float),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy x_gpu: " );
         cudasafer(cudaMemcpy(lens_y_gpu, lens->position_y , nhalos*sizeof(float), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy y_gpu: " );
         cudasafer(cudaMemcpy(b0_gpu, lens->b0 , nhalos*sizeof(float), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy b0_gpu: " );
         cudasafer(cudaMemcpy(angle_gpu, lens->ellipticity_angle , nhalos*sizeof(float), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy angle_gpu: " );
         cudasafer(cudaMemcpy(epot_gpu, lens->ellipticity_potential, nhalos*sizeof(float),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy epot_gpu: " );
         cudasafer(cudaMemcpy(rcore_gpu, lens->rcore, nhalos*sizeof(float),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy rcore_gpu: " );
         cudasafer(cudaMemcpy(rcut_gpu, lens->rcut, nhalos*sizeof(float), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy rcut_gpu: " );
         cudasafer(cudaMemcpy(anglecos_gpu, lens->anglecos, nhalos*sizeof(float),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy anglecos: " );
         cudasafer(cudaMemcpy(anglesin_gpu, lens->anglesin, nhalos*sizeof(float), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy anglesin: " );
         cudasafer(cudaMemcpy(frame_gpu, frame, sizeof(grid_param), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy fame_gpu: " );
         //printf(" transfer bandwidth = %f, %d Bytes, %f s.\n", (double) mem_len/ttime/1024./1024./1024., mem_len, ttime);
         //
         lens_gpu->type                  = type_gpu;
         lens_gpu->position_x            = lens_x_gpu;
         lens_gpu->position_y            = lens_y_gpu;
         lens_gpu->b0                    = b0_gpu;
         lens_gpu->ellipticity_angle     = angle_gpu;
         lens_gpu->ellipticity_potential = epot_gpu;
         lens_gpu->rcore                 = rcore_gpu;
         lens_gpu->rcut                  = rcut_gpu;
         lens_gpu->anglecos              = anglecos_gpu;
         lens_gpu->anglesin              = anglesin_gpu;
         //
         cudaMemcpy(lens_kernel, lens_gpu, sizeof(Potential_SOA), cudaMemcpyHostToDevice);
         //cudaDeviceSynchronyze();
         ttime += seconds();
 	
 #else
         gpu_allocate_SOA<type_t>(lens_gpu, lens, frame, nhalos, nbgridcells_x, nbgridcells_y);
 #endif
         //
         ttime += seconds();
         long int mem_len = 10*nhalos*sizeof(float) + sizeof(grid_param) + sizeof(Potential_SOA);
         //
         printf("\n  ** transfer bandwidth = %f GB/s, %d Bytes, %f s.\n", (double) mem_len/ttime/1024./1024./1024., mem_len, ttime);
         //
         const int grid_dim = nbgridcells_x;
         //
         //point  image_point;
         //
         const float dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1);
         const float dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1);
         //
         //
         //
         double time; 
         //printf("MIXED -> dx = %.15f, dy = %.15f, nbgridcells_x = %d, nbgridcells_y = %d, %p %p\n", dx, dy, nbgridcells_x, nbgridcells_y, grid_grad_temp_x, grid_grad_temp_y);
         //
         int count = 0;
         cudaStream_t stream1;
         cudaStreamCreate(&stream1);
         //
         // Float computation
         //
         PUSH_RANGE("Float GPU", 1);
         time = -seconds();
 #if 1
         {
                 int GRID_SIZE_X = (nbgridcells_x + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks
                 int GRID_SIZE_Y = (nbgridcells_y + BLOCK_SIZE_Y - 1)/BLOCK_SIZE_Y;
                 //
                 dim3 threads(BLOCK_SIZE_X, BLOCK_SIZE_Y/1);
                 dim3 grid   ( GRID_SIZE_X,  GRID_SIZE_Y);
                 //
                 gradient_grid_SP_GPU<<<grid, threads>>>(grid_grad_gpu_x, grid_grad_gpu_y, frame_gpu, lens_kernel, Nlens, dx, dy, nbgridcells_x, nbgridcells_y, istart, jstart);
 		//gradient_grid_general_SP_GPU<<<grid, threads>>>(grid_grad_gpu_dp_x, grid_grad_gpu_dp_y, frame_gpu, lens_kernel, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart);
 		cudaDeviceSynchronize();
 		cudasafer(cudaGetLastError(), "module_potentialDerivative_totalGradient_SOA_SOA_GPU");
                 CudaCheckError();
 	}
 #else
 	module_potentialDerivatives_totalGradient_SOA_CPU_GPU(grid_grad_gpu_x, grid_grad_gpu_y, frame_gpu, lens_kernel, nhalos, dx, dy, nbgridcells_x, nbgridcells_y, istart, jstart);
 #endif
 	time += seconds();
 	printf("  ** GPU SP computation: %f s.\n", time);
 	ttime = -seconds();	
 	{
                 cudaMemcpyAsync(grid_grad_x, grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(float), cudaMemcpyDeviceToHost, stream1);
                 cudaMemcpyAsync(grid_grad_y, grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(float), cudaMemcpyDeviceToHost, stream1);
         }
         cudaDeviceSynchronize();
 	//for (int ii = 0; ii < nbgridcells_x*nbgridcells_y; ++ii)
 	//	printf("%d: %f\n", ii, grid_grad_x[ii]); 
         ttime += seconds();
 	mem_len = 2*nbgridcells_x*nbgridcells_y*sizeof(float);
         printf("  ** transfer bandwidth = %f GB/s, %d Bytes, %f s.\n", (double) mem_len/ttime/1024./1024./1024., mem_len, ttime);
         POP_RANGE;
 }
 //
 //
 //
 void gradient_grid_general_double_GPU(double* grid_grad_x, double* grid_grad_y, const struct grid_param *frame, const struct Potential_Mixed<double> *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart)
 {
 	grid_param *frame_gpu;
 	//
 	int nhalos = Nlens;
 	//
 	// Allocate variables on the GPU
 	//
 	printf("\n");
 	printf("  ** allocating memory on the GPU...\n");fflush(stdout);
 	//printf("  size of gridparam = %d %d %d\n", sizeof(grid_param), sizeof(frame->xmin), sizeof(type_t));
 	double atime = -seconds();
 	//
 	Potential_Mixed<double> *lens_gpu, *lens_kernel;// = (Potential_Mixed<double> *) malloc(sizeof(Potential_Mixed<double>));
 	lens_gpu = (Potential_Mixed<double> *) malloc(sizeof(Potential_Mixed<double>));
 #if 1
 	//double* grid_grad_gpu_x;  cudasafer(cudaMallocManaged((void**) &grid_grad_gpu_x , nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_temp_x");
 	//double* grid_grad_gpu_y;  cudasafer(cudaMallocManaged((void**) &grid_grad_gpu_y , nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_temp_y");
 	double* grid_grad_gpu_x;  cudasafer(cudaMallocHost((void**) &grid_grad_gpu_x , nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_temp_x");
 	double* grid_grad_gpu_y;  cudasafer(cudaMallocHost((void**) &grid_grad_gpu_y , nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_temp_y");
 	//
 	int    *type_gpu;
 	double *lens_x_gpu, *lens_y_gpu, *b0_gpu, *angle_gpu, *epot_gpu, *rcore_gpu, *rcut_gpu, *anglecos_gpu, *anglesin_gpu;
 	//
 	cudasafer(cudaMalloc( (void**)&(lens_kernel) , sizeof(Potential_Mixed<double>)),"Gradientgpu.cu : Alloc Potential_SOA: " );
 	cudasafer(cudaMalloc( (void**)&(type_gpu)    , nhalos*sizeof(int))  ,"Gradientgpu.cu : Alloc type_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(lens_x_gpu)  , nhalos*sizeof(double)),"Gradientgpu.cu : Alloc x_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(lens_y_gpu)  , nhalos*sizeof(double)),"Gradientgpu.cu : Alloc y_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(b0_gpu)      , nhalos*sizeof(double)),"Gradientgpu.cu : Alloc b0_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(angle_gpu)   , nhalos*sizeof(double)),"Gradientgpu.cu : Alloc angle_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(epot_gpu)    , nhalos*sizeof(double)),"Gradientgpu.cu : Alloc epot_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(rcore_gpu)   , nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcore_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(rcut_gpu)    , nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcut_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(anglecos_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglecos_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(anglesin_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglesin_gpu: " );
 	cudasafer(cudaMalloc( (void**)&(frame_gpu)   , sizeof(grid_param))   ,"Gradientgpu.cu : Alloc frame_gpu: " );
 	//
 	// Copy values to the GPU
 	//
 	double ttime = -seconds();
 	cudasafer(cudaMemcpy(type_gpu       , lens->type                 , nhalos*sizeof(int)   , cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy type_gpu: " );
 	cudasafer(cudaMemcpy(lens_x_gpu     , lens->position_x           , nhalos*sizeof(double), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy x_gpu: " );
 	cudasafer(cudaMemcpy(lens_y_gpu     , lens->position_y           , nhalos*sizeof(double), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy y_gpu: " );
 	cudasafer(cudaMemcpy(b0_gpu         , lens->b0                   , nhalos*sizeof(double), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy b0_gpu: " );
 	cudasafer(cudaMemcpy(angle_gpu      , lens->ellipticity_angle    , nhalos*sizeof(double), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy angle_gpu: " );
 	cudasafer(cudaMemcpy(epot_gpu       , lens->ellipticity_potential, nhalos*sizeof(double), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy epot_gpu: " );
 	cudasafer(cudaMemcpy(rcore_gpu      , lens->rcore                , nhalos*sizeof(double), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy rcore_gpu: " );
 	cudasafer(cudaMemcpy(rcut_gpu       , lens->rcut                 , nhalos*sizeof(double), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy rcut_gpu: " );
 	cudasafer(cudaMemcpy(anglecos_gpu   , lens->anglecos             , nhalos*sizeof(double), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy anglecos: " );
 	cudasafer(cudaMemcpy(anglesin_gpu   , lens->anglesin             , nhalos*sizeof(double), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy anglesin: " );
 	cudasafer(cudaMemcpy(frame_gpu      , frame                      ,    sizeof(grid_param), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy fame_gpu: " );
 	//
 	lens_gpu->type                  = type_gpu;
 	lens_gpu->position_x            = lens_x_gpu;
 	lens_gpu->position_y            = lens_y_gpu;
 	lens_gpu->b0                    = b0_gpu;
 	lens_gpu->ellipticity_angle     = angle_gpu;
 	lens_gpu->ellipticity_potential = epot_gpu;
 	lens_gpu->rcore                 = rcore_gpu;
 	lens_gpu->rcut                  = rcut_gpu;
 	lens_gpu->anglecos              = anglecos_gpu;
 	lens_gpu->anglesin              = anglesin_gpu;
 	//
 	cudaMemcpy((void*) lens_kernel, (void*) lens_gpu, sizeof(Potential_Mixed<double>), cudaMemcpyHostToDevice);
 	//
 #else
 	gpu_allocate_SOA<double>(lens_gpu, lens, frame, nhalos, nbgridcells_x, nbgridcells_y);
 #endif
 	//
 	cudaDeviceSynchronize();
 	//
 	ttime += seconds();
 	long int mem_len = 10*nhalos*sizeof(double) + sizeof(grid_param) + sizeof(Potential_Mixed<double>);
 	//
 	//printf("  ** transfer bandwidth = %f GB/s, %d Bytes, %f s.\n", (double) mem_len/ttime/1024./1024./1024., mem_len, ttime);
 	//
 	const int grid_dim = nbgridcells_x;
 	//
 	//point  image_point;
 	//
 	const double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1);
 	const double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1);
 	//
 	//
 	//
 	double time;
 	//printf("MIXED -> dx = %.15f, dy = %.15f, nbgridcells_x = %d, nbgridcells_y = %d, %p %p\n", dx, dy, nbgridcells_x, nbgridcells_y, grid_grad_temp_x, grid_grad_temp_y);
 	int count = 0;
 	//
 	// Double computation
 	//
 	int GRID_SIZE_X = (nbgridcells_x + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks
 	int GRID_SIZE_Y = (nbgridcells_y + BLOCK_SIZE_Y - 1)/BLOCK_SIZE_Y;
 	//
 	dim3 threads(BLOCK_SIZE_X, BLOCK_SIZE_Y/1);
 	dim3 grid   ( GRID_SIZE_X,  GRID_SIZE_Y);
 	//
 	PUSH_RANGE("Double", 1);
 	time = -seconds();
 	{
 		//
 		gradient_grid_general_DP_GPU<<<grid, threads>>>(grid_grad_gpu_x, grid_grad_gpu_y, frame_gpu, lens_kernel, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart);
 		//
 		CudaCheckError();
 	}
 	cudaDeviceSynchronize();
 	time += seconds();
 	printf("  ** GPU DP computation: %f s.\n", time);
 	ttime = -seconds();
 	cudasafer(cudaMemcpy(grid_grad_x, grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost), "gradient_grid_general_double_GPU: cudaMemcpy gpu_x");
         cudasafer(cudaMemcpy(grid_grad_y, grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost), "gradient_grid_general_double_GPU: cudaMemcpy gpu_y");
 	ttime += seconds();
 	mem_len = 2*nbgridcells_x*nbgridcells_y*sizeof(double);
 	printf("  ** transfer bandwidth = %f GB/s, %d Bytes, %f s.\n", (double) mem_len/ttime/1024./1024./1024., mem_len, ttime);
 	//
 	//
 	POP_RANGE;
 	//
 	cudasafer(cudaFree( lens_kernel)   , "gradient_grid_general_double_GPU: free lens_kernel");
         cudasafer(cudaFree( type_gpu)      , "gradient_grid_general_double_GPU: free type_gpu"); 
         cudasafer(cudaFree( lens_x_gpu)    , "gradient_grid_general_double_GPU: free lens_x_gpu");
         cudasafer(cudaFree( lens_y_gpu)    , "gradient_grid_general_double_GPU: free lens_y_gpu");
         cudasafer(cudaFree( b0_gpu)        , "gradient_grid_general_double_GPU: free b0_gpu");    
         cudasafer(cudaFree( angle_gpu)     , "gradient_grid_general_double_GPU: free angle_gpu");
         cudasafer(cudaFree( epot_gpu)      , "gradient_grid_general_double_GPU: free epot_gpu");
         cudasafer(cudaFree( rcore_gpu)     , "gradient_grid_general_double_GPU: free rcore_gpu");
         cudasafer(cudaFree( rcut_gpu)      , "gradient_grid_general_double_GPU: free rcut_gpu"); 
         cudasafer(cudaFree( anglecos_gpu)  , "gradient_grid_general_double_GPU: free anglecos_gpu");
         cudasafer(cudaFree( anglesin_gpu)  , "gradient_grid_general_double_GPU: free anglesin_gpu");
         cudasafer(cudaFree( frame_gpu)     , "gradient_grid_general_double_GPU: free frame_gpu");
 	//
 	cudasafer(cudaFreeHost(grid_grad_gpu_x), "gradient_grid_general_double_GPU: free grid_grad_gpu_x");
 	cudasafer(cudaFreeHost(grid_grad_gpu_y), "gradient_grid_general_double_GPU: free grid_grad_gpu_y");
 	//
 	free(lens_gpu);
 	//
 }
 
 
 void gradient_grid_general_mixed_GPU(double* grid_grad_x, double* grid_grad_y, const struct grid_param *frame, const struct Potential_Mixed<double> *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart)
 {
         grid_param *frame_gpu;
         Potential_SOA *lens_gpu,*lens_kernel;
         //double *grid_grad_x_gpu, *grid_grad_y_gpu ;
         lens_gpu = (Potential_SOA *) malloc(sizeof(Potential_SOA));
         //lens_gpu->type = (int *) malloc(sizeof(int));
         //
         int nhalos = Nlens;
         //
         // Allocate variables on the GPU
         //
         double atime = -seconds();
         //
         double* grid_grad_temp_x;  cudasafer(cudaMallocManaged((void**) &grid_grad_temp_x , nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_temp_x");
         double* grid_grad_temp_y;  cudasafer(cudaMallocManaged((void**) &grid_grad_temp_y , nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_temp_y");
         //
         double* grid_grad_euler_x; cudasafer(cudaMallocManaged((void**) &grid_grad_euler_x, nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_euler_x");
         double* grid_grad_euler_y; cudasafer(cudaMallocManaged((void**) &grid_grad_euler_y, nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_euler_y");
         //
 #if 1
         int *type_gpu;
         type_t *lens_x_gpu, *lens_y_gpu, *b0_gpu, *angle_gpu, *epot_gpu, *rcore_gpu, *rcut_gpu, *anglecos_gpu, *anglesin_gpu;
         //
         cudasafer(cudaMalloc( (void**)&(lens_kernel) , sizeof(Potential_SOA)),"Gradientgpu.cu : Alloc Potential_SOA: " );
         cudasafer(cudaMalloc( (void**)&(type_gpu)    , nhalos*sizeof(int))  ,"Gradientgpu.cu : Alloc type_gpu: " );
         cudasafer(cudaMalloc( (void**)&(lens_x_gpu)  , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc x_gpu: " );
         cudasafer(cudaMalloc( (void**)&(lens_y_gpu)  , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc y_gpu: " );
         cudasafer(cudaMalloc( (void**)&(b0_gpu)      , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc b0_gpu: " );
         cudasafer(cudaMalloc( (void**)&(angle_gpu)   , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc angle_gpu: " );
         cudasafer(cudaMalloc( (void**)&(epot_gpu)    , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc epot_gpu: " );
         cudasafer(cudaMalloc( (void**)&(rcore_gpu)   , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc rcore_gpu: " );
         cudasafer(cudaMalloc( (void**)&(rcut_gpu)    , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc rcut_gpu: " );
         cudasafer(cudaMalloc( (void**)&(anglecos_gpu), nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc anglecos_gpu: " );
         cudasafer(cudaMalloc( (void**)&(anglesin_gpu), nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc anglesin_gpu: " );
         cudasafer(cudaMalloc( (void**)&(frame_gpu)   , sizeof(grid_param))   ,"Gradientgpu.cu : Alloc frame_gpu: " );
         //
         // Copy values to the GPU
         //
         double ttime = -seconds();
         cudasafer(cudaMemcpy(type_gpu       , lens->type                 , nhalos*sizeof(int)   , cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy type_gpu: " );
         cudasafer(cudaMemcpy(lens_x_gpu     , lens->position_x           , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy x_gpu: " );
         cudasafer(cudaMemcpy(lens_y_gpu     , lens->position_y           , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy y_gpu: " );
         cudasafer(cudaMemcpy(b0_gpu         , lens->b0                   , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy b0_gpu: " );
         cudasafer(cudaMemcpy(angle_gpu      , lens->ellipticity_angle    , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy angle_gpu: " );
         cudasafer(cudaMemcpy(epot_gpu       , lens->ellipticity_potential, nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy epot_gpu: " );
         cudasafer(cudaMemcpy(rcore_gpu      , lens->rcore                , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy rcore_gpu: " );
         cudasafer(cudaMemcpy(rcut_gpu       , lens->rcut                 , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy rcut_gpu: " );
         cudasafer(cudaMemcpy(anglecos_gpu   , lens->anglecos             , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy anglecos: " );
         cudasafer(cudaMemcpy(anglesin_gpu   , lens->anglesin             , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy anglesin: " );
         cudasafer(cudaMemcpy(frame_gpu      , frame                      ,    sizeof(grid_param), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy fame_gpu: " );
         //
         lens_gpu->type                  = type_gpu;
         lens_gpu->position_x            = lens_x_gpu;
         lens_gpu->position_y            = lens_y_gpu;
         lens_gpu->b0                    = b0_gpu;
         lens_gpu->ellipticity_angle     = angle_gpu;
         lens_gpu->ellipticity_potential = epot_gpu;
         lens_gpu->rcore                 = rcore_gpu;
         lens_gpu->rcut                  = rcut_gpu;
         lens_gpu->anglecos              = anglecos_gpu;
         lens_gpu->anglesin              = anglesin_gpu;
         //
         cudaMemcpy(lens_kernel, lens_gpu, sizeof(Potential_SOA), cudaMemcpyHostToDevice);
         //cudaDeviceSynchronize();
 #else
         gpu_allocate_SOA<type_t>(lens_gpu, lens, frame, nhalos, nbgridcells_x, nbgridcells_y);
 #endif
         //
         atime += seconds();
         long int mem_len = 10*nhalos*sizeof(type_t) + sizeof(grid_param) + sizeof(Potential_SOA);
         //
         printf("\n  ** transfer bandwidth = %f GB/s, %d Bytes, %f s.\n", (double) mem_len/ttime/1024./1024./1024., mem_len, atime);
         //
         const int grid_dim = nbgridcells_x;
         //
         //point  image_point;
         //
         const type_t dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1);
         const type_t dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1);
         //
         //
         //
         double time;
         //printf("MIXED -> dx = %.15f, dy = %.15f, nbgridcells_x = %d, nbgridcells_y = %d, %p %p\n", dx, dy, nbgridcells_x, nbgridcells_y, grid_grad_temp_x, grid_grad_temp_y);
         int count = 0;
         //
         // Float computation
         //
         PUSH_RANGE("Float", 1);
         time = -seconds();
 	//
 #if 0
 #pragma omp parallel
 #pragma omp for
         for (int jj = jstart; jj < nbgridcells_y; ++jj)
                 for (int ii = istart; ii < nbgridcells_x; ++ii)
                 {
                         //
                         int index = jj*nbgridcells_x + ii;
                         //
                         struct point image_point;
                         image_point.x = frame->xmin + (ii + istart)*dx;
                         image_point.y = frame->ymin + (jj + jstart)*dy;
                         //
                         point Grad = module_potentialDerivatives_totalGradient_SOA(&image_point, lens, Nlens);
                         //
                         grid_grad_temp_x[index] = (double) Grad.x;
                         grid_grad_temp_y[index] = (double) Grad.y;
                         //
                 }
         //memcpy(grid_grad_x, grid_grad_temp_x, nbgridcells_x*sizeof(double));
         //memcpy(grid_grad_y, grid_grad_temp_y, nbgridcells_y*sizeof(double));
         //
 #else
         {
                 int GRID_SIZE_X = (nbgridcells_x + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks
                 int GRID_SIZE_Y = (nbgridcells_y + BLOCK_SIZE_Y - 1)/BLOCK_SIZE_Y;
                 //
                 dim3 threads(BLOCK_SIZE_X, BLOCK_SIZE_Y/1);
                 dim3 grid   ( GRID_SIZE_X,  GRID_SIZE_Y);
                 //
                 gradient_grid_general_SP_GPU<<<grid, threads>>>(grid_grad_temp_x, grid_grad_temp_y, frame_gpu, lens_kernel, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart);
                 //
                 CudaCheckError();
                 cudaDeviceSynchronize();
         }
 #endif
         POP_RANGE;
         time += seconds();
         printf("  ** GPU Float computation: %f s.\n", time);
         //
         //
         //
 #if 0
         PUSH_RANGE("Euler GPU", 2);
         count = 0;
         time = -seconds();
         {
                 int GRID_SIZE_X = (nbgridcells_x + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks
                 int GRID_SIZE_Y = (nbgridcells_y + BLOCK_SIZE_Y - 1)/BLOCK_SIZE_Y;
                 //
                 dim3 threads(BLOCK_SIZE_X, BLOCK_SIZE_Y/1);
                 dim3 grid   ( GRID_SIZE_X,  GRID_SIZE_Y);
                 //
                 gradient_grid_GPU<<<grid, threads>>>(grid_grad_temp_x, grid_grad_temp_y, frame_gpu, lens_kernel, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart);
                 //daxpy<<<grid, threads>>>(nbgridcells_x*nbgridcells_y, 1., grid_grad_temp_x, grid_grad_temp_y);
                 CudaCheckError();
                 //cudaDeviceSynchronize();
         }
         time += seconds();
         printf("** %d Euler computations: %f s.\n", gpu_count, time);
         POP_RANGE;
 #endif
         //
 #if 1
         //
         // SIS Potentials in double precision
         //
         PUSH_RANGE("SIS CPU", 2);
         count = 0;
         time = -seconds();
         {
                 //gradient_grid_SIS_patch(grid_grad_temp_x, grid_grad_temp_y, frame, lens, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart, &count);
         }
         time += seconds();
         printf("** %d SIS computations: %f s.\n", count, time);
         POP_RANGE;
         //
         //
         //
 #endif
         //
         // Euler approximation
         //
 #if 1
         //memset(grid_grad_x, nbgridcells_x*nbgridcells_y*sizeof(double));
         //memset(grid_grad_y, nbgridcells_x*nbgridcells_y*sizeof(double));
         //
         count = 0;
         int* euler_count;  CudaSafeCall(cudaMallocManaged((void**) &euler_count, sizeof(int)));
         time = -seconds();
         {
                 int GRID_SIZE_X = (nbgridcells_x + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks
                 int GRID_SIZE_Y = (nbgridcells_y + BLOCK_SIZE_Y - 1)/BLOCK_SIZE_Y;
                 //
                 dim3 threads(BLOCK_SIZE_X, BLOCK_SIZE_Y/1);
                 dim3 grid   ( GRID_SIZE_X,  GRID_SIZE_Y);
                 //
                 gradient_grid_euler_GPU<<<grid, threads>>>(grid_grad_euler_x, grid_grad_euler_y, grid_grad_temp_x, grid_grad_temp_y, frame_gpu, lens_kernel, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart);
                 //daxpy<<<grid, threads>>>(nbgridcells_x*nbgridcells_y, 1., grid_grad_temp_x, grid_grad_temp_y);
                 CudaCheckError();
                 cudaDeviceSynchronize();
         }
         time += seconds();
         printf("** %d Euler computations: %f s.\n", gpu_count, time);
         //
         memcpy(grid_grad_x, grid_grad_euler_x, nbgridcells_x*nbgridcells_y*sizeof(double));
         memcpy(grid_grad_y, grid_grad_euler_y, nbgridcells_x*nbgridcells_y*sizeof(double));
         //
 #else
         memcpy(grid_grad_x, grid_grad_temp_x, nbgridcells_x*nbgridcells_y*sizeof(double));
         memcpy(grid_grad_y, grid_grad_temp_y, nbgridcells_x*nbgridcells_y*sizeof(double));
 #endif
         cudaFree(grid_grad_temp_x );
         cudaFree(grid_grad_temp_y );
         cudaFree(grid_grad_euler_x);
         cudaFree(grid_grad_euler_y);
         //
         // lens_gpu = (Potential_SOA *) malloc(sizeof(Potential_SOA));
         // lens_gpu->type = (int *) malloc(sizeof(int));
         //
         // Allocate variables on the GPU
         // cudasafer(cudaFree(lens_gpu), "free lens_kernel");
         //
         free(lens_gpu);
         cudasafer(cudaFree(lens_kernel) , "free kernel_gpu");
         cudasafer(cudaFree(type_gpu)    , "free type_gpu");
         cudasafer(cudaFree(lens_x_gpu)  , "free lens_x_gpu");
         cudasafer(cudaFree(lens_y_gpu)  , "free lens_y_gpu");
         cudasafer(cudaFree(b0_gpu)      , "free b0_gpu");
         cudasafer(cudaFree(angle_gpu)   , "free angle_gpu");
         cudasafer(cudaFree(epot_gpu)    , "free epot_gpu");
         cudasafer(cudaFree(rcore_gpu)   , "free rcore_gpu");
         cudasafer(cudaFree(rcut_gpu)    , "free rcut_gpu");
         cudasafer(cudaFree(anglecos_gpu), "free anglecos_gpu");
         cudasafer(cudaFree(anglesin_gpu), "free anglesin_gpu");
         cudasafer(cudaFree(frame_gpu)   , "free frame_gpu");
 }
 //
 //
 //
 void gradient_grid_mixed_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_Mixed<double> *lens, int nhalos, int nbgridcells_x, int nbgridcells_y, int istart, int jstart)
 {
         double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1);
         double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1);
         //
         std::cout << "Calling general mixed GPU" << std::endl;
         gradient_grid_general_mixed_GPU(grid_grad_x, grid_grad_y, frame, lens, nhalos, nbgridcells_x, nbgridcells_y, istart, jstart);
         //
 }
diff --git a/src/structure_hpc.hpp b/src/structure_hpc.hpp
index c4500c2..e7b85e8 100644
--- a/src/structure_hpc.hpp
+++ b/src/structure_hpc.hpp
@@ -1,546 +1,577 @@
 /**
 * @file   structure.h
 * @Author Thomas Jalabert, EPFL (me@example.com) , Christoph Schaefer, EPFL (christophernstrerne.schaefer@epfl.ch)
 * @date   July 2015
 * @version 0,1
 * @brief  Header file to define the used structures (e.g. defined structs)
 * 
 * @param configuration file (parameters.par)
 * @return Depends on choice in the configuration file, e.g. least chi2 model
 */
 
 
 // Header guard
 #ifndef STRUCTURE_HPC_H
 #define STRUCTURE_HPC_H
 
 
 #include <iostream>
 #include "type.h"
 
 
 
 /*****************************************************************/
 /*                               */
 /* Constants: Will be migrated to constants.h when there are too many of them*/
 /*                               */
 /*****************************************************************/
 
 
 // GPU definitions
 #define threadsPerBlock 512 // threads for each set of images
 #define MAXIMPERSOURCE 20 // maximum number of multiple images for one source
 #define MAXIM 200 // maximum  total number of images treated
 
 // Dimensions definitions
 #define NPZMAX	9	/* maximum number of critical lines in g_cline struct*/
 //#define CLINESIZE	500	/* maximum number of critical and caustic points for Cline mode. Adjust depending on RAM*/
 #define NPOTFILESIZE 2000 //maximum number of potential in potfiles
 #define DMIN	1e-4	// distance minimale de convergence dans le plan image (in arcsec)	
 #define NITMAX 	100
 #define NTYPES 	2
 
 
 // gNFW definitions
 #define gNFW_ARRAY_SIZE 1800 // Set the dimension of the gnfw table gNFW_ARRAY_SIZE, must be 1800 for the current table file
 
 // Filename definition
 #define FILENAME_SIZE 50  // size of a filename in .par file
 
 //constants definition
 
 #define pi_c2  7.209970e-06	/* pi en arcsecond/ c^2 =  648000/vol/vol */
 #define cH2piG  0.11585881	/* cH0/2piG en g/cm^2 (H0=50) */
 #define cH4piG  0.057929405	/* cH0/4piG en g/cm^2 (H0=50) */
 #define cH0_4piG  2.7730112e-4	/* cH0/4piG en 10^12 M_Sol/kpc^2 (H0=50) */
 #define d0	29.068701	/* vol/h0*1000/rta -- c/H0 en h-1.kpc/arcsecond  (h0=50)*/
 
 #define MCRIT12	.2343165	/* c^3/4piGh0/RTA/RTA in 1e12 M_sol/arcsec^2 (h0=50) */
 
 /*****************************************************************/
 /*                               */
 /* 			Types definition			*/
 /*                               */
 /*****************************************************************/
 
 /** @brief Point: Structure of 2 coordinates
  * 
  * @param x: X coordinate
  * @param y: Y coordinate
  */
 #ifdef __WITH_LENSTOOL
 #include "structure.h"
 #else
 struct point    
 {
 	type_t x;
 	type_t y;
 };
 
 /** @brief Complex: Structure of 2 doubles
  * @param re: Real Part
  * @param im: Imaginary Part
  */
 struct complex    
 {
 	type_t re;
 	type_t im;
 };
 /** @brief Segment: Structure of two points
  */
 struct segment    
 {
 	point a;	
 	point b;
 };
 
 /** @brief triplet: Structure of three points defining a triangle
  */
 struct triplet
 {
 	struct point a;
 	struct point b;
 	struct point c;
 };
 
 /** @brief bitriplet: Defines two linked triangles (one in source plane and one in image plane)
  * @param i: Triangle on image plane
  * @param s: Triangle on source plane 
  */
 struct bitriplet
 {
 	struct triplet i;	
 	struct triplet s;
 };
 
 /** @brief contains the table needed to compute the potential derivatives of general NFW profile
  */
 typedef struct
 {
 	double alpha_now, x_now, kappa, dpl;
 } gNFWdata;
 
 /** @brief Matrix: 2by2 doubles
  */
 struct matrix
 {
 	type_t  a;
 	type_t  b;
 	type_t  c;
 	type_t  d;
 };
 
 /** @brief ellipse: for shape computation
  * @param a: semimajor axis
  * @param b: semiminor axis
  * @param theta: shape ellipticity
  */
 struct ellipse
 {
 	type_t  a;
 	type_t  b;
 	type_t  theta;
 };
 
 #endif
 
 /** @brief Storage type for sources, lenses and arclets
  * @param center: position of the center of galaxy
  * @param shape: shape of galaxy
  * @param mag: magnitude
  * @param redshift: redshift
  * @param dls: Distance lens-source
  * @param dos: Distance observer-source
  * @param dr: dls/dos
  */
 
 struct galaxy
 {
 	//char    name[IDSIZE];
 	struct point    center;		
 	struct ellipse  shape;		
 	type_t  mag;
 	type_t  redshift;
 	type_t  dls;
 	type_t  dos;
 	type_t  dr;
 };
 
 
 /** @brief Contains the information for optimising a parameter in the inverse mode
  * @param block: blockorfree variable (whether a parameter is blocked or free for the mcmc algorithm)
  * @param min: lower optimisation value
  * @param max: upper optimisation value
  * @param sigma: optimisation step (MIGHT NOT BE TAKEN INTO ACCOUNT)
  */
 struct optimize_block
 {
 	int block; 			
 	type_t min;
 	type_t max;
 	type_t sigma;
 };
 /** @brief two optimize_block to simulate a point
  */
 struct optimize_point    // blockorfree for the point structure
 {
 	struct optimize_block x;
 	struct optimize_block y;
 };
 
 /** @brief Contains the information for optimising the potential in the inverse mode
  * @param position: position of the center of the halo
  * @param weight: weight of the clump (the projected mass sigma0 for PIEMD, the density rhoscale for NFW)
  * @param b0: Impact parameter
  * @param ellipticity_angle: orientation of the clump
  * @param ellipticity: Mass ellipticity
  * @param ellipticity_potential: Potential ellipticity
  * @param rcore: PIEMD specific value
  * @param rcut: PIEMD specific value
  * @param rscale: scale radius for NFW, Einasto
  * @param exponent: exponent for Einasto
  * @param vdisp: Dispersion velocity
  * @param alpha: exponent for general NFW
  * @param einasto_kappacritic: critical kappa for Einasto profile
  * @param z: redshift
  */
 struct potentialoptimization  // block or free variable for the MCMC for the potential
 {
 	struct optimize_point position;					
 	struct optimize_block weight;
 	struct optimize_block  b0; 						
 	struct optimize_block ellipticity_angle;
 	struct optimize_block ellipticity;				
 	struct optimize_block ellipticity_potential; 	
 	struct optimize_block rcore;					
 	struct optimize_block rcut;						
 	struct optimize_block rscale;
 	struct optimize_block exponent;
 	struct optimize_block vdisp;					
 	struct optimize_block alpha;
 	struct optimize_block einasto_kappacritic;
 	struct optimize_block z;						
 	
 
 };
 
 /** @brief Contains the information of the lens potential
  * @param type: 1=PIEMD , 2=NFW, 3=SIES, 4=point mass, 5=SIS, 8=PIEMD
  * @param type_name: IEMD, NFW, SIES, point
  * @param name: name of the clump (e.g. name of the galaxy) : not compulsory
  * @param position: position of the center of the halo
  * @param weight: weight of the clump (the projected mass sigma0 for PIEMD, the density rhoscale for NFW)
  * @param b0: Impact parameter
  * @param ellipticity_angle:
  * @param ellipticity: Mass ellipticity
  * @param ellipticity_potential: Potential ellipticity
  * @param rcore: PIEMD specific value
  * @param rcut: PIEMD specific value
  * @param rscale: scale radius for NFW, Einasto
  * @param exponent: exponent for Einasto
  * @param vdisp: Dispersion velocity
  * @param alpha: exponent for general NFW
  * @param einasto_kappacritic: critical kappa for Einasto profile
  * @param z: redshift
  */
 
+
+#if 1
+struct Potential_SOA
+{
+        int*    type; // 1=PIEMD ; 2=NFW; 3=SIES, 4=point mass
+        char    type_name[10]; // PIEMD, NFW, SIES, point
+        type_t*     name; // name of the clump (e.g. name of the galaxy) : not compulsory
+        //struct point position; // position of the center of the halo
+        type_t* __restrict__ position_x; // position of the center of the halo
+        type_t* __restrict__ position_y; // position of the center of the halo
+        type_t* __restrict__ weight; // weight of the clump (the projected mass sigma0 for PIEMD, the density rhoscale for NFW)
+        type_t* __restrict__ b0; // Impact parameter
+        type_t* __restrict__ vdisp;  //Dispersion velocity
+        type_t* __restrict__ ellipticity_angle; // orientation of the clump
+        type_t* __restrict__ ellipticity; // ellipticity of the mass distribition
+        type_t* __restrict__ ellipticity_potential; //ellipticity of the potential
+        type_t* __restrict__ rcore;  // core radius
+        type_t* __restrict__ rcut; // cut radius
+        type_t* __restrict__ rscale; // scale radius for NFW, Einasto
+        type_t* __restrict__ exponent; // exponent for Einasto
+        type_t* __restrict__ alpha; // exponent for general NFW
+        type_t* __restrict__ einasto_kappacritic; // critical kappa for Einasto profile
+        type_t* __restrict__ z; // redshift of the clump
+        type_t* __restrict__ mag; //magnitude
+        type_t* __restrict__ lum; //luminosity
+        type_t* __restrict__ theta; //theta
+        type_t* __restrict__ anglecos; //theta precomputation of cosinus and sinus values
+        type_t* __restrict__ anglesin; //theta
+        type_t* __restrict__ sigma; // sigma
+};
+#else
 struct Potential_SOA
 {
 	int*    type; // 1=PIEMD ; 2=NFW; 3=SIES, 4=point mass
 	char    type_name[10]; // PIEMD, NFW, SIES, point
 	type_t*     name; // name of the clump (e.g. name of the galaxy) : not compulsory
 	//struct point position; // position of the center of the halo
-	type_t* position_x; // position of the center of the halo
+	const type_t* __restrict__ position_x; // position of the center of the halo
 	type_t* position_y; // position of the center of the halo
 	type_t* weight; // weight of the clump (the projected mass sigma0 for PIEMD, the density rhoscale for NFW)
 	type_t* b0; // Impact parameter
 	type_t*	vdisp;	//Dispersion velocity
 	type_t* ellipticity_angle; // orientation of the clump
 	type_t* ellipticity; // ellipticity of the mass distribition
 	type_t* ellipticity_potential; //ellipticity of the potential
 	type_t* rcore;  // core radius
 	type_t* rcut; // cut radius
 	type_t* rscale; // scale radius for NFW, Einasto
 	type_t*	exponent; // exponent for Einasto
 	type_t* alpha; // exponent for general NFW
 	type_t* einasto_kappacritic; // critical kappa for Einasto profile
 	type_t* z; // redshift of the clump
 	type_t* mag; //magnitude
 	type_t* lum; //luminosity
 	type_t* theta; //theta
 	type_t* anglecos; //theta precomputation of cosinus and sinus values
 	type_t* anglesin; //theta
 	type_t* sigma; // sigma
 };
-
+#endif
 
 struct Potential
 {
        int    type; // 1=PIEMD ; 2=NFW; 3=SIES, 4=point mass
        char    type_name[10]; // PIEMD, NFW, SIES, point
        char    name[20]; // name of the clump (e.g. name of the galaxy) : not compulsory
        struct  point position; // position of the center of the halo
        type_t  weight; // weight of the clump (the projected mass sigma0 for PIEMD, the density rhoscale for NFW)
        type_t  b0; // Impact parameter
        type_t  vdisp;  //Dispersion velocity
        type_t  ellipticity_angle; // orientation of the clump
        type_t  ellipticity; // ellipticity of the mass distribition
        type_t  ellipticity_potential; //ellipticity of the potential
        type_t  rcore;  // core radius
        type_t  rcut; // cut radius
        type_t  rscale; // scale radius for NFW, Einasto
        type_t  exponent; // exponent for Einasto
        type_t  alpha; // exponent for general NFW
        type_t  einasto_kappacritic; // critical kappa for Einasto profile
        type_t  z; // redshift of the clump
        type_t  mag; //magnitude
        type_t  lum; //luminosity
        type_t  theta; //theta
        type_t  sigma; // sigma
 };
 
 
 /*****************************************************************/
 /*                               */
 /*      Control structure definition        */
 /*                               */
 /*****************************************************************/
 
 /** @brief Control structure for runmode parameters
  * 
  * Default values are set in module_readParameters_readRunmode
  *
  * @param nbgridcells: Number of grid cells
  * @param source: flag for simple source to image conversion
  * @param sourfile: file name for source information
  * @param image: flag for simple image to source conversion
  * @param imafile: file name for image information
  * @param mass: flag for mass fitsfile
  * @param massgridcells: Nb of cell for fitsfile
  * @param z_mass: redshift for which to be computed
  * @param z_mass_s: redshift of source for which effect of mass will be computed
  * @param potential: flag for potential fitsfile
  * @param potgridcells: Nb of cell for fitsfile
  * @param z_pot: redshift for which to be computed
  * @param dpl: flag for displacement fitsfile
  * @param dplgridcells: Nb of cell for fitsfile
  * @param z_dpl: redshift for which to be computed
  * @param inverse: flag for inversion mode (MCMC etc,)
  * @param arclet: flag for arclet mode 
  * @param debug: flag for debug mode
  * @param nimagestotal: total number of lensed images in file
  * @param nsets: number of sources attributed to the lensed images
  * @param nhalo: Number of halos
  * @param grid: 0 for automatic grid (not implemented), 1 for grid infor applying on source plane, 2 for grid information applying on image plane
  * @param nbgridcells: Number of grid cells
  * @param zgrid: redshift of grid
  * @param cline: flag for critical and caustic line calculation
  */
  
 struct runmode_param
 {
 	int 	nbgridcells;
 	//Source Mode
 	int     source;
 
 	int 	nsets;
 	//Image Mode
 	int     image;
 
 	int		nimagestot;
 	//Mult Mode
 	int     multi;
 	//Mass Mode
 	int		mass;
 	int		mass_gridcells;
 	type_t	z_mass;
 	type_t	z_mass_s;
 	//Potential Mode
 	int		potential;
 	int		pot_gridcells;
 	type_t	z_pot;
 	int 	nhalos;
 	//Potfile Mode
 
 	int		potfile;
 	int		npotfile;
 	//displacement Mode
 	int		dpl;
 	int		dpl_gridcells;
 	type_t	z_dpl;
 	//Inverse Mode
 	int     inverse; 
 	//Arclet Mode
 	int     arclet; 
 	//Debug Mode      
 	int 	debug;	
 	//Grid Mode
 	int 	grid;
 	int 	gridcells;
 	type_t 	zgrid;
 	//Critic and caustic mode
 	int		cline;
 	//Amplification Mode
 	int 	amplif;
 	int 	amplif_gridcells;
 	type_t 	z_amplif;
 	//Time/Benchmark mode
 	int		time;
 	  //SOA variables
 	 int Nlens[NTYPES];
 	 std::string    imagefile;
 	 std::string		potfilename;
 	 std::string    sourfile;
 };
 
 /** @brief Not used yet
  * 
  */
 struct image_param
 {
 
 };
 
 /** @brief Not used yet
  * 
  */
 struct source_param
 {
 
 };
 
 /** @brief Contains Grid information
  */
 
 struct grid_param
 {
 	type_t  xmin;
 	type_t  xmax;
 	type_t  ymin;
 	type_t  ymax;
 	type_t  lmin;
 	type_t  lmax;
 	type_t  rmax;
 };
 
 /** @brief Control structure for cosmological parameters
  * 
  * @param model: Cosmological model
  * @param omegaM: 
  * @param omegaX: 
  * @param curvature: curvature parameter
  * @param wX: 
  * @param wa: 
  * @param H0: Hubble constant
  * @param h: H0/50
  */
 
 struct cosmo_param  
 {
     int     model;            	
     type_t  omegaM;
     type_t  omegaX;
     type_t  curvature;
     type_t  wX;
     type_t  wa;
     type_t  H0;
     type_t  h;
 };
 
 /** @brief Control structure for potfile parameters
  *
  *  @param     	potid:  1: pot P, 2: pot Q
 	@param     	ftype:
 	@param  	potfile[FILENAME_SIZE];
 	@param     	type;
 	@param  	zlens;
 	@param  	core;CCC
 	@param  	corekpc;
 	@param  	mag0;
 	@param     	select;
 	@param     	ircut;
 	@param  	cut, cut1, cut2;
 	@param  	cutkpc1, cutkpc2;
 	@param     	isigma;
 	@param  	sigma, sigma1, sigma2;
 	@param     	islope;
 	@param  	slope, slope1, slope2;
 	@param     	ivdslope;
 	@param  	vdslope, vdslope1, vdslope2;
 	@param     	ivdscat;
 	@param  	vdscat, vdscat1, vdscat2;
 	@param     	ircutscat;
 	@param  	rcutscat, rcutscat1, rcutscat2;
 	@param     	ia;   // scaling relation of msm200
 	@param  	a, a1, a2;
 	@param     	ib;   // scaling relation of msm200
 	@param  	b, b1, b2;
 
  */
 
 struct  potfile_param
 {
     int     potid;   // 1: pot P, 2: pot Q
 	int     ftype;
 	char    potfile[FILENAME_SIZE];
 	int     type;
 	type_t  zlens;
 	type_t  core;
 	type_t  corekpc;
 	type_t  mag0;
 	int     select;
 	int     ircut;
 	type_t  cut, cut1, cut2;
 	type_t  cutkpc1, cutkpc2;
 	int     isigma;
 	type_t  sigma, sigma1, sigma2;
 	int     islope;
 	type_t  slope, slope1, slope2;
 	int     ivdslope;
 	type_t  vdslope, vdslope1, vdslope2;
 	int     ivdscat;
 	type_t  vdscat, vdscat1, vdscat2;
 	int     ircutscat;
 	type_t  rcutscat, rcutscat1, rcutscat2;
 	int     ia;   // scaling relation of msm200
 	type_t  a, a1, a2;
 	int     ib;   // scaling relation of msm200
 	type_t  b, b1, b2;
 	int npotfile;
 };
 
 /** @brief Control structure for caustic and critical lines
  * 
  * @param nplan: number of sourceplanes for which the caustic lines will be computed
  * @param cz: redshift values array for respective sourceplanes
  * @param dos: distcosmo1 to redshift z
  * @param dls: distcosmo2 between lens[0] and z
  * @param dlsds:    ratio of dl0s/dos
  * @param limitLow: minimum size of the squares in MARCHINGSQUARES
  * @param dmax: Size of search area
  * @param xmin: 
  * @param xmax: 
  * @param ymin:    
  * @param ymax: 
  * @param limithigh: maximum size of the squares in MARCHINGSQUARES algorithm
  * @param nbgridcells: nbgridcells * nbgridcells = number of pixels for critical line computation
 */
 
 struct cline_param
 {
 	int     nplan;
 	type_t  cz[NPZMAX];
 	type_t  dos[NPZMAX];	// distcosmo1 to redshift z
 	type_t  dls[NPZMAX];	// distcosmo2 between lens[0] and z
 	type_t  dlsds[NPZMAX];	// ratio of dl0s/dos
 	type_t  limitLow;   // minimum size of the squares in MARCHINGSQUARES or initial step size in SNAKE
 	type_t  dmax;
 	type_t  xmin;
 	type_t  xmax;
 	type_t  ymin;
 	type_t  ymax;
 	type_t  limitHigh; // maximum size of the squares in MARCHINGSQUARES algorithm
 	int nbgridcells; // nbgridcells * nbgridcells = number of pixels for critical line computation
 };
 
 #endif // if STRUCTURE_H