diff --git a/src/grid_gradient_GPU.cu b/src/grid_gradient_GPU.cu
index b7076c8..b25f4d3 100644
--- a/src/grid_gradient_GPU.cu
+++ b/src/grid_gradient_GPU.cu
@@ -1,357 +1,358 @@
 //
 //
 //
 #include <fstream>
 #include "grid_gradient_GPU.cuh"
 #include "gradient_GPU.cuh"
 
 #define BLOCK_SIZE_X 32
 #define BLOCK_SIZE_Y 16
 
 //#define ROT
 
 #define _SHARED_MEM
 
 #ifdef _SHARED_MEM
 #define SHARED __shared__
 #warning "shared memory"
 extern __shared__ type_t shared[];
 #else
 #define SHARED 
 #endif
 
 #define Nx 1
 #define Ny 0
 
 
 #define cudasafe 
 
 extern "C" 
 {
 	type_t myseconds();
 }
 
-__global__ void module_potentialDerivatives_totalGradient_SOA_GPU(type_t *grid_grad_x, type_t *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int nhalos);
+__global__ 
+void module_potentialDerivatives_totalGradient_SOA_GPU(type_t *grid_grad_x, type_t *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int nhalos);
 
 //
 void 
 module_potentialDerivatives_totalGradient_SOA_CPU_GPU(type_t *grid_grad_x, type_t *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, type_t dx, type_t dy, int nbgridcells_x, int nbgridcells_y, int istart, int jstart);
 //
 void
 module_potentialDerivatives_totalGradient_SOA_CPU_GPU(type_t *grid_grad_x, type_t *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos);
 //
 void gradient_grid_GPU(type_t *grid_grad_x, type_t *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, type_t dx, type_t dy, int nbgridcells_x, int nbgridcells_y, int istart, int jstart);
 
 //
 //void
 //module_potentialDerivatives_totalGradient_SOA_CPU_GPU_v2(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos);
 //
 //
 //
 void gradient_grid_GPU(type_t *grid_grad_x, type_t *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos ,int nbgridcells)
 {
 	type_t dx = (frame->xmax - frame->xmin)/(nbgridcells - 1);
         type_t dy = (frame->ymax - frame->ymin)/(nbgridcells - 1);
         //
         gradient_grid_GPU(grid_grad_x, grid_grad_y, frame, lens, nhalos, dx, dy, nbgridcells, nbgridcells, 0, 0);
 }
 //
 //
 //
 void gradient_grid_GPU(type_t *grid_grad_x, type_t *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, type_t dx, type_t dy, int nbgridcells_x, int nbgridcells_y, int istart, int jstart)
 {
 
 
 	int nBlocks_gpu = 0;
 	// Define the number of threads per block the GPU will use
 	cudaDeviceProp properties_gpu;
 
 	cudaGetDeviceProperties(&properties_gpu, 0); // Get properties of 0th GPU in use
 
 	if (properties_gpu.maxThreadsDim[0]<threadsPerBlock)
 	{
 		fprintf(stderr, "ERROR: The GPU has to support at least %u threads per block.\n", threadsPerBlock);
 		exit(-1);
 	}
 	else
 	{
 		nBlocks_gpu = properties_gpu.maxGridSize[0] / threadsPerBlock;  // Get the maximum number of blocks with the chosen number of threads
 		// per Block that the GPU supports
 	}
 
 	grid_param *frame_gpu;
 	Potential_SOA *lens_gpu,*lens_kernel;
 	int *type_gpu;
 	type_t *lens_x_gpu, *lens_y_gpu, *b0_gpu, *angle_gpu, *epot_gpu, *rcore_gpu, *rcut_gpu, *anglecos_gpu, *anglesin_gpu;
 	type_t *grid_grad_x_gpu, *grid_grad_y_gpu ;
 
 	lens_gpu = (Potential_SOA *) malloc(sizeof(Potential_SOA));
 	lens_gpu->type = (int *) malloc(sizeof(int));
 
 	// Allocate variables on the GPU
 	cudasafe(cudaMalloc( (void**)&(lens_kernel), sizeof(Potential_SOA)),"Gradientgpu.cu : Alloc Potential_SOA: " );
 	cudasafe(cudaMalloc( (void**)&(type_gpu), nhalos*sizeof(int)),"Gradientgpu.cu : Alloc type_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(lens_x_gpu), nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc x_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(lens_y_gpu), nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc y_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(b0_gpu), nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc b0_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(angle_gpu), nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc angle_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(epot_gpu), nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc epot_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(rcore_gpu), nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc rcore_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(rcut_gpu), nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc rcut_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(anglecos_gpu), nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc anglecos_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(anglesin_gpu), nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc anglesin_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(frame_gpu), sizeof(grid_param)),"Gradientgpu.cu : Alloc frame_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(grid_grad_x_gpu), (nbgridcells_x) * (nbgridcells_y) *sizeof(type_t)),"Gradientgpu.cu : Alloc source_x_gpu: " );
 	cudasafe(cudaMalloc( (void**)&(grid_grad_y_gpu), (nbgridcells_x) * (nbgridcells_y) *sizeof(type_t)),"Gradientgpu.cu : Alloc source_y_gpu: " );
 
 	// Copy values to the GPU
 	//
 	cudasafe(cudaMemcpy(type_gpu,lens->type , nhalos*sizeof(int),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy type_gpu: " );
 	cudasafe(cudaMemcpy(lens_x_gpu,lens->position_x , nhalos*sizeof(type_t),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy x_gpu: " );
 	cudasafe(cudaMemcpy(lens_y_gpu,lens->position_y , nhalos*sizeof(type_t), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy y_gpu: " );
 	cudasafe(cudaMemcpy(b0_gpu,lens->b0 , nhalos*sizeof(type_t), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy b0_gpu: " );
 	cudasafe(cudaMemcpy(angle_gpu,lens->ellipticity_angle , nhalos*sizeof(type_t), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy angle_gpu: " );
 	cudasafe(cudaMemcpy(epot_gpu, lens->ellipticity_potential, nhalos*sizeof(type_t),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy epot_gpu: " );
 	cudasafe(cudaMemcpy(rcore_gpu, lens->rcore, nhalos*sizeof(type_t),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy rcore_gpu: " );
 	cudasafe(cudaMemcpy(rcut_gpu, lens->rcut, nhalos*sizeof(type_t), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy rcut_gpu: " );
 	cudasafe(cudaMemcpy(anglecos_gpu, lens->anglecos, nhalos*sizeof(type_t),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy anglecos: " );
 	cudasafe(cudaMemcpy(anglesin_gpu, lens->anglesin, nhalos*sizeof(type_t), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy anglesin: " );
 	cudasafe(cudaMemcpy(frame_gpu, frame, sizeof(grid_param), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy fame_gpu: " );
 	//
 	lens_gpu->type 			= type_gpu;
 	lens_gpu->position_x 		= lens_x_gpu;
 	lens_gpu->position_y 		= lens_y_gpu;
 	lens_gpu->b0 			= b0_gpu;
 	lens_gpu->ellipticity_angle 	= angle_gpu;
 	lens_gpu->ellipticity_potential = epot_gpu;
 	lens_gpu->rcore 		= rcore_gpu;
 	lens_gpu->rcut 			= rcut_gpu;
 	lens_gpu->anglecos 		= anglecos_gpu;
 	lens_gpu->anglesin 		= anglesin_gpu;
 	//
 	cudaMemcpy(lens_kernel, lens_gpu, sizeof(Potential_SOA), cudaMemcpyHostToDevice);
 	//
 	type_t time = -myseconds();
 	//module_potentialDerivatives_totalGradient_SOA_CPU_GPU(grid_grad_x_gpu, grid_grad_y_gpu, frame_gpu, lens_kernel, nbgridcells_x, nhalos);
 	module_potentialDerivatives_totalGradient_SOA_CPU_GPU(grid_grad_x_gpu, grid_grad_y_gpu, frame_gpu, lens_kernel, nhalos, dx, dy, nbgridcells_x, nbgridcells_y, istart, jstart);
 	//
 	//cudasafe(cudaGetLastError(), "module_potentialDerivative_totalGradient_SOA_CPU_GPU");
 	cudaDeviceSynchronize();
 	time += myseconds();
 	//std::cout << "	kernel time = " << time << " s." << std::endl;
 	//
 
 	cudasafe(cudaMemcpy( grid_grad_x, grid_grad_x_gpu, (nbgridcells_x)*(nbgridcells_y)*sizeof(type_t), cudaMemcpyDeviceToHost )," --- Gradientgpu.cu : Copy source_x_gpu: " );
 	cudasafe(cudaMemcpy( grid_grad_y, grid_grad_y_gpu, (nbgridcells_x)*(nbgridcells_y)*sizeof(type_t), cudaMemcpyDeviceToHost)," --- Gradientgpu.cu : Copy source_y_gpu: " );
 	//
 	//printf("-----> %f %f \n",grid_grad_x[Nx], grid_grad_y[Ny]);
 	// Free GPU memory
 	cudaFree(lens_gpu);
 	cudaFree(type_gpu);
 	cudaFree(lens_x_gpu);
 	cudaFree(lens_y_gpu);
 	cudaFree(b0_gpu);
 	cudaFree(angle_gpu);
 	cudaFree(epot_gpu);
 	cudaFree(rcore_gpu);
 	cudaFree(rcut_gpu);
 	cudaFree(anglecos_gpu);
 	cudaFree(anglesin_gpu);
 	cudaFree(grid_grad_x_gpu);
 	cudaFree(grid_grad_y_gpu);
 }
 
 #if 0
 __global__
 void
 module_potentialDerivatives_totalGradient_8_SOA_GPU_loc(type_t *grid_grad_x, type_t *grid_grad_y, const struct Potential_SOA
  *lens, const struct grid_param *frame, int nbgridcells, int shalos, int nhalos)
 {
         //
         //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins");
         // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0
         //
         type_t grad_x, grad_y;
         type_t clumpgrad_x, clumpgrad_y;
         type_t image_point_x, image_point_y;
         //
         SHARED type_t cosi      [200];
         SHARED type_t sinu      [200];
         SHARED type_t rci       [200];
         SHARED type_t b0        [200];
         SHARED type_t epsi      [200];
         SHARED type_t position_x[200];
         SHARED type_t position_y[200];
         SHARED type_t rsqe      [200];
         //SHARED type_t sgrad_x   [(BLOCK_SIZE_X + 1)*BLOCK_SIZE_Y];
         //SHARED type_t sgrad_y   [(BLOCK_SIZE_X + 1)*BLOCK_SIZE_Y];
 
         //SHARED type_t sonepeps  [200];
         //SHARED type_t sonemeps  [200];
         //
         grad_x         = 0;
         grad_y         = 0;
         //
         int row        = blockIdx.y*blockDim.y + threadIdx.y;
         int col        = blockIdx.x*blockDim.x + threadIdx.x;
         //
         //int loc_row    = threadIdx.x;
         //int loc_col    = threadIdx.y*blockDim.x + threadIdx.x;
         //
         //int grid_size  = nbgridcells/blockDim.y;
         //
         //if (threadIdx.x == 0) printf("%d %d %d: row = %d, col = %d, grid_size = %d\n", blockIdx.y, gridDim.y, threadIdx.y, row, col, grid_size);
         //
         type_t dx = (frame->xmax - frame->xmin)/(nbgridcells-1);
         type_t dy = (frame->ymax - frame->ymin)/(nbgridcells-1);
         //if (threadIdx.x == 0) printf("dx = %f, dy = %f\n", dx, dy);
         //
         image_point_x = frame->xmin + col*dx;
         image_point_y = frame->ymin + row*dy;
         //
         //int iloc  = threadIdx.x*blockDim.y + threadIdx.y;
         int iglob = row*nbgridcells + col;
         int numThreads = blockDim.x*blockDim.y;
         //
         for (int i = 0; i < (nhalos + numThreads - 1)/numThreads; ++i)
         {
                 int iloc  = threadIdx.y*blockDim.x + threadIdx.x + i*numThreads;
                 if (iloc < nhalos)
                 {
                         cosi[iloc]       = __ldg(&lens->anglecos             [shalos + iloc]);
                         sinu[iloc]       = __ldg(&lens->anglesin             [shalos + iloc]);
                         position_x[iloc] = __ldg(&lens->position_x           [shalos + iloc]);
                         position_y[iloc] = __ldg(&lens->position_y           [shalos + iloc]);
                         rci[iloc]        = __ldg(&lens->rcore                [shalos + iloc]);
                         b0[iloc]         = __ldg(&lens->b0                   [shalos + iloc]);
                         epsi[iloc]       = __ldg(&lens->ellipticity_potential[shalos + iloc]);
                         rsqe[iloc]       = sqrt(epsi[iloc]);
                 }
         }
         __syncthreads();
         //
         if ((row < nbgridcells) && (col < nbgridcells))
         {
                 //
                 for(int i = 0; i < nhalos; i++)
                 {
                         //int index  = iloc;
 #if 1
                         type_t rc      = rci[i];
                         type_t eps     = epsi[i];
                         type_t onemeps = 1 - eps;
                         type_t onepeps = 1 + eps;
                         //
                         type_t sqe = rsqe[i];
                         type_t cx1 = onemeps/onepeps;
                         //
                         //
                         //type_t zci_im = 1;
                         type_t zci_im  = -0.5*(1. - eps*eps)/sqe;
                         type_t inv_onepeps = 1./(onepeps*onepeps);
                         type_t inv_onemeps = 1./(onemeps*onemeps);
 #endif
                         //
                         {
                                 //KERNEL_8;
                                 type_t grad_x = grad_y = 0.;
                                 type_t true_coord_y = image_point_y - position_y[i];
                                 type_t true_coord_x = image_point_x - position_x[i];
                                 KERNEL_8_reg(0);
                                 grid_grad_x[iglob +  0] += grad_x;
                                 grid_grad_y[iglob +  0] += grad_y;
                         }
                         /*
                         {
                                 //KERNEL_8;
                                 type_t grad_x = grad_y = 0.;
                                 type_t true_coord_y = image_point_y - position_y[i];
                                 type_t true_coord_x = image_point_x - position_x[i] + BLOCK_SIZE_X/2*dx;
                                 KERNEL_8_reg(0);
                                 grid_grad_x[iglob + BLOCK_SIZE_X/2] += grad_x;
                                 grid_grad_y[iglob + BLOCK_SIZE_X/2] += grad_y;
                         }
                         */
                 //
                 }
         }
 }
 
 
 #endif
 void 
 module_potentialDerivatives_totalGradient_SOA_CPU_GPU(type_t *grid_grad_x, type_t *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_gpu, int nhalos, type_t dx, type_t dy, int nbgridcells_x, int nbgridcells_y, int istart, int jstart)
 {
         struct point grad, clumpgrad;
         //
         grad.x = clumpgrad.x = 0.;
         grad.y = clumpgrad.y = 0.;
         //
         int shalos = 0;
         //
         int GRID_SIZE_X = (nbgridcells_x + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks
         int GRID_SIZE_Y = (nbgridcells_y + BLOCK_SIZE_Y - 1)/BLOCK_SIZE_Y;
 	//
         printf("grid_size_x = %d, grid_size_y = %d, nbgridcells_x = %d, nbgridcells_y = %d, istart = %d, jstart = %d (split)\n", GRID_SIZE_X, GRID_SIZE_Y, nbgridcells_x, nbgridcells_y, istart, jstart);
         //
         dim3 threads(BLOCK_SIZE_X, BLOCK_SIZE_Y/1);
         dim3 grid   (GRID_SIZE_X , GRID_SIZE_Y);
         //
         int count = nhalos;
         //printf("nhalos = %d, size of shared memory = %lf\n", nhalos, (double) (8*nhalos + BLOCK_SIZE_X*nbgridcells/BLOCK_SIZE_Y)*sizeof(double))
 ;
         printf("nhalos = %d, size of shared memory = %lf (split)\n", nhalos, (type_t) (8*nhalos + BLOCK_SIZE_X*BLOCK_SIZE_Y)*sizeof(type_t));
         //
         cudaMemset(grid_grad_x, 0, nbgridcells_x*nbgridcells_y*sizeof(type_t));
         cudaMemset(grid_grad_y, 0, nbgridcells_x*nbgridcells_y*sizeof(type_t));
 	//
         //module_potentialDerivatives_totalGradient_SOA_GPU<<<grid, threads>>> (grid_grad_x, grid_grad_y, lens, frame, nhalos, nbgridcells_x);
         module_potentialDerivatives_totalGradient_SOA_GPU<<<grid, threads>>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nhalos, dx, dy, nbgridcells_x, nbgridcells_y, istart, jstart);
         cudasafe(cudaGetLastError(), "module_potentialDerivative_totalGradient_SOA_CPU_GPU_8_SOA_GPU");
 	//
         cudaDeviceSynchronize();
         printf("GPU kernel done...\n");
 }
 //
 //
 //
 void
 module_potentialDerivatives_totalGradient_SOA_CPU_GPU(type_t *grid_grad_x, type_t *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos)
 {
 	struct point grad, clumpgrad;
 	//
 	grad.x = clumpgrad.x = 0.;
 	grad.y = clumpgrad.y = 0.;
 	//
 	int shalos = 0;
 	//
 	int GRID_SIZE_X = (nbgridcells + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks
 	int GRID_SIZE_Y = (nbgridcells + BLOCK_SIZE_Y - 1)/BLOCK_SIZE_Y; 
 	//printf("grid_size_x = %d, grid_size_y = %d\n", GRID_SIZE_X, GRID_SIZE_Y);
 	//
 	type_t* timer = (type_t *) malloc((int) nbgridcells*nbgridcells*sizeof(type_t));
 	type_t* dtimer;
 	//cudasafe(cudaMalloc( (void**)&(dtimer), (int) nbgridcells*nbgridcells*sizeof(type_t)),"Gradientgpu.cu : totalGradient_SOA_CPU_GPU: " );
 	//
 	//{
 	//dim3 dimBlock(BLOCK_SIZE_X/1, BLOCK_SIZE_Y);
 	//dim3 dimGrid (GRID_SIZE_X   , GRID_SIZE_Y );	
 	dim3 threads(BLOCK_SIZE_X, BLOCK_SIZE_Y/1);
 	dim3 grid   (GRID_SIZE_X , GRID_SIZE_Y);	
 	//
 	int count = nhalos;	
 	//printf("nhalos = %d, size of shared memory = %lf\n", nhalos, (type_t) (8*nhalos + BLOCK_SIZE_X*nbgridcells/BLOCK_SIZE_Y)*sizeof(type_t));
 	printf("nhalos = %d, size of shared memory = %lf\n", nhalos, (type_t) (8*nhalos + BLOCK_SIZE_X*BLOCK_SIZE_Y)*sizeof(type_t));
 	//
 	cudaMemset(grid_grad_x, 0, nbgridcells*nbgridcells*sizeof(type_t));
 	cudaMemset(grid_grad_y, 0, nbgridcells*nbgridcells*sizeof(type_t));
 		//
 	module_potentialDerivatives_totalGradient_SOA_GPU<<<grid, threads>>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nbgridcells, nhalos);
 	cudasafe(cudaGetLastError(), "module_potentialDerivative_totalGradient_SOA_CPU_GPU_8_SOA_GPU");
 		
 	//}
 	cudaDeviceSynchronize();
 	printf("GPU kernel done...\n");
 }