diff --git a/Benchmarks/GridGradientBenchmark/main.cpp b/Benchmarks/GridGradientBenchmark/main.cpp
index 4825916..82dbce7 100644
--- a/Benchmarks/GridGradientBenchmark/main.cpp
+++ b/Benchmarks/GridGradientBenchmark/main.cpp
@@ -1,644 +1,644 @@
 /**
 * @file   main.cpp
 * @Author Christoph Schaaefer, EPFL (christophernstrerne.schaefer@epfl.ch)
 * @date   October 2016
 * @brief  Benchmark for gradhalo function
 */
 
 #include <iostream>
 #include <iomanip>
 #include <string.h>
 #include <math.h>
 #include <sys/time.h>
 #include <fstream>
 #include <sys/stat.h>
 #include <unistd.h>
 //
 //#include <mm_malloc.h>
 #include <omp.h>
 //
 //#include <cuda_runtime.h>
 #include <structure_hpc.hpp>
 #include "timer.h"
 #include "gradient.hpp"
 #include "chi_CPU.hpp"
 #include "module_cosmodistances.hpp"
 #include "module_readParameters.hpp"
 #include "allocation.hpp"
 #ifdef __WITH_MPI
 #include <mpi.h>
 #endif
 #ifdef __WITH_GPU
 #warning "GPU support enabled"
 #include "cudafunctions.cuh"
 #include "grid_gradient_GPU.cuh"
 #include "assign_gpu_to_rank.hpp"
 #include "module_readParameters_GPU.cuh"
 #include "allocation_GPU.cuh"
 #endif
 //
 #include "grid_gradient_CPU.hpp"
 #include "setup.hpp"
 //
 #ifdef __WITH_LENSTOOL
 #warning "linking with lenstool..."
 #include<fonction.h>
 #include<constant.h>
 #include<dimension.h>
 #include<structure.h>
 //
 #include <ittnotify.h>
 //
 struct g_mode   M;
 struct g_pot    P[NPOTFILE];
 struct g_pixel  imFrame, wFrame, ps, PSF;
 struct g_cube   cubeFrame;
 struct g_dyn    Dy;      //   //TV
 //
 struct g_source S;
 struct g_image  I;
 struct g_grille G;
 struct g_msgrid H;  // multi-scale grid
 struct g_frame  F;
 struct g_large  L;
 struct g_cosmo  C;
 struct g_cline  CL;
 struct g_observ O;
 struct pot      lens[NLMAX];
 struct pot      lmin[NLMAX], lmax[NLMAX], prec[NLMAX];
 struct g_cosmo  clmin, clmax;       /*cosmological limits*/
 struct galaxie  smin[NFMAX], smax[NFMAX];       // limits on source parameters
 struct ipot     ip;
 struct MCarlo   mc;
 struct vfield   vf;
 struct vfield   vfmin,vfmax; // limits on velocity field parameters
 struct cline    cl[NIMAX];
 lensdata *lens_table;
 //
 int    block[NLMAX][NPAMAX];      /*switch for the lens optimisation*/
 int    cblock[NPAMAX];                /*switch for the cosmological optimisation*/
 int    sblock[NFMAX][NPAMAX];                /*switch for the source parameters*/
 int    vfblock[NPAMAX];                /*switch for the velocity field parameters*/
 double excu   [NLMAX][NPAMAX];
 double excd   [NLMAX][NPAMAX];
 /* supplments tableaux de valeurs pour fonctions g pour Einasto
  *  * Ce sont trois variables globales qu'on pourra utiliser dans toutes les fonctions du projet
  *  */
 
 #define CMAX 20
 #define LMAX 80
 
 
 
 float Tab1[LMAX][CMAX];
 float Tab2[LMAX][CMAX];
 float Tab3[LMAX][CMAX];
 
 
 int      nrline, ntline, flagr, flagt;
 long int  narclet;
 
 struct point    gimage[NGGMAX][NGGMAX], gsource_global[NGGMAX][NGGMAX];
 struct biline   radial[NMAX] , tangent[NMAX];
 struct galaxie  arclet[NAMAX], source[NFMAX], image[NFMAX][NIMAX];
 struct galaxie  cimage[NFMAX];
 struct pointgal gianti[NPMAX][NIMAX];
 
 struct point    SC;
 double elix;
 double alpha_e;
 
 double *v_xx;
 double *v_yy;
 double **map_p;
 double **tmp_p;
 double **map_axx;
 double **map_ayy;
 #endif
 
 #define NTIMES 1
 double timings[NTIMES];
 
 void 
 average_stdev(double* average, double* stdev)
 {
 	*average = 0.;
 	*stdev   = 0.;	
 
 	if (NTIMES == 1)
 	{
 		*average = timings[0];
 		*stdev   = 0.;
 	}
 	else
 	{
 		for (int iter = 0; iter < NTIMES; ++iter)
 			*average += timings[iter];
 		*average /= NTIMES;
 		for (int iter = 0; iter < NTIMES; ++iter)
 			*stdev += (timings[iter] - *average)*(timings[iter] - *average);
 		//
 		*stdev = sqrt(*stdev/(NTIMES - 1));
 	}
 }
 
 //void
 //gradient_grid_GPU_sorted(type_t *grid_grad_x, type_t *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells);
 //
 //
 int module_readCheckInput_readInput(int argc, char *argv[])
 {
 	/// check if there is a correct number of arguments, and store the name of the input file in infile
 
 	char* infile;
 	struct stat  file_stat;
 
 	// If we do not have 3 arguments, stop
 	if ( argc != 3 )
 	{
 		fprintf(stderr, "\nUnexpected number of arguments\n");
 		fprintf(stderr, "\nUSAGE:\n");
 		fprintf(stderr, "lenstool  input_file  output_directorypath  [-n]\n\n");
 		exit(-1);
 	}
 	else if ( argc == 3 )
 		infile=argv[1];
 	std::ifstream ifile(infile,std::ifstream::in); // Open the file
 
 
 	int ts = (int) time (NULL);
 	char buffer[10];
 	std::stringstream ss;
 	ss << ts;
 	std::string trimstamp = ss.str();
 	//
 	std::string outdir = argv[2];
 	outdir += "-";
 	outdir += trimstamp;
 	std::cout << "Output dir:	" << outdir << std::endl;
 
 	// check whether the output directory already exists
 	if (stat(outdir.c_str(), &file_stat) < 0){
 		mkdir(outdir.c_str(), S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP | S_IXGRP | S_IROTH );
 	}
 	else 
 	{
 		printf("Error : Directory %s already exists. Specify a non existing directory.\n",argv[2]);
 		exit(-1);
 	}
 
 	// check whether the input file exists. If it could not be opened (ifile = 0), it does not exist
 	if (ifile) ifile.close();
 	else
 	{
 		printf("The file %s does not exist, please specify a valid file name\n",infile);
 		exit(-1);
 	}
 
 	return 0;
 }
 //
 //
 //
 int main(int argc, char *argv[])
 {
 	//
 	// Setting Up the problem
 	//
 
 	// This module function reads the terminal input when calling LENSTOOL and checks that it is correct
 	// Otherwise it exits LENSTOOL
 	// 
 	char cwd[1024];
 	if (getcwd(cwd, sizeof(cwd)) != NULL)
 		fprintf(stdout, "Current working dir: %s\n", cwd);
 	//
 	int world_rank = 0, world_size=1;
 #ifdef __WITH_MPI
 	MPI_Init(&argc, &argv);
 	//MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 	//MPI_Comm_size(MPI_COMM_WORLD, &world_size);
         char processor_name[MPI_MAX_PROCESSOR_NAME];
         int name_len;
         MPI_Get_processor_name(processor_name, &name_len);
         //MPI_Barrier(MPI_COMM_WORLD);
 #endif
 	int verbose = (world_rank == 0);
         int numthreads = 1;
 #ifdef _OPENMP
 #warning "using openmp"
 #pragma omp parallel
         numthreads = omp_get_num_threads();
 #endif
         //
         if (verbose) printf("\n --- Lenstool-HPC --- \n\n"); fflush(stdout);
         //
 	//
 #ifdef __WITH_MPI
         MPI_Barrier(MPI_COMM_WORLD);
 #endif
         printf("Hello world from processor %s, rank %d out of %d processors and %d threads per rank\n", processor_name, world_rank, world_size, numthreads); fflush(stdout);
 #ifdef __WITH_GPU
 	//assign_gpu_to_local_rank();
 #endif
 	if (verbose) module_readCheckInput_readInput(argc, argv);
 	//
 	// This module function reads the cosmology parameters from the parameter file
 	// Input: struct cosmologicalparameters cosmology, parameter file
 	// Output: Initialized cosmology struct
 
 	cosmo_param cosmology;  // Cosmology struct to store the cosmology data from the file
 	std::string inputFile = argv[1];   // Input file
 	module_readParameters_readCosmology(inputFile, cosmology);
 	//
 	// This module function reads the runmode paragraph and the number of sources, arclets, etc. in the parameter file.
 	// The runmode_param stores the information of what exactly the user wants to do with lenstool.
 	struct runmode_param runmode;
 	module_readParameters_readRunmode(inputFile, &runmode);
 	runmode.debug = 1;
 	if (runmode.debug) 
 	{
 		module_readParameters_debug_cosmology(runmode.debug, cosmology);
 		module_readParameters_debug_runmode(runmode.debug, runmode);
 	}
 	//
 	//=== Declaring variables
 	//
 	// AOS
 	int lsize = runmode.nhalos;//+ *runmode.npotfile - 1;
 	struct Potential* lenses_AOS = (struct Potential*) malloc(lsize*sizeof(struct Potential));
 	//void module_readParameters_Potential(std::string infile, Potential lens[], int nhalos)
 
 	// SOA
 	struct Potential_SOA* lenses_SOA;
 	struct cline_param cline;
 	struct potfile_param potfile;
 	//
 #ifdef __WITH_GPU
         struct grid_param* frame; cudaMallocManaged(&frame, sizeof(struct grid_param));
 #else
         struct grid_param* frame = (struct grid_param*) malloc(sizeof(struct grid_param));
 #endif
 	//struct Potential potfilepotentials[runmode.npotfile];
 	//
 	//
 	//
 	// This module function reads in the potential form and its parameters (e.g. NFW)
 	// Input: input file
 	// Output: Potentials and its parameters
 	//
 #ifdef __WITH_GPU
         if (verbose) printf("Allocating Unified Memory... ");fflush(stdout);
         //cudaMallocManaged(&lenses_SOA, sizeof(struct Potential_SOA));
         PotentialSOAAllocation_GPU(&lenses_SOA, runmode.nhalos);
         if (verbose) printf("lenses_SOA = %p\n", lenses_SOA);
-        //module_readParameters_PotentialSOA_noalloc_GPU(inputFile, lenses_SOA, runmode.nhalos, runmode.n_tot_halos, cosmology);		//invalid ordinal problem
-        module_readParameters_PotentialSOA_noalloc(inputFile, lenses_SOA, runmode.nhalos, runmode.n_tot_halos, cosmology);
+        module_readParameters_PotentialSOA_noalloc(inputFile, lenses_SOA, runmode.nhalos, runmode.n_tot_halos, cosmology);		//use this line instead if SM architecture lower then 6.x
+        //module_readParameters_PotentialSOA_noalloc_GPU(inputFile, lenses_SOA, runmode.nhalos, runmode.n_tot_halos, cosmology);		//invalid ordinal problem if GPUs with SM architecture lower then 6.x
         printf("ok.\n"); fflush(stdout);
 #else
         if (verbose) printf("Allocating Memory... ");fflush(stdout);
         //Potential_SOA = (struct Potential_SOA) malloc(sizeof(struct Potential_SOA));
         PotentialSOAAllocation(&lenses_SOA, runmode.nhalos);
         module_readParameters_PotentialSOA_noalloc(inputFile, lenses_SOA, runmode.nhalos, runmode.n_tot_halos, cosmology);
         printf("ok.\n"); fflush(stdout);
 #endif
 	//if (runmode.debug) module_readParameters_debug_potential_SOA( lenses_SOA, runmode.n_tot_halos);
 
 
 	//if (runmode.debug) module_readParameters_debug_potential_SOA( lenses_SOA, runmodlse.nhalos);
 	//module_readParameters_PotentialSOA_direct(inputFile, lenses_SOA, runmode.nhalos, runmode.n_tot_halos, cosmology);
 	//printf("before read parameters\n");fflush(stdout);
 	//module_readParameters_Potential(inputFile, lenses_AOS, runmode.nhalos);
 	//module_readParameters_Potential(inputFile, lenses_AOS, lsize);
 	//printf("after read parameters\n");fflush(stdout);
 	//Converts to SOA
 	//module_readParameters_PotentialSOA(inputFile, lenses, lenses_SOA, runmode.nhalos);
 	//module_readParameters_debug_potential(runmode.debug, lenses_AOS, runmode.nhalos);
 	//module_readParameters_debug_potential(runmode.debug, lenses    , runmode.nhalos);
 	// This module function reads in the potfiles parameters
 	// Input: input file
 	// Output: Potentials from potfiles and its parameters
 	// 	
 	if (runmode.potfile == 1 )
 	{
 		module_readParameters_readpotfiles_param(inputFile, &potfile, cosmology);
 		if (runmode.debug) module_readParameters_debug_potfileparam(runmode.debug, &potfile);
 		module_readParameters_readpotfiles_SOA(&runmode, &cosmology,&potfile,lenses_SOA);
 		if (runmode.debug) module_readParameters_debug_potential_SOA( lenses_SOA, runmode.n_tot_halos);
 
 	}
 
 	module_readParameters_lens_dslds_calculation(&runmode,&cosmology,lenses_SOA);
 	//
 	// This module function reads in the grid form and its parameters
 	// Input: input file
 	// Output: grid and its parameters
 	//
 	module_readParameters_Grid(inputFile, frame);
 	//
 	//
 	//
 	double t_1,t_2,t_3,t_4;
 	//
 	// Lenstool-CPU Grid-Gradient
 	//
 	//Setting Test:
 	type_t dx, dy;
 	int grid_dim = runmode.nbgridcells;
 	//
 	dx = (frame->xmax - frame->xmin)/(runmode.nbgridcells-1);
 	dy = (frame->ymax - frame->ymin)/(runmode.nbgridcells-1);
 	//
 #ifdef __WITH_LENSTOOL
 	double *grid_grad_x_LT, *grid_grad_y_LT;
 	if (world_rank == 0)
 	{
 		if (verbose) std::cout << "CPU Test Lenstool    ...\t\t ";
 		//if (verbose) printf("	Setting up lenstool using %d lenses...", runmode.n_tot_halos); fflush(stdout);
 		convert_to_LT(lenses_SOA, runmode.n_tot_halos);
 		//if (verbose) printf("ok\n");
 		struct point Grad;
 		grid_grad_x_LT = (double *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t));
 		grid_grad_y_LT = (double *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t));
 		//
 		double dlsds = 1.;
 		double zs    = 1.;
 		double average = 0., stdev = 0.;
 		for (int iter = 0; iter < NTIMES; ++iter)
 		{
 			memset(grid_grad_x_LT, 0, (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t));
 			memset(grid_grad_y_LT, 0, (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t));
 			//
 			double t_lt = -myseconds();
 #pragma omp parallel for
 			for (int jj = 0; jj < runmode.nbgridcells; ++jj)
 				for (int ii = 0; ii < runmode.nbgridcells; ++ii)
 				{
 					//  (index < grid_dim*grid_dim)
 
 					int index = jj*runmode.nbgridcells + ii;
 
 					struct point image_point;
 					image_point.x = frame->xmin + ii*dx;
 					image_point.y = frame->ymin + jj*dy;
 					G.nlens = runmode.n_tot_halos;
 #if 0
 					Grad = e_grad_pot(&image_point, lens);
 					grid_grad_x_LT[index] = Grad.x;
 					grid_grad_y_LT[index] = Grad.y;
 
 #else
 					for (int lens = 0; lens < runmode.n_tot_halos; ++lens)
 					{
 						struct point Grad = e_grad_pot(&image_point, lens);
 						//
 						grid_grad_x_LT[index] += Grad.x;
 						grid_grad_y_LT[index] += Grad.y;
 					}
 #endif
 					//
 				}
 			t_lt += myseconds();
 			timings[iter] = t_lt;
 		}
 		average_stdev(&average, &stdev);
 		if (verbose) std::cout << "   Average time = " << average << " stdev = " << stdev << std::endl;
 	}
 #endif
 	//
 	// MPI definitions
 	//
 	int grid_size     = runmode.nbgridcells;
 	int loc_grid_size = runmode.nbgridcells/world_size;
 	//
 	double y_len      = fabs(frame->ymax - frame->ymin);
 	int    y_len_loc  = runmode.nbgridcells/world_size;
 	int    y_pos_loc  = (int) world_rank*y_len_loc;
 	int    y_bound    = y_len_loc;
 	//
 	// CPU SOA benchmark
 	//
 	type_t* grid_grad_x_cpu = (type_t *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t));
 	type_t* grid_grad_y_cpu = (type_t *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t));
 	//
 	memset(grid_grad_x_cpu, 0, (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t));
 	memset(grid_grad_y_cpu, 0, (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t));
 	//
 	if (verbose) std::cout << "CPU Test Lenstool_hpc...\t\t ";
 	//
 	{
 		double average = 0., stdev = 0.;
 		//__SSC_MARK(0x111); // start SDE tracing, note it uses 2 underscores
 		//__itt_resume(); // start VTune, again use 2 underscores
 		for (int iter = 0; iter < NTIMES; ++iter)
 		{
 			double t_1 = -myseconds();
 			gradient_grid_CPU(grid_grad_x_cpu, grid_grad_y_cpu, frame, lenses_SOA, runmode.n_tot_halos, runmode.nbgridcells);
 			t_1 += myseconds();
 			std::cout << "CBLA...\t\t " << NTIMES << std::endl;
 			timings[iter] = t_1;
 		}
 		//__itt_pause(); // stop VTune
 		//__SSC_MARK(0x222); // stop SDE tracing
 		average_stdev(&average, &stdev);
 		if (verbose) std::cout << "   Average time = " << average << " stdev = " << stdev << std::endl;
 	}
         //
         // CPU AOS benchmark
         //
         //
 #ifdef __WITH_LENSTOOL_AOS
 		convert_SOA_to_AOS(lenses_AOS, lenses_SOA, runmode.nhalos); 
 
         type_t* grid_grad_x_cpu_AOS = (type_t *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t));
         type_t* grid_grad_y_cpu_AOS = (type_t *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t));
         //
         memset(grid_grad_x_cpu_AOS, 0, (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t));
         memset(grid_grad_y_cpu_AOS, 0, (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t));
 
         if (verbose) std::cout << "CPU Test Lenstool_hpc AOS...\t\t ";
         //
         {
                 double average = 0., stdev = 0.;
                 for (int iter = 0; iter < NTIMES; ++iter)
                 {
                         double t_1 = -myseconds();
                         gradient_grid_CPU_AOS(grid_grad_x_cpu_AOS, grid_grad_y_cpu_AOS, frame, &lenses_AOS[0], runmode.n_tot_halos, runmode.nbgridcells);
                         t_1 += myseconds();
                         timings[iter] = t_1;
                 }
                 average_stdev(&average, &stdev);
                 if (verbose) std::cout << "   Average time = " << average << " stdev = " << stdev << std::endl;
         }
 #endif
 	//if (verbose) std::cout << " Time = " << std::setprecision(15) << t_1 << std::endl;
 	//type_t *grid_gradient_x, *grid_gradient_y;
 	//
 	// GPU section 
 	//
 #ifdef __WITH_GPU
 #warning "using GPUs..."
 	//
 	// GPU benchmark
 	//
 	if (verbose) std::cout << "GPU Test... "; fflush(stdout);
 	//
 	//type_t* grid_grad_x_gpu = (type_t *) malloc((int) (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t));
 	//type_t* grid_grad_y_gpu = (type_t *) malloc((int) (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t));
 	type_t* grid_grad_x_gpu; cudaMallocManaged(&grid_grad_x_gpu, (int) (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t));
 	type_t* grid_grad_y_gpu; cudaMallocManaged(&grid_grad_y_gpu, (int) (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t));
 	//
 	cudaMemset(grid_grad_x_gpu, 0, (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t));
 	cudaMemset(grid_grad_y_gpu, 0, (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t));
 	//
 	//grid_grad_x_gpu = (type_t *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t));
 	//grid_grad_y_gpu = (type_t *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t));
 	//
 	//printf("%d %d %d %d\n", runmode.n_tot_halos, grid_size, y_pos_loc, y_bound);
 	t_2 = -myseconds();
 	for (int iter = 0; iter < NTIMES; ++iter)
 	{
 		gradient_grid_GPU_UM(grid_grad_x_gpu, grid_grad_y_gpu, frame, lenses_SOA, runmode.n_tot_halos, dx, dy, grid_size, y_bound, 0, y_pos_loc);
 		cudaDeviceSynchronize();
 	}
 	t_2 += myseconds();
 	t_2 /= NTIMES;
 	//
 	cudasafe(cudaGetLastError(), "gradient_grid_GPU_UM");
 	//
 	if (verbose) std::cout << " Time  " << std::setprecision(15) << t_2 << std::endl;
 	//
 #endif
 	std::ofstream myfile;
 #ifdef __WITH_LENSTOOL 
 	//
 	type_t norm_x_LT = 0.;
 	type_t norm_y_LT = 0.;
 	type_t sum_x_LT  = 0.;
 	type_t sum_y_LT  = 0.;
 	//
 	type_t norm_x_LT_AOS = 0.;
 	type_t norm_y_LT_AOS = 0.;
 	//
 	if (world_rank == 0)
 	{
 		//
 		for (int ii = 0; ii < grid_dim*grid_dim; ++ii)
 		{
 			type_t gx = grid_grad_x_LT[ii];
 			type_t gy = grid_grad_y_LT[ii];
 			//
 			sum_x_LT      += gx*gx;
 			sum_y_LT      += gy*gy;
 
 			//
 			type_t c_x     = grid_grad_x_cpu[ii];
 			type_t c_y     = grid_grad_y_cpu[ii];
 			//
 			norm_x_LT     += (gx - grid_grad_x_cpu[ii])*(gx - grid_grad_x_cpu[ii]);
 			norm_y_LT     += (gy - grid_grad_y_cpu[ii])*(gy - grid_grad_y_cpu[ii]);
 			//
 			norm_x_LT_AOS += (gx - grid_grad_x_cpu_AOS[ii])*(gx - grid_grad_x_cpu_AOS[ii]);
 			norm_y_LT_AOS += (gy - grid_grad_y_cpu_AOS[ii])*(gy - grid_grad_y_cpu_AOS[ii]);
 			//printf("%.15f = %.15f\n", gx, grid_grad_x_cpu[ii]);
 		}
 		//
 	}
 #ifdef __WITH_GPU
 	type_t norm_x_gpu_LT = 0.;
 	type_t norm_y_gpu_LT = 0.;
 	//
 	for (int ii = 0; ii < grid_dim*grid_dim; ++ii)
 	{
 		type_t g_x = grid_grad_x_LT[ii];
 		type_t g_y = grid_grad_y_LT[ii];
 		//
 		type_t c_x = grid_grad_x_gpu[ii];
 		type_t c_y = grid_grad_y_gpu[ii];
 		//
 		norm_x_gpu_LT += (grid_grad_x_LT[ii] - grid_grad_x_gpu[ii])*(grid_grad_x_LT[ii] - grid_grad_x_gpu[ii]);
 		norm_y_gpu_LT += (grid_grad_y_LT[ii] - grid_grad_y_gpu[ii])*(grid_grad_y_LT[ii] - grid_grad_y_gpu[ii]);
 	}
 	//
 #endif
 #endif
 	/*
 #ifdef __WITH_LENSTOOL
 	//
 	type_t norm_x_LT = 0.;
 	type_t norm_y_LT = 0.;
 	//
 	type_t sum_x_LT  = 0.;
 	type_t sum_y_LT  = 0.;
 	//
 	for (int ii = 0; ii < grid_dim*grid_dim; ++ii)
 	{
 	//
 	sum_x_LT += grid_grad_x_cpu[ii]*grid_grad_x_cpu[ii];
 	sum_y_LT += grid_grad_y_cpu[ii]*grid_grad_y_cpu[ii];
 	//
 	norm_x_LT += (grid_grad_x_cpu[ii] - grid_grad_x_LT[ii])*(grid_grad_x_cpu[ii] - grid_grad_x_LT[ii]);
 	norm_y_LT += (grid_grad_y_cpu[ii] - grid_grad_y_LT[ii])*(grid_grad_y_cpu[ii] - grid_grad_y_LT[ii]);
 	//
 	}
 #endif
 */
 	//type_t sum_x_cpu_2000 = 4761763143.24101;
 	//type_t sum_y_cpu_2000 = 5412618205.81843;
 	type_t sum_x_cpu  = 0.;
 	type_t sum_y_cpu  = 0.;
 	//
 #ifdef __WITH_GPU
 	type_t norm_x_gpu = 0.;
 	type_t norm_y_gpu = 0.;
 	//
 	type_t sum_x_gpu  = 0.;
 	type_t sum_y_gpu  = 0.;
 	//
 	for (int ii = 0; ii < grid_dim*grid_dim; ++ii)
 	{
 		//
 		//if (fabs(grid_grad_x_gpu[ii] - grid_grad_x_cpu[ii]) > 1e-10)
 		//	printf("Error = %.15f %.15f, %.15f\n", grid_grad_x_gpu[ii], grid_grad_x_cpu[ii], fabs(grid_grad_x_gpu[ii] - grid_grad_x_cpu[ii]));
 		sum_x_cpu += grid_grad_x_cpu[ii]*grid_grad_x_cpu[ii];
 		sum_y_cpu += grid_grad_y_cpu[ii]*grid_grad_y_cpu[ii];
 
 		sum_x_gpu += grid_grad_x_gpu[ii]*grid_grad_x_gpu[ii];
 		sum_y_gpu += grid_grad_y_gpu[ii]*grid_grad_y_gpu[ii];
 		//
 		norm_x_gpu += (grid_grad_x_cpu[ii] - grid_grad_x_gpu[ii])*(grid_grad_x_cpu[ii] - grid_grad_x_gpu[ii]);
 		norm_y_gpu += (grid_grad_y_cpu[ii] - grid_grad_y_gpu[ii])*(grid_grad_y_cpu[ii] - grid_grad_y_gpu[ii]);
 		//
 	}
 	//printf("norm_gpu = %f %f, sum = %f %f\n", norm_x_gpu, norm_x_gpu, sum_x_gpu, sum_y_gpu);
 
 #endif
 
 #ifdef __WITH_LENSTOOL
 	if (verbose) std::cout << "  l2 difference norm cpu-LT     = " << std::setprecision(15) << norm_x_LT << " " << std::setprecision(15) << norm_y_LT << std::endl;
 	if (verbose) std::cout << "  l2 difference norm cpu-LT AOS = " << std::setprecision(15) << norm_x_LT_AOS << " " << std::setprecision(15) << norm_y_LT_AOS << std::endl;
 #ifdef __WITH_GPU
 	if (verbose) std::cout << "  l2 difference norm gpu-LT     = " << std::setprecision(15) << norm_x_gpu_LT << " " << std::setprecision(15) << norm_y_gpu_LT << std::endl;
 	if (verbose) std::cout << "  l2 difference norm cpu-gpu    = " << std::setprecision(15) << norm_x_gpu << " " << std::setprecision(15) << norm_y_gpu << std::endl;
 #endif
 #endif
 	//if (verbose) std::cout << "Pour 2000  sum x cpu = " << std::setprecision(15) << sum_x_cpu_2000 << " sum_y_cpu = " << std::setprecision(15) << sum_y_cpu_2000 << std::endl;
 	if (verbose) std::cout << "  sum x cpu = " << std::setprecision(15) << sum_x_cpu << " sum_y_cpu = " << std::setprecision(15) << sum_y_cpu << std::endl;
 #ifdef __WITH_GPU
 	if (verbose) std::cout << "  sum x gpu = " << std::setprecision(15) << sum_x_gpu << " sum_y_gpu = " << std::setprecision(15) << sum_y_gpu << std::endl;
 #endif
 	//
 	if (verbose) std::cout << "Exiting..." << std::endl;
 	//
 #ifdef __WITH_MPI
 	MPI_Finalize();
 #endif
 
 #ifdef __WITH_GPU
 	cudaFree(lenses_SOA);
 	cudaFree(frame);
 #else
 	free(lenses_SOA);
 	free(frame);
 #endif
 }
diff --git a/src/module_readParameters_GPU.cu b/src/module_readParameters_GPU.cu
index aa64531..b21a80d 100644
--- a/src/module_readParameters_GPU.cu
+++ b/src/module_readParameters_GPU.cu
@@ -1,41 +1,43 @@
 #include "grid_gradient_GPU.cuh"
 #include "gradient.hpp"
 #include "gradient_GPU.cuh"
 #include "gradient.hpp"
 
 #include "module_readParameters.hpp"
 
 extern void cudasafe( cudaError_t error, const char* message);
 
 void checkCudaError(cudaError_t error)
 {
   if(error != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(error));
 }
 //
 //
 //
 void module_readParameters_PotentialSOA_noalloc_GPU(std::string infile, Potential_SOA *lens_SOA, int nhalos, int n_tot_halos, cosmo_param cosmology)
 {
-	module_readParameters_PotentialSOA_noalloc(infile, lens_SOA, nhalos, n_tot_halos, cosmology);
+	//module_readParameters_PotentialSOA_noalloc(infile, lens_SOA, nhalos, n_tot_halos, cosmology);
 #if defined(__WITH_UM)
+    #if (__CUDA_ARCH__ >= 600)
 // let's prefetch the data in the GPU memory
 #warning "Prefetching data on GPU"
-	checkCudaError(cudaMemPrefetchAsync(lens_SOA 				, sizeof(Potential_SOA), 0));
+	    checkCudaError(cudaMemPrefetchAsync(lens_SOA 				, sizeof(Potential_SOA), 0));
         checkCudaError(cudaMemPrefetchAsync(lens_SOA->position_x		, nhalos*sizeof(type_t), 0));
         checkCudaError(cudaMemPrefetchAsync(lens_SOA->position_y		, nhalos*sizeof(type_t), 0));
         checkCudaError(cudaMemPrefetchAsync(lens_SOA->anglecos			, nhalos*sizeof(type_t), 0));
         checkCudaError(cudaMemPrefetchAsync(lens_SOA->anglesin			, nhalos*sizeof(type_t), 0));
         checkCudaError(cudaMemPrefetchAsync(lens_SOA->ellipticity_potential	, nhalos*sizeof(type_t), 0));
         checkCudaError(cudaMemPrefetchAsync(lens_SOA->rcore			, nhalos*sizeof(type_t), 0));
         checkCudaError(cudaMemPrefetchAsync(lens_SOA->rcut			, nhalos*sizeof(type_t), 0));
         checkCudaError(cudaMemPrefetchAsync(lens_SOA->b0			, nhalos*sizeof(type_t), 0));
         checkCudaError(cudaMemPrefetchAsync(lens_SOA->ellipticity_angle		, nhalos*sizeof(type_t), 0));
         checkCudaError(cudaMemPrefetchAsync(lens_SOA->ellipticity		, nhalos*sizeof(type_t), 0));
         checkCudaError(cudaMemPrefetchAsync(lens_SOA->vdisp			, nhalos*sizeof(type_t), 0));
         checkCudaError(cudaMemPrefetchAsync(lens_SOA->z				, nhalos*sizeof(type_t), 0));
         checkCudaError(cudaMemPrefetchAsync(lens_SOA->N_types			, nhalos*sizeof(int), 0));
         checkCudaError(cudaMemPrefetchAsync(lens_SOA->type			, nhalos*sizeof(int), 0));
+    #endif
 #endif
 
 }