diff --git a/Benchmarks/GridGradientBenchmark/main.cpp b/Benchmarks/GridGradientBenchmark/main.cpp index 4825916..82dbce7 100644 --- a/Benchmarks/GridGradientBenchmark/main.cpp +++ b/Benchmarks/GridGradientBenchmark/main.cpp @@ -1,644 +1,644 @@ /** * @file main.cpp * @Author Christoph Schaaefer, EPFL (christophernstrerne.schaefer@epfl.ch) * @date October 2016 * @brief Benchmark for gradhalo function */ #include #include #include #include #include #include #include #include // //#include #include // //#include #include #include "timer.h" #include "gradient.hpp" #include "chi_CPU.hpp" #include "module_cosmodistances.hpp" #include "module_readParameters.hpp" #include "allocation.hpp" #ifdef __WITH_MPI #include #endif #ifdef __WITH_GPU #warning "GPU support enabled" #include "cudafunctions.cuh" #include "grid_gradient_GPU.cuh" #include "assign_gpu_to_rank.hpp" #include "module_readParameters_GPU.cuh" #include "allocation_GPU.cuh" #endif // #include "grid_gradient_CPU.hpp" #include "setup.hpp" // #ifdef __WITH_LENSTOOL #warning "linking with lenstool..." #include #include #include #include // #include // struct g_mode M; struct g_pot P[NPOTFILE]; struct g_pixel imFrame, wFrame, ps, PSF; struct g_cube cubeFrame; struct g_dyn Dy; // //TV // struct g_source S; struct g_image I; struct g_grille G; struct g_msgrid H; // multi-scale grid struct g_frame F; struct g_large L; struct g_cosmo C; struct g_cline CL; struct g_observ O; struct pot lens[NLMAX]; struct pot lmin[NLMAX], lmax[NLMAX], prec[NLMAX]; struct g_cosmo clmin, clmax; /*cosmological limits*/ struct galaxie smin[NFMAX], smax[NFMAX]; // limits on source parameters struct ipot ip; struct MCarlo mc; struct vfield vf; struct vfield vfmin,vfmax; // limits on velocity field parameters struct cline cl[NIMAX]; lensdata *lens_table; // int block[NLMAX][NPAMAX]; /*switch for the lens optimisation*/ int cblock[NPAMAX]; /*switch for the cosmological optimisation*/ int sblock[NFMAX][NPAMAX]; /*switch for the source parameters*/ int vfblock[NPAMAX]; /*switch for the velocity field parameters*/ double excu [NLMAX][NPAMAX]; double excd [NLMAX][NPAMAX]; /* supplments tableaux de valeurs pour fonctions g pour Einasto * * Ce sont trois variables globales qu'on pourra utiliser dans toutes les fonctions du projet * */ #define CMAX 20 #define LMAX 80 float Tab1[LMAX][CMAX]; float Tab2[LMAX][CMAX]; float Tab3[LMAX][CMAX]; int nrline, ntline, flagr, flagt; long int narclet; struct point gimage[NGGMAX][NGGMAX], gsource_global[NGGMAX][NGGMAX]; struct biline radial[NMAX] , tangent[NMAX]; struct galaxie arclet[NAMAX], source[NFMAX], image[NFMAX][NIMAX]; struct galaxie cimage[NFMAX]; struct pointgal gianti[NPMAX][NIMAX]; struct point SC; double elix; double alpha_e; double *v_xx; double *v_yy; double **map_p; double **tmp_p; double **map_axx; double **map_ayy; #endif #define NTIMES 1 double timings[NTIMES]; void average_stdev(double* average, double* stdev) { *average = 0.; *stdev = 0.; if (NTIMES == 1) { *average = timings[0]; *stdev = 0.; } else { for (int iter = 0; iter < NTIMES; ++iter) *average += timings[iter]; *average /= NTIMES; for (int iter = 0; iter < NTIMES; ++iter) *stdev += (timings[iter] - *average)*(timings[iter] - *average); // *stdev = sqrt(*stdev/(NTIMES - 1)); } } //void //gradient_grid_GPU_sorted(type_t *grid_grad_x, type_t *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells); // // int module_readCheckInput_readInput(int argc, char *argv[]) { /// check if there is a correct number of arguments, and store the name of the input file in infile char* infile; struct stat file_stat; // If we do not have 3 arguments, stop if ( argc != 3 ) { fprintf(stderr, "\nUnexpected number of arguments\n"); fprintf(stderr, "\nUSAGE:\n"); fprintf(stderr, "lenstool input_file output_directorypath [-n]\n\n"); exit(-1); } else if ( argc == 3 ) infile=argv[1]; std::ifstream ifile(infile,std::ifstream::in); // Open the file int ts = (int) time (NULL); char buffer[10]; std::stringstream ss; ss << ts; std::string trimstamp = ss.str(); // std::string outdir = argv[2]; outdir += "-"; outdir += trimstamp; std::cout << "Output dir: " << outdir << std::endl; // check whether the output directory already exists if (stat(outdir.c_str(), &file_stat) < 0){ mkdir(outdir.c_str(), S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP | S_IXGRP | S_IROTH ); } else { printf("Error : Directory %s already exists. Specify a non existing directory.\n",argv[2]); exit(-1); } // check whether the input file exists. If it could not be opened (ifile = 0), it does not exist if (ifile) ifile.close(); else { printf("The file %s does not exist, please specify a valid file name\n",infile); exit(-1); } return 0; } // // // int main(int argc, char *argv[]) { // // Setting Up the problem // // This module function reads the terminal input when calling LENSTOOL and checks that it is correct // Otherwise it exits LENSTOOL // char cwd[1024]; if (getcwd(cwd, sizeof(cwd)) != NULL) fprintf(stdout, "Current working dir: %s\n", cwd); // int world_rank = 0, world_size=1; #ifdef __WITH_MPI MPI_Init(&argc, &argv); //MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); //MPI_Comm_size(MPI_COMM_WORLD, &world_size); char processor_name[MPI_MAX_PROCESSOR_NAME]; int name_len; MPI_Get_processor_name(processor_name, &name_len); //MPI_Barrier(MPI_COMM_WORLD); #endif int verbose = (world_rank == 0); int numthreads = 1; #ifdef _OPENMP #warning "using openmp" #pragma omp parallel numthreads = omp_get_num_threads(); #endif // if (verbose) printf("\n --- Lenstool-HPC --- \n\n"); fflush(stdout); // // #ifdef __WITH_MPI MPI_Barrier(MPI_COMM_WORLD); #endif printf("Hello world from processor %s, rank %d out of %d processors and %d threads per rank\n", processor_name, world_rank, world_size, numthreads); fflush(stdout); #ifdef __WITH_GPU //assign_gpu_to_local_rank(); #endif if (verbose) module_readCheckInput_readInput(argc, argv); // // This module function reads the cosmology parameters from the parameter file // Input: struct cosmologicalparameters cosmology, parameter file // Output: Initialized cosmology struct cosmo_param cosmology; // Cosmology struct to store the cosmology data from the file std::string inputFile = argv[1]; // Input file module_readParameters_readCosmology(inputFile, cosmology); // // This module function reads the runmode paragraph and the number of sources, arclets, etc. in the parameter file. // The runmode_param stores the information of what exactly the user wants to do with lenstool. struct runmode_param runmode; module_readParameters_readRunmode(inputFile, &runmode); runmode.debug = 1; if (runmode.debug) { module_readParameters_debug_cosmology(runmode.debug, cosmology); module_readParameters_debug_runmode(runmode.debug, runmode); } // //=== Declaring variables // // AOS int lsize = runmode.nhalos;//+ *runmode.npotfile - 1; struct Potential* lenses_AOS = (struct Potential*) malloc(lsize*sizeof(struct Potential)); //void module_readParameters_Potential(std::string infile, Potential lens[], int nhalos) // SOA struct Potential_SOA* lenses_SOA; struct cline_param cline; struct potfile_param potfile; // #ifdef __WITH_GPU struct grid_param* frame; cudaMallocManaged(&frame, sizeof(struct grid_param)); #else struct grid_param* frame = (struct grid_param*) malloc(sizeof(struct grid_param)); #endif //struct Potential potfilepotentials[runmode.npotfile]; // // // // This module function reads in the potential form and its parameters (e.g. NFW) // Input: input file // Output: Potentials and its parameters // #ifdef __WITH_GPU if (verbose) printf("Allocating Unified Memory... ");fflush(stdout); //cudaMallocManaged(&lenses_SOA, sizeof(struct Potential_SOA)); PotentialSOAAllocation_GPU(&lenses_SOA, runmode.nhalos); if (verbose) printf("lenses_SOA = %p\n", lenses_SOA); - //module_readParameters_PotentialSOA_noalloc_GPU(inputFile, lenses_SOA, runmode.nhalos, runmode.n_tot_halos, cosmology); //invalid ordinal problem - module_readParameters_PotentialSOA_noalloc(inputFile, lenses_SOA, runmode.nhalos, runmode.n_tot_halos, cosmology); + module_readParameters_PotentialSOA_noalloc(inputFile, lenses_SOA, runmode.nhalos, runmode.n_tot_halos, cosmology); //use this line instead if SM architecture lower then 6.x + //module_readParameters_PotentialSOA_noalloc_GPU(inputFile, lenses_SOA, runmode.nhalos, runmode.n_tot_halos, cosmology); //invalid ordinal problem if GPUs with SM architecture lower then 6.x printf("ok.\n"); fflush(stdout); #else if (verbose) printf("Allocating Memory... ");fflush(stdout); //Potential_SOA = (struct Potential_SOA) malloc(sizeof(struct Potential_SOA)); PotentialSOAAllocation(&lenses_SOA, runmode.nhalos); module_readParameters_PotentialSOA_noalloc(inputFile, lenses_SOA, runmode.nhalos, runmode.n_tot_halos, cosmology); printf("ok.\n"); fflush(stdout); #endif //if (runmode.debug) module_readParameters_debug_potential_SOA( lenses_SOA, runmode.n_tot_halos); //if (runmode.debug) module_readParameters_debug_potential_SOA( lenses_SOA, runmodlse.nhalos); //module_readParameters_PotentialSOA_direct(inputFile, lenses_SOA, runmode.nhalos, runmode.n_tot_halos, cosmology); //printf("before read parameters\n");fflush(stdout); //module_readParameters_Potential(inputFile, lenses_AOS, runmode.nhalos); //module_readParameters_Potential(inputFile, lenses_AOS, lsize); //printf("after read parameters\n");fflush(stdout); //Converts to SOA //module_readParameters_PotentialSOA(inputFile, lenses, lenses_SOA, runmode.nhalos); //module_readParameters_debug_potential(runmode.debug, lenses_AOS, runmode.nhalos); //module_readParameters_debug_potential(runmode.debug, lenses , runmode.nhalos); // This module function reads in the potfiles parameters // Input: input file // Output: Potentials from potfiles and its parameters // if (runmode.potfile == 1 ) { module_readParameters_readpotfiles_param(inputFile, &potfile, cosmology); if (runmode.debug) module_readParameters_debug_potfileparam(runmode.debug, &potfile); module_readParameters_readpotfiles_SOA(&runmode, &cosmology,&potfile,lenses_SOA); if (runmode.debug) module_readParameters_debug_potential_SOA( lenses_SOA, runmode.n_tot_halos); } module_readParameters_lens_dslds_calculation(&runmode,&cosmology,lenses_SOA); // // This module function reads in the grid form and its parameters // Input: input file // Output: grid and its parameters // module_readParameters_Grid(inputFile, frame); // // // double t_1,t_2,t_3,t_4; // // Lenstool-CPU Grid-Gradient // //Setting Test: type_t dx, dy; int grid_dim = runmode.nbgridcells; // dx = (frame->xmax - frame->xmin)/(runmode.nbgridcells-1); dy = (frame->ymax - frame->ymin)/(runmode.nbgridcells-1); // #ifdef __WITH_LENSTOOL double *grid_grad_x_LT, *grid_grad_y_LT; if (world_rank == 0) { if (verbose) std::cout << "CPU Test Lenstool ...\t\t "; //if (verbose) printf(" Setting up lenstool using %d lenses...", runmode.n_tot_halos); fflush(stdout); convert_to_LT(lenses_SOA, runmode.n_tot_halos); //if (verbose) printf("ok\n"); struct point Grad; grid_grad_x_LT = (double *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t)); grid_grad_y_LT = (double *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t)); // double dlsds = 1.; double zs = 1.; double average = 0., stdev = 0.; for (int iter = 0; iter < NTIMES; ++iter) { memset(grid_grad_x_LT, 0, (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t)); memset(grid_grad_y_LT, 0, (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t)); // double t_lt = -myseconds(); #pragma omp parallel for for (int jj = 0; jj < runmode.nbgridcells; ++jj) for (int ii = 0; ii < runmode.nbgridcells; ++ii) { // (index < grid_dim*grid_dim) int index = jj*runmode.nbgridcells + ii; struct point image_point; image_point.x = frame->xmin + ii*dx; image_point.y = frame->ymin + jj*dy; G.nlens = runmode.n_tot_halos; #if 0 Grad = e_grad_pot(&image_point, lens); grid_grad_x_LT[index] = Grad.x; grid_grad_y_LT[index] = Grad.y; #else for (int lens = 0; lens < runmode.n_tot_halos; ++lens) { struct point Grad = e_grad_pot(&image_point, lens); // grid_grad_x_LT[index] += Grad.x; grid_grad_y_LT[index] += Grad.y; } #endif // } t_lt += myseconds(); timings[iter] = t_lt; } average_stdev(&average, &stdev); if (verbose) std::cout << " Average time = " << average << " stdev = " << stdev << std::endl; } #endif // // MPI definitions // int grid_size = runmode.nbgridcells; int loc_grid_size = runmode.nbgridcells/world_size; // double y_len = fabs(frame->ymax - frame->ymin); int y_len_loc = runmode.nbgridcells/world_size; int y_pos_loc = (int) world_rank*y_len_loc; int y_bound = y_len_loc; // // CPU SOA benchmark // type_t* grid_grad_x_cpu = (type_t *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t)); type_t* grid_grad_y_cpu = (type_t *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t)); // memset(grid_grad_x_cpu, 0, (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t)); memset(grid_grad_y_cpu, 0, (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t)); // if (verbose) std::cout << "CPU Test Lenstool_hpc...\t\t "; // { double average = 0., stdev = 0.; //__SSC_MARK(0x111); // start SDE tracing, note it uses 2 underscores //__itt_resume(); // start VTune, again use 2 underscores for (int iter = 0; iter < NTIMES; ++iter) { double t_1 = -myseconds(); gradient_grid_CPU(grid_grad_x_cpu, grid_grad_y_cpu, frame, lenses_SOA, runmode.n_tot_halos, runmode.nbgridcells); t_1 += myseconds(); std::cout << "CBLA...\t\t " << NTIMES << std::endl; timings[iter] = t_1; } //__itt_pause(); // stop VTune //__SSC_MARK(0x222); // stop SDE tracing average_stdev(&average, &stdev); if (verbose) std::cout << " Average time = " << average << " stdev = " << stdev << std::endl; } // // CPU AOS benchmark // // #ifdef __WITH_LENSTOOL_AOS convert_SOA_to_AOS(lenses_AOS, lenses_SOA, runmode.nhalos); type_t* grid_grad_x_cpu_AOS = (type_t *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t)); type_t* grid_grad_y_cpu_AOS = (type_t *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t)); // memset(grid_grad_x_cpu_AOS, 0, (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t)); memset(grid_grad_y_cpu_AOS, 0, (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t)); if (verbose) std::cout << "CPU Test Lenstool_hpc AOS...\t\t "; // { double average = 0., stdev = 0.; for (int iter = 0; iter < NTIMES; ++iter) { double t_1 = -myseconds(); gradient_grid_CPU_AOS(grid_grad_x_cpu_AOS, grid_grad_y_cpu_AOS, frame, &lenses_AOS[0], runmode.n_tot_halos, runmode.nbgridcells); t_1 += myseconds(); timings[iter] = t_1; } average_stdev(&average, &stdev); if (verbose) std::cout << " Average time = " << average << " stdev = " << stdev << std::endl; } #endif //if (verbose) std::cout << " Time = " << std::setprecision(15) << t_1 << std::endl; //type_t *grid_gradient_x, *grid_gradient_y; // // GPU section // #ifdef __WITH_GPU #warning "using GPUs..." // // GPU benchmark // if (verbose) std::cout << "GPU Test... "; fflush(stdout); // //type_t* grid_grad_x_gpu = (type_t *) malloc((int) (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t)); //type_t* grid_grad_y_gpu = (type_t *) malloc((int) (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t)); type_t* grid_grad_x_gpu; cudaMallocManaged(&grid_grad_x_gpu, (int) (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t)); type_t* grid_grad_y_gpu; cudaMallocManaged(&grid_grad_y_gpu, (int) (runmode.nbgridcells)*(runmode.nbgridcells)*sizeof(type_t)); // cudaMemset(grid_grad_x_gpu, 0, (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t)); cudaMemset(grid_grad_y_gpu, 0, (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t)); // //grid_grad_x_gpu = (type_t *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t)); //grid_grad_y_gpu = (type_t *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(type_t)); // //printf("%d %d %d %d\n", runmode.n_tot_halos, grid_size, y_pos_loc, y_bound); t_2 = -myseconds(); for (int iter = 0; iter < NTIMES; ++iter) { gradient_grid_GPU_UM(grid_grad_x_gpu, grid_grad_y_gpu, frame, lenses_SOA, runmode.n_tot_halos, dx, dy, grid_size, y_bound, 0, y_pos_loc); cudaDeviceSynchronize(); } t_2 += myseconds(); t_2 /= NTIMES; // cudasafe(cudaGetLastError(), "gradient_grid_GPU_UM"); // if (verbose) std::cout << " Time " << std::setprecision(15) << t_2 << std::endl; // #endif std::ofstream myfile; #ifdef __WITH_LENSTOOL // type_t norm_x_LT = 0.; type_t norm_y_LT = 0.; type_t sum_x_LT = 0.; type_t sum_y_LT = 0.; // type_t norm_x_LT_AOS = 0.; type_t norm_y_LT_AOS = 0.; // if (world_rank == 0) { // for (int ii = 0; ii < grid_dim*grid_dim; ++ii) { type_t gx = grid_grad_x_LT[ii]; type_t gy = grid_grad_y_LT[ii]; // sum_x_LT += gx*gx; sum_y_LT += gy*gy; // type_t c_x = grid_grad_x_cpu[ii]; type_t c_y = grid_grad_y_cpu[ii]; // norm_x_LT += (gx - grid_grad_x_cpu[ii])*(gx - grid_grad_x_cpu[ii]); norm_y_LT += (gy - grid_grad_y_cpu[ii])*(gy - grid_grad_y_cpu[ii]); // norm_x_LT_AOS += (gx - grid_grad_x_cpu_AOS[ii])*(gx - grid_grad_x_cpu_AOS[ii]); norm_y_LT_AOS += (gy - grid_grad_y_cpu_AOS[ii])*(gy - grid_grad_y_cpu_AOS[ii]); //printf("%.15f = %.15f\n", gx, grid_grad_x_cpu[ii]); } // } #ifdef __WITH_GPU type_t norm_x_gpu_LT = 0.; type_t norm_y_gpu_LT = 0.; // for (int ii = 0; ii < grid_dim*grid_dim; ++ii) { type_t g_x = grid_grad_x_LT[ii]; type_t g_y = grid_grad_y_LT[ii]; // type_t c_x = grid_grad_x_gpu[ii]; type_t c_y = grid_grad_y_gpu[ii]; // norm_x_gpu_LT += (grid_grad_x_LT[ii] - grid_grad_x_gpu[ii])*(grid_grad_x_LT[ii] - grid_grad_x_gpu[ii]); norm_y_gpu_LT += (grid_grad_y_LT[ii] - grid_grad_y_gpu[ii])*(grid_grad_y_LT[ii] - grid_grad_y_gpu[ii]); } // #endif #endif /* #ifdef __WITH_LENSTOOL // type_t norm_x_LT = 0.; type_t norm_y_LT = 0.; // type_t sum_x_LT = 0.; type_t sum_y_LT = 0.; // for (int ii = 0; ii < grid_dim*grid_dim; ++ii) { // sum_x_LT += grid_grad_x_cpu[ii]*grid_grad_x_cpu[ii]; sum_y_LT += grid_grad_y_cpu[ii]*grid_grad_y_cpu[ii]; // norm_x_LT += (grid_grad_x_cpu[ii] - grid_grad_x_LT[ii])*(grid_grad_x_cpu[ii] - grid_grad_x_LT[ii]); norm_y_LT += (grid_grad_y_cpu[ii] - grid_grad_y_LT[ii])*(grid_grad_y_cpu[ii] - grid_grad_y_LT[ii]); // } #endif */ //type_t sum_x_cpu_2000 = 4761763143.24101; //type_t sum_y_cpu_2000 = 5412618205.81843; type_t sum_x_cpu = 0.; type_t sum_y_cpu = 0.; // #ifdef __WITH_GPU type_t norm_x_gpu = 0.; type_t norm_y_gpu = 0.; // type_t sum_x_gpu = 0.; type_t sum_y_gpu = 0.; // for (int ii = 0; ii < grid_dim*grid_dim; ++ii) { // //if (fabs(grid_grad_x_gpu[ii] - grid_grad_x_cpu[ii]) > 1e-10) // printf("Error = %.15f %.15f, %.15f\n", grid_grad_x_gpu[ii], grid_grad_x_cpu[ii], fabs(grid_grad_x_gpu[ii] - grid_grad_x_cpu[ii])); sum_x_cpu += grid_grad_x_cpu[ii]*grid_grad_x_cpu[ii]; sum_y_cpu += grid_grad_y_cpu[ii]*grid_grad_y_cpu[ii]; sum_x_gpu += grid_grad_x_gpu[ii]*grid_grad_x_gpu[ii]; sum_y_gpu += grid_grad_y_gpu[ii]*grid_grad_y_gpu[ii]; // norm_x_gpu += (grid_grad_x_cpu[ii] - grid_grad_x_gpu[ii])*(grid_grad_x_cpu[ii] - grid_grad_x_gpu[ii]); norm_y_gpu += (grid_grad_y_cpu[ii] - grid_grad_y_gpu[ii])*(grid_grad_y_cpu[ii] - grid_grad_y_gpu[ii]); // } //printf("norm_gpu = %f %f, sum = %f %f\n", norm_x_gpu, norm_x_gpu, sum_x_gpu, sum_y_gpu); #endif #ifdef __WITH_LENSTOOL if (verbose) std::cout << " l2 difference norm cpu-LT = " << std::setprecision(15) << norm_x_LT << " " << std::setprecision(15) << norm_y_LT << std::endl; if (verbose) std::cout << " l2 difference norm cpu-LT AOS = " << std::setprecision(15) << norm_x_LT_AOS << " " << std::setprecision(15) << norm_y_LT_AOS << std::endl; #ifdef __WITH_GPU if (verbose) std::cout << " l2 difference norm gpu-LT = " << std::setprecision(15) << norm_x_gpu_LT << " " << std::setprecision(15) << norm_y_gpu_LT << std::endl; if (verbose) std::cout << " l2 difference norm cpu-gpu = " << std::setprecision(15) << norm_x_gpu << " " << std::setprecision(15) << norm_y_gpu << std::endl; #endif #endif //if (verbose) std::cout << "Pour 2000 sum x cpu = " << std::setprecision(15) << sum_x_cpu_2000 << " sum_y_cpu = " << std::setprecision(15) << sum_y_cpu_2000 << std::endl; if (verbose) std::cout << " sum x cpu = " << std::setprecision(15) << sum_x_cpu << " sum_y_cpu = " << std::setprecision(15) << sum_y_cpu << std::endl; #ifdef __WITH_GPU if (verbose) std::cout << " sum x gpu = " << std::setprecision(15) << sum_x_gpu << " sum_y_gpu = " << std::setprecision(15) << sum_y_gpu << std::endl; #endif // if (verbose) std::cout << "Exiting..." << std::endl; // #ifdef __WITH_MPI MPI_Finalize(); #endif #ifdef __WITH_GPU cudaFree(lenses_SOA); cudaFree(frame); #else free(lenses_SOA); free(frame); #endif } diff --git a/src/module_readParameters_GPU.cu b/src/module_readParameters_GPU.cu index aa64531..b21a80d 100644 --- a/src/module_readParameters_GPU.cu +++ b/src/module_readParameters_GPU.cu @@ -1,41 +1,43 @@ #include "grid_gradient_GPU.cuh" #include "gradient.hpp" #include "gradient_GPU.cuh" #include "gradient.hpp" #include "module_readParameters.hpp" extern void cudasafe( cudaError_t error, const char* message); void checkCudaError(cudaError_t error) { if(error != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(error)); } // // // void module_readParameters_PotentialSOA_noalloc_GPU(std::string infile, Potential_SOA *lens_SOA, int nhalos, int n_tot_halos, cosmo_param cosmology) { - module_readParameters_PotentialSOA_noalloc(infile, lens_SOA, nhalos, n_tot_halos, cosmology); + //module_readParameters_PotentialSOA_noalloc(infile, lens_SOA, nhalos, n_tot_halos, cosmology); #if defined(__WITH_UM) + #if (__CUDA_ARCH__ >= 600) // let's prefetch the data in the GPU memory #warning "Prefetching data on GPU" - checkCudaError(cudaMemPrefetchAsync(lens_SOA , sizeof(Potential_SOA), 0)); + checkCudaError(cudaMemPrefetchAsync(lens_SOA , sizeof(Potential_SOA), 0)); checkCudaError(cudaMemPrefetchAsync(lens_SOA->position_x , nhalos*sizeof(type_t), 0)); checkCudaError(cudaMemPrefetchAsync(lens_SOA->position_y , nhalos*sizeof(type_t), 0)); checkCudaError(cudaMemPrefetchAsync(lens_SOA->anglecos , nhalos*sizeof(type_t), 0)); checkCudaError(cudaMemPrefetchAsync(lens_SOA->anglesin , nhalos*sizeof(type_t), 0)); checkCudaError(cudaMemPrefetchAsync(lens_SOA->ellipticity_potential , nhalos*sizeof(type_t), 0)); checkCudaError(cudaMemPrefetchAsync(lens_SOA->rcore , nhalos*sizeof(type_t), 0)); checkCudaError(cudaMemPrefetchAsync(lens_SOA->rcut , nhalos*sizeof(type_t), 0)); checkCudaError(cudaMemPrefetchAsync(lens_SOA->b0 , nhalos*sizeof(type_t), 0)); checkCudaError(cudaMemPrefetchAsync(lens_SOA->ellipticity_angle , nhalos*sizeof(type_t), 0)); checkCudaError(cudaMemPrefetchAsync(lens_SOA->ellipticity , nhalos*sizeof(type_t), 0)); checkCudaError(cudaMemPrefetchAsync(lens_SOA->vdisp , nhalos*sizeof(type_t), 0)); checkCudaError(cudaMemPrefetchAsync(lens_SOA->z , nhalos*sizeof(type_t), 0)); checkCudaError(cudaMemPrefetchAsync(lens_SOA->N_types , nhalos*sizeof(int), 0)); checkCudaError(cudaMemPrefetchAsync(lens_SOA->type , nhalos*sizeof(int), 0)); + #endif #endif }