diff --git a/Benchmarks/ChiBenchmark/main.cpp b/Benchmarks/ChiBenchmark/main.cpp index 9eca507..96328e9 100644 --- a/Benchmarks/ChiBenchmark/main.cpp +++ b/Benchmarks/ChiBenchmark/main.cpp @@ -1,354 +1,354 @@ /** * @file main.cpp * @Author Christoph Schaaefer, EPFL (christophernstrerne.schaefer@epfl.ch) * @date October 2016 * @brief Benchmark for gradhalo function */ #include <iostream> #include <iomanip> #include <string.h> #include <math.h> #include <sys/time.h> #include <fstream> #include <sys/stat.h> #include <unistd.h> // #include <mm_malloc.h> // #include <structure_hpc.hpp> #include "timer.h" #include "gradient.hpp" #include "chi_CPU.hpp" #include "module_cosmodistances.hpp" #include "module_readParameters.hpp" #include<omp.h> //#define __WITH_LENSTOOL 0 #ifdef __WITH_LENSTOOL #warning "linking with libtool..." #include <fonction.h> #include <constant.h> #include <dimension.h> #include <structure.h> #include <setup.hpp> #endif #ifdef __WITH_LENSTOOL struct g_mode M; struct g_pot P[NPOTFILE]; struct g_pixel imFrame, wFrame, ps, PSF; struct g_cube cubeFrame; struct g_dyn Dy; // //TV struct g_source S; struct g_image I; struct g_grille G; struct g_msgrid H; // multi-scale grid struct g_frame F; struct g_large L; struct g_cosmo C; struct g_cline CL; struct g_observ O; struct pot lens[NLMAX]; struct pot lmin[NLMAX], lmax[NLMAX], prec[NLMAX]; struct g_cosmo clmin, clmax; /*cosmological limits*/ struct galaxie smin[NFMAX], smax[NFMAX]; // limits on source parameters struct ipot ip; struct MCarlo mc; struct vfield vf; struct vfield vfmin,vfmax; // limits on velocity field parameters struct cline cl[NIMAX]; lensdata *lens_table; int block[NLMAX][NPAMAX]; /*switch for the lens optimisation*/ int cblock[NPAMAX]; /*switch for the cosmological optimisation*/ int sblock[NFMAX][NPAMAX]; /*switch for the source parameters*/ int vfblock[NPAMAX]; /*switch for the velocity field parameters*/ double excu[NLMAX][NPAMAX]; double excd[NLMAX][NPAMAX]; /* supplments tableaux de valeurs pour fonctions g pour Einasto * * Ce sont trois variables globales qu'on pourra utiliser dans toutes les fonctions du projet * */ #define CMAX 20 #define LMAX 80 float Tab1[LMAX][CMAX]; float Tab2[LMAX][CMAX]; float Tab3[LMAX][CMAX]; int nrline, ntline, flagr, flagt; long int narclet; struct point gimage[NGGMAX][NGGMAX], gsource_global[NGGMAX][NGGMAX]; struct biline radial[NMAX], tangent[NMAX]; struct galaxie arclet[NAMAX], source[NFMAX], image[NFMAX][NIMAX]; struct galaxie cimage[NFMAX]; struct pointgal gianti[NPMAX][NIMAX]; struct point SC; double elix; double alpha_e; double *v_xx; double *v_yy; double **map_p; double **tmp_p; double **map_axx; double **map_ayy; #endif void chi_bruteforce_SOA_CPU_grid_gradient(double *chi, int *error, runmode_param *runmode, const struct Potential_SOA *lens, const struct grid_param *frame, const int *nimages_strongLensing, galaxy *images); int module_readCheckInput_readInput(int argc, char *argv[]) { /// check if there is a correct number of arguments, and store the name of the input file in infile char* infile; struct stat file_stat; // If we do not have 3 arguments, stop if ( argc != 3 ) { fprintf(stderr, "\nUnexpected number of arguments\n"); fprintf(stderr, "\nUSAGE:\n"); fprintf(stderr, "lenstool input_file output_directorypath [-n]\n\n"); exit(-1); } else if ( argc == 3 ) infile=argv[1]; std::ifstream ifile(infile,std::ifstream::in); // Open the file int ts = (int) time (NULL); char buffer[10]; std::stringstream ss; ss << ts; std::string trimstamp = ss.str(); // std::string outdir = argv[2]; outdir += "-"; outdir += trimstamp; std::cout << outdir << std::endl; // check whether the output directory already exists if (stat(outdir.c_str(), &file_stat) < 0){ mkdir(outdir.c_str(), S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP | S_IXGRP | S_IROTH ); } else { printf("Error : Directory %s already exists. Specify a non existing directory.\n",argv[2]); exit(-1); } // check whether the input file exists. If it could not be opened (ifile = 0), it does not exist if(ifile){ ifile.close(); } else{ printf("The file %s does not exist, please specify a valid file name\n",infile); exit(-1); } return 0; } int main(int argc, char *argv[]) { double wallclock = myseconds(); printf("Reading parameter file at time %f s...\n", myseconds() - wallclock); // Setting Up the problem //=========================================================================================================== // This module function reads the terminal input when calling LENSTOOL and checks that it is correct // Otherwise it exits LENSTOOL module_readCheckInput_readInput(argc, argv); // This module function reads the cosmology parameters from the parameter file // Input: struct cosmologicalparameters cosmology, parameter file // Output: Initialized cosmology struct cosmo_param cosmology; // Cosmology struct to store the cosmology data from the file std::string inputFile = argv[1]; // Input file module_readParameters_readCosmology(inputFile, cosmology); // This module function reads the runmode paragraph and the number of sources, arclets, etc. in the parameter file. // The runmode_param stores the information of what exactly the user wants to do with lenstool. struct runmode_param runmode; module_readParameters_readRunmode(inputFile, &runmode); module_readParameters_debug_cosmology(runmode.debug, cosmology); module_readParameters_debug_runmode(runmode.debug, runmode); //=== Declaring variables struct grid_param frame; struct galaxy images[runmode.nimagestot]; struct galaxy sources[runmode.nsets]; struct Potential lenses[runmode.nhalos+runmode.npotfile-1]; struct Potential_SOA lenses_SOA_table[NTYPES]; struct Potential_SOA lenses_SOA; struct cline_param cline; struct potfile_param potfile; struct Potential potfilepotentials[runmode.npotfile]; struct potentialoptimization host_potentialoptimization[runmode.nhalos]; int nImagesSet[runmode.nsets]; // Contains the number of images in each set of imagesnano // This module function reads in the potential form and its parameters (e.g. NFW) // Input: input file // Output: Potentials and its parameters module_readParameters_Potential(inputFile, lenses, runmode.nhalos); //Converts to SOA //module_readParameters_PotentialSOA(inputFile, lenses, lenses_SOA, runmode.Nlens); module_readParameters_PotentialSOA(inputFile, lenses, &lenses_SOA, runmode.nhalos); //module_readParameters_PotentialSOA_nonsorted(inputFile, lenses, &lenses_SOA_nonsorted, runmode.nhalos); module_readParameters_debug_potential(runmode.debug, lenses, runmode.nhalos); //std::cerr << lenses_SOA[1].b0[0] << " " << lenses[0].b0 << std::endl; // This module function reads in the potfiles parameters // Input: input file // Output: Potentials from potfiles and its parameters if (runmode.potfile == 1 ){ module_readParameters_readpotfiles_param(inputFile, &potfile,cosmology); module_readParameters_debug_potfileparam(runmode.debug, &potfile); module_readParameters_readpotfiles(&runmode,&potfile,lenses); module_readParameters_debug_potential(runmode.debug, lenses, runmode.nhalos+runmode.npotfile); } // This module function reads in the grid form and its parameters // Input: input file // Output: grid and its parameters #if 1 module_readParameters_Grid(inputFile, &frame); if (runmode.image == 1 or runmode.inverse == 1 or runmode.time > 0){ // This module function reads in the strong lensing images module_readParameters_readImages(&runmode, images, nImagesSet); //runmode.nsets = runmode.nimagestot; for(int i = 0; i < runmode.nimagestot; ++i){ images[i].dls = module_cosmodistances_objectObject(lenses[0].z, images[i].redshift, cosmology); images[i].dos = module_cosmodistances_observerObject(images[i].redshift, cosmology); images[i].dr = module_cosmodistances_lensSourceToObserverSource(lenses[0].z, images[i].redshift, cosmology); } module_readParameters_debug_image(runmode.debug, images, nImagesSet,runmode.nsets); } if (runmode.inverse == 1){ // This module function reads in the potential optimisation limits module_readParameters_limit(inputFile,host_potentialoptimization,runmode.nhalos); module_readParameters_debug_limit(runmode.debug, host_potentialoptimization[0]); } if (runmode.source == 1) { //Initialisation to default values.(Setting sources to z = 1.5 default value) for(int i = 0; i < runmode.nsets; ++i) { sources[i].redshift = 1.5; } // This module function reads in the strong lensing sources module_readParameters_readSources(&runmode, sources); //Calculating cosmoratios for(int i = 0; i < runmode.nsets; ++i) { sources[i].dls = module_cosmodistances_objectObject(lenses[0].z, sources[i].redshift, cosmology); sources[i].dos = module_cosmodistances_observerObject(sources[i].redshift, cosmology); sources[i].dr = module_cosmodistances_lensSourceToObserverSource(lenses[0].z, sources[i].redshift, cosmology); } module_readParameters_debug_source(runmode.debug, sources, runmode.nsets); } // std::cout << "--------------------------" << std::endl << std::endl; #endif - // + // Lenstool Bruteforce //=========================================================================================================== #ifdef __WITH_LENSTOOLL { printf("Calling lenstool at time %f s\n", myseconds() - wallclock); setup_lenstool(); // double chi2; double lhood0(0); int error(0); double time; if ( M.ichi2 != 0 ) { int error; //NPRINTF(stdout, "INFO: compute chires.dat\n"); readConstraints(); o_chires("chires.dat"); time = -myseconds(); error = o_chi_lhood0(&chi2, &lhood0, NULL); time += myseconds(); printf("INFO: chi2 %lf Lhood %lf\n", chi2, -0.5 * ( chi2 + lhood0 ) ); o_global_free(); } std::cout << "Lenstool 6.8.1 chi Benchmark: "; std::cout << " Chi : " << std::setprecision(15) << chi2 ; std::cout << " Time " << std::setprecision(15) << time << std::endl; o_global_free(); } #endif // Lenstool-GPU Bruteforce //=========================================================================================================== #if 0 { printf("Calling lenstoolhpc orig at time %f s\n", myseconds() - wallclock); std::cout << "LenstoolHPC dist chi Benchmark:\n "; double chi2; double time; int error; time = -myseconds(); mychi_bruteforce_SOA_CPU_grid_gradient_orig(&chi2, &error, &runmode, &lenses_SOA, &frame, nImagesSet, images); time += myseconds(); std::cout << " Chi : " << std::setprecision(15) << chi2; std::cout << " Time " << std::setprecision(15) << time << std::endl; } #endif #if 0 { //std::cout << "MylenstoolHPC chi Benchmark:\n "; printf("Calling lenstoolhpc at time %f s\n", myseconds() - wallclock); double chi2; double time; int error; time = -myseconds(); mychi_bruteforce_SOA_CPU_grid_gradient(&chi2, &error, &runmode, &lenses_SOA, &frame, nImagesSet, images); time += myseconds(); std::cout << " Chi : " << std::setprecision(15) << chi2; std::cout << " Time " << std::setprecision(15) << time << std::endl; } #endif printf("Ending execution at time %f s\n", myseconds() - wallclock); } diff --git a/Benchmarks/GradientBenchmark/main.cpp b/Benchmarks/GradientBenchmark/main.cpp index 0b1b482..1c9fb35 100644 --- a/Benchmarks/GradientBenchmark/main.cpp +++ b/Benchmarks/GradientBenchmark/main.cpp @@ -1,249 +1,250 @@ /** * @file main.cpp * @Author Christoph Schaaefer, EPFL (christophernstrerne.schaefer@epfl.ch) * @date October 2016 * @brief Benchmark for gradhalo function */ #include <iostream> #include <iomanip> #include <string.h> //#include <cuda_runtime.h> #include <math.h> #include <sys/time.h> #include <fstream> // // #include <mm_malloc.h> //#define __WITH_LENSTOOL 0 // #ifdef __WITH_LENSTOOL #warning "linking with libtool..." #include<fonction.h> #include<constant.h> #include<dimension.h> #include<structure.h> //#include <liblenstool> #endif //#include "../../../Projects/lenstool-6.8.1/include/structure.h" //#include "structure.h" #include "structure_hpc.hpp" #include "timer.h" #include "gradient.hpp" #include "gradient_avx.hpp" #include "gradient_avx512f.hpp" #include "setup.hpp" #include <type.h> // #define NN 100000 + // // // #ifdef __WITH_LENSTOOL struct g_mode M; struct g_pot P[NPOTFILE]; struct g_pixel imFrame, wFrame, ps, PSF; struct g_cube cubeFrame; struct g_dyn Dy; // //TV struct g_source S; struct g_image I; struct g_grille G; struct g_msgrid H; // multi-scale grid struct g_frame F; struct g_large L; struct g_cosmo C; struct g_cline CL; struct g_observ O; struct pot lens[NLMAX]; struct pot lmin[NLMAX], lmax[NLMAX], prec[NLMAX]; struct g_cosmo clmin, clmax; /*cosmological limits*/ struct galaxie smin[NFMAX], smax[NFMAX]; // limits on source parameters struct ipot ip; struct MCarlo mc; struct vfield vf; struct vfield vfmin,vfmax; // limits on velocity field parameters struct cline cl[NIMAX]; lensdata *lens_table; int block[NLMAX][NPAMAX]; /*switch for the lens optimisation*/ int cblock[NPAMAX]; /*switch for the cosmological optimisation*/ int sblock[NFMAX][NPAMAX]; /*switch for the source parameters*/ int vfblock[NPAMAX]; /*switch for the velocity field parameters*/ double excu[NLMAX][NPAMAX]; double excd[NLMAX][NPAMAX]; /* supplments tableaux de valeurs pour fonctions g pour Einasto * * Ce sont trois variables globales qu'on pourra utiliser dans toutes les fonctions du projet * */ #define CMAX 20 #define LMAX 80 float Tab1[LMAX][CMAX]; float Tab2[LMAX][CMAX]; float Tab3[LMAX][CMAX]; int nrline, ntline, flagr, flagt; long int narclet; struct point gimage[NGGMAX][NGGMAX], gsource_global[NGGMAX][NGGMAX]; struct biline radial[NMAX], tangent[NMAX]; struct galaxie arclet[NAMAX], source[NFMAX], image[NFMAX][NIMAX]; struct galaxie cimage[NFMAX]; struct pointgal gianti[NPMAX][NIMAX]; struct point SC; double elix; double alpha_e; double *v_xx; double *v_yy; double **map_p; double **tmp_p; double **map_axx; double **map_ayy; #endif //struct pot lens_ref[NLMAX]; // //#include <iitnotify.h> int main() { double t0, t1, t2, t3; // //Variable creation // point image; // //Initialisation // int nlenses; type_t x, y; type_t sol_grad_x, sol_grad_y; point grad; // store the result //pot* lens; #ifdef __WITH_LENSTOOL setup_jauzac_LT(&nlenses, &image.x, &image.y, &sol_grad_x, &sol_grad_y); // struct point grad_lt, Grad; //printf("Number lenses = %d, b0 = %f\n", nlenses, lens[0].b0); // t0 = -myseconds(); for (int ii = 0; ii < NN; ++ii) { grad_lt.x = grad_lt.y = 0.; for (long int jj = 0; jj < nlenses; ++jj) { //printf("%f\n", lens[ii].b0); Grad = e_grad_pot(&image, jj); // grad_lt.x += Grad.x; grad_lt.y += Grad.y; //printf("%f %f\n", grad_lt.x, grad_lt.y); } } t0 += myseconds(); #endif // // //setup_jauzac(Potential** lens, int* nlenses, double* x, double* y, double* sol_grad_x, double* sol_grad_y) // // Potential* lens_aos; setup_jauzac(&lens_aos, &nlenses, &image.x, &image.y, &sol_grad_x, &sol_grad_y); t1 = -myseconds(); for (int ii = 0; ii < NN; ++ii) grad = module_potentialDerivatives_totalGradient(nlenses, &image, lens_aos); t1 += myseconds(); //printf("---> grad = %f %f\n", grad.x, grad.y); // // Setting up the AOS potential // point image_soa; int nlenses_soa; double x_soa, y_soa; type_t sol_grad_x_soa, sol_grad_y_soa; // Potential_SOA lens_soa; setup_jauzac_SOA(&lens_soa, &nlenses_soa, &image_soa.x, &image_soa.y, &sol_grad_x_soa, &sol_grad_y_soa); // std::cout << "Benchmark for Gradient Calculation using " << nlenses_soa << " lenses with type " << lens_soa.type[0] << ", image: " << image_soa.x << " " << image_soa.y << std::endl; // // AVX version // #if 0 point grad_soa_novec; double t21 = -myseconds(); for (int ii = 0; ii < NN; ++ii) { //grad_soa = module_potentialDerivatives_totalGradient_SOA_AVX512(&image_soa, &lens_soa, nlenses_soa); //grad_soa = module_potentialDerivatives_totalGradient_8_SOA_AVX(&image_soa, &lens_soa, nlenses_soa); // //grad_soa_avx = module_potentialDerivatives_totalGradient_81_SOA_AVX(&image_soa, &lens_soa, 0, nlenses_soa); grad_soa_novec = module_potentialDerivatives_totalGradient_SOA_novec(&image_soa, &lens_soa, nlenses_soa); //grad_soa = module_potentialDerivatives_totalGradient_SOA(&image_soa, &lens_soa, 0, nlenses_soa); } t21 += myseconds(); std::cout << "1" << std::endl; #endif // // no vectorization // #if 1 point grad_soa; // store the result t2 = -myseconds(); for (int ii = 0; ii < NN; ++ii) { //grad_soa = module_potentialDerivatives_totalGradient_SOA_AVX512(&image_soa, &lens_soa, nlenses_soa); //grad_soa = module_potentialDerivatives_totalGradient_8_SOA_AVX(&image_soa, &lens_soa, nlenses_soa); // //grad_soa_avx = module_potentialDerivatives_totalGradient_81_SOA_AVX(&image_soa, &lens_soa, 0, nlenses_soa); grad_soa = module_potentialDerivatives_totalGradient_SOA(&image_soa, &lens_soa, nlenses_soa); //grad_soa = module_potentialDerivatives_totalGradient_SOA(&image_soa, &lens_soa, 0, nlenses_soa); } t2 += myseconds(); #endif //__SSC_MARK(0x222); // // autovectorized version // point grad_soa_avx; t3 = -myseconds(); for (int ii = 0; ii < NN; ++ii) { //grad_soa_avx = module_potentialDerivatives_totalGradient_SOA_AVX512(&image_soa, &lens_soa, nlenses_soa); // grad_soa = module_potentialDerivatives_totalGradient_81_SOA_AVX(&image_soa, &lens_soa, nlenses_soa); // //grad_soa = module_potentialDerivatives_totalGradient_81_SOA_AVX(&image_soa, &lens_soa, nlenses_soa); //grad_soa = module_potentialDerivatives_totalGradient_81_SOA(&image_soa, &lens_soa, 0, nlenses_soa); //grad_soa = module_potentialDerivatives_totalGradient_8_SOA(&image_soa, &lens_soa, 0, nlenses_soa); #ifdef _double grad_soa_avx = module_potentialDerivatives_totalGradient_SOA_AVX(&image_soa, &lens_soa, nlenses_soa); #endif } t3 += myseconds(); // #ifdef _double std::cout << " Double calculation = " << std::endl; #else std::cout << " Float calculation = " << std::endl; #endif std::cout << " ref sol = " << std::setprecision(15) << sol_grad_x << " " << std::setprecision(15) << sol_grad_y << std::endl; // #ifdef __WITH_LENSTOOL std::cout << " Lenstool sol = " << std::setprecision(15) << grad_lt.x << " " << std::setprecision(15) << grad_lt.y << ", time = " << t0 << " s." << std::endl; #endif // std::cout << " grad = " << std::setprecision(15) << grad.x << " " << std::setprecision(15) << grad.y << ", time = " << t1 << " s., speedup = " << (double) t0/t1 << std::endl; // //std::cout << " grad novec = " << std::setprecision(15) << grad.x << " " << std::setprecision(15) << grad.y << ", time = " << t21 << " s., speedup = " << (double) t0/t21 << std::endl; // std::cout << " grad SIMD = " << std::setprecision(15) << grad_soa.x << " " << std::setprecision(15) << grad_soa.y << ", time = " << t2 << " s., speedup = " << (double) t0/t2 << std::endl; // std::cout << " grad handcoded = " << std::setprecision(15) << grad_soa_avx.x << " " << std::setprecision(15) << grad_soa_avx.y << ", time = " << t3 << " s. speedup = " << (double) t0/t3 << std::endl; // } diff --git a/Benchmarks/GradientBenchmark/setup.cpp b/Benchmarks/GradientBenchmark/setup.cpp index 2aadf99..0c1c37a 100644 --- a/Benchmarks/GradientBenchmark/setup.cpp +++ b/Benchmarks/GradientBenchmark/setup.cpp @@ -1,294 +1,295 @@ #include <iostream> #include <string.h> //#include <cuda_runtime.h> #include <math.h> #include <sys/time.h> #include <fstream> #include <immintrin.h> //#include "simd_math.h" //#define __WITH_LENSTOOL //#include "structure.h" #include "structure_hpc.hpp" #include <type.h> //#ifdef __WITH_LENSTOOL //#endif //#include "setup.hpp" extern int num_lenses = 95; // extern type_t x_c = -44.094528948812552; extern type_t y_c = -36.928385887965803; // //extern int lense_type[] = {81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81,81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81}; extern int lense_type[] = {8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; //extern int lense_type[] = {5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; + // extern type_t pos_x[] = {0.000000000000000, 35.000000000000000, 65.000000000000000, 130.000000000000000, -19.399999999999999, 11.025931099982012, -2.671581400009952, 8.706480300003618, -46.071898399997828, -10.335049000010734, -25.638072299983587, -21.186195800000924, 47.770210000017265, 20.889859399994865, -9.947619900012000, 28.559020299985104, -11.757235600015839, 44.290179900007189, 49.190403899989448, 36.260348400000979, -47.958373599980469, 42.325706400014852, 37.455656899984845, 35.637785899984095, 48.099282799982589, 35.578290900001157, -3.050470599984273, 3.330580999983544, 43.051886999993762, 12.070367100002324, -0.012525300009348, 0.389991100006194, 18.864752299989611, 3.278487299995037, 10.076003800004774, 11.586436500008050, 37.425482399992134, 0.486777199988778, 25.632663699981624, 64.980204499991984, 18.813228000016544, 3.462950200001255, 28.896917700002792, -21.734176000018063, -0.928577399996435, -0.731873800008366, 78.370563400014419, 39.869047099991327, 10.636794000018757, 81.825827699993738, 11.692331899990036, 65.849002200017992, 12.142102700007115, 7.184091700006435, 92.001179599988561, 9.130061900012469, 1.379771499990539, 1.090836400019363, 63.261966300009220, -26.291947899982713, 64.747063800006757, 65.182885999997012, 69.098453600009847, 63.422517399987171, 83.228941699997549, 62.814757499999601, 58.726967500001948, 109.445168699990987, 65.110011700002232, 17.873548699988238, 108.654938499989214, 4.474934500009342, 3.973924000006243, 88.196061999997639, 61.916070000008830, 87.056832499986754, 108.613092799990099, 61.803057900019461, 79.079379299996546, 75.673077399983285, 129.107267799989302, 132.913239399982729, 124.503095399984701, 143.243164500019446, 128.260958600014533, 138.446558499998758, 130.452594800015163, 134.181991500016807, 131.942247000012628, 130.527746399989439, 132.704864599988241, 118.334119300008808, 147.068493300001194, 150.936237199997493, 142.130408800003835}; extern type_t pos_y[] = {0.000000000000000, -10.000000000000000, 40.000000000000000, 80.000000000000000, -21.699999999999999, -59.119920000003390, -58.477319999988708, -50.961599999999407, -45.297360000006393, -42.065999999999804, -46.933559999993690, -46.580759999989141, -37.085399999989477, -38.415960000011751, -38.951640000004772, -33.509519999998361, -35.354879999999866, -22.304880000001504, -22.674599999999145, -9.754919999988942, -22.953600000010965, -21.260159999994244, -20.612880000001610, -18.947520000006079, -7.068240000003811, -1.161000000010404, -1.597679999997581, -1.477079999995112, 5.219999999994229, -3.917879999997353, 0.001799999995455, -2.880720000001702, -0.453960000010056, 2.791800000011335, 1.789919999993117, 0.972720000009986, 2.251080000010575, 20.615760000009686, 20.495160000007218, 19.365120000000502, 16.301160000006121, 17.766359999990300, 18.265679999993267, 21.366720000011696, 32.603399999999283, 21.648600000006013, 23.630759999994666, 22.818239999989487, 21.551399999995624, 22.820040000010522, 25.769519999997215, 33.740639999987820, 25.422120000004611, 23.979960000008305, 33.112080000009314, 33.039720000007833, 32.423399999993308, 33.522119999992128, 33.943680000007248, 39.039480000008098, 33.088679999991655, 36.841680000006249, 40.110120000011307, 36.187200000006214, 43.166519999994080, 37.808640000000082, 41.374079999997093, 42.308279999991782, 39.722400000007951, 43.650719999999410, 45.423719999999435, 47.048760000009793, 48.891600000007429, 45.961559999992119, 53.340119999990065, 47.024999999987926, 46.107720000011909, 52.768080000006989, 58.089600000010932, 62.472239999991075, 77.337719999999877, 72.065520000009542, 74.942999999998960, 76.529879999998229, 77.375160000002552, 85.891679999997450, 83.807999999999083, 87.973920000004568, 87.793919999998593, 95.139359999993189, 95.914439999995693, 101.069640000000049, 101.447279999987927, 107.222760000001927, 106.741440000004673}; extern type_t theta[] = {1.047197551196598, 1.047197551196598, 1.047197551196598, 1.047197551196598, -0.698131700797732, -1.408480706359424, 0.359537825910832, 0.027925268031909, -0.991347015132779, -0.055850536063819, -0.991347015132779, 0.036651914291881, 0.687659725285766, 0.561996019142174, 0.907571211037051, 0.054105206811824, 0.202458193231342, -1.343903524035634, 0.420624349730633, 0.246091424531200, 0.150098315671512, -0.521853446346304, -0.040142572795870, 0.980875039620813, -0.884881930761125, -1.509709802975095, 0.666715774261834, -0.514872129338327, 1.174606586592184, -0.282743338823081, 1.457349925415265, 0.260054058547155, -1.164134611080218, -0.654498469497874, 1.038470904936626, -0.453785605518526, -0.886627260013119, -0.003490658503989, 1.549852375770965, 0.134390352403563, 0.293215314335047, -1.453859266911276, 0.109955742875643, 0.823795406941324, -0.349065850398866, 1.096066770252439, -0.527089434102287, -0.766199541625511, -1.186823891356144, 1.368338133563554, 0.748746249105567, 0.066322511575785, -0.651007810993885, -1.052433538952581, 0.256563400043166, 1.321214243759707, 1.110029404268394, 0.794124809657420, 0.134390352403563, -1.132718684544320, -0.961676417848876, -1.078613477732496, -1.530653753999027, -0.191986217719376, -0.127409035395586, -0.366519142918809, -0.815068760681352, -1.469567230179226, 0.026179938779915, 1.448623279155294, -0.146607657167524, 1.118756050528365, 0.837758040957278, -0.422369678982628, 1.024508270920671, -0.464257581030492, 0.226892802759263, 1.364847475059566, 0.249582083035189, -1.115265392024376, 1.418952681871390, -0.031415926535898, -1.127482696788337, 1.083849465488479, -0.047123889803847, -0.855211333477221, -0.867428638241182, 0.841248699461267, 0.289724655831059, 1.075122819228507, 0.099483767363677, -1.197295866868110, -0.534070751110265, 0.481710873550435, -0.174532925199433}; // extern type_t epot[] = {0.075426688904937, 0.075426688904937, 0.075426688904937, 0.075426688904937, 0.116562483442831, 0.344322344322344,0.075409836065574, 0.034285714285715, 0.158045977011495, 0.064297800338409, 0.180914512922465, 0.085616438356165, 0.449781659388646, 0.198529411764706, 0.179611650485437, 0.160975609756097, 0.492772667542707, 0.216589861751152, 0.031496062992126, 0.043407043407043, 0.376311844077961, 0.157622739018088, 0.396428571428571, 0.373650107991361, 0.241081081081081, 0.364293085655315, 0.528824833702883, 0.317901234567901, 0.338345864661654, 0.129963898916967, 0.175792507204611, 0.151898734177215, 0.059829059829059, 0.208333333333333, 0.340163934426229, 0.590493601462523, 0.039062500000001, 0.015317286652079, 0.126005361930295, 0.123473541383989, 0.308584686774942, 0.200913242009132, 0.035587188612100, 0.096280087527352, 0.167197452229299, 0.109909909909910, 0.104109589041096, 0.051409618573797, 0.087431693989071, 0.176470588235294, 0.097178683385580, 0.045769764216366, 0.074235807860262, 0.131944444444444, 0.101449275362319, 0.285714285714286, 0.367619047619048, 0.239263803680982, 0.132132132132132, 0.318295739348371, 0.078817733990148, 0.108695652173913, 0.045143638850889, 0.381165919282511, 0.099236641221374,0.240437158469945, 0.334513274336283, 0.103594080338266, 0.186246418338109, 0.328918322295806, 0.098844672657253, 0.024911032028470, 0.102719033232628, 0.462857142857143, 0.039920159680638, 0.403565640194490, 0.023391812865496, 0.315315315315315, 0.274418604651163, 0.161987041036717, 0.137254901960784, 0.044673539518900, 0.057902973395931, 0.160283687943262, 0.122340425531915, 0.520395550061805, 0.021645021645023, 0.094555873925501, 0.142045454545455, 0.145907473309608, 0.194915254237288, 0.282904689863843, 0.450331125827815, 0.135245901639344, 0.340425531914894}; // extern type_t rcore[] = {10.000000000000000, 5.000000000000000, 10.000000000000000, 10.000000000000000, 1.000000000000000, 0.034052339117993, 0.030349179184560, 0.046574365790645, 0.038383710030628, 0.068256961228370, 0.020804005013518, 0.028193367328818, 0.055481341027049, 0.039097316478764, 0.031633478431847, 0.039641214329569, 0.048322227956496, 0.041128886304460, 0.028585576136945, 0.080565099864725, 0.039097316478764, 0.027425019798032, 0.034367421532497, 0.025594517675956, 0.049905332073874, 0.073815395284787, 0.052984269343359, 0.048322227956496,0.067320440611066, 0.044683475951999, 0.067320440611066, 0.022811137889776, 0.036320092822069, 0.039277781492078, 0.037857065785625, 0.021288592541707, 0.024668738981743, 0.069206510145232, 0.053474526751449, 0.048322227956496, 0.025359865727459, 0.029117021675657, 0.031199450836488, 0.051067776588639, 0.099116726380952, 0.029386438160114, 0.050833141468429, 0.034367421532497, 0.021784467563449, 0.027934888983489, 0.044273815382088, 0.052498506642202, 0.035821762466628, 0.013432193817836, 0.052257297914513, 0.037337647370553, 0.023342477548558, 0.043266020305102, 0.039641214329569, 0.061965070730691, 0.021093417552129, 0.028717521161380, 0.051067776588639, 0.032669835654398, 0.052498506642202, 0.017707080013958, 0.037509990120305, 0.030349179184560, 0.013936282772780, 0.028454237344829, 0.057829168758607, 0.036825355643309, 0.040192678553178, 0.029522079795690, 0.056512816167846, 0.034685419368686, 0.023886193697910, 0.035821762466628, 0.038738870126742, 0.034685419368686, 0.083588580432868, 0.032221588704875, 0.042672388251826, 0.052017197440259, 0.033124318340140, 0.048100207486392, 0.026677611870563, 0.023996447358569, 0.016000993535772, 0.057563467863253, 0.034526054342700, 0.045935341535597, 0.039459079494046, 0.031488135800734, 0.039277781492078}; // extern type_t rcut[] = {500.000000000000000, 500.000000000000000, 500.000000000000000, 500.000000000000000, 150.000000000000000, 5.107850867699009, 4.552376877684008, 6.986154868596703, 5.757556504594200, 10.238544184255536, 3.120600752027715, 4.229005099322641, 8.322201154057367, 5.864597471814614, 4.745021764777110, 5.946182149435385, 7.248334193474441, 6.169332945669035, 4.287836420541705, 12.084764979708812, 5.864597471814614, 4.113752969704730, 5.155113229874521, 3.839177651393335, 7.485799811081068, 11.072309292717980, 7.947640401503818, 7.248334193474441, 10.098066091659968, 6.702521392799788, 10.098066091659968, 3.421670683466420, 5.448013923310371, 5.891667223811720, 5.678559867843796, 3.193288881256013, 3.700310847261404, 10.380976521784758, 8.021179012717386, 7.248334193474441, 3.803979859118900, 4.367553251348570, 4.679917625473273, 7.660166488295854, 14.867508957142872, 4.407965724017149, 7.624971220264309, 5.155113229874521, 3.267670134517328, 4.190233347523285, 6.641072307313197, 7.874775996330366, 5.373264369994223, 2.014829072675421, 7.838594687177010, 5.600647105583010, 3.501371632283681, 6.489903045765340, 5.946182149435385, 9.294760609603639, 3.164012632819346, 4.307628174206992, 7.660166488295854, 4.900475348159637, 7.874775996330366, 2.656062002093733, 5.626498518045686, 4.552376877684008, 2.090442415917007, 4.268135601724389, 8.674375313791064, 5.523803346496345, 6.028901782976731, 4.428311969353570, 8.476922425176864, 5.202812905302965, 3.582929054686518, 5.373264369994223, 5.810830519011283, 5.202812905302965, 12.538287064930271, 4.833238305731319, 6.400858237773873, 7.802579616038900, 4.968647751020976, 7.215031122958762, 4.001641780584524, 3.599467103785322, 2.400149030365801, 8.634520179487971, 5.178908151405005, 6.890301230339541, 5.918861924106924, 4.723220370110043, 5.891667223811720}; // type_t b0[] = {27.686284800000006, 15.573535200000000, 21.197311800000001, 43.259820000000005, 1.401618168000000, 0.701887044377384, 0.625557486765266, 0.959991142907221, 0.791165290944736, 1.406912946824478, 0.428812292146906, 0.581121878203467, 1.143581776765294, 0.805874151883638, 0.652029471542907, 0.817084978065710, 0.996018089699853, 0.847748882881212, 0.589206088811179, 1.660608383702515, 0.805874151883638, 0.565284693698478, 0.708381525237709, 0.527554371568815, 1.028649042482203, 1.521483427216619, 1.092112118320076, 0.996018089699853, 1.387609377521935, 0.921016108754891, 1.387609377521935, 0.470183328577265, 0.748629999074688, 0.809593897959512, 0.780310096202241, 0.438800613557777, 0.508472214857728, 1.426485055525682, 1.102217307333858, 0.996018089699853, 0.522717724018210, 0.600160247852116, 0.643083291809730, 1.052609356688346, 2.042994635018428, 0.605713462253246, 1.047773056002485, 0.708381525237709, 0.449021592862168, 0.575794120800079, 0.912572182315177, 1.082099574236042, 0.738358410415477, 0.276864469886694, 1.077127778308336, 0.769603840315186, 0.481135305220354, 0.891799503367263, 0.817084978065710, 1.277224457300750, 0.434777665351592, 0.591925740547885, 1.052609356688346, 0.673390873628514, 1.082099574236042, 0.364978453094982, 0.773156170239529, 0.625557486765266, 0.287254754837836, 0.586498932739152, 1.191975217859029, 0.759044479765867, 0.828451762375228, 0.608509308563808, 1.164842549348754, 0.714936098789562, 0.492342386170850, 0.738358410415477, 0.798485853249674, 0.714936098789562, 1.722928385636979, 0.664151604471083, 0.879563555467234, 1.072178825707933, 0.682758673823120, 0.991441802267353, 0.549879116438807, 0.494614935370580, 0.329812587059260, 1.186498589205643, 0.711651265795296, 0.946819570637053, 0.813330813603161, 0.649033668246664, 0.80959389795951}; // extern type_t grad_x = -77.124413458712169; extern type_t grad_y = -46.092652444451353; // // // /**** usefull functions for PIEMD profile : see old lenstool ****/ /** I*w,v=0.5 Kassiola & Kovner, 1993 PIEMD, paragraph 4.1 * * Global variables used : * - none */ /** @brief This module function calculates profile depended information like the impactparameter b0 and the potential ellipticity epot * * @param lens: mass distribution for which to calculate parameters */ void module_readParameters_calculatePotentialparameter(Potential *lens) { //std::cout << "module_readParameters_calculatePotentialparameter..." << std::endl; switch (lens->type) { case(5): /*Elliptical Isothermal Sphere*/ //impact parameter b0 lens->b0 = 4* pi_c2 * lens->vdisp * lens->vdisp ; //ellipticity_potential lens->ellipticity_potential = lens->ellipticity/3 ; break; case(8): /* PIEMD */ //impact parameter b0 lens->b0 = 6.*pi_c2 * lens->vdisp * lens->vdisp; //ellipticity_parameter if ( lens->ellipticity == 0. && lens->ellipticity_potential != 0. ){ // emass is (a2-b2)/(a2+b2) lens->ellipticity = 2.*lens->ellipticity_potential / (1. + lens->ellipticity_potential * lens->ellipticity_potential); //printf("1 : %f %f \n",lens->ellipticity,lens->ellipticity_potential); } else if ( lens->ellipticity == 0. && lens->ellipticity_potential == 0. ){ lens->ellipticity_potential = 0.00001; //printf("2 : %f %f \n",lens->ellipticity,lens->ellipticity_potential); } else{ // epot is (a-b)/(a+b) lens->ellipticity_potential = (1. - sqrt(1 - lens->ellipticity * lens->ellipticity)) / lens->ellipticity; //printf("3 : %f %f \n",lens->ellipticity,lens->ellipticity_potential); } break; default: std::cout << "ERROR: LENSPARA profil type of clump "<< lens->name << " unknown : "<< lens->type << std::endl; //printf( "ERROR: LENSPARA profil type of clump %s unknown : %d\n",lens->name, lens->type); break; }; } // // // void module_readParameters_calculatePotentialparameter_SOA(Potential_SOA *lens, int ii) { //std::cout << "module_readParameters_calculatePotentialparameter..." << std::endl; switch (lens->type[ii]) { case(5): /*Elliptical Isothermal Sphere*/ //impact parameter b0 lens->b0[ii] = 4* pi_c2 * lens->vdisp[ii] * lens->vdisp[ii] ; //ellipticity_potential lens->ellipticity_potential[ii] = lens->ellipticity[ii]/3 ; break; case(8): /* PIEMD */ //impact parameter b0 lens->b0[ii] = 6.*pi_c2 * lens->vdisp[ii] * lens->vdisp[ii]; //ellipticity_parameter if ( lens->ellipticity[ii] == 0. && lens->ellipticity_potential[ii] != 0. ){ // emass is (a2-b2)/(a2+b2) lens->ellipticity[ii] = 2.*lens->ellipticity_potential[ii] / (1. + lens->ellipticity_potential[ii] * lens->ellipticity_potential[ii]); //printf("1 : %f %f \n",lens->ellipticity[ii],lens->ellipticity_potential[ii]); } else if ( lens->ellipticity[ii] == 0. && lens->ellipticity_potential[ii] == 0. ){ lens->ellipticity_potential[ii] = 0.00001; //printf("2 : %f %f \n",lens->ellipticity[ii],lens->ellipticity_potential[ii]); } else{ // epot is (a-b)/(a+b) lens->ellipticity_potential[ii] = (1. - sqrt(1 - lens->ellipticity[ii] * lens->ellipticity[ii])) / lens->ellipticity[ii]; //printf("3 : %f %f \n",lens->ellipticity[ii],lens->ellipticity_potential[ii]); } break; default: std::cout << "ERROR: LENSPARA profil type of clump "<< lens->name << " unknown : "<< lens->type << std::endl; //printf( "ERROR: LENSPARA profil type of clump %s unknown : %d\n",lens->name, lens->type); break; }; } // // // void setup_jauzac_SOA(Potential_SOA *lens_soa, int* nlenses, type_t* x, type_t* y, type_t* sol_grad_x, type_t* sol_grad_y) { *nlenses = num_lenses; *sol_grad_x = grad_x; *sol_grad_y = grad_y; *x = x_c; *y = y_c; // //Potential_SOA lens_soa // for (int i = 0; i < *nlenses; ++i) { lens_soa->type = (int*) malloc(*nlenses*sizeof(int)); lens_soa->vdisp = (type_t*) malloc(*nlenses*sizeof(type_t)); // lens_soa->position_x = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->position_y = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->weight = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->b0 = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->vdisp = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->ellipticity_angle = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->anglecos = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->anglesin = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->ellipticity = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->ellipticity_potential = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->rcore = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->rcut = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->rscale = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->exponent = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->alpha = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->einasto_kappacritic = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->z = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->mag = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->lum = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->theta = (type_t*) malloc(*nlenses*sizeof(type_t)); lens_soa->sigma = (type_t*) malloc(*nlenses*sizeof(type_t)); } // for (int i = 0; i < *nlenses; ++i) { //ilens = &lens[i]; // lens_soa->vdisp[i] = 1.; lens_soa->position_x[i] = pos_x[i]; lens_soa->position_y[i] = pos_y[i]; lens_soa->type[i] = lense_type[i]; lens_soa->ellipticity[i] = 0.11; lens_soa->ellipticity_potential[i] = epot[i]; lens_soa->ellipticity_angle[i] = theta[i]; lens_soa->anglecos[i] = cos(theta[i]); lens_soa->anglesin[i] = sin(theta[i]); lens_soa->rcut[i] = rcut[i]; lens_soa->rcore[i] = rcore[i]; lens_soa->b0[i] = b0[i]; lens_soa->weight[i] = 0; lens_soa->rscale[i] = 0; lens_soa->exponent[i] = 0; lens_soa->alpha[i] = 0.; lens_soa->einasto_kappacritic[i] = 0; lens_soa->z[i] = 0.4; } } void setup_jauzac(Potential** lens, int* nlenses, type_t* x, type_t* y, type_t* sol_grad_x, type_t* sol_grad_y) { *nlenses = num_lenses; *sol_grad_x = grad_x; *sol_grad_y = grad_y; *x = x_c; *y = y_c; // *lens = (Potential*) malloc(sizeof(Potential)*(*nlenses)); // for (int i = 0; i < *nlenses; ++i) { //ilens = &lens[i]; // (*lens)[i].position.x = pos_x[i]; (*lens)[i].position.y = pos_y[i]; // (*lens)[i].vdisp = 1.; (*lens)[i].type = lense_type[i]; (*lens)[i].ellipticity = 0.11; (*lens)[i].ellipticity_potential = epot[i]; (*lens)[i].ellipticity_angle = theta[i]; (*lens)[i].rcut = rcut[i]; (*lens)[i].rcore = rcore[i]; (*lens)[i].b0 = b0[i]; (*lens)[i].weight = 0; (*lens)[i].rscale = 0; (*lens)[i].exponent = 0; (*lens)[i].alpha = 0.; (*lens)[i].einasto_kappacritic = 0; (*lens)[i].z = 0.4; } std::cout << "Setup done..." << std::endl; } #ifdef __WITH_LENSTOOL extern struct pot lens[NLMAX]; void //setup_jauzac_LT(struct pot** lens, int* nlenses, type_t* x, type_t* y, type_t* sol_grad_x, type_t* sol_grad_y) setup_jauzac_LT( int* nlenses, type_t* x, type_t* y, type_t* sol_grad_x, type_t* sol_grad_y) { *nlenses = num_lenses; *sol_grad_x = grad_x; *sol_grad_y = grad_y; *x = x_c; *y = y_c; // //*lens = (struct pot*) malloc(sizeof(struct pot)*(*nlenses)); // for (int i = 0; i < *nlenses; ++i) { // lens[i].C.x = pos_x[i]; lens[i].C.y = pos_y[i]; // lens[i].sigma = 1.; // disp if (lense_type[i] == 5) lens[i].type = 1; else lens[i].type = lense_type[i]; lens[i].emass = 0.11; lens[i].epot = epot[i]; lens[i].theta = theta[i]; lens[i].rcut = rcut[i]; lens[i].rc = rcore[i]; lens[i].b0 = b0[i]; lens[i].masse = 0; // weight //lens[i].rc = 0; // rscale // (&lens)[i].exponent = 0; lens[i].alpha = 0.; // (&lens)[i].einasto_kappacritic = 0; lens[i].z = 0.4; } std::cout << "Setup done..." << std::endl; } #endif diff --git a/src/gradient.cpp b/src/gradient.cpp index cfccd6b..5cd12be 100644 --- a/src/gradient.cpp +++ b/src/gradient.cpp @@ -1,967 +1,965 @@ #include <iostream> #include <iomanip> #include <string.h> //#include <cuda_runtime.h> #include <math.h> #include <sys/time.h> #include <fstream> #include <immintrin.h> #include <map> /* #ifdef __AVX__ #include "simd_math_avx.h" #endif #ifdef __AVX512F__ #include "simd_math_avx512f.h" #endif */ #include "structure_hpc.hpp" #include "gradient.hpp" #include "utils.hpp" //#include "iacaMarks.h" // // // /**@brief Return the gradient of the projected lens potential for one clump * !!! You have to multiply by dlsds to obtain the true gradient * for the expressions, see the papers : JP Kneib & P Natarajan, Cluster Lenses, The Astronomy and Astrophysics Review (2011) for 1 and 2 * and JP Kneib PhD (1993) for 3 * * @param pImage point where the result is computed in the lens plane * @param lens mass distribution */ // /// Useful functions // complex piemd_1derivatives_ci05(type_t x, type_t y, type_t eps, type_t rc) { type_t sqe, cx1, cxro, cyro, rem2; complex zci, znum, zden, zis, zres; type_t norm; // //std::cout << "piemd_lderivatives" << std::endl; sqe = sqrt(eps); cx1 = (1. - eps) / (1. + eps); cxro = (1. + eps) * (1. + eps); cyro = (1. - eps) * (1. - eps); // rem2 = x * x / cxro + y * y / cyro; /*zci=cpx(0.,-0.5*(1.-eps*eps)/sqe); znum=cpx(cx1*x,(2.*sqe*sqrt(rc*rc+rem2)-y/cx1)); zden=cpx(x,(2.*rc*sqe-y)); zis=pcpx(zci,lncpx(dcpx(znum,zden))); zres=pcpxflt(zis,b0);*/ // --> optimized code zci.re = 0; zci.im = -0.5*(1. - eps*eps)/sqe; znum.re = cx1 * x; znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; zden.re = x; zden.im = 2.*rc * sqe - y; norm = zden.re * zden.re + zden.im * zden.im; // zis = znum/zden zis.re = (znum.re * zden.re + znum.im * zden.im) / norm; zis.im = (znum.im * zden.re - znum.re * zden.im) / norm; norm = zis.re; zis.re = log(sqrt(norm * norm + zis.im * zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) zis.im = atan2(zis.im, norm); // norm = zis.re; zres.re = zci.re * zis.re - zci.im * zis.im; // Re( zci*ln(zis) ) zres.im = zci.im * zis.re + zis.im * zci.re; // Im( zci*ln(zis) ) // //zres.re = zis.re*b0; //zres.im = zis.im*b0; // return(zres); } // //// changes the coordinates of point P into a new basis (rotation of angle theta) //// y' y x' //// * | / //// * | / theta //// * | / //// *|--------->x /* inline struct point rotateCoordinateSystem(struct point P, double theta) { struct point Q; Q.x = P.x*cos(theta) + P.y*sin(theta); Q.y = P.y*cos(theta) - P.x*sin(theta); return(Q); } */ // // struct point grad_halo(const struct point *pImage, const struct Potential *lens) { struct point true_coord, true_coord_rot, result; type_t X, Y, R, angular_deviation, u; complex zis; // result.x = result.y = 0.; // /*positionning at the potential center*/ // Change the origin of the coordinate system to the center of the clump true_coord.x = pImage->x - lens->position.x; true_coord.y = pImage->y - lens->position.y; //printf("x, y = %f, %f\n", lens->position.x, lens->position.y); // true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle); // switch (lens->type) { case(5): /*Elliptical Isothermal Sphere*/ /*rotation of the coordiante axes to match the potential axes*/ //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle); // R=sqrt(true_coord_rot.x*true_coord_rot.x*(1 - lens->ellipticity_potential) + true_coord_rot.y*true_coord_rot.y*(1 + lens->ellipticity_potential)); //ellippot = ellipmass/3 result.x = (1 - lens->ellipticity_potential)*lens->b0*true_coord_rot.x/(R); result.y = (1 + lens->ellipticity_potential)*lens->b0*true_coord_rot.y/(R); break; case(8): /* PIEMD */ /*rotation of the coordiante axes to match the potential axes*/ /*Doing something....*/ complex zis = piemd_1derivatives_ci05(true_coord_rot.x, true_coord_rot.y, lens->ellipticity_potential, lens->rcore); // result.x = lens->b0*zis.re; result.y = lens->b0*zis.im; break; case(81): //PIEMD Kassiola & Kovner,1993 I0.5c-I0.5cut type_t t05; if ( lens->ellipticity_potential > 2E-4 ) { //printf("1 "); t05 = lens->b0*lens->rcut/(lens->rcut - lens->rcore); complex zis = piemd_1derivatives_ci05(true_coord_rot.x, true_coord_rot.y, lens->ellipticity_potential, lens->rcore); complex zis_cut = piemd_1derivatives_ci05(true_coord_rot.x, true_coord_rot.y, lens->ellipticity_potential, lens->rcut); result.x = t05 * (zis.re - zis_cut.re); result.y = t05 * (zis.im - zis_cut.im); //printf(" g = %f %f\n", result.x, result.y); } else if (( u = true_coord_rot.x*true_coord_rot.x + true_coord_rot.y*true_coord_rot.y) > 0. ) { //printf("2 "); // Circular dPIE Elliasdottir 2007 Eq A23 slighly modified for t05 X = lens->rcore; Y = lens->rcut; t05 = sqrt(u + X * X) - X - sqrt(u + Y * Y) + Y; // Faster and equiv to Elliasdottir (see Golse PhD) t05 *= lens->b0 * Y / (Y - X) / u; // 1/u because t05/sqrt(u) and normalised Q/sqrt(u) result.x = t05*true_coord_rot.x; result.y = t05*true_coord_rot.y; } else { //printf("3 "); result.x = 0.; result.y = 0.; } break; default: std::cout << "ERROR: Grad 1 profil type of clump "<< lens->name << " unknown : "<< lens->type << std::endl; break; }; result = rotateCoordinateSystem(result, -lens->ellipticity_angle); //printf(" rot grad = %f %f\n", result.x, result.y); return result; } // // // struct point module_potentialDerivatives_totalGradient(const int nhalos, const struct point* pImage, const struct Potential* lens) { struct point grad, clumpgrad; // grad.x = 0; grad.y = 0; // for(int i = 0; i < nhalos; i++) { clumpgrad = grad_halo(pImage, &lens[i]); //compute gradient for each clump separately //std::cout << clumpgrad.x << " " << clumpgrad.y << std::endl; //nan check //if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y) { // add the gradients grad.x += clumpgrad.x; grad.y += clumpgrad.y; } } // return(grad); } // // SOA versions, vectorizable // struct point module_potentialDerivatives_totalGradient_5_SOA(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA begins"); //printf("# module_potentialDerivatives_totalGradient_SIS_SOA begins\n"); // struct point grad, result; grad.x = 0; grad.y = 0; for(int i = shalos; i < shalos + nhalos; i++) { // struct point true_coord, true_coord_rotation; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); type_t ell_pot = lens->ellipticity_potential[i]; // type_t R = sqrt(true_coord_rotation.x*true_coord_rotation.x*(1 - ell_pot) + true_coord_rotation.y*true_coord_rotation.y*(1 + ell_pot)); // result.x = (1 - ell_pot)*lens->b0[i]*true_coord_rotation.x/R; result.y = (1 + ell_pot)*lens->b0[i]*true_coord_rotation.y/R; // result = rotateCoordinateSystem(result, -lens->ellipticity_angle[i]); // grad.x += result.x; grad.y += result.y; } return grad; } // // // struct point module_potentialDerivatives_totalGradient_5_SOA_v2(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins"); //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n"); // struct point grad, result; grad.x = 0; grad.y = 0; for(int i = shalos; i < shalos + nhalos; i++) { - // struct point true_coord, true_coord_rotation; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); type_t cose = lens->anglecos[i]; type_t sine = lens->anglesin[i]; // type_t x = true_coord.x*cose + true_coord.y*sine; type_t y = true_coord.y*cose - true_coord.x*sine; // type_t ell_pot = lens->ellipticity_potential[i]; // type_t R = sqrt(x*x*(1 - ell_pot) + y*y*(1 + ell_pot)); // result.x = (1 - ell_pot)*lens->b0[i]*x/R; result.y = (1 + ell_pot)*lens->b0[i]*y/R; // grad.x += result.x*cose - result.y*sine; grad.y += result.y*cose + result.x*sine; //grad.x = x/R; //grad.y = y/R; } return grad; } struct point module_potentialDerivatives_totalGradient_5_SOA_print(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos, int index) { asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins"); //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n"); // struct point grad, result; std::ofstream myfile; grad.x = 0; grad.y = 0; #ifdef _double std::string name = "Double_"; #else std::string name = "Float_"; #endif for(int i = shalos; i < shalos + nhalos; i++) { // struct point true_coord, true_coord_rotation; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; //// /* myfile.open (name + "pImage->x_1.txt", std::ios_base::app); myfile << index << " " << pImage->x << std::setprecision(7) << " " << std::endl; myfile.close(); /** myfile.open (name + "pImage->y_1.txt", std::ios_base::app); myfile << index << " " << pImage->y << std::setprecision(7) << " " << std::endl; myfile.close(); */ /* myfile.open (name + "true_coord.x_2.txt", std::ios_base::app); myfile << index << " " << true_coord.x << std::setprecision(7) << " " << std::endl; myfile.close(); /* myfile.open (name + "true_coord.y_2.txt", std::ios_base::app); myfile << index << " " << true_coord.y << std::setprecision(7) << " " << std::endl; myfile.close(); */// type_t cose = lens->anglecos[i]; type_t sine = lens->anglesin[i]; //// myfile.open (name + "lens->anglecos[i]_3.txt", std::ios_base::app); myfile << index << " " << lens->anglecos[i] << std::setprecision(7) << " " << std::endl; myfile.close(); // myfile.open (name + "lens->anglesin[i]_3.txt", std::ios_base::app); myfile << index << " " << lens->anglesin[i] << std::setprecision(7) << " " << std::endl; myfile.close(); //// type_t x = true_coord.x*cose + true_coord.y*sine; type_t y = true_coord.y*cose - true_coord.x*sine; //// /* myfile.open (name + "x_4.txt", std::ios_base::app); myfile << index << " " << x << std::setprecision(7) << " " << std::endl; myfile.close(); /* myfile.open (name + "y_4.txt", std::ios_base::app); myfile << index << " " << y << std::setprecision(7) << " " << std::endl; myfile.close(); *//// type_t ell_pot = lens->ellipticity_potential[i]; // type_t R = sqrt(x*x*(1 - ell_pot) + y*y*(1 + ell_pot)); // result.x = (1 - ell_pot)*lens->b0[i]*x/R; result.y = (1 + ell_pot)*lens->b0[i]*y/R; //// /* myfile.open (name + "ell_pot_5.txt", std::ios_base::app); myfile << index << " " << ell_pot << std::setprecision(7) << " " << std::endl; myfile.close(); // myfile.open (name + "R_5.txt", std::ios_base::app); myfile << index << " " << R << std::setprecision(7) << " " << std::endl; myfile.close(); myfile.open (name + "x_6.txt", std::ios_base::app); myfile << index << " " << x << std::setprecision(7) << " " << std::endl; myfile.close(); /* myfile.open (name + "y_6.txt", std::ios_base::app); myfile << index << " " << y << std::setprecision(7) << " " << std::endl; myfile.close(); */// grad.x += result.x*cose - result.y*sine; grad.y += result.y*cose + result.x*sine; //// /* myfile.open (name + "grad.x_7.txt", std::ios_base::app); myfile << index << " " << grad.x << std::setprecision(7) << " " << std::endl; myfile.close(); /* myfile.open (name + "grad.y_7.txt", std::ios_base::app); myfile << index << " " << grad.y << std::setprecision(7) << " " << std::endl; myfile.close(); *//// } return grad; } - - // // // struct point module_potentialDerivatives_totalGradient_5_SOA_v2_novec(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA begins"); // struct point grad, result; grad.x = 0; grad.y = 0; #pragma novec for(int i = shalos; i < shalos + nhalos; i++) { // struct point true_coord, true_coord_rotation; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); type_t cose = lens->anglecos[i]; type_t sine = lens->anglesin[i]; // type_t x = true_coord.x*cose + true_coord.y*sine; type_t y = true_coord.y*cose - true_coord.x*sine; //: type_t R = sqrt(x*x*(1 - lens->ellipticity[i]/3.) + y*y*(1 + lens->ellipticity[i]/3.)); + result.x = (1 - lens->ellipticity[i]/3.)*lens->b0[i]*x/R; result.y = (1 + lens->ellipticity[i]/3.)*lens->b0[i]*y/R; // grad.x += result.x*cose - result.y*sine; grad.y += result.y*cose + result.x*sine; } return grad; } // // // struct point module_potentialDerivatives_totalGradient_8_SOA(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad; grad.x = 0; grad.y = 0; // for(int i = shalos; i < shalos + nhalos; i++) { //IACA_START; // struct point true_coord, true_coord_rot; //, result; //type_t R, angular_deviation; complex zis; // //result.x = result.y = 0.; // //@@printf("image_x = %f image_y = %f\n", pImage->x, pImage->y); true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; //printf("x = %f y = %f\n", true_coord.x, true_coord.y); /*positionning at the potential center*/ // Change the origin of the coordinate system to the center of the clump true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); // type_t x = true_coord_rot.x; type_t y = true_coord_rot.y; //@@printf("x = %f y = %f\n", x, y); type_t eps = lens->ellipticity_potential[i]; type_t rc = lens->rcore[i]; // //std::cout << "piemd_lderivatives" << std::endl; // type_t sqe = sqrt(eps); // type_t cx1 = (1. - eps)/(1. + eps); type_t cxro = (1. + eps)*(1. + eps); type_t cyro = (1. - eps)*(1. - eps); // type_t rem2 = x*x/cxro + y*y/cyro; // complex zci, znum, zden, zres; type_t norm; // zci.re = 0; zci.im = -0.5*(1. - eps*eps)/sqe; //@@printf("zci = %f %f\n", zci.re, zci.im); // znum.re = cx1*x; znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; // zden.re = x; zden.im = 2.*rc*sqe - y; norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden //@@printf("norm = %f\n", norm); // zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; //@@printf("zis = %f %f\n", zis.re, zis.im); norm = zis.re; zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) zis.im = atan2(zis.im, norm); //@@printf("y,x = %f %f\n", zis.im, norm); // norm = zis.re; zres.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) zres.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) // //@@printf("zres: %f %f\n", zres.re, zres.im); // zis.re = zres.re; zis.im = zres.im; // //zres.re = zis.re*b0; //zres.im = zis.im*b0; // rotation clumpgrad.x = zis.re; clumpgrad.y = zis.im; clumpgrad = rotateCoordinateSystem(clumpgrad, -lens->ellipticity_angle[i]); // clumpgrad.x = lens->b0[i]*clumpgrad.x; clumpgrad.y = lens->b0[i]*clumpgrad.y; // //clumpgrad.x = lens->b0[i]*zis.re; //clumpgrad.y = lens->b0[i]*zis.im; //nan check //if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y) //{ // add the gradients grad.x += clumpgrad.x; grad.y += clumpgrad.y; //@@printf("grad: %f %f\n", grad.x, grad.y); //@@std::cout << "grad.x = " << grad.x << " grad.y = " << grad.y << std::endl; //} } //IACA_END; // return(grad); } // // // struct point module_potentialDerivatives_totalGradient_8_SOA_v2(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_8_SOA_v2 begins"); //std::cout << "# module_potentialDerivatives_totalGradient_8_SOA_v2 begins" << std::endl; // // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad; grad.x = 0; grad.y = 0; //printf("%d %d\n", shalos, nhalos); // for(int i = shalos; i < shalos + nhalos; i++) { //IACA_START; // struct point true_coord, true_coord_rot; //, result; complex zis; type_t b0 = lens->b0[i]; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // //true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); // type_t cose = lens->anglecos[i]; type_t sine = lens->anglesin[i]; // type_t x = true_coord.x*cose + true_coord.y*sine; type_t y = true_coord.y*cose - true_coord.x*sine; // type_t eps = lens->ellipticity_potential[i]; type_t rc = lens->rcore[i]; // type_t sqe = sqrt(eps); // type_t cx1 = (1. - eps)/(1. + eps); type_t cxro = (1. + eps)*(1. + eps); type_t cyro = (1. - eps)*(1. - eps); // type_t rem2 = x*x/cxro + y*y/cyro; // complex zci, znum, zden, zres; type_t norm; // zci.re = 0; zci.im = -0.5*(1. - eps*eps)/sqe; // KERNEL(rc, zres) // grad.x += b0*(zres.re*cose - zres.im*sine); grad.y += b0*(zres.im*cose + zres.re*sine); // } //IACA_END; // return(grad); } // // // struct point module_potentialDerivatives_totalGradient_8_SOA_v2_novec(const struct point *pImage, const struct Potential_SOA *lens , int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_8_SOA_v2 begins"); //std::cout << "# module_potentialDerivatives_totalGradient_8_SOA_v2 begins" << std::endl; // // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad; grad.x = 0; grad.y = 0; //printf("%d %d\n", shalos, nhalos); // #pragma novector for(int i = shalos; i < shalos + nhalos; i++) { //IACA_START; // struct point true_coord, true_coord_rot; //, result; complex zis; type_t b0 = lens->b0[i]; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // //true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); // type_t cose = lens->anglecos[i]; type_t sine = lens->anglesin[i]; // type_t x = true_coord.x*cose + true_coord.y*sine; type_t y = true_coord.y*cose - true_coord.x*sine; // type_t eps = lens->ellipticity_potential[i]; type_t rc = lens->rcore[i]; // type_t sqe = sqrt(eps); // type_t cx1 = (1. - eps)/(1. + eps); type_t cxro = (1. + eps)*(1. + eps); type_t cyro = (1. - eps)*(1. - eps); // type_t rem2 = x*x/cxro + y*y/cyro; // complex zci, znum, zden, zres; type_t norm; // zci.re = 0; zci.im = -0.5*(1. - eps*eps)/sqe; // KERNEL(rc, zres) // grad.x += b0*(zres.re*cose - zres.im*sine); grad.y += b0*(zres.im*cose + zres.re*sine); // } //IACA_END; // return(grad); } // // // struct point module_potentialDerivatives_totalGradient_81_SOA(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_81_SOA begins"); //std::cout << "# module_potentialDerivatives_totalGradient_SOA begins" << std::endl; // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad; grad.x = 0; grad.y = 0; for(int i = shalos; i < shalos + nhalos; i++) { //IACA_START; // struct point true_coord, true_coord_rot; //, result; //type_t R, angular_deviation; complex zis; // //result.x = result.y = 0.; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; /*positionning at the potential center*/ // Change the origin of the coordinate system to the center of the clump true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); // type_t x = true_coord_rot.x; type_t y = true_coord_rot.y; type_t eps = lens->ellipticity_potential[i]; type_t rc = lens->rcore[i]; type_t rcut = lens->rcut[i]; type_t b0 = lens->b0[i]; type_t t05 = b0*rcut/(rcut - rc); // type_t sqe = sqrt(eps); // type_t cx1 = (1. - eps)/(1. + eps); type_t cxro = (1. + eps)*(1. + eps); type_t cyro = (1. - eps)*(1. - eps); // type_t rem2 = x*x/cxro + y*y/cyro; // complex zci, znum, zden, zres_rc, zres_rcut; type_t norm; // zci.re = 0; zci.im = -0.5*(1. - eps*eps)/sqe; // step 1 { KERNEL(rc, zres_rc) } // step 2 { KERNEL(rcut, zres_rcut) } zis.re = t05*(zres_rc.re - zres_rcut.re); zis.im = t05*(zres_rc.im - zres_rcut.im); // rotation clumpgrad.x = zis.re; clumpgrad.y = zis.im; clumpgrad = rotateCoordinateSystem(clumpgrad, -lens->ellipticity_angle[i]); // grad.x += clumpgrad.x; grad.y += clumpgrad.y; //} } //IACA_END; // return(grad); } // // // struct point module_potentialDerivatives_totalGradient_81_SOA_v2(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_81_SOA begins"); //std::cout << "# module_potentialDerivatives_totalGradient_81_SOA begins" << std::endl; // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad; grad.x = 0; grad.y = 0; for(int i = shalos; i < shalos + nhalos; i++) { //IACA_START; // struct point true_coord, true_coord_rot; //, result; //type_t R, angular_deviation; complex zis; // //result.x = result.y = 0.; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; /*positionning at the potential center*/ // Change the origin of the coordinate system to the center of the clump type_t cose = lens->anglecos[i]; type_t sine = lens->anglesin[i]; type_t x = true_coord.x*cose + true_coord.y*sine; type_t y = true_coord.y*cose - true_coord.x*sine; // type_t eps = lens->ellipticity_potential[i]; type_t rc = lens->rcore[i]; type_t rcut = lens->rcut[i]; type_t b0 = lens->b0[i]; type_t t05 = b0*rcut/(rcut - rc); // type_t sqe = sqrt(eps); // type_t cx1 = (1. - eps)/(1. + eps); type_t cxro = (1. + eps)*(1. + eps); type_t cyro = (1. - eps)*(1. - eps); // type_t rem2 = x*x/cxro + y*y/cyro; // complex zci, znum, zden, zres_rc, zres_rcut; type_t norm; // zci.re = 0; zci.im = -0.5*(1. - eps*eps)/sqe; // // step 1 { KERNEL(rc, zres_rc) } // step 2 { KERNEL(rcut, zres_rcut) } zis.re = t05*(zres_rc.re - zres_rcut.re); zis.im = t05*(zres_rc.im - zres_rcut.im); // rotation grad.x += (zis.re*cose - zis.im*sine); grad.y += (zis.im*cose + zis.re*sine); // } // return(grad); } // // // struct point module_potentialDerivatives_totalGradient_81_SOA_v2_novec(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_81_SOA begins"); //std::cout << "# module_potentialDerivatives_totalGradient_81_SOA begins" << std::endl; // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 struct point grad, clumpgrad; grad.x = 0; grad.y = 0; #pragma novector for(int i = shalos; i < shalos + nhalos; i++) { //IACA_START; // struct point true_coord, true_coord_rot; //, result; //type_t R, angular_deviation; complex zis; // //result.x = result.y = 0.; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; /*positionning at the potential center*/ // Change the origin of the coordinate system to the center of the clump type_t cose = lens->anglecos[i]; type_t sine = lens->anglesin[i]; type_t x = true_coord.x*cose + true_coord.y*sine; type_t y = true_coord.y*cose - true_coord.x*sine; // type_t eps = lens->ellipticity_potential[i]; type_t rc = lens->rcore[i]; type_t rcut = lens->rcut[i]; type_t b0 = lens->b0[i]; type_t t05 = b0*rcut/(rcut - rc); // type_t sqe = sqrt(eps); // type_t cx1 = (1. - eps)/(1. + eps); type_t cxro = (1. + eps)*(1. + eps); type_t cyro = (1. - eps)*(1. - eps); // type_t rem2 = x*x/cxro + y*y/cyro; // complex zci, znum, zden, zres_rc, zres_rcut; type_t norm; // zci.re = 0; zci.im = -0.5*(1. - eps*eps)/sqe; // // step 1 { KERNEL(rc, zres_rc) } // step 2 { KERNEL(rcut, zres_rcut) } zis.re = t05*(zres_rc.re - zres_rcut.re); zis.im = t05*(zres_rc.im - zres_rcut.im); // rotation grad.x += (zis.re*cose - zis.im*sine); grad.y += (zis.im*cose + zis.re*sine); // } // return(grad); } // // // typedef struct point (*halo_func_t) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos); halo_func_t halo_func[100] = { 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_v2, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_v2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_81_SOA_v2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; // // // struct point module_potentialDerivatives_totalGradient_SOA(const struct point *pImage, const struct Potential_SOA *lens, int nhalos) { struct point grad, clumpgrad; // grad.x = clumpgrad.x = 0; grad.y = clumpgrad.y = 0; // int shalos = 0; // //module_potentialDerivatives_totalGradient_81_SOA(pImage, lens, 0, nhalos); //return; /* int* p_type = &(lens->type)[0]; int* lens_type = (int*) malloc(nhalos*sizeof(int)); memcpy(lens_type, &(lens->type)[0], nhalos*sizeof(int)); */ //quicksort(lens_type, nhalos); // while (shalos < nhalos) { int lens_type = lens->type[shalos]; int count = 1; while (lens->type[shalos + count] == lens_type) count++; //std::cerr << "type = " << lens_type << " " << count << " " << shalos << " " << " func = " << halo_func[lens_type] << std::endl; // clumpgrad = (*halo_func[lens_type])(pImage, lens, shalos, count); // grad.x += clumpgrad.x; grad.y += clumpgrad.y; shalos += count; } return(grad); } typedef struct point (*halo_func_t_novec) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos); halo_func_t_novec halo_func_novec[100] = { 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_v2_novec, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_v2_novec, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_81_SOA_v2_novec, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; // // // struct point module_potentialDerivatives_totalGradient_SOA_novec(const struct point *pImage, const struct Potential_SOA *lens, int nhalos) { struct point grad, clumpgrad; // grad.x = clumpgrad.x = 0; grad.y = clumpgrad.y = 0; // int shalos = 0; // //module_potentialDerivatives_totalGradient_81_SOA(pImage, lens, 0, nhalos); //return; /* int* p_type = &(lens->type)[0]; int* lens_type = (int*) malloc(nhalos*sizeof(int)); memcpy(lens_type, &(lens->type)[0], nhalos*sizeof(int)); */ //quicksort(lens_type, nhalos); // while (shalos < nhalos) { int lens_type = lens->type[shalos]; int count = 1; while (lens->type[shalos + count] == lens_type) count++; //std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl; // clumpgrad = (*halo_func_novec[lens_type])(pImage, lens, shalos, count); // grad.x += clumpgrad.x; grad.y += clumpgrad.y; shalos += count; } return(grad); } diff --git a/src/gradient_GPU.cu b/src/gradient_GPU.cu index a21248d..ddfecc5 100644 --- a/src/gradient_GPU.cu +++ b/src/gradient_GPU.cu @@ -1,946 +1,949 @@ #include <fstream> #include "grid_gradient_GPU.cuh" #include "gradient_GPU.cuh" #include "gradient.hpp" #define BLOCK_SIZE_X 8 #define BLOCK_SIZE_Y 16 //#define ROT #define _SHARED_MEM #ifdef _SHARED_MEM #define SHARED __shared__ #warning "shared memory" extern __shared__ type_t shared[]; #else #define SHARED #endif #define Nx 1 #define Ny 0 extern "C" { double myseconds(); } void module_potentialDerivatives_totalGradient_SOA_CPU_GPU(type_t *grid_grad_x, type_t *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos); void module_potentialDerivatives_totalGradient_SOA_CPU_GPU_v2(type_t *grid_grad_x, type_t *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos); void calculate_cossin_values(type_t *theta_cos, type_t *theta_sin, type_t *angles, int nhalos ){ for(int i = 0 ; i < nhalos; i++) { theta_cos[i] = cos(angles[i]); theta_sin[i] = sin(angles[i]); } } __global__ void module_potentialDerivatives_totalGradient_8_SOA_GPU_cur(type_t *grid_grad_x, type_t *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int shalos, int nhalos) { //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad, image_point; // grad.x = 0; grad.y = 0; // int col = blockIdx.x*blockDim.x + threadIdx.x; int row = blockIdx.y*blockDim.y + threadIdx.y; // if ((row < nbgridcells) && (col < nbgridcells)) { // int index = row*nbgridcells + col; // //grid_grad_x[index] = 0.; //grid_grad_y[index] = 0.; // type_t dx = (frame->xmax - frame->xmin)/(nbgridcells-1); type_t dy = (frame->ymax - frame->ymin)/(nbgridcells-1); // image_point.x = frame->xmin + col*dx; image_point.y = frame->ymin + row*dy; // for(int i = shalos; i < shalos + nhalos; i++) { struct point true_coord, true_coord_rot; //, result; //type_t R, angular_deviation; complex zis; // // positionning at the potential center // Change the origin of the coordinate system to the center of the clump // //@@if ((row == Ny) && (col == Nx)) printf("image_x = %f, %f image_y = %f, %f\n", image_point.x, frame->xmin, image_point.y,frame->ymin); type_t true_coord_x = image_point.x - __ldg(&lens->position_x[i]); type_t true_coord_y = image_point.y - __ldg(&lens->position_y[i]); //if ((row == Ny) && (col == Nx)) printf("x = %f y = %f\n", true_coord_x, true_coord_y); // type_t cosi = __ldg(&lens->anglecos[i]); type_t sinu = __ldg(&lens->anglesin[i]); // type_t x = true_coord_x*cosi + true_coord_y*sinu; type_t y = true_coord_y*cosi - true_coord_x*sinu; // //if ((row == Ny) && (col == Nx)) printf("x = %f y = %f\n", x, y); // type_t eps = __ldg(&lens->ellipticity_potential[i]); // type_t sqe = sqrt(eps); // type_t rem2 = x*x/((1. + eps)*(1. + eps)) + y*y/((1. - eps)*(1. - eps)); // complex zci; complex znum, zden, zres; type_t norm; // zci.re = 0; zci.im = -0.5*(1. - eps*eps)/sqe; //@@if ((col == Nx) && (row == Ny)) printf("%d %d, zis: %f %f\n", row, col, zci.re, zci.im); // type_t rc = __ldg(&lens->rcore[i]); type_t cx1 = (1. - eps)/(1. + eps); znum.re = cx1*x; znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; // zden.re = x; zden.im = 2.*rc*sqe - y; norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden //@@if ((col == Nx) && (row == Ny)) printf("norm = %f\n", norm); // zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; // //@@if ((col == Nx) && (row == Ny)) printf("%d %d, zis: %f %f\n", row, col, zis.re, zis.im); // norm = zis.re; // zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) zis.im = atan2(zis.im, norm); // //@@if ((col == Nx) && (row == Ny)) printf("%d %d, zis: %f %f\n", row, col, zis.re, zis.im); // zres.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) zres.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) // //@@if ((col == Nx) && (row == Ny)) printf("%d %d, zres: %f %f\n", row, col, zres.re, zres.im); // type_t b0 = __ldg(&lens->b0[i]); grad.x += b0*(zres.re*cosi - zres.im*sinu); grad.y += b0*(zres.im*cosi + zres.re*sinu); //@@if ((col == Nx) && (row == Ny)) printf("grad: %f %f\n", grad.x, grad.y); } //IACA_END; // grid_grad_x[index] = grad.x; grid_grad_y[index] = grad.y; //if ((row == 0) && (col == 9)) //printf("%f %f: %f %f\n", image_point.x, image_point.y, grid_grad_x[index], grid_grad_y[index]); } } __global__ void module_potentialDerivatives_totalGradient_8_SOA_GPU_SM2(type_t *grid_grad_x, type_t *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int shalos, int nhalos) { // //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // type_t grad_x, grad_y; type_t clumpgrad_x, clumpgrad_y; type_t image_point_x, image_point_y; // SHARED type_t cosi [200]; SHARED type_t sinu [200]; SHARED type_t rc [200]; SHARED type_t b0 [200]; SHARED type_t epsi [200]; SHARED type_t position_x[200]; SHARED type_t position_y[200]; SHARED type_t rsqe [200]; SHARED type_t sonepeps [200]; SHARED type_t sonemeps [200]; // grad_x = 0; grad_y = 0; // int col = blockIdx.x*blockDim.x + threadIdx.x; int row = blockIdx.y*blockDim.y + threadIdx.y; int ithread = threadIdx.y*blockDim.x + threadIdx.x; // int index = row*nbgridcells + col; // //grid_grad_x[index] = 0.; //grid_grad_y[index] = 0.; // type_t dx = (frame->xmax - frame->xmin)/(nbgridcells-1); type_t dy = (frame->ymax - frame->ymin)/(nbgridcells-1); // image_point_x = frame->xmin + col*dx; image_point_y = frame->ymin + row*dy; // int i = ithread; if (i < nhalos) { cosi[i] = __ldg(&lens->anglecos [shalos + i]); sinu[i] = __ldg(&lens->anglesin [shalos + i]); position_x[i] = __ldg(&lens->position_x [shalos + i]); position_y[i] = __ldg(&lens->position_y [shalos + i]); rc[i] = __ldg(&lens->rcore [shalos + i]); b0[i] = __ldg(&lens->b0 [shalos + i]); epsi[i] = __ldg(&lens->ellipticity_potential[shalos + i]); //sonemeps[i] = 1 - epsi[i]; //sonepeps[i] = 1 + epsi[i]; rsqe[i] = sqrt(epsi[i]); } __syncthreads(); // if ((row < nbgridcells) && (col < nbgridcells)) { for(int i = 0; i < nhalos; i++) { // type_t true_coord_x = image_point_x - position_x[i]; type_t true_coord_y = image_point_y - position_y[i]; // type_t x = true_coord_x*cosi[i] + true_coord_y*sinu[i]; type_t y = true_coord_y*cosi[i] - true_coord_x*sinu[i]; // type_t eps = epsi[i]; //type_t onemeps = 1 - eps; //type_t onepeps = 1 + eps; // //type_t eps = epsi[i]; type_t onemeps = sonemeps[i]; type_t onepeps = sonepeps[i]; // //type_t sqe = sqrt(eps); type_t sqe = rsqe[i]; type_t rem2 = x*x/(onepeps*onepeps) + y*y/(onemeps*onemeps); // complex zis; // type_t znum_re, znum_im; type_t zres_re, zres_im; type_t norm; type_t zden_re, zden_im; type_t zis_re, zis_im; // type_t zci_im = -0.5*(1. - eps*eps)/sqe; // type_t cx1 = onemeps/onepeps; // znum_re = cx1*x; znum_im = 2.*sqe*sqrt(rc[i]*rc[i] + rem2) - y/cx1; // zden_re = x; zden_im = 2.*rc[i]*sqe - y; // norm = (x*x + zden_im*zden_im); // zis = znum/zden zis.re = (znum_re*x + znum_im*zden_im)/norm; zis.im = (znum_im*x - znum_re*zden_im)/norm; // norm = zis.re; // zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) zis.im = atan2(zis.im, norm); // zres_re = - zci_im*zis.im; // Re( zci*ln(zis) ) zres_im = zci_im*zis.re; // Im( zci*ln(zis) ) // grad_x += b0[i]*(zres_re*cosi[i] - zres_im*sinu[i]); grad_y += b0[i]*(zres_im*cosi[i] + zres_re*sinu[i]); } // grid_grad_x[index] = grad_x; grid_grad_y[index] = grad_y; //__syncthreads(); } } // // // __global__ void module_potentialDerivatives_totalGradient_8_SOA_GPU_SM3(type_t *grid_grad_x, type_t *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int shalos, int nhalos) { // //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // type_t grad_x, grad_y; type_t clumpgrad_x, clumpgrad_y; type_t image_point_x, image_point_y; // SHARED type_t cosi [200]; SHARED type_t sinu [200]; SHARED type_t rci [200]; SHARED type_t b0 [200]; SHARED type_t epsi [200]; SHARED type_t position_x[200]; SHARED type_t position_y[200]; SHARED type_t rsqe [200]; //SHARED type_t sgrad_x [(BLOCK_SIZE_X + 1)*BLOCK_SIZE_Y]; //SHARED type_t sgrad_y [(BLOCK_SIZE_X + 1)*BLOCK_SIZE_Y]; //SHARED type_t sonepeps [200]; //SHARED type_t sonemeps [200]; // grad_x = 0; grad_y = 0; // int row = blockIdx.y*blockDim.y + threadIdx.y; int col = blockIdx.x*blockDim.x + threadIdx.x; // //int loc_row = threadIdx.x; //int loc_col = threadIdx.y*blockDim.x + threadIdx.x; // //int grid_size = nbgridcells/blockDim.y; // //if (threadIdx.x == 0) printf("%d %d %d: row = %d, col = %d, grid_size = %d\n", blockIdx.y, gridDim.y, threadIdx.y, row, col, grid_size); // type_t dx = (frame->xmax - frame->xmin)/(nbgridcells-1); type_t dy = (frame->ymax - frame->ymin)/(nbgridcells-1); //if (threadIdx.x == 0) printf("dx = %f, dy = %f\n", dx, dy); // image_point_x = frame->xmin + col*dx; image_point_y = frame->ymin + row*dy; // //int iloc = threadIdx.x*blockDim.y + threadIdx.y; int iglob = row*nbgridcells + col; int numThreads = blockDim.x*blockDim.y; // for (int i = 0; i < (nhalos + numThreads - 1)/numThreads; ++i) { int iloc = threadIdx.y*blockDim.x + threadIdx.x + i*numThreads; if (iloc < nhalos) { cosi[iloc] = __ldg(&lens->anglecos [shalos + iloc]); sinu[iloc] = __ldg(&lens->anglesin [shalos + iloc]); position_x[iloc] = __ldg(&lens->position_x [shalos + iloc]); position_y[iloc] = __ldg(&lens->position_y [shalos + iloc]); rci[iloc] = __ldg(&lens->rcore [shalos + iloc]); b0[iloc] = __ldg(&lens->b0 [shalos + iloc]); epsi[iloc] = __ldg(&lens->ellipticity_potential[shalos + iloc]); rsqe[iloc] = sqrt(epsi[iloc]); } } __syncthreads(); // if ((row < nbgridcells) && (col < nbgridcells)) { // for(int i = 0; i < nhalos; i++) { //int index = iloc; #if 1 type_t rc = rci[i]; type_t eps = epsi[i]; type_t onemeps = 1 - eps; type_t onepeps = 1 + eps; // type_t sqe = rsqe[i]; type_t cx1 = onemeps/onepeps; // // //type_t zci_im = 1; type_t zci_im = -0.5*(1. - eps*eps)/sqe; type_t inv_onepeps = 1./(onepeps*onepeps); type_t inv_onemeps = 1./(onemeps*onemeps); #endif // { //KERNEL_8; type_t grad_x = grad_y = 0.; type_t true_coord_y = image_point_y - position_y[i]; type_t true_coord_x = image_point_x - position_x[i]; KERNEL_8_reg(0); grid_grad_x[iglob + 0] += grad_x; grid_grad_y[iglob + 0] += grad_y; } /* { //KERNEL_8; type_t grad_x = grad_y = 0.; type_t true_coord_y = image_point_y - position_y[i]; type_t true_coord_x = image_point_x - position_x[i] + BLOCK_SIZE_X/2*dx; KERNEL_8_reg(0); grid_grad_x[iglob + BLOCK_SIZE_X/2] += grad_x; grid_grad_y[iglob + BLOCK_SIZE_X/2] += grad_y; } */ // } } } // // // __global__ void module_potentialDerivatives_totalGradient_8_SOA_GPU_SM4(type_t *grid_grad_x, type_t *grid_grad_y, const struct Potential_SOA lens, const struct grid_param *frame, int nbgridcells, int shalos, int nhalos/*, type_t* dtimer*/) { // //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // type_t grad_x, grad_y; type_t clumpgrad_x, clumpgrad_y; type_t image_point_x, image_point_y; // // type_t* cosi = &shared[0*nhalos]; type_t* sinu = &shared[1*nhalos]; type_t* rc = &shared[2*nhalos]; type_t* b0 = &shared[3*nhalos]; type_t* epsi = &shared[4*nhalos]; type_t* position_x = &shared[5*nhalos]; type_t* position_y = &shared[6*nhalos]; type_t* rsqe = &shared[7*nhalos]; //SHARED type_t sonepeps [200]; //SHARED type_t sonemeps [200]; // grad_x = 0; grad_y = 0; // int grid_size = nbgridcells/gridDim.y; int row = blockIdx.x*blockDim.x + threadIdx.x; int col = (blockIdx.y*blockDim.y + threadIdx.y)/**grid_size*/; // // #if 1 //SHARED type_t sgrad_x [ if (/*(threadIdx.x == 0) &&*/ (blockIdx.x == 0)) { if (threadIdx.x == 0) printf("blockDim.x = %d, blockIdx.x = %d grimdDim.x = %d threadIdx.x = %d\n", blockDim.x, blockIdx.x, gridDim.x, threadIdx.x); if (threadIdx.x == 0) printf("blockDim.y = %d, blockIdx.y = %d grimdDim.y = %d threadIdx.y = %d\n", blockDim.y, blockIdx.y, gridDim.y, threadIdx.y); if (threadIdx.x == 0) printf("row = %d, col = %d, grid_size = %d\n", row, col, grid_size); } __syncthreads(); #endif type_t* sgrad_x = &shared[8*nhalos]; type_t* sgrad_y = &shared[8*nhalos + (grid_size + 1)*blockDim.x]; // // //grid_grad_x[index] = 0.; //grid_grad_y[index] = 0.; // type_t dx = (frame->xmax - frame->xmin)/(nbgridcells-1); type_t dy = (frame->ymax - frame->ymin)/(nbgridcells-1); // //if (threadIdx.x == 0) printf("dx = %f, dy = %f\n", dx, dy); // image_point_x = frame->xmin + col*dx; image_point_y = frame->ymin + row*dy; return; // int i = 0; #if 0 for (; i < nhalos; i = i + blockDim.x) { int pos = threadIdx.x + i; /*if ((threadIdx.x == 0) && (blockIdx.x == 0))*/ printf("pos = %d\n"); __syncthreads(); // cosi[pos] = __ldg(&lens->anglecos [shalos + pos]); sinu[pos] = __ldg(&lens->anglesin [shalos + pos]); position_x[pos] = __ldg(&lens->position_x [shalos + pos]); position_y[pos] = __ldg(&lens->position_y [shalos + pos]); rc[pos] = __ldg(&lens->rcore [shalos + pos]); b0[pos] = __ldg(&lens->b0 [shalos + pos]); epsi[pos] = __ldg(&lens->ellipticity_potential[shalos + pos]); rsqe[pos] = sqrt(epsi[i]); // } #endif #if 0 if (threadIdx.x == 0) for (; i < nhalos; i += 1) { cosi[i] = __ldg(&lens->anglecos [shalos + i]); sinu[i] = __ldg(&lens->anglesin [shalos + i]); position_x[i] = __ldg(&lens->position_x [shalos + i]); position_y[i] = __ldg(&lens->position_y [shalos + i]); rc[i] = __ldg(&lens->rcore [shalos + i]); b0[i] = __ldg(&lens->b0 [shalos + i]); epsi[i] = __ldg(&lens->ellipticity_potential[shalos + i]); rsqe[i] = sqrt(epsi[i]); } #endif __syncthreads(); //if ((row == col == 0)) printf("shared mem done...\n"); // if (row < nbgridcells) { //for(int icol = 0; icol < grid_size; ++icol){ // if (col + icol < nbgridcells){ //grad_x = grad_y = 0.; int index = row*nbgridcells + col; // for(int i = 0; i < nhalos; i++) { int sindex = threadIdx.x*grid_size; #if 0 type_t eps = epsi[i]; type_t onemeps = 1 - eps; type_t onepeps = 1 + eps; // type_t sqe = rsqe[i]; type_t cx1 = onemeps/onepeps; // // //type_t x = true_coord_y*sinu[i]; //type_t y = true_coord_y*cosi[i]; type_t true_coord_y = image_point_y - position_y[i]; type_t true_coord_x = image_point_x - position_x[i] /*+ icol*dx*/; // complex zci; zci.im = -0.5*(1. - eps*eps)/sqe; type_t inv_onepeps = 1./(onepeps*onepeps); type_t inv_onemeps = 1./(onemeps*onemeps); #endif // for(int icol = 0; icol < grid_size; ++icol) { if (col + icol < nbgridcells) { #if 0 if ((row == 1) && (col == 1)) printf("%d %d: %f %f\n", row, col, true_coord_x, true_coord_y); true_coord_x = image_point_x - position_x[i] + icol*dx; // //x += true_coord_x*cosi[i]; //y -= true_coord_x*sinu[i]; type_t x = true_coord_x*cosi[i] + true_coord_y*sinu[i]; type_t y = true_coord_y*cosi[i] - true_coord_x*sinu[i]; // //if ((row == 1) && (col == 0)) printf("i = %d, eps = %f\n", i, eps); // //double eps = epsi[i]; //double onemeps = sonemeps[i]; //double onepeps = sonepeps[i]; // //double sqe = sqrt(eps); //double rem2 = x*x/(onepeps*onepeps) + y*y/(onemeps*onemeps); type_t rem2 = x*x*inv_onepeps + y*y*inv_onemeps; // // //double znum_re, znum_im; //double zres_re, zres_im; //double zden_re, zden_im; //double zis_re, zis_im; type_t norm; // complex zis; complex znum; complex zden; complex zres; // //double cx1 = onemeps/onepeps; // znum.re = cx1*x; znum.im = 2.*sqe*sqrt(rc[i]*rc[i] + rem2) - y/cx1; // zden.re = x; zden.im = 2.*rc[i]*sqe - y; // norm = (x*x + zden.im*zden.im); // zis = znum/zden zis.re = (znum.re*x + znum.im*zden.im)/norm; zis.im = (znum.im*x - znum.re*zden.im)/norm; // norm = zis.re; // zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) zis.im = atan2(zis.im, norm); // zres.re = - zci.im*zis.im; // Re( zci*ln(zis) ) zres.im = zci.im*zis.re; // Im( zci*ln(zis) ) // grid_grad_x[index] += b0[i]*(zres.re*cosi[i] - zres.im*sinu[i]); grid_grad_y[index] += b0[i]*(zres.im*cosi[i] + zres.re*sinu[i]); #endif //sgrad_x[sindex] += (float) sindex; sgrad_y[sindex] += (float) -sindex; //sindex++; // //grid_grad_x[index] += grad_x; //grid_grad_y[index] += grad_y; } // } } __syncthreads(); return; // #if 0 int sindex = threadIdx.x*grid_size; for(int icol = 0; icol < grid_size; ++icol) { if (col + icol < nbgridcells) { grid_grad_x[index + col] = sgrad_x[sindex]; grid_grad_y[index + col] = sgrad_y[sindex]; sindex++; } } __syncthreads(); #endif } } __global__ void module_potentialDerivatives_totalGradient_8_SOA_GPU_v2(type_t *grid_grad_x, type_t *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int i, int nhalos) { //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad, image_point; grad.x = 0; grad.y = 0; // int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; // if ((row < nbgridcells) && (col < nbgridcells)) { // int index = col*nbgridcells + row; // //grid_grad_x[index] = 0.; //grid_grad_y[index] = 0.; // type_t dx = (frame->xmax - frame->xmin)/(nbgridcells-1); type_t dy = (frame->ymax - frame->ymin)/(nbgridcells-1); // #if 0 /*SHARED*/ type_t img_pt[2]; if ((row == 0) && (col == 0)) { img_pt[0] = frame->xmin + col*dx; img_pt[1] = frame->ymin + row*dy; } __syncthreads(); #else image_point.x = frame->xmin + col*dx; image_point.y = frame->ymin + row*dy; #endif // // //for(int i = shalos; i < shalos + nhalos; i++) //{ //IACA_START; // struct point true_coord, true_coord_rot; //, result; //type_t R, angular_deviation; complex zis; // //result.x = result.y = 0.; // #if 0 true_coord.x = img_pt[0] - __ldg(&lens->position_x[i]); true_coord.y = img_pt[1] - __ldg(&lens->position_y[i]); #else true_coord.x = image_point.x - __ldg(&lens->position_x[i]); true_coord.y = image_point.y - __ldg(&lens->position_y[i]); #endif type_t cosi = __ldg(&lens->anglecos[i]); type_t sinu = __ldg(&lens->anglesin[i]); // positionning at the potential center // Change the origin of the coordinate system to the center of the clump type_t x = true_coord.x*cosi + true_coord.y*sinu; type_t y = true_coord.y*cosi - true_coord.x*sinu; // type_t eps = __ldg(&lens->ellipticity_potential[i]); // type_t sqe = sqrt(eps); // type_t rem2 = x*x/((1. + eps)*(1. + eps)) + y*y/((1. - eps)*(1. - eps)); // complex zci; complex znum, zden, zres; type_t norm; // zci.im = -0.5*(1. - eps*eps)/sqe; // type_t rc = __ldg(&lens->rcore[i]); type_t cx1 = (1. - eps)/(1. + eps); znum.re = cx1*x; znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; // zden.re = x; zden.im = 2.*rc*sqe - y; norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden // zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; // type_t b0 = __ldg(&lens->b0[i]); grad.x += b0*(zres.re*cosi - zres.im*sinu); grad.y += b0*(zres.im*cosi + zres.re*sinu); // grid_grad_x[index] += grad.x; grid_grad_y[index] += grad.y; } } #if 1 typedef struct point (*halo_func_GPU_t) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos); __constant__ halo_func_GPU_t halo_func_GPU[100] = { 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_GPU, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_GPU, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_81_SOA_GPU, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; #endif __device__ point module_potentialDerivatives_totalGradient_5_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos){ //asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins"); //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n"); // struct point grad, result; grad.x = 0; grad.y = 0; for(int i = shalos; i < shalos + nhalos; i++) { // struct point true_coord, true_coord_rotation; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); type_t cose = lens->anglecos[i]; type_t sine = lens->anglesin[i]; // type_t x = true_coord.x*cose + true_coord.y*sine; type_t y = true_coord.y*cose - true_coord.x*sine; // type_t ell_pot = lens->ellipticity_potential[i]; // type_t b0_inv_R = lens->b0[i]/sqrt(x*x*(1 - ell_pot) + y*y*(1 + ell_pot)); // result.x = (1 - ell_pot)*x*b0_inv_R; result.y = (1 + ell_pot)*y*b0_inv_R; // grad.x += result.x*cose - result.y*sine; grad.y += result.y*cose + result.x*sine; } return grad; } __device__ point module_potentialDerivatives_totalGradient_8_SOA_GPU(const struct point *image_point, const struct Potential_SOA *lens, int shalos, int nhalos){ struct point grad; grad.x = 0; grad.y = 0; // for(int i = shalos; i < shalos + nhalos; i++) { complex zis; // positioning at the potential center // Change the origin of the coordinate system to the center of the clump //@@if ((row == Ny) && (col == Nx)) printf("image_x = %f, %f image_y = %f, %f\n", image_point.x, frame->xmin, image_point.y,frame->ymin); + type_t true_coord_x = image_point->x - __ldg(&lens->position_x[i]); type_t true_coord_y = image_point->y - __ldg(&lens->position_y[i]); //if ((row == Ny) && (col == Nx)) printf("x = %f y = %f\n", true_coord_x, true_coord_y); // type_t cosi = __ldg(&lens->anglecos[i]); type_t sinu = __ldg(&lens->anglesin[i]); // type_t x = true_coord_x*cosi + true_coord_y*sinu; type_t y = true_coord_y*cosi - true_coord_x*sinu; // //if ((row == Ny) && (col == Nx)) printf("x = %f y = %f\n", x, y); // type_t eps = __ldg(&lens->ellipticity_potential[i]); // type_t sqe = sqrt(eps); // type_t rem2 = x*x/((1. + eps)*(1. + eps)) + y*y/((1. - eps)*(1. - eps)); // complex zci; complex znum, zden, zres; type_t norm; // zci.re = 0; zci.im = -0.5*(1. - eps*eps)/sqe; //@@if ((col == Nx) && (row == Ny)) printf("%d %d, zis: %f %f\n", row, col, zci.re, zci.im); // type_t rc = __ldg(&lens->rcore[i]); type_t cx1 = (1. - eps)/(1. + eps); + znum.re = cx1*x; znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; // zden.re = x; zden.im = 2.*rc*sqe - y; norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden //@@if ((col == Nx) && (row == Ny)) printf("norm = %f\n", norm); // zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; // //@@if ((col == Nx) && (row == Ny)) printf("%d %d, zis: %f %f\n", row, col, zis.re, zis.im); // norm = zis.re; // zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) zis.im = atan2(zis.im, norm); // //@@if ((col == Nx) && (row == Ny)) printf("%d %d, zis: %f %f\n", row, col, zis.re, zis.im); // zres.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) zres.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) // //@@if ((col == Nx) && (row == Ny)) printf("%d %d, zres: %f %f\n", row, col, zres.re, zres.im); // type_t b0 = __ldg(&lens->b0[i]); + grad.x += b0*(zres.re*cosi - zres.im*sinu); grad.y += b0*(zres.im*cosi + zres.re*sinu); //@@if ((col == Nx) && (row == Ny)) printf("grad: %f %f\n", grad.x, grad.y); } //IACA_END; // return(grad); } __device__ point module_potentialDerivatives_totalGradient_81_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos){ //asm volatile("# module_potentialDerivatives_totalGradient_81_SOA begins"); //std::cout << "# module_potentialDerivatives_totalGradient_81_SOA begins" << std::endl; // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad; grad.x = 0; grad.y = 0; for(int i = shalos; i < shalos + nhalos; i++) { //IACA_START; // struct point true_coord, true_coord_rot; //, result; //type_t R, angular_deviation; complex zis; // //result.x = result.y = 0.; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; /*positionning at the potential center*/ // Change the origin of the coordinate system to the center of the clump type_t cose = lens->anglecos[i]; type_t sine = lens->anglesin[i]; type_t x = true_coord.x*cose + true_coord.y*sine; type_t y = true_coord.y*cose - true_coord.x*sine; // type_t eps = lens->ellipticity_potential[i]; type_t rc = lens->rcore[i]; type_t rcut = lens->rcut[i]; type_t b0 = lens->b0[i]; type_t t05 = b0*rcut/(rcut - rc); // type_t sqe = sqrt(eps); // type_t cx1 = (1. - eps)/(1. + eps); type_t cxro = (1. + eps)*(1. + eps); type_t cyro = (1. - eps)*(1. - eps); // type_t rem2 = x*x/cxro + y*y/cyro; // complex zci, znum, zden, zres_rc, zres_rcut; type_t norm; // zci.re = 0; zci.im = -0.5*(1. - eps*eps)/sqe; // // step 1 { KERNEL(rc, zres_rc) } // step 2 { KERNEL(rcut, zres_rcut) } zis.re = t05*(zres_rc.re - zres_rcut.re); zis.im = t05*(zres_rc.im - zres_rcut.im); // rotation grad.x += (zis.re*cose - zis.im*sine); grad.y += (zis.im*cose + zis.re*sine); // } // return(grad); } __global__ void module_potentialDerivatives_totalGradient_SOA_GPU(type_t *grid_grad_x, type_t *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int nhalos) { struct point grad, clumpgrad, image_point; // int col = blockIdx.x*blockDim.x + threadIdx.x; int row = blockIdx.y*blockDim.y + threadIdx.y; // if ((row < nbgridcells) && (col < nbgridcells)) { // type_t dx = (frame->xmax - frame->xmin)/(nbgridcells-1); type_t dy = (frame->ymax - frame->ymin)/(nbgridcells-1); // int index = row*nbgridcells + col; // Create temp grad variable to minimise writing to global memory grid_grad grad.x = 0; grad.y = 0; // image_point.x = frame->xmin + col*dx; image_point.y = frame->ymin + row*dy; // int shalos = 0; while (shalos < nhalos) { int lens_type = lens->type[shalos]; int count = 1; while (lens->type[shalos + count] == lens_type) count++; // //if(row == 0 && col == 0) printf("type = %d, count %d , shalos %d \n", lens_type,count,shalos ); // clumpgrad = (*halo_func_GPU[lens_type])(&image_point, lens, shalos, count); // grad.x += clumpgrad.x; grad.y += clumpgrad.y; shalos += count; } // Write to global memory grid_grad_x[index] = grad.x; grid_grad_y[index] = grad.y; //if ((row == 0) && (col == 9)) //printf("%f %f: %f %f\n", image_point.x, image_point.y, grid_grad_x[index], grid_grad_y[index]); } }