diff --git a/src/gradient.cpp b/src/gradient.cpp index 4847991..cbc899b 100644 --- a/src/gradient.cpp +++ b/src/gradient.cpp @@ -1,503 +1,503 @@ #include <iostream> #include <string.h> #include <cuda_runtime.h> #include <math.h> #include <sys/time.h> #include <fstream> #include <immintrin.h> #include <map> /* #ifdef __AVX__ #include "simd_math_avx.h" #endif #ifdef __AVX512F__ #include "simd_math_avx512f.h" #endif */ #include "structure_hpc.h" #include "gradient.hpp" #include "utils.hpp" //#include "iacaMarks.h" // // // /**@brief Return the gradient of the projected lens potential for one clump * !!! You have to multiply by dlsds to obtain the true gradient * for the expressions, see the papers : JP Kneib & P Natarajan, Cluster Lenses, The Astronomy and Astrophysics Review (2011) for 1 and 2 * and JP Kneib PhD (1993) for 3 * * @param pImage point where the result is computed in the lens plane * @param lens mass distribution */ // /// Useful functions // complex piemd_1derivatives_ci05(double x, double y, double eps, double rc) { double sqe, cx1, cxro, cyro, rem2; complex zci, znum, zden, zis, zres; double norm; // //std::cout << "piemd_lderivatives" << std::endl; sqe = sqrt(eps); cx1 = (1. - eps) / (1. + eps); cxro = (1. + eps) * (1. + eps); cyro = (1. - eps) * (1. - eps); // rem2 = x * x / cxro + y * y / cyro; /*zci=cpx(0.,-0.5*(1.-eps*eps)/sqe); znum=cpx(cx1*x,(2.*sqe*sqrt(rc*rc+rem2)-y/cx1)); zden=cpx(x,(2.*rc*sqe-y)); zis=pcpx(zci,lncpx(dcpx(znum,zden))); zres=pcpxflt(zis,b0);*/ // --> optimized code zci.re = 0; zci.im = -0.5*(1. - eps*eps)/sqe; znum.re = cx1 * x; znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; zden.re = x; zden.im = 2.*rc * sqe - y; norm = zden.re * zden.re + zden.im * zden.im; // zis = znum/zden zis.re = (znum.re * zden.re + znum.im * zden.im) / norm; zis.im = (znum.im * zden.re - znum.re * zden.im) / norm; norm = zis.re; zis.re = log(sqrt(norm * norm + zis.im * zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) zis.im = atan2(zis.im, norm); // norm = zis.re; zres.re = zci.re * zis.re - zci.im * zis.im; // Re( zci*ln(zis) ) zres.im = zci.im * zis.re + zis.im * zci.re; // Im( zci*ln(zis) ) // //zres.re = zis.re*b0; //zres.im = zis.im*b0; // return(zres); } // //// changes the coordinates of point P into a new basis (rotation of angle theta) //// y' y x' //// * | / //// * | / theta //// * | / //// *|--------->x /* inline struct point rotateCoordinateSystem(struct point P, double theta) { struct point Q; Q.x = P.x*cos(theta) + P.y*sin(theta); Q.y = P.y*cos(theta) - P.x*sin(theta); return(Q); } */ // // struct point grad_halo(const struct point *pImage, const struct Potential *lens) { struct point true_coord, true_coord_rot, result; double X, Y, R, angular_deviation, u; complex zis; // result.x = result.y = 0.; // /*positionning at the potential center*/ // Change the origin of the coordinate system to the center of the clump true_coord.x = pImage->x - lens->position.x; true_coord.y = pImage->y - lens->position.y; //printf("x, y = %f, %f\n", lens->position.x, lens->position.y); // true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle); // switch (lens->type) { case(5): /*Elliptical Isothermal Sphere*/ /*rotation of the coordiante axes to match the potential axes*/ //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle); // R=sqrt(true_coord_rot.x*true_coord_rot.x*(1 - lens->ellipticity/3.) + true_coord_rot.y*true_coord_rot.y*(1 + lens->ellipticity/3.)); //ellippot = ellipmass/3 result.x = (1 - lens->ellipticity/3.)*lens->b0*true_coord_rot.x/(R); result.y = (1 + lens->ellipticity/3.)*lens->b0*true_coord_rot.y/(R); break; case(8): /* PIEMD */ /*rotation of the coordiante axes to match the potential axes*/ /*Doing something....*/ complex zis = piemd_1derivatives_ci05(true_coord_rot.x, true_coord_rot.y, lens->ellipticity_potential, lens->rcore); // result.x = lens->b0*zis.re; result.y = lens->b0*zis.im; break; case(81): //PIEMD Kassiola & Kovner,1993 I0.5c-I0.5cut double t05; if ( lens->ellipticity_potential > 2E-4 ) { //printf("1 "); t05 = lens->b0*lens->rcut/(lens->rcut - lens->rcore); complex zis = piemd_1derivatives_ci05(true_coord_rot.x, true_coord_rot.y, lens->ellipticity_potential, lens->rcore); complex zis_cut = piemd_1derivatives_ci05(true_coord_rot.x, true_coord_rot.y, lens->ellipticity_potential, lens->rcut); result.x = t05 * (zis.re - zis_cut.re); result.y = t05 * (zis.im - zis_cut.im); //printf(" g = %f %f\n", result.x, result.y); } else if (( u = true_coord_rot.x*true_coord_rot.x + true_coord_rot.y*true_coord_rot.y) > 0. ) { //printf("2 "); // Circular dPIE Elliasdottir 2007 Eq A23 slighly modified for t05 X = lens->rcore; Y = lens->rcut; t05 = sqrt(u + X * X) - X - sqrt(u + Y * Y) + Y; // Faster and equiv to Elliasdottir (see Golse PhD) t05 *= lens->b0 * Y / (Y - X) / u; // 1/u because t05/sqrt(u) and normalised Q/sqrt(u) result.x = t05*true_coord_rot.x; result.y = t05*true_coord_rot.y; } else { //printf("3 "); result.x = 0.; result.y = 0.; } break; default: std::cout << "ERROR: Grad 1 profil type of clump "<< lens->name << " unknown : "<< lens->type << std::endl; break; }; result = rotateCoordinateSystem(result, -lens->ellipticity_angle); //printf(" rot grad = %f %f\n", result.x, result.y); return result; } // // // struct point module_potentialDerivatives_totalGradient(const int nhalos, const struct point* pImage, const struct Potential* lens) { struct point grad, clumpgrad; // grad.x = 0; grad.y = 0; // for(int i = 0; i < nhalos; i++) { clumpgrad = grad_halo(pImage, &lens[i]); //compute gradient for each clump separately //std::cout << clumpgrad.x << " " << clumpgrad.y << std::endl; //nan check //if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y) { // add the gradients grad.x += clumpgrad.x; grad.y += clumpgrad.y; } } // return(grad); } // // SOA versions, vectorizable // struct point module_potentialDerivatives_totalGradient_5_SOA(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA begins"); // struct point grad, clumpgrad; grad.x = 0; grad.y = 0; for(int i = shalos; i < shalos + nhalos; i++) { // struct point true_coord, true_coord_rotation; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); double R = sqrt(true_coord_rotation.x*true_coord_rotation.x*(1 - lens->ellipticity_potential[i])+true_coord_rotation.y*true_coord_rotation.y*(1 + lens->ellipticity_potential[i])); // grad.x += (1 - lens->ellipticity[i]/3.)*lens->b0[i]*true_coord_rotation.x/R; grad.y += (1 + lens->ellipticity[i]/3.)*lens->b0[i]*true_coord_rotation.y/R; } return grad; } // // // struct point module_potentialDerivatives_totalGradient_8_SOA(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad; grad.x = 0; grad.y = 0; // for(int i = shalos; i < shalos + nhalos; i++) { //IACA_START; // struct point true_coord, true_coord_rot; //, result; //double R, angular_deviation; complex zis; // //result.x = result.y = 0.; // //@@printf("image_x = %f image_y = %f\n", pImage->x, pImage->y); true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; //printf("x = %f y = %f\n", true_coord.x, true_coord.y); /*positionning at the potential center*/ // Change the origin of the coordinate system to the center of the clump true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); // double x = true_coord_rot.x; double y = true_coord_rot.y; //@@printf("x = %f y = %f\n", x, y); double eps = lens->ellipticity_potential[i]; double rc = lens->rcore[i]; // //std::cout << "piemd_lderivatives" << std::endl; // double sqe = sqrt(eps); // double cx1 = (1. - eps)/(1. + eps); double cxro = (1. + eps)*(1. + eps); double cyro = (1. - eps)*(1. - eps); // double rem2 = x*x/cxro + y*y/cyro; // complex zci, znum, zden, zres; double norm; // zci.re = 0; zci.im = -0.5*(1. - eps*eps)/sqe; //@@printf("zci = %f %f\n", zci.re, zci.im); // znum.re = cx1*x; znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; // zden.re = x; zden.im = 2.*rc*sqe - y; norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden //@@printf("norm = %f\n", norm); // zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; //@@printf("zis = %f %f\n", zis.re, zis.im); norm = zis.re; zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) zis.im = atan2(zis.im, norm); - //@@printf("zis = %f %f\n", zis.re, zis.im); + //@@printf("y,x = %f %f\n", zis.im, norm); // norm = zis.re; zres.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) zres.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) // //@@printf("zres: %f %f\n", zres.re, zres.im); // zis.re = zres.re; zis.im = zres.im; // //zres.re = zis.re*b0; //zres.im = zis.im*b0; // rotation clumpgrad.x = zis.re; clumpgrad.y = zis.im; clumpgrad = rotateCoordinateSystem(clumpgrad, -lens->ellipticity_angle[i]); // clumpgrad.x = lens->b0[i]*clumpgrad.x; clumpgrad.y = lens->b0[i]*clumpgrad.y; // //clumpgrad.x = lens->b0[i]*zis.re; //clumpgrad.y = lens->b0[i]*zis.im; //nan check //if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y) //{ // add the gradients grad.x += clumpgrad.x; grad.y += clumpgrad.y; //@@printf("grad: %f %f\n", grad.x, grad.y); //@@std::cout << "grad.x = " << grad.x << " grad.y = " << grad.y << std::endl; //} } //IACA_END; // return(grad); } struct point module_potentialDerivatives_totalGradient_81_SOA(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); //std::cout << "# module_potentialDerivatives_totalGradient_SOA begins" << std::endl; // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad; grad.x = 0; grad.y = 0; // for(int i = shalos; i < shalos + nhalos; i++) { //IACA_START; // struct point true_coord, true_coord_rot; //, result; //double R, angular_deviation; complex zis; // //result.x = result.y = 0.; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; /*positionning at the potential center*/ // Change the origin of the coordinate system to the center of the clump true_coord_rot = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); // double x = true_coord_rot.x; double y = true_coord_rot.y; double eps = lens->ellipticity_potential[i]; double rc = lens->rcore[i]; double rcut = lens->rcut[i]; double b0 = lens->b0[i]; double t05 = b0*rcut/(rcut - rc); //printf("b0 = %f, rcut = %f, rc = %f, t05 = %f\n", b0, rcut, rc, t05); // //std::cout << "piemd_lderivatives" << std::endl; // double sqe = sqrt(eps); // double cx1 = (1. - eps)/(1. + eps); double cxro = (1. + eps)*(1. + eps); double cyro = (1. - eps)*(1. - eps); // double rem2 = x*x/cxro + y*y/cyro; // complex zci, znum, zden, zres_rc, zres_rcut; double norm; // zci.re = 0; zci.im = -0.5*(1. - eps*eps)/sqe; // // step 1 // { znum.re = cx1*x; znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; // zden.re = x; zden.im = 2.*rc*sqe - y; norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden // zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; norm = zis.re; zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) zis.im = atan2(zis.im, norm); // norm = zis.re; zres_rc.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) zres_rc.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) } // // step 2 // { znum.re = cx1*x; znum.im = 2.*sqe*sqrt(rcut*rcut + rem2) - y/cx1; // zden.re = x; zden.im = 2.*rcut*sqe - y; norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden // zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; norm = zis.re; zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) zis.im = atan2(zis.im, norm); // norm = zis.re; zres_rcut.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) zres_rcut.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) } zis.re = t05*(zres_rc.re - zres_rcut.re); zis.im = t05*(zres_rc.im - zres_rcut.im); //printf("%f %f\n", zis.re, zis.im); // //zres.re = zis.re*b0; //zres.im = zis.im*b0; // rotation clumpgrad.x = zis.re; clumpgrad.y = zis.im; clumpgrad = rotateCoordinateSystem(clumpgrad, -lens->ellipticity_angle[i]); // //clumpgrad.x = lens->b0[i]*clumpgrad.x; //clumpgrad.y = lens->b0[i]*clumpgrad.y; // //clumpgrad.x = lens->b0[i]*zis.re; //clumpgrad.y = lens->b0[i]*zis.im; //nan check //if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y) //{ // add the gradients grad.x += clumpgrad.x; grad.y += clumpgrad.y; //printf("grad = %f %f\n", clumpgrad.x, clumpgrad.y); //} } //IACA_END; // return(grad); } // // // typedef struct point (*halo_func_t) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos); halo_func_t halo_func[100] = { 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA, 0, 0, module_potentialDerivatives_totalGradient_8_SOA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_81_SOA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; // // // struct point module_potentialDerivatives_totalGradient_SOA(const struct point *pImage, const struct Potential_SOA *lens, int nhalos) { struct point grad, clumpgrad; // grad.x = clumpgrad.x = 0; grad.y = clumpgrad.y = 0; // int shalos = 0; // //module_potentialDerivatives_totalGradient_81_SOA(pImage, lens, 0, nhalos); //return; /* int* p_type = &(lens->type)[0]; int* lens_type = (int*) malloc(nhalos*sizeof(int)); memcpy(lens_type, &(lens->type)[0], nhalos*sizeof(int)); */ //quicksort(lens_type, nhalos); // while (shalos < nhalos) { int lens_type = lens->type[shalos]; int count = 1; while (lens->type[shalos + count] == lens_type) count++; //std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl; // clumpgrad = (*halo_func[lens_type])(pImage, lens, shalos, count); // grad.x += clumpgrad.x; grad.y += clumpgrad.y; shalos += count; } return(grad); }