diff --git a/Benchmarks/GridGradientBenchmark/gradient_GPU.cu b/Benchmarks/GridGradientBenchmark/gradient_GPU.cu index 92809bb..187e9c1 100644 --- a/Benchmarks/GridGradientBenchmark/gradient_GPU.cu +++ b/Benchmarks/GridGradientBenchmark/gradient_GPU.cu @@ -1,353 +1,418 @@ #include "gradient_GPU.cuh" #include <iostream> #include <string.h> #include <cuda_runtime.h> #include <math.h> #include <sys/time.h> #include <fstream> #include <immintrin.h> #include <map> /* #ifdef __AVX__ #include "simd_math_avx.h" #endif #ifdef __AVX512F__ #include "simd_math_avx512f.h" #endif */ #include "structure_hpc.h" #include "utils.hpp" // // SOA versions, vectorizable // -__device__ struct point module_potentialDerivatives_totalGradient_5_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) +__device__ +inline struct point module_potentialDerivatives_totalGradient_5_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { //asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA begins"); // struct point grad, clumpgrad; grad.x = 0; grad.y = 0; for(int i = shalos; i < shalos + nhalos; i++) { // struct point true_coord, true_coord_rotation; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // true_coord_rotation = rotateCoordinateSystem_GPU(true_coord, lens->ellipticity_angle[i]); double R = sqrt(true_coord_rotation.x*true_coord_rotation.x*(1 - lens->ellipticity_potential[i])+true_coord_rotation.y*true_coord_rotation.y*(1 + lens->ellipticity_potential[i])); // grad.x += (1 - lens->ellipticity[i]/3.)*lens->b0[i]*true_coord_rotation.x/R; grad.y += (1 + lens->ellipticity[i]/3.)*lens->b0[i]*true_coord_rotation.y/R; } return grad; } // // // -__device__ struct point module_potentialDerivatives_totalGradient_8_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) + __device__ +inline struct point module_potentialDerivatives_totalGradient_8_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { - //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); - // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 - // - struct point grad, clumpgrad; - grad.x = 0; - grad.y = 0; - // - for(int i = shalos; i < shalos + nhalos; i++) - { - //IACA_START; - // - struct point true_coord, true_coord_rot; //, result; - //double R, angular_deviation; - complex zis; - // - //result.x = result.y = 0.; - // - true_coord.x = pImage->x - lens->position_x[i]; - true_coord.y = pImage->y - lens->position_y[i]; - double cosi,sinu; - //cosi = cos(lens->ellipticity_angle[i]); - //sinu = sin(lens->ellipticity_angle[i]); - cosi = lens->anglecos[i]; - sinu = lens->anglesin[i]; - //positionning at the potential center - // Change the origin of the coordinate system to the center of the clump - //true_coord_rot = rotateCoordinateSystem_GPU(true_coord, lens->ellipticity_angle[i]); - true_coord_rot = rotateCoordinateSystem_GPU_2(true_coord, cosi,sinu); - // - double x = true_coord_rot.x; - double y = true_coord_rot.y; - double eps = lens->ellipticity_potential[i]; - double rc = lens->rcore[i]; - // - //std::cout << "piemd_lderivatives" << std::endl; - // - double sqe = sqrt(eps); - // - double cx1 = (1. - eps)/(1. + eps); - double cxro = (1. + eps)*(1. + eps); - double cyro = (1. - eps)*(1. - eps); - // - double rem2 = x*x/cxro + y*y/cyro; - // - complex zci, znum, zden, zres; - double norm; - // - zci.re = 0; - zci.im = -0.5*(1. - eps*eps)/sqe; - // - znum.re = cx1*x; - znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; - // - zden.re = x; - zden.im = 2.*rc*sqe - y; - norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden - // - zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; - zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; - norm = zis.re; - zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) - zis.im = atan2(zis.im, norm); - // norm = zis.re; - zres.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) - zres.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) - // - zis.re = zres.re; - zis.im = zres.im; - // - //zres.re = zis.re*b0; - //zres.im = zis.im*b0; - // rotation - clumpgrad.x = zis.re; - clumpgrad.y = zis.im; - //clumpgrad = rotateCoordinateSystem_GPU(true_coord, -lens->ellipticity_angle[i]); - clumpgrad = rotateCoordinateSystem_GPU_2(clumpgrad, cosi,-sinu); - // - clumpgrad.x = lens->b0[i]*clumpgrad.x; - clumpgrad.y = lens->b0[i]*clumpgrad.y; - // - //clumpgrad.x = lens->b0[i]*zis.re; - //clumpgrad.y = lens->b0[i]*zis.im; - //nan check - //if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y) - //{ - // add the gradients - grad.x += clumpgrad.x; - grad.y += clumpgrad.y; - //} - } - //IACA_END; - // - return(grad); + //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); + // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 + // + struct point grad, clumpgrad; + grad.x = 0; + grad.y = 0; + // + for(int i = shalos; i < shalos + nhalos; i++) + { + //IACA_START; + // + struct point true_coord, true_coord_rot; //, result; + //double R, angular_deviation; + complex zis; + // + //result.x = result.y = 0.; + // + true_coord.x = pImage->x - lens->position_x[i]; + true_coord.y = pImage->y - lens->position_y[i]; + //double cosi,sinu; + //cosi = cos(lens->ellipticity_angle[i]); + //sinu = sin(lens->ellipticity_angle[i]); + double cosi = lens->anglecos[i]; + double sinu = lens->anglesin[i]; + //positionning at the potential center + // Change the origin of the coordinate system to the center of the clump + //true_coord_rot = rotateCoordinateSystem_GPU(true_coord, lens->ellipticity_angle[i]); + //true_coord_rot = rotateCoordinateSystem_GPU_2(true_coord, cosi, sinu); + true_coord_rot.x = true_coord.x*cosi + true_coord.y*sinu; + true_coord_rot.y = true_coord.y*cosi - true_coord.x*sinu; + // + double x = true_coord_rot.x; + double y = true_coord_rot.y; + double eps = lens->ellipticity_potential[i]; + double rc = lens->rcore[i]; + double b0 = lens->b0[i]; + // + //std::cout << "piemd_lderivatives" << std::endl; + // + double sqe = sqrt(eps); + // + double cx1 = (1. - eps)/(1. + eps); + double cxro = (1. + eps)*(1. + eps); + double cyro = (1. - eps)*(1. - eps); + // + double rem2 = x*x/cxro + y*y/cyro; + // + complex zci, znum, zden, zres; + double norm; + // + zci.re = 0; + zci.im = -0.5*(1. - eps*eps)/sqe; + // + znum.re = cx1*x; + znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; + // + zden.re = x; + zden.im = 2.*rc*sqe - y; + norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden + // + zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; + zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; + norm = zis.re; + zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) + zis.im = atan2(zis.im, norm); + // norm = zis.re; + zres.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) + zres.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) + // + zis.re = zres.re; + zis.im = zres.im; + // + grad.x += b0*(zres.re*cosi - zres.im*sinu); + grad.y += b0*(zres.im*cosi + zres.re*sinu); + // + //zres.re = zis.re*b0; + //zres.im = zis.im*b0; + // rotation + //clumpgrad.x = zis.re; + //clumpgrad.y = zis.im; + //clumpgrad = rotateCoordinateSystem_GPU(true_coord, -lens->ellipticity_angle[i]); + //clumpgrad = rotateCoordinateSystem_GPU_2(clumpgrad, cosi, -sinu); + //grad.x += b0*(zis.re*cosi - zis.im*sinu); + //grad.y += b0*(zis.im*cosi + zis.re*sinu); + //clumpgrad.x = true_coord.x*cosi + true_coord.y*sinu; + //clumpgrad.y = true_coord.y*cosi - true_coord.x*sinu; + // + //clumpgrad.x = lens->b0[i]*clumpgrad.x; + //clumpgrad.y = lens->b0[i]*clumpgrad.y; + // + //clumpgrad.x = lens->b0[i]*zis.re; + //clumpgrad.y = lens->b0[i]*zis.im; + //nan check + //if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y) + //{ + // add the gradients + //grad.x += clumpgrad.x; + //grad.y += clumpgrad.y; + //} + } + //IACA_END; + // + return(grad); } -__device__ struct point module_potentialDerivatives_totalGradient_81_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) +__device__ inline struct point module_potentialDerivatives_totalGradient_81_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { - //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); - //std::cout << "# module_potentialDerivatives_totalGradient_SOA begins" << std::endl; - // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 - // - struct point grad, clumpgrad; - grad.x = 0; - grad.y = 0; - // - for(int i = shalos; i < shalos + nhalos; i++) - { - //IACA_START; - // - struct point true_coord, true_coord_rot; //, result; - //double R, angular_deviation; - complex zis; - // - //result.x = result.y = 0.; - // - true_coord.x = pImage->x - lens->position_x[i]; - true_coord.y = pImage->y - lens->position_y[i]; - //positionning at the potential center - // Change the origin of the coordinate system to the center of the clump - true_coord_rot = rotateCoordinateSystem_GPU(true_coord, lens->ellipticity_angle[i]); - // - double x = true_coord_rot.x; - double y = true_coord_rot.y; - double eps = lens->ellipticity_potential[i]; - double rc = lens->rcore[i]; - double rcut = lens->rcut[i]; - double b0 = lens->b0[i]; - double t05 = b0*rcut/(rcut - rc); - //printf("b0 = %f, rcut = %f, rc = %f, t05 = %f\n", b0, rcut, rc, t05); - // - //std::cout << "piemd_lderivatives" << std::endl; - // - double sqe = sqrt(eps); - // - double cx1 = (1. - eps)/(1. + eps); - double cxro = (1. + eps)*(1. + eps); - double cyro = (1. - eps)*(1. - eps); - // - double rem2 = x*x/cxro + y*y/cyro; - // - complex zci, znum, zden, zres_rc, zres_rcut; - double norm; - // - zci.re = 0; - zci.im = -0.5*(1. - eps*eps)/sqe; - // - // step 1 - // - { - znum.re = cx1*x; - znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; - // - zden.re = x; - zden.im = 2.*rc*sqe - y; - norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden - // - zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; - zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; - norm = zis.re; - zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) - zis.im = atan2(zis.im, norm); - // norm = zis.re; - zres_rc.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) - zres_rc.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) - } - // - // step 2 - // - { - znum.re = cx1*x; - znum.im = 2.*sqe*sqrt(rcut*rcut + rem2) - y/cx1; - // - zden.re = x; - zden.im = 2.*rcut*sqe - y; - norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden - // - zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; - zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; - norm = zis.re; - zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) - zis.im = atan2(zis.im, norm); - // norm = zis.re; - zres_rcut.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) - zres_rcut.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) - } - zis.re = t05*(zres_rc.re - zres_rcut.re); - zis.im = t05*(zres_rc.im - zres_rcut.im); - //printf("%f %f\n", zis.re, zis.im); - // - //zres.re = zis.re*b0; - //zres.im = zis.im*b0; - // rotation - clumpgrad.x = zis.re; - clumpgrad.y = zis.im; - clumpgrad = rotateCoordinateSystem_GPU(clumpgrad, -lens->ellipticity_angle[i]); - // - //clumpgrad.x = lens->b0[i]*clumpgrad.x; - //clumpgrad.y = lens->b0[i]*clumpgrad.y; - // - //clumpgrad.x = lens->b0[i]*zis.re; - //clumpgrad.y = lens->b0[i]*zis.im; - //nan check - //if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y) - //{ - // add the gradients - grad.x += clumpgrad.x; - grad.y += clumpgrad.y; - //printf("grad = %f %f\n", clumpgrad.x, clumpgrad.y); - //} - } - //IACA_END; - // - return(grad); + //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); + //std::cout << "# module_potentialDerivatives_totalGradient_SOA begins" << std::endl; + // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 + // + struct point grad, clumpgrad; + grad.x = 0; + grad.y = 0; + // + for(int i = shalos; i < shalos + nhalos; i++) + { + //IACA_START; + // + struct point true_coord, true_coord_rot; //, result; + //double R, angular_deviation; + complex zis; + // + //result.x = result.y = 0.; + // + true_coord.x = pImage->x - lens->position_x[i]; + true_coord.y = pImage->y - lens->position_y[i]; + //positionning at the potential center + // Change the origin of the coordinate system to the center of the clump + true_coord_rot = rotateCoordinateSystem_GPU(true_coord, lens->ellipticity_angle[i]); + // + double x = true_coord_rot.x; + double y = true_coord_rot.y; + double eps = lens->ellipticity_potential[i]; + double rc = lens->rcore[i]; + double rcut = lens->rcut[i]; + double b0 = lens->b0[i]; + double t05 = b0*rcut/(rcut - rc); + //printf("b0 = %f, rcut = %f, rc = %f, t05 = %f\n", b0, rcut, rc, t05); + // + //std::cout << "piemd_lderivatives" << std::endl; + // + double sqe = sqrt(eps); + // + double cx1 = (1. - eps)/(1. + eps); + double cxro = (1. + eps)*(1. + eps); + double cyro = (1. - eps)*(1. - eps); + // + double rem2 = x*x/cxro + y*y/cyro; + // + complex zci, znum, zden, zres_rc, zres_rcut; + double norm; + // + zci.re = 0; + zci.im = -0.5*(1. - eps*eps)/sqe; + // + // step 1 + // + { + znum.re = cx1*x; + znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; + // + zden.re = x; + zden.im = 2.*rc*sqe - y; + norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden + // + zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; + zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; + norm = zis.re; + zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) + zis.im = atan2(zis.im, norm); + // norm = zis.re; + zres_rc.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) + zres_rc.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) + } + // + // step 2 + // + { + znum.re = cx1*x; + znum.im = 2.*sqe*sqrt(rcut*rcut + rem2) - y/cx1; + // + zden.re = x; + zden.im = 2.*rcut*sqe - y; + norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden + // + zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; + zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; + norm = zis.re; + zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) + zis.im = atan2(zis.im, norm); + // norm = zis.re; + zres_rcut.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) + zres_rcut.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) + } + zis.re = t05*(zres_rc.re - zres_rcut.re); + zis.im = t05*(zres_rc.im - zres_rcut.im); + //printf("%f %f\n", zis.re, zis.im); + // + //zres.re = zis.re*b0; + //zres.im = zis.im*b0; + // rotation + clumpgrad.x = zis.re; + clumpgrad.y = zis.im; + clumpgrad = rotateCoordinateSystem_GPU(clumpgrad, -lens->ellipticity_angle[i]); + // + //clumpgrad.x = lens->b0[i]*clumpgrad.x; + //clumpgrad.y = lens->b0[i]*clumpgrad.y; + // + //clumpgrad.x = lens->b0[i]*zis.re; + //clumpgrad.y = lens->b0[i]*zis.im; + //nan check + //if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y) + //{ + // add the gradients + grad.x += clumpgrad.x; + grad.y += clumpgrad.y; + //printf("grad = %f %f\n", clumpgrad.x, clumpgrad.y); + //} + } + //IACA_END; + // + return(grad); } // // // typedef struct point (*halo_func_GPU_t) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos); __constant__ halo_func_GPU_t halo_func_GPU[100] = { -0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_GPU, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_GPU, 0, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0, module_potentialDerivatives_totalGradient_81_SOA_GPU, 0, 0, 0, 0, 0, 0, 0, 0, -0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_GPU, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_GPU, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, module_potentialDerivatives_totalGradient_81_SOA_GPU, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; // // // __device__ struct point module_potentialDerivatives_totalGradient_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int nhalos) +{ + struct point grad, clumpgrad; + // + grad.x = clumpgrad.x = 0; + grad.y = clumpgrad.y = 0; + // + int shalos = 0; + clumpgrad = module_potentialDerivatives_totalGradient_5_SOA_GPU(pImage, lens, 0, nhalos); + grad.x += clumpgrad.x; + grad.y += clumpgrad.y; + return(grad); + // + //module_potentialDerivatives_totalGradient_81_SOA(pImage, lens, 0, nhalos); + //return; + /* + int* p_type = &(lens->type)[0]; + int* lens_type = (int*) malloc(nhalos*sizeof(int)); + memcpy(lens_type, &(lens->type)[0], nhalos*sizeof(int)); + + //quicksort(lens_type, nhalos); + //*/ + + //printf ("%f \n",lens->position_x[shalos]); + + + while (shalos < nhalos) + { + + int lens_type = lens->type[shalos]; + int count = 1; + while (lens->type[shalos + count] == lens_type) count++; + //std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl; + //printf ("%d %d %d \n",lens_type,count,shalos); + // + clumpgrad = (*halo_func_GPU[lens_type])(pImage, lens, shalos, count); + // + grad.x += clumpgrad.x; + grad.y += clumpgrad.y; + shalos += count; + } + + return(grad); +} + + +#if 0 +__device__ struct point module_potentialDerivatives_totalGradient_SOA_CPU_GPU(const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, double xmin, double ymin, int nbgridcells, int nhalos) { struct point grad, clumpgrad; // grad.x = clumpgrad.x = 0; grad.y = clumpgrad.y = 0; - // - int shalos = 0; - // - //module_potentialDerivatives_totalGradient_81_SOA(pImage, lens, 0, nhalos); - //return; - /* - int* p_type = &(lens->type)[0]; - int* lens_type = (int*) malloc(nhalos*sizeof(int)); - memcpy(lens_type, &(lens->type)[0], nhalos*sizeof(int)); + // + int shalos = 0; + //clumpgrad = module_potentialDerivatives_totalGradient_5_SOA_GPU(pImage, lens, 0, nhalos); + //grad.x += clumpgrad.x; + //grad.y += clumpgrad.y; + //return(grad); + // + //module_potentialDerivatives_totalGradient_81_SOA(pImage, lens, 0, nhalos); + //return; + /* + int* p_type = &(lens->type)[0]; + int* lens_type = (int*) malloc(nhalos*sizeof(int)); + memcpy(lens_type, &(lens->type)[0], nhalos*sizeof(int)); - //quicksort(lens_type, nhalos); - //*/ + //quicksort(lens_type, nhalos); + //*/ - //printf ("%f \n",lens->position_x[shalos]); + //printf ("%f \n",lens->position_x[shalos]); - while (shalos < nhalos) - { + while (shalos < nhalos) + { - int lens_type = lens->type[shalos]; - int count = 1; - while (lens->type[shalos + count] == lens_type) count++; - //std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl; - //printf ("%d %d %d \n",lens_type,count,shalos); - // - clumpgrad = (*halo_func_GPU[lens_type])(pImage, lens, shalos, count); - // - grad.x += clumpgrad.x; - grad.y += clumpgrad.y; - shalos += count; - } + int lens_type = lens->type[shalos]; + int count = 1; + while (lens->type[shalos + count] == lens_type) count++; + //std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl; + //printf ("%d %d %d \n",lens_type,count,shalos); + // + clumpgrad = (*halo_func_GPU[lens_type])(pImage, lens, shalos, count); + // + grad.x += clumpgrad.x; + grad.y += clumpgrad.y; + shalos += count; + } return(grad); } +#endif + __device__ inline struct point rotateCoordinateSystem_GPU(struct point P, double theta) { - struct point Q; + struct point Q; - Q.x = P.x*cos(theta) + P.y*sin(theta); - Q.y = P.y*cos(theta) - P.x*sin(theta); + Q.x = P.x*cos(theta) + P.y*sin(theta); + Q.y = P.y*cos(theta) - P.x*sin(theta); - return(Q); + return(Q); } __device__ inline struct point rotateCoordinateSystem_GPU_2(struct point P, double cosi, double sinu) { - struct point Q; + struct point Q; - Q.x = P.x*cosi + P.y*sinu; - Q.y = P.y*cosi - P.x*sinu; + Q.x = P.x*cosi + P.y*sinu; + Q.y = P.y*cosi - P.x*sinu; - return(Q); + return(Q); } diff --git a/Benchmarks/GridGradientBenchmark/gradient_GPU.cuh b/Benchmarks/GridGradientBenchmark/gradient_GPU.cuh index 4cb3d20..65b9bdf 100644 --- a/Benchmarks/GridGradientBenchmark/gradient_GPU.cuh +++ b/Benchmarks/GridGradientBenchmark/gradient_GPU.cuh @@ -1,20 +1,25 @@ /* * gradient_GPU.cuh * * Created on: Feb 1, 2017 * Author: cerschae */ #ifndef GRADIENT_GPU_CUH_ #define GRADIENT_GPU_CUH_ #include "cudafunctions.cuh" #include <fstream> #include <structure_hpc.h> __device__ struct point module_potentialDerivatives_totalGradient_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int nhalos); __device__ inline struct point rotateCoordinateSystem_GPU(struct point P, double theta); __device__ inline struct point rotateCoordinateSystem_GPU_2(struct point P, double cosi, double sinu); +// +//__device__ +//inline struct point module_potentialDerivatives_totalGradient_5_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) + + #endif /* GRADIENT_GPU_CUH_ */ diff --git a/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cu b/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cu index 4de3c88..6412cfd 100644 --- a/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cu +++ b/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cu @@ -1,792 +1,841 @@ #include <fstream> #include "grid_gradient_GPU.cuh" +#define BLOCK_SIZE 16 + +void +module_potentialDerivatives_totalGradient_SOA_CPU_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos); + +void +module_potentialDerivatives_totalGradient_SOA_CPU_GPU_v2(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos); + void calculate_cossin_values(double *theta_cos, double *theta_sin, double *angles, int nhalos ){ for(int i = 0 ; i < nhalos; i++){ theta_cos[i]=cos(angles[i]); theta_sin[i]=sin(angles[i]); } } void gradient_grid_GPU_sorted(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos ,int nbgridcells){ int nBlocks_gpu = 0; // Define the number of threads per block the GPU will use cudaDeviceProp properties_gpu; cudaGetDeviceProperties(&properties_gpu, 0); // Get properties of 0th GPU in use if (properties_gpu.maxThreadsDim[0]<threadsPerBlock) { fprintf(stderr, "ERROR: The GPU has to support at least %u threads per block.\n", threadsPerBlock); exit(-1); } else { nBlocks_gpu = properties_gpu.maxGridSize[0] / threadsPerBlock; // Get the maximum number of blocks with the chosen number of threads // per Block that the GPU supports } grid_param *frame_gpu; Potential_SOA *lens_gpu,*lens_kernel ; int *type_gpu; double *lens_x_gpu, *lens_y_gpu, *b0_gpu, *angle_gpu, *epot_gpu, *rcore_gpu, *rcut_gpu, *anglecos_gpu, *anglesin_gpu; double *grid_grad_x_gpu, *grid_grad_y_gpu ; lens_gpu = (Potential_SOA *) malloc(sizeof(Potential_SOA)); lens_gpu->type = (int *) malloc(sizeof(int)); // Allocate variables on the GPU cudasafe(cudaMalloc( (void**)&(lens_kernel), sizeof(Potential_SOA)),"Gradientgpu.cu : Alloc Potential_SOA: " ); cudasafe(cudaMalloc( (void**)&(type_gpu), nhalos*sizeof(int)),"Gradientgpu.cu : Alloc type_gpu: " ); cudasafe(cudaMalloc( (void**)&(lens_x_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc x_gpu: " ); cudasafe(cudaMalloc( (void**)&(lens_y_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc y_gpu: " ); cudasafe(cudaMalloc( (void**)&(b0_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc b0_gpu: " ); cudasafe(cudaMalloc( (void**)&(angle_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc angle_gpu: " ); cudasafe(cudaMalloc( (void**)&(epot_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc epot_gpu: " ); cudasafe(cudaMalloc( (void**)&(rcore_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcore_gpu: " ); cudasafe(cudaMalloc( (void**)&(rcut_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcut_gpu: " ); cudasafe(cudaMalloc( (void**)&(anglecos_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglecos_gpu: " ); cudasafe(cudaMalloc( (void**)&(anglesin_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglesin_gpu: " ); cudasafe(cudaMalloc( (void**)&(frame_gpu), sizeof(grid_param)),"Gradientgpu.cu : Alloc frame_gpu: " ); cudasafe(cudaMalloc( (void**)&(grid_grad_x_gpu), (nbgridcells) * (nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_x_gpu: " ); cudasafe(cudaMalloc( (void**)&(grid_grad_y_gpu), (nbgridcells) * (nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_y_gpu: " ); // Copy values to the GPU cudasafe(cudaMemcpy(type_gpu,lens->type , nhalos*sizeof(int),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy type_gpu: " ); cudasafe(cudaMemcpy(lens_x_gpu,lens->position_x , nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy x_gpu: " ); cudasafe(cudaMemcpy(lens_y_gpu,lens->position_y , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy y_gpu: " ); cudasafe(cudaMemcpy(b0_gpu,lens->b0 , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy b0_gpu: " ); cudasafe(cudaMemcpy(angle_gpu,lens->ellipticity_angle , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy angle_gpu: " ); cudasafe(cudaMemcpy(epot_gpu, lens->ellipticity_potential, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy epot_gpu: " ); cudasafe(cudaMemcpy(rcore_gpu, lens->rcore, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy rcore_gpu: " ); cudasafe(cudaMemcpy(rcut_gpu, lens->rcut, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy rcut_gpu: " ); cudasafe(cudaMemcpy(anglecos_gpu, lens->anglecos, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy anglecos: " ); cudasafe(cudaMemcpy(anglesin_gpu, lens->anglesin, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy anglesin: " ); cudasafe(cudaMemcpy(frame_gpu, frame, sizeof(grid_param), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy fame_gpu: " ); //printf("%p \n", lens_gpu); //printf("%p \n", type_gpu); //printf("%p \n", lens_gpu->type); //fflush(stdout); lens_gpu->type = type_gpu; lens_gpu->position_x = lens_x_gpu; lens_gpu->position_y = lens_y_gpu; lens_gpu->b0 = b0_gpu; lens_gpu->ellipticity_angle = angle_gpu; lens_gpu->ellipticity_potential = epot_gpu; lens_gpu->rcore = rcore_gpu; lens_gpu->rcut = rcut_gpu; lens_gpu->anglecos = anglecos_gpu; lens_gpu->anglesin = anglesin_gpu; cudaMemcpy(lens_kernel, lens_gpu, sizeof(Potential_SOA), cudaMemcpyHostToDevice); if (int((nbgridcells) * (nbgridcells)/threadsPerBlock) == 0){ gradient_grid_kernel<<<1,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel); } else{ gradient_grid_kernel<<<(nbgridcells) * (nbgridcells)/threadsPerBlock,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel); } cudasafe(cudaMemcpy( grid_grad_x, grid_grad_x_gpu, (nbgridcells) * (nbgridcells) *sizeof(double),cudaMemcpyDeviceToHost ),"Gradientgpu.cu : Copy source_x_gpu: " ); cudasafe(cudaMemcpy( grid_grad_y, grid_grad_y_gpu, (nbgridcells) * (nbgridcells) *sizeof(double), cudaMemcpyDeviceToHost),"Gradientgpu.cu : Copy source_y_gpu: " ); //printf("%f %f \n",grid_grad_x[0],grid_grad_y[0]); // Free GPU memory cudaFree(lens_gpu); cudaFree(type_gpu); cudaFree(lens_x_gpu); cudaFree(lens_y_gpu); cudaFree(b0_gpu); cudaFree(angle_gpu); cudaFree(epot_gpu); cudaFree(rcore_gpu); cudaFree(rcut_gpu); cudaFree(anglecos_gpu); cudaFree(anglesin_gpu); cudaFree(grid_grad_x_gpu); cudaFree(grid_grad_y_gpu); /* for (int i = 0; i < nbgridcells; i++){ for(int j = 0; j < nbgridcells; j++){ printf(" %f",grid_grad_x[i*nbgridcells + j]); } printf("\n"); }*/ } void gradient_grid_GPU_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, int nbgridcells){ //get number of GPU devices int nDevices; cudaGetDeviceCount(&nDevices); // Initialise kernel variables, table for multiple devices grid_param *frame_gpu[nDevices]; Potential_SOA *lens_gpu[nDevices],*lens_kernel[nDevices] ; int *type_gpu[nDevices]; double *lens_x_gpu[nDevices], *lens_y_gpu[nDevices], *b0_gpu[nDevices], *angle_gpu[nDevices], *epot_gpu[nDevices], *rcore_gpu[nDevices], *rcut_gpu[nDevices],*anglecos_gpu[nDevices], *anglesin_gpu[nDevices]; double *grid_grad_x_gpu[nDevices], *grid_grad_y_gpu[nDevices] ; // Initialise multiple device variables int Ndevice[nDevices], indexactual[nDevices]; cudaStream_t stream[nDevices]; indexactual[0] = 0 ; Ndevice[0] = (nbgridcells) * (nbgridcells)/nDevices; printf("Using %d Gpu's \n",nDevices ); //printf("%d %d \n",indexactual[0], Ndevice[0]); for (int dev = 1; dev < nDevices; dev++) { Ndevice[dev] = (nbgridcells) * (nbgridcells)/nDevices; if(indexactual[dev]+Ndevice[dev] > (nbgridcells) * (nbgridcells)){ Ndevice[dev] = (nbgridcells) * (nbgridcells) - indexactual[dev-1]; } indexactual[dev] = indexactual[dev-1] + Ndevice[dev]; //printf("%d %d \n",indexactual[dev], Ndevice[dev]); } for (int dev = 0; dev < nDevices; dev++) { cudaSetDevice(dev); lens_gpu[dev] = (Potential_SOA *) malloc(sizeof(Potential_SOA)); lens_gpu[dev]->type = (int *) malloc(sizeof(int)); // Allocate variables on the GPU cudasafe(cudaMalloc( (void**)&(lens_kernel[dev]), sizeof(Potential_SOA)),"Gradientgpu.cu : Alloc Potential_SOA: " ); cudasafe(cudaMalloc( (void**)&(type_gpu[dev]), nhalos*sizeof(int)),"Gradientgpu.cu : Alloc type_gpu: " ); cudasafe(cudaMalloc( (void**)&(lens_x_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc x_gpu: " ); cudasafe(cudaMalloc( (void**)&(lens_y_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc y_gpu: " ); cudasafe(cudaMalloc( (void**)&(b0_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc b0_gpu: " ); cudasafe(cudaMalloc( (void**)&(angle_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc angle_gpu: " ); cudasafe(cudaMalloc( (void**)&(epot_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc epot_gpu: " ); cudasafe(cudaMalloc( (void**)&(rcore_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcore_gpu: " ); cudasafe(cudaMalloc( (void**)&(rcut_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcut_gpu: " ); cudasafe(cudaMalloc( (void**)&(frame_gpu[dev]), sizeof(grid_param)),"Gradientgpu.cu : Alloc frame_gpu: " ); cudasafe(cudaMalloc( (void**)&(anglecos_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglecos_gpu: " ); cudasafe(cudaMalloc( (void**)&(anglesin_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglesin_gpu: " ); cudasafe(cudaMalloc( (void**)&(grid_grad_x_gpu[dev]), Ndevice[dev] *sizeof(double)),"Gradientgpu.cu : Alloc source_x_gpu: " ); cudasafe(cudaMalloc( (void**)&(grid_grad_y_gpu[dev]), Ndevice[dev] *sizeof(double)),"Gradientgpu.cu : Alloc source_y_gpu: " ); cudaStreamCreate(&stream[dev]); } for (int dev = 0; dev < nDevices; dev++) { cudaSetDevice(dev); // Copy values to the GPU cudasafe(cudaMemcpyAsync(type_gpu[dev],lens->type , nhalos*sizeof(int),cudaMemcpyHostToDevice,stream[dev] ),"Gradientgpu.cu : Copy type_gpu: " ); cudasafe(cudaMemcpyAsync(lens_x_gpu[dev],lens->position_x , nhalos*sizeof(double),cudaMemcpyHostToDevice,stream[dev] ),"Gradientgpu.cu : Copy x_gpu: " ); cudasafe(cudaMemcpyAsync(lens_y_gpu[dev],lens->position_y , nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy y_gpu: " ); cudasafe(cudaMemcpyAsync(b0_gpu[dev],lens->b0 , nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy b0_gpu: " ); cudasafe(cudaMemcpyAsync(angle_gpu[dev],lens->ellipticity_angle , nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy angle_gpu: " ); cudasafe(cudaMemcpyAsync(epot_gpu[dev], lens->ellipticity_potential, nhalos*sizeof(double),cudaMemcpyHostToDevice ,stream[dev]),"Gradientgpu.cu : Copy epot_gpu: " ); cudasafe(cudaMemcpyAsync(rcore_gpu[dev], lens->rcore, nhalos*sizeof(double),cudaMemcpyHostToDevice ,stream[dev]),"Gradientgpu.cu : Copy rcore_gpu: " ); cudasafe(cudaMemcpyAsync(rcut_gpu[dev], lens->rcut, nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy rcut_gpu: " ); cudasafe(cudaMemcpyAsync(anglecos_gpu[dev], lens->anglecos, nhalos*sizeof(double),cudaMemcpyHostToDevice,stream[dev] ),"Gradientgpu.cu : Copy anglecos: " ); cudasafe(cudaMemcpyAsync(anglesin_gpu[dev], lens->anglesin, nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy anglesin: " ); cudasafe(cudaMemcpyAsync(frame_gpu[dev], frame, sizeof(grid_param), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy fame_gpu: " ); lens_gpu[dev]->type = type_gpu[dev]; lens_gpu[dev]->position_x = lens_x_gpu[dev]; lens_gpu[dev]->position_y = lens_y_gpu[dev]; lens_gpu[dev]->b0 = b0_gpu[dev]; lens_gpu[dev]->ellipticity_angle = angle_gpu[dev]; lens_gpu[dev]->ellipticity_potential = epot_gpu[dev]; lens_gpu[dev]->rcore = rcore_gpu[dev]; lens_gpu[dev]->rcut = rcut_gpu[dev]; lens_gpu[dev]->anglecos = anglecos_gpu[dev]; lens_gpu[dev]->anglesin = anglesin_gpu[dev]; cudaMemcpyAsync(lens_kernel[dev], lens_gpu[dev], sizeof(Potential_SOA), cudaMemcpyHostToDevice,stream[dev]); } for (int dev = 0; dev < nDevices; dev++) { cudaSetDevice(dev); int nBlocks_gpu = 0; cudaDeviceProp properties_gpu; cudaGetDeviceProperties(&properties_gpu, dev); // Get properties of 0th GPU in use if (properties_gpu.maxThreadsDim[0]<threadsPerBlock) { fprintf(stderr, "ERROR: The GPU has to support at least %u threads per block.\n", threadsPerBlock); exit(-1); } if (int((nbgridcells) * (nbgridcells)/threadsPerBlock) == 0){ gradient_grid_kernel_multiple<<<1,threadsPerBlock,0,stream[dev]>>>(grid_grad_x_gpu[dev], grid_grad_y_gpu[dev],frame_gpu[dev],nhalos, nbgridcells, lens_kernel[dev], indexactual[dev],Ndevice[dev]); } else{ gradient_grid_kernel_multiple<<<(nbgridcells) * (nbgridcells)/threadsPerBlock,threadsPerBlock,0,stream[dev]>>>(grid_grad_x_gpu[dev], grid_grad_y_gpu[dev],frame_gpu[dev],nhalos, nbgridcells, lens_kernel[dev], indexactual[dev],Ndevice[dev]); } } for (int dev = 0; dev < nDevices; dev++) { cudasafe(cudaMemcpyAsync( grid_grad_x + indexactual[dev], grid_grad_x_gpu[dev], Ndevice[dev] *sizeof(double),cudaMemcpyDeviceToHost ,stream[dev]),"Gradientgpu.cu : Copy source_x_gpu: " ); cudasafe(cudaMemcpyAsync( grid_grad_y + indexactual[dev], grid_grad_y_gpu[dev], Ndevice[dev] *sizeof(double), cudaMemcpyDeviceToHost,stream[dev]),"Gradientgpu.cu : Copy source_y_gpu: " ); } for (int dev = 0; dev < nDevices; dev++) { cudaSetDevice(dev); // Free GPU memory cudaFree(type_gpu[dev]); cudaFree(lens_x_gpu[dev]); cudaFree(lens_y_gpu[dev]); cudaFree(b0_gpu[dev]); cudaFree(angle_gpu[dev]); cudaFree(epot_gpu[dev]); cudaFree(rcore_gpu[dev]); cudaFree(rcut_gpu[dev]); cudaFree(anglecos_gpu[dev]); cudaFree(anglesin_gpu[dev]); cudaFree(grid_grad_x_gpu[dev]); cudaFree(grid_grad_y_gpu[dev]); cudaStreamDestroy(stream[dev]); } } #if 1 void gradient_grid_GPU_sub(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, int nbgridcells, int indexactual, int Ncells ){ // GPU Property query int nBlocks_gpu = 0; + // Define the number of threads per block the GPU will use cudaDeviceProp properties_gpu; + cudaGetDeviceProperties(&properties_gpu, 0); // Get properties of 0th GPU in use + if (properties_gpu.maxThreadsDim[0]<threadsPerBlock) { - fprintf(stderr, "ERROR: The GPU has to support at least %u threads per block.\n", threadsPerBlock); - exit(-1); + fprintf(stderr, "ERROR: The GPU has to support at least %u threads per block.\n", threadsPerBlock); + exit(-1); } else { - nBlocks_gpu = properties_gpu.maxGridSize[0] / threadsPerBlock; // Get the maximum number of blocks with the chosen number of threads - // per Block that the GPU supports + nBlocks_gpu = properties_gpu.maxGridSize[0] / threadsPerBlock; // Get the maximum number of blocks with the chosen number of threads + // per Block that the GPU supports } - //Number of subdivision and Dimension variables for subdivision - //int nSubd = 2; - - //cudaStream_t stream[nDevices]; - - //Kernel Variables grid_param *frame_gpu; Potential_SOA *lens_gpu,*lens_kernel ; int *type_gpu; double *lens_x_gpu, *lens_y_gpu, *b0_gpu, *angle_gpu, *epot_gpu, *rcore_gpu, *rcut_gpu, *anglecos_gpu, *anglesin_gpu; double *grid_grad_x_gpu, *grid_grad_y_gpu ; - lens_gpu = (Potential_SOA *) malloc(sizeof(Potential_SOA)); lens_gpu->type = (int *) malloc(sizeof(int)); // Allocate variables on the GPU cudasafe(cudaMalloc( (void**)&(lens_kernel), sizeof(Potential_SOA)),"Gradientgpu.cu : Alloc Potential_SOA: " ); cudasafe(cudaMalloc( (void**)&(type_gpu), nhalos*sizeof(int)),"Gradientgpu.cu : Alloc type_gpu: " ); cudasafe(cudaMalloc( (void**)&(lens_x_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc x_gpu: " ); cudasafe(cudaMalloc( (void**)&(lens_y_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc y_gpu: " ); cudasafe(cudaMalloc( (void**)&(b0_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc b0_gpu: " ); cudasafe(cudaMalloc( (void**)&(angle_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc angle_gpu: " ); cudasafe(cudaMalloc( (void**)&(epot_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc epot_gpu: " ); cudasafe(cudaMalloc( (void**)&(rcore_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcore_gpu: " ); cudasafe(cudaMalloc( (void**)&(rcut_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcut_gpu: " ); cudasafe(cudaMalloc( (void**)&(anglecos_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglecos_gpu: " ); cudasafe(cudaMalloc( (void**)&(anglesin_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglesin_gpu: " ); cudasafe(cudaMalloc( (void**)&(frame_gpu), sizeof(grid_param)),"Gradientgpu.cu : Alloc frame_gpu: " ); - cudasafe(cudaMalloc( (void**)&(grid_grad_x_gpu), Ncells *sizeof(double)),"Gradientgpu.cu : Alloc source_x_gpu: " ); - cudasafe(cudaMalloc( (void**)&(grid_grad_y_gpu), Ncells *sizeof(double)),"Gradientgpu.cu : Alloc source_y_gpu: " ); - + cudasafe(cudaMalloc( (void**)&(grid_grad_x_gpu), (nbgridcells) * (nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_x_gpu: " ); + cudasafe(cudaMalloc( (void**)&(grid_grad_y_gpu), (nbgridcells) * (nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_y_gpu: " ); // Copy values to the GPU - cudasafe(cudaMemcpyAsync(type_gpu,lens->type , nhalos*sizeof(int),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy type_gpu: " ); - cudasafe(cudaMemcpyAsync(lens_x_gpu,lens->position_x , nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy x_gpu: " ); - cudasafe(cudaMemcpyAsync(lens_y_gpu,lens->position_y , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy y_gpu: " ); - cudasafe(cudaMemcpyAsync(b0_gpu,lens->b0 , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy b0_gpu: " ); - cudasafe(cudaMemcpyAsync(angle_gpu,lens->ellipticity_angle , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy angle_gpu: " ); - cudasafe(cudaMemcpyAsync(epot_gpu, lens->ellipticity_potential, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy epot_gpu: " ); - cudasafe(cudaMemcpyAsync(rcore_gpu, lens->rcore, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy rcore_gpu: " ); - cudasafe(cudaMemcpyAsync(rcut_gpu, lens->rcut, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy rcut_gpu: " ); - cudasafe(cudaMemcpyAsync(anglecos_gpu, lens->anglecos, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy anglecos: " ); - cudasafe(cudaMemcpyAsync(anglesin_gpu, lens->anglesin, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy anglesin: " ); - cudasafe(cudaMemcpyAsync(frame_gpu, frame, sizeof(grid_param), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy fame_gpu: " ); - - + cudasafe(cudaMemcpy(type_gpu,lens->type , nhalos*sizeof(int),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy type_gpu: " ); + cudasafe(cudaMemcpy(lens_x_gpu,lens->position_x , nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy x_gpu: " ); + cudasafe(cudaMemcpy(lens_y_gpu,lens->position_y , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy y_gpu: " ); + cudasafe(cudaMemcpy(b0_gpu,lens->b0 , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy b0_gpu: " ); + cudasafe(cudaMemcpy(angle_gpu,lens->ellipticity_angle , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy angle_gpu: " ); + cudasafe(cudaMemcpy(epot_gpu, lens->ellipticity_potential, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy epot_gpu: " ); + cudasafe(cudaMemcpy(rcore_gpu, lens->rcore, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy rcore_gpu: " ); + cudasafe(cudaMemcpy(rcut_gpu, lens->rcut, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy rcut_gpu: " ); + cudasafe(cudaMemcpy(anglecos_gpu, lens->anglecos, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy anglecos: " ); + cudasafe(cudaMemcpy(anglesin_gpu, lens->anglesin, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy anglesin: " ); + cudasafe(cudaMemcpy(frame_gpu, frame, sizeof(grid_param), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy fame_gpu: " ); + + + //printf("%p \n", lens_gpu); + //printf("%p \n", type_gpu); + //printf("%p \n", lens_gpu->type); + //fflush(stdout); lens_gpu->type = type_gpu; lens_gpu->position_x = lens_x_gpu; lens_gpu->position_y = lens_y_gpu; lens_gpu->b0 = b0_gpu; lens_gpu->ellipticity_angle = angle_gpu; lens_gpu->ellipticity_potential = epot_gpu; lens_gpu->rcore = rcore_gpu; lens_gpu->rcut = rcut_gpu; lens_gpu->anglecos = anglecos_gpu; lens_gpu->anglesin = anglesin_gpu; cudaMemcpy(lens_kernel, lens_gpu, sizeof(Potential_SOA), cudaMemcpyHostToDevice); +#if 0 + int BLOCK_SIZE = 16; // number of threads + int GRID_SIZE = (nbgridcells + BLOCK_SIZE - 1)/BLOCK_SIZE; // number of blocks + // + dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); + dim3 dimGrid(GRID_SIZE, GRID_SIZE); + // + if (int((nbgridcells) * (nbgridcells)/threadsPerBlock) == 0) + { + gradient_grid_kernel<<<1,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel); + } + else + { + //gradient_grid_kernel<<<(nbgridcells) * (nbgridcells)/threadsPerBlock,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel); + //gradient_grid_kernel_v2<<<dimGrid, dimBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu, frame_gpu, nhalos, nbgridcells, lens_kernel); + //gradient_grid_kernel_v2<<<dimGrid, dimBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu, frame_gpu, nhalos, nbgridcells, lens_kernel); if (int((Ncells)/threadsPerBlock) == 0){ gradient_grid_kernel_multiple<<<1,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel, indexactual, Ncells); } else{ gradient_grid_kernel_multiple<<<(Ncells)/threadsPerBlock,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel, indexactual, Ncells); } - cudasafe(cudaMemcpy( grid_grad_x + indexactual, grid_grad_x_gpu, Ncells *sizeof(double),cudaMemcpyDeviceToHost ),"Gradientgpu.cu : Copy source_x_gpu: " ); - cudasafe(cudaMemcpy( grid_grad_y + indexactual, grid_grad_y_gpu, Ncells *sizeof(double),cudaMemcpyDeviceToHost ),"Gradientgpu.cu : Copy source_y_gpu: " ); +#endif + module_potentialDerivatives_totalGradient_SOA_CPU_GPU(grid_grad_x_gpu, grid_grad_y_gpu, frame_gpu, lens, lens_kernel, nbgridcells, nhalos); + cudasafe(cudaMemcpy( grid_grad_x, grid_grad_x_gpu, (nbgridcells) * (nbgridcells) *sizeof(double),cudaMemcpyDeviceToHost ),"Gradientgpu.cu : Copy source_x_gpu: " ); + cudasafe(cudaMemcpy( grid_grad_y, grid_grad_y_gpu, (nbgridcells) * (nbgridcells) *sizeof(double), cudaMemcpyDeviceToHost),"Gradientgpu.cu : Copy source_y_gpu: " ); //printf("%f %f \n",grid_grad_x[0],grid_grad_y[0]); // Free GPU memory cudaFree(lens_gpu); cudaFree(type_gpu); cudaFree(lens_x_gpu); cudaFree(lens_y_gpu); cudaFree(b0_gpu); cudaFree(angle_gpu); cudaFree(epot_gpu); cudaFree(rcore_gpu); cudaFree(rcut_gpu); cudaFree(anglecos_gpu); cudaFree(anglesin_gpu); cudaFree(grid_grad_x_gpu); cudaFree(grid_grad_y_gpu); + /* + for (int i = 0; i < nbgridcells; i++){ + for(int j = 0; j < nbgridcells; j++){ + printf(" %f",grid_grad_x[i*nbgridcells + j]); + } + printf("\n"); + }*/ } -#endif __global__ void gradient_grid_kernel(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens) { - //*grad_x = *grad_y = 0.; + //*grad_x = *grad_y = 0.; - int bid=blockIdx.x; // index of the block (and of the set of images) - int tid=threadIdx.x; // index of the thread within the block + int bid=blockIdx.x; // index of the block (and of the set of images) + int tid=threadIdx.x; // index of the thread within the block - double dx,dy; //pixelsize - int grid_dim, index; - struct point image_point, Grad; - dx = (frame->xmax - frame->xmin)/(nbgridcells-1); - dy = (frame->ymax - frame->ymin)/(nbgridcells-1); - grid_dim = (nbgridcells); + double dx,dy; //pixelsize + int grid_dim, index; + struct point image_point, Grad; + dx = (frame->xmax - frame->xmin)/(nbgridcells-1); + dy = (frame->ymax - frame->ymin)/(nbgridcells-1); + grid_dim = (nbgridcells); - index = bid * threadsPerBlock + tid ; - - while(index < grid_dim*grid_dim){ - - grid_grad_x[index] = 0.; - grid_grad_y[index] = 0.; + index = bid * threadsPerBlock + tid ; - image_point.x = frame->xmin + (index/grid_dim)*dx; - image_point.y = frame->ymin + (index % grid_dim)*dy; + while(index < grid_dim*grid_dim){ - Grad = module_potentialDerivatives_totalGradient_SOA_GPU(&image_point, lens, Nlens); + grid_grad_x[index] = 0.; + grid_grad_y[index] = 0.; - grid_grad_x[index] = Grad.x; - grid_grad_y[index] = Grad.y; + image_point.x = frame->xmin + (index/grid_dim)*dx; + image_point.y = frame->ymin + (index % grid_dim)*dy; - bid += gridDim.x; - index = bid * threadsPerBlock + tid; - } + Grad = module_potentialDerivatives_totalGradient_SOA_GPU(&image_point, lens, Nlens); + grid_grad_x[index] = Grad.x; + grid_grad_y[index] = Grad.y; + bid += gridDim.x; + index = bid * threadsPerBlock + tid; + } } -__global__ void gradient_grid_kernel_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells, const struct Potential_SOA *lens, int indexactual, int ncells) { +__global__ void gradient_grid_kernel_v2(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens) { - //*grad_x = *grad_y = 0.; + //*grad_x = *grad_y = 0.; - int bid=blockIdx.x; // index of the block (and of the set of images) - int tid=threadIdx.x; // index of the thread within the block + int bid = blockIdx.x; // index of the block (and of the set of images) + int tid = threadIdx.x; // index of the thread within the block - double dx,dy; //pixelsize - int grid_dim, index; - struct point image_point, Grad; - dx = (frame->xmax - frame->xmin)/(nbgridcells-1); - dy = (frame->ymax - frame->ymin)/(nbgridcells-1); - grid_dim = (nbgridcells); - - index = bid * threadsPerBlock + tid ; - int major_index = indexactual + index ; - - while(index < ncells){ - - grid_grad_x[index] = 0.; - grid_grad_y[index] = 0.; + double dx,dy; //pixelsize + int grid_dim, index; + struct point image_point, Grad; + // + dx = (frame->xmax - frame->xmin)/(nbgridcells-1); + dy = (frame->ymax - frame->ymin)/(nbgridcells-1); + // + grid_dim = (nbgridcells); + // + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + // + index = col*nbgridcells + row ; + // + //while(index < grid_dim*grid_dim){ - image_point.x = frame->xmin + (major_index/grid_dim)*dx; - image_point.y = frame->ymin + (major_index % grid_dim)*dy; + //grid_grad_x[index] = 0.; + //grid_grad_y[index] = 0.; - Grad = module_potentialDerivatives_totalGradient_SOA_GPU(&image_point, lens, Nlens); + image_point.x = frame->xmin + col*dx; + image_point.y = frame->ymin + row*dy; - grid_grad_x[index] = Grad.x; - grid_grad_y[index] = Grad.y; + Grad = module_potentialDerivatives_totalGradient_SOA_GPU(&image_point, lens, Nlens); - bid += gridDim.x; - index = bid * threadsPerBlock + tid; - major_index = indexactual + index ; - } + grid_grad_x[index] = Grad.x; + grid_grad_y[index] = Grad.y; + bid += gridDim.x; + index = bid * threadsPerBlock + tid; + //} } - -__global__ void gradient_grid_sis_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut) { - - //*grad_x = *grad_y = 0.; - - int bid=blockIdx.x; // index of the block (and of the set of images) - int tid=threadIdx.x; // index of the thread within the block +/* +void gradient_grid_general_CPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens){ + int bid=0; // index of the block (and of the set of images) + int tid=0; // index of the thread within the block double dx,dy,x_pos,y_pos; //pixelsize int grid_dim, index; - struct point true_coord, true_coord_rotation; + point Grad, image_point, true_coord_rotation; double R; dx = (frame->xmax - frame->xmin)/(nbgridcells-1); dy = (frame->ymax - frame->ymin)/(nbgridcells-1); grid_dim = (nbgridcells); - index = bid * threadsPerBlock + tid ; - - + index = bid ; while(index < grid_dim*grid_dim){ grid_grad_x[index] = 0.; grid_grad_y[index] = 0.; - x_pos= frame->xmin + (index/grid_dim)*dx; - y_pos= frame->ymin + (index % grid_dim)*dy; - //printf("%f %f \n", x_pos, y_pos); - - for(int iterator=0; iterator < Nlens; iterator++){ - - true_coord.x = x_pos - lens_x[iterator]; - true_coord.y = y_pos - lens_y[iterator]; - - // Change the origin of the coordinate system to the center of the clump - true_coord_rotation = rotateCoordinateSystem_GPU(true_coord, lens_angle[iterator]); - - R=sqrt(true_coord_rotation.x*true_coord_rotation.x*(1-lens_epot[iterator])+true_coord_rotation.y*true_coord_rotation.y*(1+lens_epot[iterator])); //ellippot = ellipmass/3 - grid_grad_x[index] += (1-lens_epot[iterator])*lens_b0[iterator]*true_coord_rotation.x/(R); - grid_grad_y[index] += (1+lens_epot[iterator])*lens_b0[iterator]*true_coord_rotation.y/(R); - - - - } - + image_point.x = frame->xmin + (index/grid_dim)*dx; + image_point.y = frame->ymin + (index % grid_dim)*dy; + Grad = module_potentialDerivatives_totalGradient_SOA(&image_point, lens, Nlens); + grid_grad_x[index] = Grad.x; + grid_grad_y[index] = Grad.y; - bid += gridDim.x; - index = bid * threadsPerBlock + tid; + bid += 1; + index = bid * 1 + tid; } - - } -__global__ void gradient_grid_piemd_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut) { - - //*grad_x = *grad_y = 0.; - - int bid=blockIdx.x; // index of the block (and of the set of images) - int tid=threadIdx.x; // index of the thread within the block - - double dx,dy,x_pos,y_pos; //pixelsize - int grid_dim, index; - struct point true_coord, true_coord_rotation; - complex zis; - dx = (frame->xmax - frame->xmin)/(nbgridcells-1); - dy = (frame->ymax - frame->ymin)/(nbgridcells-1); - grid_dim = (nbgridcells); - - index = bid * threadsPerBlock + tid ; - - - - while(index < grid_dim*grid_dim){ - - //grid_grad_x[index] = 0.; - //grid_grad_y[index] = 0.; - - x_pos= frame->xmin + (index/grid_dim)*dx; - y_pos= frame->ymin + (index % grid_dim)*dy; - //printf("%f %f \n", x_pos, y_pos); - - for(int iterator=0; iterator < Nlens; iterator++){ - - true_coord.x = x_pos - lens_x[iterator]; - true_coord.y = y_pos - lens_y[iterator]; - - // Change the origin of the coordinate system to the center of the clump - true_coord_rotation = rotateCoordinateSystem_GPU(true_coord, lens_angle[iterator]); - - double x = true_coord_rotation.x; - double y = true_coord_rotation.y; - double eps = lens_epot[iterator]; - double rc = lens_rcore[iterator]; - - double sqe = sqrt(eps); - // - double cx1 = (1. - eps)/(1. + eps); - double cxro = (1. + eps)*(1. + eps); - double cyro = (1. - eps)*(1. - eps); - // - double rem2 = x*x/cxro + y*y/cyro; - - complex zci, znum, zden, zres; - double norm; - // - zci.re = 0; - zci.im = -0.5*(1. - eps*eps)/sqe; - // - znum.re = cx1*x; - znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; - // - zden.re = x; - zden.im = 2.*rc*sqe - y; - norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden - // - zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; - zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; - norm = zis.re; - zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) - zis.im = atan2(zis.im, norm); - // norm = zis.re; - zres.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) - zres.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) - // - zis.re = zres.re; - zis.im = zres.im; - - grid_grad_x[index] += lens_b0[iterator]*zis.re; - grid_grad_y[index] += lens_b0[iterator]*zis.im; - - - - } - - - - - bid += gridDim.x; - index = bid * threadsPerBlock + tid; - } - +*/ +__global__ +void +inline +module_potentialDerivatives_totalGradient_8_SOA_GPU(double *grid_grad_x, double *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int shalos, int nhalos) +{ + //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); + // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 + // + struct point grad, clumpgrad, image_point; + grad.x = 0; + grad.y = 0; + // + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + // + if ((row < nbgridcells) && (col < nbgridcells)) + { + // + int index = col*nbgridcells + row; + // + //grid_grad_x[index] = 0.; + //grid_grad_y[index] = 0.; + // + double dx = (frame->xmax - frame->xmin)/(nbgridcells-1); + double dy = (frame->ymax - frame->ymin)/(nbgridcells-1); + // +#if 0 + __shared__ double img_pt[2]; + if ((row == 0) && (col == 0)) + { + img_pt[0] = frame->xmin + col*dx; + img_pt[1] = frame->ymin + row*dy; + } + __syncthreads(); +#else + image_point.x = frame->xmin + col*dx; + image_point.y = frame->ymin + row*dy; +#endif + // + // + for(int i = shalos; i < shalos + nhalos; i++) + { + //IACA_START; + // + struct point true_coord, true_coord_rot; //, result; + //double R, angular_deviation; + complex zis; + // + //result.x = result.y = 0.; + // +#if 0 + true_coord.x = img_pt[0] - __ldg(&lens->position_x[i]); + true_coord.y = img_pt[1] - __ldg(&lens->position_y[i]); +#else + true_coord.x = image_point.x - __ldg(&lens->position_x[i]); + true_coord.y = image_point.y - __ldg(&lens->position_y[i]); +#endif + double cosi = __ldg(&lens->anglecos[i]); + double sinu = __ldg(&lens->anglesin[i]); + // positionning at the potential center + // Change the origin of the coordinate system to the center of the clump + double x = true_coord.x*cosi + true_coord.y*sinu; + double y = true_coord.y*cosi - true_coord.x*sinu; + // + double eps = __ldg(&lens->ellipticity_potential[i]); + // + double sqe = sqrt(eps); + // + double rem2 = x*x/((1. + eps)*(1. + eps)) + y*y/((1. - eps)*(1. - eps)); + // + complex zci; + complex znum, zden, zres; + double norm; + // + zci.im = -0.5*(1. - eps*eps)/sqe; + // + double rc = __ldg(&lens->rcore[i]); + double cx1 = (1. - eps)/(1. + eps); + znum.re = cx1*x; + znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; + // + zden.re = x; + zden.im = 2.*rc*sqe - y; + norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden + // + zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; + zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; + // + norm = zis.re; + // + zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) + zis.im = atan2(zis.im, norm); + // + zres.re = zci.im*zis.im; // Re( zci*ln(zis) ) + zres.im = zci.im*zis.re; // Im( zci*ln(zis) ) + // + double b0 = __ldg(&lens->b0[i]); + grad.x += b0*(zres.re*cosi - zres.im*sinu); + grad.y += b0*(zres.im*cosi + zres.re*sinu); + } + //IACA_END; + // + grid_grad_x[index] += grad.x; + grid_grad_y[index] += grad.y; + } } -__global__ void gradient_grid_piemd_GPU_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, int Ndevice, int indexactual, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut) { - - //*grad_x = *grad_y = 0.; - - int bid=blockIdx.x; // index of the block (and of the set of images) - int tid=threadIdx.x; // index of the thread within the block - - double dx,dy,x_pos,y_pos; //pixelsize - int grid_dim, index; - struct point true_coord, true_coord_rotation; - complex zis; - dx = (frame->xmax - frame->xmin)/(nbgridcells-1); - dy = (frame->ymax - frame->ymin)/(nbgridcells-1); - grid_dim = (nbgridcells); - - index = bid * threadsPerBlock + tid ; - - - - while(index < Ndevice){ - //printf("%d \n", index); - - grid_grad_x[index] = 0.; - grid_grad_y[index] = 0.; - - x_pos= frame->xmin + ((indexactual +index)/grid_dim)*dx; - y_pos= frame->ymin + ((indexactual +index) % grid_dim)*dy; - //printf("%f %f \n", x_pos, y_pos); - - for(int iterator=0; iterator < Nlens; iterator++){ - - true_coord.x = x_pos - lens_x[iterator]; - true_coord.y = y_pos - lens_y[iterator]; - - // Change the origin of the coordinate system to the center of the clump - true_coord_rotation = rotateCoordinateSystem_GPU(true_coord, lens_angle[iterator]); - - double x = true_coord_rotation.x; - double y = true_coord_rotation.y; - double eps = lens_epot[iterator]; - double rc = lens_rcore[iterator]; - - double sqe = sqrt(eps); - // - double cx1 = (1. - eps)/(1. + eps); - double cxro = (1. + eps)*(1. + eps); - double cyro = (1. - eps)*(1. - eps); - // - double rem2 = x*x/cxro + y*y/cyro; - - complex zci, znum, zden, zres; - double norm; - // - zci.re = 0; - zci.im = -0.5*(1. - eps*eps)/sqe; - // - znum.re = cx1*x; - znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; - // - zden.re = x; - zden.im = 2.*rc*sqe - y; - norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden - // - zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; - zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; - norm = zis.re; - zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) - zis.im = atan2(zis.im, norm); - // norm = zis.re; - zres.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) - zres.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) - // - zis.re = zres.re; - zis.im = zres.im; - - grid_grad_x[index] += lens_b0[iterator]*zis.re; - grid_grad_y[index] += lens_b0[iterator]*zis.im; - - - - } - - - - - bid += gridDim.x; - - index = bid * threadsPerBlock + tid; - } - - +__global__ +void +module_potentialDerivatives_totalGradient_8_SOA_GPU_v2(double *grid_grad_x, double *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, + int nbgridcells, int i, int nhalos) +{ + //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); + // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 + // + struct point grad, clumpgrad, image_point; + grad.x = 0; + grad.y = 0; + // + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + // + if ((row < nbgridcells) && (col < nbgridcells)) + { + // + int index = col*nbgridcells + row; + // + //grid_grad_x[index] = 0.; + //grid_grad_y[index] = 0.; + // + double dx = (frame->xmax - frame->xmin)/(nbgridcells-1); + double dy = (frame->ymax - frame->ymin)/(nbgridcells-1); + // +#if 0 + __shared__ double img_pt[2]; + if ((row == 0) && (col == 0)) + { + img_pt[0] = frame->xmin + col*dx; + img_pt[1] = frame->ymin + row*dy; + } + __syncthreads(); +#else + image_point.x = frame->xmin + col*dx; + image_point.y = frame->ymin + row*dy; +#endif + // + // + //for(int i = shalos; i < shalos + nhalos; i++) + //{ + //IACA_START; + // + struct point true_coord, true_coord_rot; //, result; + //double R, angular_deviation; + complex zis; + // + //result.x = result.y = 0.; + // +#if 0 + true_coord.x = img_pt[0] - __ldg(&lens->position_x[i]); + true_coord.y = img_pt[1] - __ldg(&lens->position_y[i]); +#else + true_coord.x = image_point.x - __ldg(&lens->position_x[i]); + true_coord.y = image_point.y - __ldg(&lens->position_y[i]); +#endif + double cosi = __ldg(&lens->anglecos[i]); + double sinu = __ldg(&lens->anglesin[i]); + // positionning at the potential center + // Change the origin of the coordinate system to the center of the clump + double x = true_coord.x*cosi + true_coord.y*sinu; + double y = true_coord.y*cosi - true_coord.x*sinu; + // + double eps = __ldg(&lens->ellipticity_potential[i]); + // + double sqe = sqrt(eps); + // + double rem2 = x*x/((1. + eps)*(1. + eps)) + y*y/((1. - eps)*(1. - eps)); + // + complex zci; + complex znum, zden, zres; + double norm; + // + zci.im = -0.5*(1. - eps*eps)/sqe; + // + double rc = __ldg(&lens->rcore[i]); + double cx1 = (1. - eps)/(1. + eps); + znum.re = cx1*x; + znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; + // + zden.re = x; + zden.im = 2.*rc*sqe - y; + norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden + // + zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; + zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; + // + // + double b0 = __ldg(&lens->b0[i]); + grad.x += b0*(zres.re*cosi - zres.im*sinu); + grad.y += b0*(zres.im*cosi + zres.re*sinu); + //} + //IACA_END; + // + grid_grad_x[index] += grad.x; + grid_grad_y[index] += grad.y; + } } -__global__ void piemd_GPU(double *grad_x, double *grad_y, point *pImage, int Nlens, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut) { - - //*grad_x = *grad_y = 0.; - - int bid=blockIdx.x; // index of the block (and of the set of images) - int tid=threadIdx.x; // index of the thread within the block - - struct point true_coord, true_coord_rotation; - complex zis; - //gridDim.x , threadsPerBlock; - - int iterator = bid * threadsPerBlock + tid ; - //for(int iterator = 0; iterator < Nlens; ++iterator){ - while(iterator < Nlens){ - //printf(" %d %d %d %d \n",iterator , bid , threadsPerBlock , tid ); - true_coord.x = pImage->x - lens_x[iterator]; - true_coord.y = pImage->y - lens_y[iterator]; - - // Change the origin of the coordinate system to the center of the clump - true_coord_rotation = rotateCoordinateSystem_GPU(true_coord, lens_angle[iterator]); - - double x = true_coord_rotation.x; - double y = true_coord_rotation.y; - double eps = lens_epot[iterator]; - double rc = lens_rcore[iterator]; - - double sqe = sqrt(eps); - // - double cx1 = (1. - eps)/(1. + eps); - double cxro = (1. + eps)*(1. + eps); - double cyro = (1. - eps)*(1. - eps); - // - double rem2 = x*x/cxro + y*y/cyro; - - complex zci, znum, zden, zres; - double norm; - // - zci.re = 0; - zci.im = -0.5*(1. - eps*eps)/sqe; - // - znum.re = cx1*x; - znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; - // - zden.re = x; - zden.im = 2.*rc*sqe - y; - norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden - // - zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; - zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; - norm = zis.re; - zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) - zis.im = atan2(zis.im, norm); - // norm = zis.re; - zres.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) - zres.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) - // - zis.re = zres.re; - zis.im = zres.im; - // - //zres.re = zis.re*b0; - //zres.im = zis.im*b0; - // - // - - //grad->x += lens_b0[iterator]*zis.re; - //grad->y += lens_b0[iterator]*zis.im; - atomicAdd_double(grad_x,lens_b0[iterator]*zis.re); - atomicAdd_double(grad_y,lens_b0[iterator]*zis.im); - - //printf("%f %f \n ", lens_b0[iterator]*zis.re, lens_b0[iterator]*zis.im); - //printf("%f %f \n ", *grad_x, *grad_y); - - bid += gridDim.x; - iterator = bid * threadsPerBlock + tid ; - } +/* + typedef struct point (*halo_func_GPU_t) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos); + + __constant__ halo_func_GPU_t halo_func_GPU[100] = + { + 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_GPU, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_GPU, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, module_potentialDerivatives_totalGradient_81_SOA_GPU, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + */ + + void +module_potentialDerivatives_totalGradient_SOA_CPU_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos) +{ + struct point grad, clumpgrad; + // + grad.x = clumpgrad.x = 0; + grad.y = clumpgrad.y = 0; + int shalos = 0; + int GRID_SIZE = (nbgridcells + BLOCK_SIZE - 1)/BLOCK_SIZE; // number of blocks + // + dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); + dim3 dimGrid(GRID_SIZE, GRID_SIZE); + int count = nhalos; + cudaMemset(grid_grad_x, 0, nbgridcells*nbgridcells*sizeof(double)); + cudaMemset(grid_grad_y, 0, nbgridcells*nbgridcells*sizeof(double)); + module_potentialDerivatives_totalGradient_8_SOA_GPU<<<dimGrid, dimBlock>>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nbgridcells, shalos, count); + //grid.x += clumpgrad.x; + //grad.y += clumpgrad.y; + + // + // + /* + while (shalos < nhalos) + { + + int lens_type = lens_cpu->type[shalos]; + int count = 1; + while (lens_cpu->type[shalos + count] == lens_type) count++; + //std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl; + //printf ("%d %d %d \n",lens_type,count,shalos); + // + clumpgrad = (*halo_func_GPU[lens_type]<<<dimGrid, dimBlock>>> )(lens_gpu, shalos, count); + // + grad.x += clumpgrad.x; + grad.y += clumpgrad.y; + shalos += count; + } + return(grad); + */ } -__device__ static double atomicAdd_double(double* address, double val) + + void +module_potentialDerivatives_totalGradient_SOA_CPU_GPU_v2(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos) { - unsigned long long int* address_as_ull = - (unsigned long long int*)address; - unsigned long long int old = *address_as_ull, assumed; - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + - __longlong_as_double(assumed))); - } while (assumed != old); - return __longlong_as_double(old); + struct point grad, clumpgrad; + // + grad.x = clumpgrad.x = 0; + grad.y = clumpgrad.y = 0; + int shalos = 0; + int GRID_SIZE = (nbgridcells + BLOCK_SIZE - 1)/BLOCK_SIZE; // number of blocks + // + dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); + dim3 dimGrid(GRID_SIZE, GRID_SIZE); + int count = nhalos; + // + cudaMemset(grid_grad_x, 0, nbgridcells*nbgridcells*sizeof(double)); + cudaMemset(grid_grad_y, 0, nbgridcells*nbgridcells*sizeof(double)); + // + for (int ii = 0; ii < nhalos; ++ii) + module_potentialDerivatives_totalGradient_8_SOA_GPU<<<dimGrid, dimBlock>>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nbgridcells, ii, 1); + + //grid.x += clumpgrad.x; + //grad.y += clumpgrad.y; + + // + // + /* + while (shalos < nhalos) + { + + int lens_type = lens_cpu->type[shalos]; + int count = 1; + while (lens_cpu->type[shalos + count] == lens_type) count++; + //std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl; + //printf ("%d %d %d \n",lens_type,count,shalos); + // + clumpgrad = (*halo_func_GPU[lens_type]<<<dimGrid, dimBlock>>> )(lens_gpu, shalos, count); + // + grad.x += clumpgrad.x; + grad.y += clumpgrad.y; + shalos += count; + } + + return(grad); + */ } diff --git a/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cuh b/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cuh index 5153dff..575df16 100644 --- a/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cuh +++ b/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cuh @@ -1,29 +1,30 @@ /* * gradientgpu.cuh * * Created on: Nov 29, 2016 * Author: cerschae */ #ifndef GRID_GRADIENT_GPU_CUH_ #define GRID_GRADIENT_GPU_CUH_ #include "cudafunctions.cuh" #include "gradient_GPU.cuh" #include <structure_hpc.h> void gradient_grid_GPU_sorted(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens,int nbgridcells); void gradient_grid_pinned(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int *Nlens,int nbgridcell); void gradient_grid_pinned_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int *Nlens,int nbgridcell); void gradient_grid_GPU_sub(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos,int nbgridcells, int indexactual, int Ncells ); void gradient_grid_GPU_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, int nbgridcells); __global__ void gradient_grid_kernel(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens); +__global__ void gradient_grid_kernel_v2(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens); __global__ void gradient_grid_piemd_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcell, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut); __global__ void gradient_grid_sis_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcell, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut); __global__ void gradient_grid_piemd_GPU_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcell, int Ndevice, int indexactual, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut); __global__ void gradient_grid_kernel_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells, const struct Potential_SOA *lens, int indexactual, int ncells); __device__ static double atomicAdd_double(double* address, double val); #endif /* GRADIENTGPU_CUH_ */ diff --git a/Benchmarks/GridGradientBenchmark/main.cpp b/Benchmarks/GridGradientBenchmark/main.cpp index 629299c..f09f058 100644 --- a/Benchmarks/GridGradientBenchmark/main.cpp +++ b/Benchmarks/GridGradientBenchmark/main.cpp @@ -1,314 +1,314 @@ /** * @file main.cpp * @Author Christoph Schaaefer, EPFL (christophernstrerne.schaefer@epfl.ch) * @date October 2016 * @brief Benchmark for gradhalo function */ #include <iostream> #include <iomanip> #include <string.h> #include <math.h> #include <sys/time.h> #include <fstream> #include <sys/stat.h> // #include <mm_malloc.h> // #include <cuda_runtime.h> #include <structure_hpc.h> #include "timer.h" #include "gradient.hpp" #include "chi_CPU.hpp" #include "module_cosmodistances.h" #include "module_readParameters.hpp" #include "grid_gradient_GPU.cuh" //#include<omp.h> int module_readCheckInput_readInput(int argc, char *argv[]) { /// check if there is a correct number of arguments, and store the name of the input file in infile char* infile; struct stat file_stat; // If we do not have 3 arguments, stop if ( argc != 3 ) { fprintf(stderr, "\nUnexpected number of arguments\n"); fprintf(stderr, "\nUSAGE:\n"); fprintf(stderr, "lenstool input_file output_directorypath [-n]\n\n"); exit(-1); } else if ( argc == 3 ) infile=argv[1]; std::ifstream ifile(infile,std::ifstream::in); // Open the file int ts = (int) time (NULL); char buffer[10]; std::stringstream ss; ss << ts; std::string trimstamp = ss.str(); // std::string outdir = argv[2]; outdir += "-"; outdir += trimstamp; std::cout << outdir << std::endl; // check whether the output directory already exists if (stat(outdir.c_str(), &file_stat) < 0){ mkdir(outdir.c_str(), S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP | S_IXGRP | S_IROTH ); } else { printf("Error : Directory %s already exists. Specify a non existing directory.\n",argv[2]); exit(-1); } // check whether the input file exists. If it could not be opened (ifile = 0), it does not exist if(ifile){ ifile.close(); } else{ printf("The file %s does not exist, please specify a valid file name\n",infile); exit(-1); } return 0; } int main(int argc, char *argv[]) { // Setting Up the problem //=========================================================================================================== // This module function reads the terminal input when calling LENSTOOL and checks that it is correct // Otherwise it exits LENSTOOL module_readCheckInput_readInput(argc, argv); // This module function reads the cosmology parameters from the parameter file // Input: struct cosmologicalparameters cosmology, parameter file // Output: Initialized cosmology struct cosmo_param cosmology; // Cosmology struct to store the cosmology data from the file std::string inputFile = argv[1]; // Input file module_readParameters_readCosmology(inputFile, cosmology); // This module function reads the runmode paragraph and the number of sources, arclets, etc. in the parameter file. // The runmode_param stores the information of what exactly the user wants to do with lenstool. struct runmode_param runmode; module_readParameters_readRunmode(inputFile, &runmode); module_readParameters_debug_cosmology(runmode.debug, cosmology); module_readParameters_debug_runmode(runmode.debug, runmode); //=== Declaring variables struct grid_param frame; struct galaxy images[runmode.nimagestot]; struct galaxy sources[runmode.nsets]; struct Potential lenses[runmode.nhalos+runmode.npotfile-1]; struct Potential_SOA lenses_SOA_table[NTYPES]; struct Potential_SOA lenses_SOA; struct cline_param cline; struct potfile_param potfile; struct Potential potfilepotentials[runmode.npotfile]; struct potentialoptimization host_potentialoptimization[runmode.nhalos]; int nImagesSet[runmode.nsets]; // Contains the number of images in each set of images // This module function reads in the potential form and its parameters (e.g. NFW) // Input: input file // Output: Potentials and its parameters module_readParameters_Potential(inputFile, lenses, runmode.nhalos); //Converts to SOA //module_readParameters_PotentialSOA(inputFile, lenses, lenses_SOA, runmode.Nlens); module_readParameters_PotentialSOA(inputFile, lenses, &lenses_SOA, runmode.nhalos); //module_readParameters_PotentialSOA_nonsorted(inputFile, lenses, &lenses_SOA_nonsorted, runmode.nhalos); module_readParameters_debug_potential(runmode.debug, lenses, runmode.nhalos); //std::cerr << lenses_SOA[1].b0[0] << " " << lenses[0].b0 << std::endl; // This module function reads in the potfiles parameters // Input: input file // Output: Potentials from potfiles and its parameters if (runmode.potfile == 1 ){ module_readParameters_readpotfiles_param(inputFile, &potfile); module_readParameters_debug_potfileparam(runmode.debug, &potfile); module_readParameters_readpotfiles(&runmode,&potfile,lenses); module_readParameters_debug_potential(runmode.debug, lenses, runmode.nhalos+runmode.npotfile); } // This module function reads in the grid form and its parameters // Input: input file // Output: grid and its parameters module_readParameters_Grid(inputFile, &frame); if (runmode.image == 1 or runmode.inverse == 1 or runmode.time > 0){ // This module function reads in the strong lensing images module_readParameters_readImages(&runmode, images, nImagesSet); //runmode.nsets = runmode.nimagestot; for(int i = 0; i < runmode.nimagestot; ++i){ images[i].dls = module_cosmodistances_objectObject(lenses[0].z, images[i].redshift, cosmology); images[i].dos = module_cosmodistances_observerObject(images[i].redshift, cosmology); images[i].dr = module_cosmodistances_lensSourceToObserverSource(lenses[0].z, images[i].redshift, cosmology); } module_readParameters_debug_image(runmode.debug, images, nImagesSet,runmode.nsets); } if (runmode.inverse == 1){ // This module function reads in the potential optimisation limits module_readParameters_limit(inputFile,host_potentialoptimization,runmode.nhalos); module_readParameters_debug_limit(runmode.debug, host_potentialoptimization[0]); } if (runmode.source == 1){ //Initialisation to default values.(Setting sources to z = 1.5 default value) for(int i = 0; i < runmode.nsets; ++i){ sources[i].redshift = 1.5; } // This module function reads in the strong lensing sources module_readParameters_readSources(&runmode, sources); //Calculating cosmoratios for(int i = 0; i < runmode.nsets; ++i){ sources[i].dls = module_cosmodistances_objectObject(lenses[0].z, sources[i].redshift, cosmology); sources[i].dos = module_cosmodistances_observerObject(sources[i].redshift, cosmology); sources[i].dr = module_cosmodistances_lensSourceToObserverSource(lenses[0].z, sources[i].redshift, cosmology); } module_readParameters_debug_source(runmode.debug, sources, runmode.nsets); } std::cout << "--------------------------" << std::endl << std::endl; double t_1(0),t_2(0),t_3(0),t_4(0); // Lenstool-CPU Grid-Gradient //=========================================================================================================== //Setting Test: double dx, dy; dx = (frame.xmax - frame.xmin)/(runmode.nbgridcells-1); dy = (frame.ymax - frame.ymin)/(runmode.nbgridcells-1); point test_point1_1,test_point2_2, test_result1_1,test_result2_2,test_pointN_N, test_resultN_N; double dlsds = images[0].dr; test_point1_1.x = frame.xmin; test_point1_1.y = frame.ymin; test_point2_2.x = frame.xmin+dx; test_point2_2.y = frame.ymin+dy; test_pointN_N.x = frame.xmin + ((runmode.nbgridcells*runmode.nbgridcells-1)/runmode.nbgridcells)*dx; test_pointN_N.y = frame.ymin + ((runmode.nbgridcells*runmode.nbgridcells-1) % runmode.nbgridcells)*dy; test_result1_1 = module_potentialDerivatives_totalGradient_SOA(&test_point1_1, &lenses_SOA, runmode.nhalos); test_result2_2 = module_potentialDerivatives_totalGradient_SOA(&test_point2_2, &lenses_SOA, runmode.nhalos); test_resultN_N = module_potentialDerivatives_totalGradient_SOA(&test_pointN_N, &lenses_SOA, runmode.nhalos); std::cout << " Initial Test " << std::endl; std::cout << " Test 1: " << std::endl; -std::cout << " Point 1 : " << std::setprecision(5) << test_point1_1.x << " "<< test_point1_1.y << std::endl; -std::cout << " Gradient " << std::setprecision(5) << test_result1_1.x << " "<< test_result1_1.y << std::endl; +std::cout << " Point 1 : " << std::setprecision(15) << test_point1_1.x << " "<< test_point1_1.y << std::endl; +std::cout << " Gradient " << std::setprecision(15) << test_result1_1.x << " "<< test_result1_1.y << std::endl; std::cout << " Test 2: " << std::endl; -std::cout << " Point 2 : " << std::setprecision(5) << test_point2_2.x << " "<< test_point2_2.y << std::endl; -std::cout << " Gradient " << std::setprecision(5) << test_result2_2.x << " "<< test_result2_2.y << std::endl; +std::cout << " Point 2 : " << std::setprecision(15) << test_point2_2.x << " "<< test_point2_2.y << std::endl; +std::cout << " Gradient " << std::setprecision(15) << test_result2_2.x << " "<< test_result2_2.y << std::endl; std::cout << " Test 3: " << std::endl; -std::cout << " Point 3 : " << std::setprecision(5) << test_pointN_N.x << " "<< test_pointN_N.y << std::endl; -std::cout << " Gradient " << std::setprecision(5) << test_resultN_N.x << " "<< test_resultN_N.y << std::endl; +std::cout << " Point 3 : " << std::setprecision(15) << test_pointN_N.x << " "<< test_pointN_N.y << std::endl; +std::cout << " Gradient " << std::setprecision(15) << test_resultN_N.x << " "<< test_resultN_N.y << std::endl; double *grid_gradient_x, *grid_gradient_y; grid_gradient_x = (double *)malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double)); grid_gradient_y = (double *)malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double)); int grid_dim = runmode.nbgridcells; #if 0 t_1 = -myseconds(); //Packaging the image to sourceplane conversion gradient_grid_CPU(grid_gradient_x,grid_gradient_y,&frame,&lenses_SOA,runmode.nhalos,grid_dim); t_1 += myseconds(); std::cout << " gradient_grid_CPU Brute Force Benchmark " << std::endl; std::cout << " Test 1: " << std::endl; std::cout << " Point 1 : " << std::setprecision(5) << test_point1_1.x << " "<< test_point1_1.y << std::endl; std::cout << " Gradient " << std::setprecision(5) << grid_gradient_x[0] << " "<< grid_gradient_y[0] << std::endl; std::cout << " Test 2: " << std::endl; std::cout << " Point 2 : " << std::setprecision(5) << test_point2_2.x << " "<< test_point2_2.y << std::endl; std::cout << " Gradient " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells+1] << " "<< grid_gradient_y[runmode.nbgridcells+1] << std::endl; std::cout << " Time " << std::setprecision(15) << t_1 << std::endl; #endif #if 1 t_2 = -myseconds(); -gradient_grid_GPU_sorted(grid_gradient_x,grid_gradient_y,&frame,&lenses_SOA,runmode.nhalos,grid_dim); +gradient_grid_GPU_sorted(grid_gradient_x,grid_gradient_y, &frame, &lenses_SOA, runmode.nhalos, grid_dim); t_2 += myseconds(); std::cout << " gradient_grid_GPU_sorted Brute Force Benchmark " << std::endl; std::cout << " Test 1: " << std::endl; -std::cout << " Point 1 : " << std::setprecision(5) << test_point1_1.x << " "<< test_point1_1.y << std::endl; -std::cout << " Gradient " << std::setprecision(5) << grid_gradient_x[0] << " "<< grid_gradient_y[0] << std::endl; +std::cout << " Point 1 : " << std::setprecision(15) << test_point1_1.x << " "<< test_point1_1.y << std::endl; +std::cout << " Gradient " << std::setprecision(15) << grid_gradient_x[0] << " "<< grid_gradient_y[0] << std::endl; std::cout << " Test 2: " << std::endl; -std::cout << " Point 2 : " << std::setprecision(5) << test_point2_2.x << " "<< test_point2_2.y << std::endl; -std::cout << " Gradient " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells+1] << " "<< grid_gradient_y[runmode.nbgridcells+1] << std::endl; +std::cout << " Point 2 : " << std::setprecision(15) << test_point2_2.x << " "<< test_point2_2.y << std::endl; +std::cout << " Gradient " << std::setprecision(15) << grid_gradient_x[runmode.nbgridcells+1] << " "<< grid_gradient_y[runmode.nbgridcells+1] << std::endl; std::cout << " Time " << std::setprecision(15) << t_2 << std::endl; #endif #if 1 t_3 = -myseconds(); gradient_grid_GPU_multiple(grid_gradient_x,grid_gradient_y,&frame,&lenses_SOA,runmode.nhalos,grid_dim); t_3 += myseconds(); std::cout << " gradient_grid_GPU_multiple Brute Force Benchmark " << std::endl; std::cout << " Test 1: " << std::endl; std::cout << " Point 1 : " << std::setprecision(5) << test_point1_1.x << " "<< test_point1_1.y << std::endl; std::cout << " Gradient " << std::setprecision(5) << grid_gradient_x[0] << " "<< grid_gradient_y[0] << std::endl; std::cout << " Test 2: " << std::endl; std::cout << " Point 2 : " << std::setprecision(5) << test_point2_2.x << " "<< test_point2_2.y << std::endl; std::cout << " Gradient " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells+1] << " "<< grid_gradient_y[runmode.nbgridcells+1] << std::endl; std::cout << " Test 3: " << std::endl; std::cout << " Point 3 : " << std::setprecision(5) << test_pointN_N.x << " "<< test_pointN_N.y << std::endl; std::cout << " Gradient " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells*runmode.nbgridcells-1] << " "<< grid_gradient_y[runmode.nbgridcells*runmode.nbgridcells-1] << std::endl; std::cout << " Time " << std::setprecision(15) << t_3 << std::endl; #endif #if 1 t_4 = -myseconds(); gradient_grid_GPU_sub(grid_gradient_x,grid_gradient_y,&frame,&lenses_SOA,runmode.nhalos,grid_dim,0,grid_dim*grid_dim); t_4 += myseconds(); std::cout << " gradient_grid_GPU_sub Brute Force Benchmark " << std::endl; std::cout << " Test 1: " << std::endl; std::cout << " Point 1 : " << std::setprecision(5) << test_point1_1.x << " "<< test_point1_1.y << std::endl; std::cout << " Gradient " << std::setprecision(5) << grid_gradient_x[0] << " "<< grid_gradient_y[0] << std::endl; std::cout << " Test 2: " << std::endl; std::cout << " Point 2 : " << std::setprecision(5) << test_point2_2.x << " "<< test_point2_2.y << std::endl; std::cout << " Gradient " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells+1] << " "<< grid_gradient_y[runmode.nbgridcells+1] << std::endl; std::cout << " Test 3: " << std::endl; std::cout << " Point 3 : " << std::setprecision(5) << test_pointN_N.x << " "<< test_pointN_N.y << std::endl; std::cout << " Gradient " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells*runmode.nbgridcells-1] << " "<< grid_gradient_y[runmode.nbgridcells*runmode.nbgridcells-1] << std::endl; std::cout << " Time " << std::setprecision(15) << t_4 << std::endl; #endif }