diff --git a/Benchmarks/GridGradientBenchmark/Makefile b/Benchmarks/GridGradientBenchmark/Makefile index c21ed04..61051ca 100644 --- a/Benchmarks/GridGradientBenchmark/Makefile +++ b/Benchmarks/GridGradientBenchmark/Makefile @@ -1,81 +1,81 @@ PROGRAM_NAME := ChiBenchmark #CXX=g++ -lm -ffast-math -ftree-loop-vectorize CXX=icpc program_CXX_SRCS := $(wildcard *.cpp) program_CXX_OBJS := ${program_CXX_SRCS:.cpp=.o} #program_CXX_SRCS += $(wildcard ../../*.c) #Find C source files from additonal directories program_C_OBJS := ${program_C_SRCS:.c=.o} # program_CU_SRCS := $(wildcard *.cu) #program_CU_SRCS += $(wildcard ../../*.cu) #Find CUDA source files from additional directories #program_CU_HEADERS := $(wildcard *.cuh) #Optional: Include .cuh files as dependencies #program_CU_HEADERS += $(wildcard ../../*.cuh) #Find .cuh files from additional directories program_CU_OBJS := ${program_CU_SRCS:.cu=.cuo} # program_INCLUDE_DIRS := . /usr/local/cuda/include/ #C++ Include directories program_INCLUDE_DIRS += $(CFITSIO_ROOT)/include #program_INCLUDE_DIRS += /users/fgilles/bin/iaca-lin32/include program_INCLUDE_DIRS += $(LENSTOOL_ROOT)/include program_INCLUDE_DIRS += $(GSL_ROOT) program_INCLUDE_DIRS += $(LENSTOOLHPC_ROOT)/src # #program_CU_INCLUDE_DIRS := /home/users/amclaugh/CUB/cub-1.3.2/ #CUDA Include directories # #program_INCLUDE_LIBS := ./ #Include libraries program_INCLUDE_LIBS += $(CFITSIO_ROOT)/lib #Include libraries program_INCLUDE_LIBS += $(LENSTOOL_ROOT)/src program_INCLUDE_LIBS += $(LENSTOOL_ROOT)/liblt program_INCLUDE_LIBS += $(LENSTOOLHPC_ROOT)/src/ program_INCLUDE_LIBS += $(GSL_ROOT)/lib program_INCLUDE_LIBS += $(WCSTOOL_ROOT) # # # Compiler flags CPPFLAGS += $(foreach includedir,$(program_INCLUDE_DIRS),-I$(includedir)) CPPFLAGS += $(foreach includelib,$(program_INCLUDE_LIBS),-L$(includelib)) CPPFLAGS += -D__WITH_LENSTOOL CPPFLAGS += -qopenmp -xHost -g -O3 -std=c++0x -Wall -pedantic #CPPFLAGS += -qopenmp -march=core-avx2 -g -O3 -std=c++0x -Wall -pedantic #CPPFLAGS += -llenstoolhpc -qopenmp -xHost -g -O3 -std=c++0x -Wall -pedantic #CPPFLAGS += -llenstoolhpc -qopenmp -axMIC-AVX512,CORE-AVX2 -g -O3 -std=c++0x -Wall -pedantic #CPPFLAGS += -qopt-prefetch-distance=64,8 -qopt-streaming-cache-evict=0 -llenstoolhpc -qopenmp -xMIC-AVX512 -g -O3 -std=c++0x -Wall -pedantic LDFLAGS := -llenstoolhpc -lcfitsio -llenstool -llt -lgsl -lgslcblas -lm -lwcs NVFLAGS := -O3 -g -G -arch=sm_35 -rdc=true -ccbin icpc -Xcompiler -qopenmp -D__WITH_LENSTOOL #rdc=true needed for separable compilation #NVFLAGS += --maxrregcount=40 NVFLAGS += $(foreach includedir,$(program_INCLUDE_DIRS),-I$(includedir)) NVFLAGS += $(foreach includelib,$(program_INCLUDE_LIBS),-L$(includelib)) CUO_O_OBJECTS := ${program_CU_OBJS:.cuo=.cuo.o} OBJECTS = $(program_CXX_OBJS) $(program_C_OBJS) $(program_CU_OBJS) .PHONY: all clean distclean all: $(PROGRAM_NAME) # debug: CXXFLAGS = -g -O0 -std=c++0x -Wall -pedantic -DDEBUG $(EXTRA_FLAGS) debug: $(PROGRAM_NAME) %.cuo: %.cu %.cuh - nvcc $(NVFLAGS) --ptxas-options=-v -o $@ -dc $< -#nvcc $(NVFLAGS) -Xptxas -v,-abi=no -ptx -src-in-ptx -dc $< + nvcc $(NVFLAGS) -Xptxas -dlcm=cg --ptxas-options=-v -o $@ -dc $< + #nvcc $(NVFLAGS) -maxrregcount=100 -Xptxas -dlcm=cg --ptxas-options=-v -o $@ -dc $< $(PROGRAM_NAME): $(OBJECTS) @ for cu_obj in $(program_CU_OBJS); \ do \ mv $$cu_obj $$cu_obj.o; \ done #append a .o suffix for nvcc nvcc $(NVFLAGS) -o $@ $(program_CXX_OBJS) $(program_C_OBJS) $(CUO_O_OBJECTS) $(LDFLAGS) @ for cu_obj in $(CUO_O_OBJECTS); \ do \ mv $$cu_obj $${cu_obj%.*} ; \ done #remove the .o for make clean: @- $(RM) $(PROGRAM_NAME) $(OBJECTS) *~ *.o *.optrpt distclean: clean diff --git a/Benchmarks/GridGradientBenchmark/gradient_GPU.cu b/Benchmarks/GridGradientBenchmark/gradient_GPU.cu index b115234..52ce9c6 100644 --- a/Benchmarks/GridGradientBenchmark/gradient_GPU.cu +++ b/Benchmarks/GridGradientBenchmark/gradient_GPU.cu @@ -1,516 +1,515 @@ #include "gradient_GPU.cuh" #include #include #include #include #include #include #include #include /* #ifdef __AVX__ #include "simd_math_avx.h" #endif #ifdef __AVX512F__ #include "simd_math_avx512f.h" #endif */ #include "structure_hpc.h" #include "utils.hpp" // // SOA versions, vectorizable // __device__ inline struct point module_potentialDerivatives_totalGradient_5_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { //asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA begins"); // struct point grad, clumpgrad; grad.x = 0; grad.y = 0; for(int i = shalos; i < shalos + nhalos; i++) { // struct point true_coord, true_coord_rotation; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // true_coord_rotation = rotateCoordinateSystem_GPU(true_coord, lens->ellipticity_angle[i]); double R = sqrt(true_coord_rotation.x*true_coord_rotation.x*(1 - lens->ellipticity_potential[i])+true_coord_rotation.y*true_coord_rotation.y*(1 + lens->ellipticity_potential[i])); // grad.x += (1 - lens->ellipticity[i]/3.)*lens->b0[i]*true_coord_rotation.x/R; grad.y += (1 + lens->ellipticity[i]/3.)*lens->b0[i]*true_coord_rotation.y/R; } return grad; } - // // // __device__ inline struct point module_potentialDerivatives_totalGradient_8_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad; grad.x = 0; grad.y = 0; // for(int i = shalos; i < shalos + nhalos; i++) { //IACA_START; // struct point true_coord, true_coord_rot; //, result; //double R, angular_deviation; complex zis; // //result.x = result.y = 0.; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; //double cosi,sinu; //cosi = cos(lens->ellipticity_angle[i]); //sinu = sin(lens->ellipticity_angle[i]); double cosi = lens->anglecos[i]; double sinu = lens->anglesin[i]; //positionning at the potential center // Change the origin of the coordinate system to the center of the clump //true_coord_rot = rotateCoordinateSystem_GPU(true_coord, lens->ellipticity_angle[i]); //true_coord_rot = rotateCoordinateSystem_GPU_2(true_coord, cosi, sinu); true_coord_rot.x = true_coord.x*cosi + true_coord.y*sinu; true_coord_rot.y = true_coord.y*cosi - true_coord.x*sinu; // double x = true_coord_rot.x; double y = true_coord_rot.y; double eps = lens->ellipticity_potential[i]; double rc = lens->rcore[i]; double b0 = lens->b0[i]; // //std::cout << "piemd_lderivatives" << std::endl; // double sqe = sqrt(eps); // double cx1 = (1. - eps)/(1. + eps); double cxro = (1. + eps)*(1. + eps); double cyro = (1. - eps)*(1. - eps); // double rem2 = x*x/cxro + y*y/cyro; // complex zci, znum, zden, zres; double norm; // zci.re = 0; zci.im = -0.5*(1. - eps*eps)/sqe; // znum.re = cx1*x; znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; // zden.re = x; zden.im = 2.*rc*sqe - y; norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden // zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; norm = zis.re; zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) zis.im = atan2(zis.im, norm); // norm = zis.re; zres.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) zres.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) // zis.re = zres.re; zis.im = zres.im; // grad.x += b0*(zres.re*cosi - zres.im*sinu); grad.y += b0*(zres.im*cosi + zres.re*sinu); // //zres.re = zis.re*b0; //zres.im = zis.im*b0; // rotation //clumpgrad.x = zis.re; //clumpgrad.y = zis.im; //clumpgrad = rotateCoordinateSystem_GPU(true_coord, -lens->ellipticity_angle[i]); //clumpgrad = rotateCoordinateSystem_GPU_2(clumpgrad, cosi, -sinu); //grad.x += b0*(zis.re*cosi - zis.im*sinu); //grad.y += b0*(zis.im*cosi + zis.re*sinu); //clumpgrad.x = true_coord.x*cosi + true_coord.y*sinu; //clumpgrad.y = true_coord.y*cosi - true_coord.x*sinu; // //clumpgrad.x = lens->b0[i]*clumpgrad.x; //clumpgrad.y = lens->b0[i]*clumpgrad.y; // //clumpgrad.x = lens->b0[i]*zis.re; //clumpgrad.y = lens->b0[i]*zis.im; //nan check //if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y) //{ // add the gradients //grad.x += clumpgrad.x; //grad.y += clumpgrad.y; //} } //IACA_END; // return(grad); } // // // __device__ inline struct point module_potentialDerivatives_totalGradient_8_SOA_GPU_SM(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad; grad.x = 0; grad.y = 0; // for(int i = shalos; i < shalos + nhalos; i++) { //IACA_START; // // struct point true_coord, true_coord_rot; //, result; //double R, angular_deviation; complex zis; // //result.x = result.y = 0.; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; //double cosi,sinu; //cosi = cos(lens->ellipticity_angle[i]); //sinu = sin(lens->ellipticity_angle[i]); double cosi = lens->anglecos[i]; double sinu = lens->anglesin[i]; //positionning at the potential center // Change the origin of the coordinate system to the center of the clump //true_coord_rot = rotateCoordinateSystem_GPU(true_coord, lens->ellipticity_angle[i]); //true_coord_rot = rotateCoordinateSystem_GPU_2(true_coord, cosi, sinu); true_coord_rot.x = true_coord.x*cosi + true_coord.y*sinu; true_coord_rot.y = true_coord.y*cosi - true_coord.x*sinu; // double x = true_coord_rot.x; double y = true_coord_rot.y; double eps = lens->ellipticity_potential[i]; double rc = lens->rcore[i]; double b0 = lens->b0[i]; // //std::cout << "piemd_lderivatives" << std::endl; // double sqe = sqrt(eps); // double cx1 = (1. - eps)/(1. + eps); double cxro = (1. + eps)*(1. + eps); double cyro = (1. - eps)*(1. - eps); // double rem2 = x*x/cxro + y*y/cyro; // complex zci, znum, zden, zres; double norm; // zci.re = 0; zci.im = -0.5*(1. - eps*eps)/sqe; // znum.re = cx1*x; znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; // zden.re = x; zden.im = 2.*rc*sqe - y; norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden // zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; norm = zis.re; zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) zis.im = atan2(zis.im, norm); // norm = zis.re; zres.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) zres.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) // zis.re = zres.re; zis.im = zres.im; // grad.x += b0*(zres.re*cosi - zres.im*sinu); grad.y += b0*(zres.im*cosi + zres.re*sinu); // //zres.re = zis.re*b0; //zres.im = zis.im*b0; // rotation //clumpgrad.x = zis.re; //clumpgrad.y = zis.im; } //IACA_END; // return(grad); } __device__ inline struct point module_potentialDerivatives_totalGradient_81_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); //std::cout << "# module_potentialDerivatives_totalGradient_SOA begins" << std::endl; // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad; grad.x = 0; grad.y = 0; // for(int i = shalos; i < shalos + nhalos; i++) { //IACA_START; // struct point true_coord, true_coord_rot; //, result; //double R, angular_deviation; complex zis; // //result.x = result.y = 0.; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; //positionning at the potential center // Change the origin of the coordinate system to the center of the clump true_coord_rot = rotateCoordinateSystem_GPU(true_coord, lens->ellipticity_angle[i]); // double x = true_coord_rot.x; double y = true_coord_rot.y; double eps = lens->ellipticity_potential[i]; double rc = lens->rcore[i]; double rcut = lens->rcut[i]; double b0 = lens->b0[i]; double t05 = b0*rcut/(rcut - rc); //printf("b0 = %f, rcut = %f, rc = %f, t05 = %f\n", b0, rcut, rc, t05); // //std::cout << "piemd_lderivatives" << std::endl; // double sqe = sqrt(eps); // double cx1 = (1. - eps)/(1. + eps); double cxro = (1. + eps)*(1. + eps); double cyro = (1. - eps)*(1. - eps); // double rem2 = x*x/cxro + y*y/cyro; // complex zci, znum, zden, zres_rc, zres_rcut; double norm; // zci.re = 0; zci.im = -0.5*(1. - eps*eps)/sqe; // // step 1 // { znum.re = cx1*x; znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; // zden.re = x; zden.im = 2.*rc*sqe - y; norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden // zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; norm = zis.re; zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) zis.im = atan2(zis.im, norm); // norm = zis.re; zres_rc.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) zres_rc.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) } // // step 2 // { znum.re = cx1*x; znum.im = 2.*sqe*sqrt(rcut*rcut + rem2) - y/cx1; // zden.re = x; zden.im = 2.*rcut*sqe - y; norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden // zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; norm = zis.re; zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) zis.im = atan2(zis.im, norm); // norm = zis.re; zres_rcut.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) zres_rcut.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) } zis.re = t05*(zres_rc.re - zres_rcut.re); zis.im = t05*(zres_rc.im - zres_rcut.im); //printf("%f %f\n", zis.re, zis.im); // //zres.re = zis.re*b0; //zres.im = zis.im*b0; // rotation clumpgrad.x = zis.re; clumpgrad.y = zis.im; clumpgrad = rotateCoordinateSystem_GPU(clumpgrad, -lens->ellipticity_angle[i]); // //clumpgrad.x = lens->b0[i]*clumpgrad.x; //clumpgrad.y = lens->b0[i]*clumpgrad.y; // //clumpgrad.x = lens->b0[i]*zis.re; //clumpgrad.y = lens->b0[i]*zis.im; //nan check //if(clumpgrad.x == clumpgrad.x or clumpgrad.y == clumpgrad.y) //{ // add the gradients grad.x += clumpgrad.x; grad.y += clumpgrad.y; //printf("grad = %f %f\n", clumpgrad.x, clumpgrad.y); //} } //IACA_END; // return(grad); } // // // typedef struct point (*halo_func_GPU_t) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos); __constant__ halo_func_GPU_t halo_func_GPU[100] = { 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_GPU, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_GPU, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_81_SOA_GPU, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; // // // __device__ struct point module_potentialDerivatives_totalGradient_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int nhalos) { struct point grad, clumpgrad; // grad.x = clumpgrad.x = 0; grad.y = clumpgrad.y = 0; // int shalos = 0; clumpgrad = module_potentialDerivatives_totalGradient_5_SOA_GPU(pImage, lens, 0, nhalos); grad.x += clumpgrad.x; grad.y += clumpgrad.y; return(grad); // //module_potentialDerivatives_totalGradient_81_SOA(pImage, lens, 0, nhalos); //return; /* int* p_type = &(lens->type)[0]; int* lens_type = (int*) malloc(nhalos*sizeof(int)); memcpy(lens_type, &(lens->type)[0], nhalos*sizeof(int)); //quicksort(lens_type, nhalos); //*/ //printf ("%f \n",lens->position_x[shalos]); while (shalos < nhalos) { int lens_type = lens->type[shalos]; int count = 1; while (lens->type[shalos + count] == lens_type) count++; //std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl; //printf ("%d %d %d \n",lens_type,count,shalos); // clumpgrad = (*halo_func_GPU[lens_type])(pImage, lens, shalos, count); // grad.x += clumpgrad.x; grad.y += clumpgrad.y; shalos += count; } return(grad); } #if 0 __device__ struct point module_potentialDerivatives_totalGradient_SOA_CPU_GPU(const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, double xmin, double ymin, int nbgridcells, int nhalos) { struct point grad, clumpgrad; // grad.x = clumpgrad.x = 0; grad.y = clumpgrad.y = 0; // int shalos = 0; //clumpgrad = module_potentialDerivatives_totalGradient_5_SOA_GPU(pImage, lens, 0, nhalos); //grad.x += clumpgrad.x; //grad.y += clumpgrad.y; //return(grad); // //module_potentialDerivatives_totalGradient_81_SOA(pImage, lens, 0, nhalos); //return; /* int* p_type = &(lens->type)[0]; int* lens_type = (int*) malloc(nhalos*sizeof(int)); memcpy(lens_type, &(lens->type)[0], nhalos*sizeof(int)); //quicksort(lens_type, nhalos); //*/ //printf ("%f \n",lens->position_x[shalos]); while (shalos < nhalos) { int lens_type = lens->type[shalos]; int count = 1; while (lens->type[shalos + count] == lens_type) count++; //std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl; //printf ("%d %d %d \n",lens_type,count,shalos); // clumpgrad = (*halo_func_GPU[lens_type])(pImage, lens, shalos, count); // grad.x += clumpgrad.x; grad.y += clumpgrad.y; shalos += count; } return(grad); } #endif __device__ inline struct point rotateCoordinateSystem_GPU(struct point P, double theta) { struct point Q; Q.x = P.x*cos(theta) + P.y*sin(theta); Q.y = P.y*cos(theta) - P.x*sin(theta); return(Q); } __device__ inline struct point rotateCoordinateSystem_GPU_2(struct point P, double cosi, double sinu) { struct point Q; Q.x = P.x*cosi + P.y*sinu; Q.y = P.y*cosi - P.x*sinu; return(Q); } diff --git a/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cu b/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cu index e8b1d16..5872098 100644 --- a/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cu +++ b/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cu @@ -1,1052 +1,1260 @@ #include #include "grid_gradient_GPU.cuh" #define BLOCK_SIZE_X 16 #define BLOCK_SIZE_Y 8 //#define ROT #define _SHARED_MEM #ifdef _SHARED_MEM #define SHARED __shared__ #warning "shared memory" +extern __shared__ double shared[]; #else #define SHARED #endif #define Nx 1 #define Ny 0 extern "C" { double myseconds(); } void module_potentialDerivatives_totalGradient_SOA_CPU_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos); void module_potentialDerivatives_totalGradient_SOA_CPU_GPU_v2(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos); void calculate_cossin_values(double *theta_cos, double *theta_sin, double *angles, int nhalos ){ - for(int i = 0 ; i < nhalos; i++){ - theta_cos[i]=cos(angles[i]); - theta_sin[i]=sin(angles[i]); + for(int i = 0 ; i < nhalos; i++) + { + theta_cos[i] = cos(angles[i]); + theta_sin[i] = sin(angles[i]); } } void gradient_grid_GPU_sorted(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos ,int nbgridcells){ int nBlocks_gpu = 0; // Define the number of threads per block the GPU will use cudaDeviceProp properties_gpu; cudaGetDeviceProperties(&properties_gpu, 0); // Get properties of 0th GPU in use if (properties_gpu.maxThreadsDim[0]type = (int *) malloc(sizeof(int)); // Allocate variables on the GPU cudasafe(cudaMalloc( (void**)&(lens_kernel), sizeof(Potential_SOA)),"Gradientgpu.cu : Alloc Potential_SOA: " ); cudasafe(cudaMalloc( (void**)&(type_gpu), nhalos*sizeof(int)),"Gradientgpu.cu : Alloc type_gpu: " ); cudasafe(cudaMalloc( (void**)&(lens_x_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc x_gpu: " ); cudasafe(cudaMalloc( (void**)&(lens_y_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc y_gpu: " ); cudasafe(cudaMalloc( (void**)&(b0_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc b0_gpu: " ); cudasafe(cudaMalloc( (void**)&(angle_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc angle_gpu: " ); cudasafe(cudaMalloc( (void**)&(epot_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc epot_gpu: " ); cudasafe(cudaMalloc( (void**)&(rcore_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcore_gpu: " ); cudasafe(cudaMalloc( (void**)&(rcut_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcut_gpu: " ); cudasafe(cudaMalloc( (void**)&(anglecos_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglecos_gpu: " ); cudasafe(cudaMalloc( (void**)&(anglesin_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglesin_gpu: " ); cudasafe(cudaMalloc( (void**)&(frame_gpu), sizeof(grid_param)),"Gradientgpu.cu : Alloc frame_gpu: " ); cudasafe(cudaMalloc( (void**)&(grid_grad_x_gpu), (nbgridcells) * (nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_x_gpu: " ); cudasafe(cudaMalloc( (void**)&(grid_grad_y_gpu), (nbgridcells) * (nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_y_gpu: " ); // Copy values to the GPU cudasafe(cudaMemcpy(type_gpu,lens->type , nhalos*sizeof(int),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy type_gpu: " ); cudasafe(cudaMemcpy(lens_x_gpu,lens->position_x , nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy x_gpu: " ); cudasafe(cudaMemcpy(lens_y_gpu,lens->position_y , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy y_gpu: " ); cudasafe(cudaMemcpy(b0_gpu,lens->b0 , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy b0_gpu: " ); cudasafe(cudaMemcpy(angle_gpu,lens->ellipticity_angle , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy angle_gpu: " ); cudasafe(cudaMemcpy(epot_gpu, lens->ellipticity_potential, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy epot_gpu: " ); cudasafe(cudaMemcpy(rcore_gpu, lens->rcore, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy rcore_gpu: " ); cudasafe(cudaMemcpy(rcut_gpu, lens->rcut, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy rcut_gpu: " ); cudasafe(cudaMemcpy(anglecos_gpu, lens->anglecos, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy anglecos: " ); cudasafe(cudaMemcpy(anglesin_gpu, lens->anglesin, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy anglesin: " ); cudasafe(cudaMemcpy(frame_gpu, frame, sizeof(grid_param), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy fame_gpu: " ); //printf("%p \n", lens_gpu); //printf("%p \n", type_gpu); //printf("%p \n", lens_gpu->type); //fflush(stdout); lens_gpu->type = type_gpu; lens_gpu->position_x = lens_x_gpu; lens_gpu->position_y = lens_y_gpu; lens_gpu->b0 = b0_gpu; lens_gpu->ellipticity_angle = angle_gpu; lens_gpu->ellipticity_potential = epot_gpu; lens_gpu->rcore = rcore_gpu; lens_gpu->rcut = rcut_gpu; lens_gpu->anglecos = anglecos_gpu; lens_gpu->anglesin = anglesin_gpu; cudaMemcpy(lens_kernel, lens_gpu, sizeof(Potential_SOA), cudaMemcpyHostToDevice); if (int((nbgridcells) * (nbgridcells)/threadsPerBlock) == 0){ gradient_grid_kernel<<<1,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel); } else{ gradient_grid_kernel<<<(nbgridcells) * (nbgridcells)/threadsPerBlock,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel); } cudasafe(cudaMemcpy( grid_grad_x, grid_grad_x_gpu, (nbgridcells) * (nbgridcells) *sizeof(double),cudaMemcpyDeviceToHost ),"Gradientgpu.cu : Copy source_x_gpu: " ); cudasafe(cudaMemcpy( grid_grad_y, grid_grad_y_gpu, (nbgridcells) * (nbgridcells) *sizeof(double), cudaMemcpyDeviceToHost),"Gradientgpu.cu : Copy source_y_gpu: " ); //printf("%f %f \n",grid_grad_x[0],grid_grad_y[0]); // Free GPU memory cudaFree(lens_gpu); cudaFree(type_gpu); cudaFree(lens_x_gpu); cudaFree(lens_y_gpu); cudaFree(b0_gpu); cudaFree(angle_gpu); cudaFree(epot_gpu); cudaFree(rcore_gpu); cudaFree(rcut_gpu); cudaFree(anglecos_gpu); cudaFree(anglesin_gpu); cudaFree(grid_grad_x_gpu); cudaFree(grid_grad_y_gpu); /* for (int i = 0; i < nbgridcells; i++){ for(int j = 0; j < nbgridcells; j++){ printf(" %f",grid_grad_x[i*nbgridcells + j]); } printf("\n"); }*/ } void gradient_grid_GPU_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, int nbgridcells){ //get number of GPU devices int nDevices; cudaGetDeviceCount(&nDevices); // Initialise kernel variables, table for multiple devices grid_param *frame_gpu[nDevices]; Potential_SOA *lens_gpu[nDevices],*lens_kernel[nDevices] ; int *type_gpu[nDevices]; double *lens_x_gpu[nDevices], *lens_y_gpu[nDevices], *b0_gpu[nDevices], *angle_gpu[nDevices], *epot_gpu[nDevices], *rcore_gpu[nDevices], *rcut_gpu[nDevices],*anglecos_gpu[nDevices], *anglesin_gpu[nDevices]; double *grid_grad_x_gpu[nDevices], *grid_grad_y_gpu[nDevices] ; // Initialise multiple device variables int Ndevice[nDevices], indexactual[nDevices]; cudaStream_t stream[nDevices]; indexactual[0] = 0 ; Ndevice[0] = (nbgridcells) * (nbgridcells)/nDevices; printf("Using %d Gpu's \n",nDevices ); //printf("%d %d \n",indexactual[0], Ndevice[0]); for (int dev = 1; dev < nDevices; dev++) { Ndevice[dev] = (nbgridcells) * (nbgridcells)/nDevices; if(indexactual[dev]+Ndevice[dev] > (nbgridcells) * (nbgridcells)){ Ndevice[dev] = (nbgridcells) * (nbgridcells) - indexactual[dev-1]; } indexactual[dev] = indexactual[dev-1] + Ndevice[dev]; //printf("%d %d \n",indexactual[dev], Ndevice[dev]); } for (int dev = 0; dev < nDevices; dev++) { cudaSetDevice(dev); lens_gpu[dev] = (Potential_SOA *) malloc(sizeof(Potential_SOA)); lens_gpu[dev]->type = (int *) malloc(sizeof(int)); // Allocate variables on the GPU cudasafe(cudaMalloc( (void**)&(lens_kernel[dev]), sizeof(Potential_SOA)),"Gradientgpu.cu : Alloc Potential_SOA: " ); cudasafe(cudaMalloc( (void**)&(type_gpu[dev]), nhalos*sizeof(int)),"Gradientgpu.cu : Alloc type_gpu: " ); cudasafe(cudaMalloc( (void**)&(lens_x_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc x_gpu: " ); cudasafe(cudaMalloc( (void**)&(lens_y_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc y_gpu: " ); cudasafe(cudaMalloc( (void**)&(b0_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc b0_gpu: " ); cudasafe(cudaMalloc( (void**)&(angle_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc angle_gpu: " ); cudasafe(cudaMalloc( (void**)&(epot_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc epot_gpu: " ); cudasafe(cudaMalloc( (void**)&(rcore_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcore_gpu: " ); cudasafe(cudaMalloc( (void**)&(rcut_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcut_gpu: " ); cudasafe(cudaMalloc( (void**)&(frame_gpu[dev]), sizeof(grid_param)),"Gradientgpu.cu : Alloc frame_gpu: " ); cudasafe(cudaMalloc( (void**)&(anglecos_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglecos_gpu: " ); cudasafe(cudaMalloc( (void**)&(anglesin_gpu[dev]), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglesin_gpu: " ); cudasafe(cudaMalloc( (void**)&(grid_grad_x_gpu[dev]), Ndevice[dev] *sizeof(double)),"Gradientgpu.cu : Alloc source_x_gpu: " ); cudasafe(cudaMalloc( (void**)&(grid_grad_y_gpu[dev]), Ndevice[dev] *sizeof(double)),"Gradientgpu.cu : Alloc source_y_gpu: " ); cudaStreamCreate(&stream[dev]); } for (int dev = 0; dev < nDevices; dev++) { cudaSetDevice(dev); // Copy values to the GPU cudasafe(cudaMemcpyAsync(type_gpu[dev],lens->type , nhalos*sizeof(int),cudaMemcpyHostToDevice,stream[dev] ),"Gradientgpu.cu : Copy type_gpu: " ); cudasafe(cudaMemcpyAsync(lens_x_gpu[dev],lens->position_x , nhalos*sizeof(double),cudaMemcpyHostToDevice,stream[dev] ),"Gradientgpu.cu : Copy x_gpu: " ); cudasafe(cudaMemcpyAsync(lens_y_gpu[dev],lens->position_y , nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy y_gpu: " ); cudasafe(cudaMemcpyAsync(b0_gpu[dev],lens->b0 , nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy b0_gpu: " ); cudasafe(cudaMemcpyAsync(angle_gpu[dev],lens->ellipticity_angle , nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy angle_gpu: " ); cudasafe(cudaMemcpyAsync(epot_gpu[dev], lens->ellipticity_potential, nhalos*sizeof(double),cudaMemcpyHostToDevice ,stream[dev]),"Gradientgpu.cu : Copy epot_gpu: " ); cudasafe(cudaMemcpyAsync(rcore_gpu[dev], lens->rcore, nhalos*sizeof(double),cudaMemcpyHostToDevice ,stream[dev]),"Gradientgpu.cu : Copy rcore_gpu: " ); cudasafe(cudaMemcpyAsync(rcut_gpu[dev], lens->rcut, nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy rcut_gpu: " ); cudasafe(cudaMemcpyAsync(anglecos_gpu[dev], lens->anglecos, nhalos*sizeof(double),cudaMemcpyHostToDevice,stream[dev] ),"Gradientgpu.cu : Copy anglecos: " ); cudasafe(cudaMemcpyAsync(anglesin_gpu[dev], lens->anglesin, nhalos*sizeof(double), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy anglesin: " ); cudasafe(cudaMemcpyAsync(frame_gpu[dev], frame, sizeof(grid_param), cudaMemcpyHostToDevice,stream[dev]),"Gradientgpu.cu : Copy fame_gpu: " ); lens_gpu[dev]->type = type_gpu[dev]; lens_gpu[dev]->position_x = lens_x_gpu[dev]; lens_gpu[dev]->position_y = lens_y_gpu[dev]; lens_gpu[dev]->b0 = b0_gpu[dev]; lens_gpu[dev]->ellipticity_angle = angle_gpu[dev]; lens_gpu[dev]->ellipticity_potential = epot_gpu[dev]; lens_gpu[dev]->rcore = rcore_gpu[dev]; lens_gpu[dev]->rcut = rcut_gpu[dev]; lens_gpu[dev]->anglecos = anglecos_gpu[dev]; lens_gpu[dev]->anglesin = anglesin_gpu[dev]; cudaMemcpyAsync(lens_kernel[dev], lens_gpu[dev], sizeof(Potential_SOA), cudaMemcpyHostToDevice,stream[dev]); } for (int dev = 0; dev < nDevices; dev++) { cudaSetDevice(dev); int nBlocks_gpu = 0; cudaDeviceProp properties_gpu; cudaGetDeviceProperties(&properties_gpu, dev); // Get properties of 0th GPU in use if (properties_gpu.maxThreadsDim[0]>>(grid_grad_x_gpu[dev], grid_grad_y_gpu[dev],frame_gpu[dev],nhalos, nbgridcells, lens_kernel[dev], indexactual[dev],Ndevice[dev]); } else{ gradient_grid_kernel_multiple<<<(nbgridcells) * (nbgridcells)/threadsPerBlock,threadsPerBlock,0,stream[dev]>>>(grid_grad_x_gpu[dev], grid_grad_y_gpu[dev],frame_gpu[dev],nhalos, nbgridcells, lens_kernel[dev], indexactual[dev],Ndevice[dev]); } } for (int dev = 0; dev < nDevices; dev++) { cudasafe(cudaMemcpyAsync( grid_grad_x + indexactual[dev], grid_grad_x_gpu[dev], Ndevice[dev] *sizeof(double),cudaMemcpyDeviceToHost ,stream[dev]),"Gradientgpu.cu : Copy source_x_gpu: " ); cudasafe(cudaMemcpyAsync( grid_grad_y + indexactual[dev], grid_grad_y_gpu[dev], Ndevice[dev] *sizeof(double), cudaMemcpyDeviceToHost,stream[dev]),"Gradientgpu.cu : Copy source_y_gpu: " ); } for (int dev = 0; dev < nDevices; dev++) { cudaSetDevice(dev); // Free GPU memory cudaFree(type_gpu[dev]); cudaFree(lens_x_gpu[dev]); cudaFree(lens_y_gpu[dev]); cudaFree(b0_gpu[dev]); cudaFree(angle_gpu[dev]); cudaFree(epot_gpu[dev]); cudaFree(rcore_gpu[dev]); cudaFree(rcut_gpu[dev]); cudaFree(anglecos_gpu[dev]); cudaFree(anglesin_gpu[dev]); cudaFree(grid_grad_x_gpu[dev]); cudaFree(grid_grad_y_gpu[dev]); cudaStreamDestroy(stream[dev]); } } #if 1 void gradient_grid_GPU_sub(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, int nbgridcells, int indexactual, int Ncells ){ // GPU Property query int nBlocks_gpu = 0; // Define the number of threads per block the GPU will use cudaDeviceProp properties_gpu; cudaGetDeviceProperties(&properties_gpu, 0); // Get properties of 0th GPU in use if (properties_gpu.maxThreadsDim[0]type = (int *) malloc(sizeof(int)); // Allocate variables on the GPU cudasafe(cudaMalloc( (void**)&(lens_kernel), sizeof(Potential_SOA)),"Gradientgpu.cu : Alloc Potential_SOA: " ); cudasafe(cudaMalloc( (void**)&(type_gpu), nhalos*sizeof(int)),"Gradientgpu.cu : Alloc type_gpu: " ); cudasafe(cudaMalloc( (void**)&(lens_x_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc x_gpu: " ); cudasafe(cudaMalloc( (void**)&(lens_y_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc y_gpu: " ); cudasafe(cudaMalloc( (void**)&(b0_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc b0_gpu: " ); cudasafe(cudaMalloc( (void**)&(angle_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc angle_gpu: " ); cudasafe(cudaMalloc( (void**)&(epot_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc epot_gpu: " ); cudasafe(cudaMalloc( (void**)&(rcore_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcore_gpu: " ); cudasafe(cudaMalloc( (void**)&(rcut_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcut_gpu: " ); cudasafe(cudaMalloc( (void**)&(anglecos_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglecos_gpu: " ); cudasafe(cudaMalloc( (void**)&(anglesin_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglesin_gpu: " ); cudasafe(cudaMalloc( (void**)&(frame_gpu), sizeof(grid_param)),"Gradientgpu.cu : Alloc frame_gpu: " ); cudasafe(cudaMalloc( (void**)&(grid_grad_x_gpu), (nbgridcells) * (nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_x_gpu: " ); cudasafe(cudaMalloc( (void**)&(grid_grad_y_gpu), (nbgridcells) * (nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_y_gpu: " ); // Copy values to the GPU cudasafe(cudaMemcpy(type_gpu,lens->type , nhalos*sizeof(int),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy type_gpu: " ); cudasafe(cudaMemcpy(lens_x_gpu,lens->position_x , nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy x_gpu: " ); cudasafe(cudaMemcpy(lens_y_gpu,lens->position_y , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy y_gpu: " ); cudasafe(cudaMemcpy(b0_gpu,lens->b0 , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy b0_gpu: " ); cudasafe(cudaMemcpy(angle_gpu,lens->ellipticity_angle , nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy angle_gpu: " ); cudasafe(cudaMemcpy(epot_gpu, lens->ellipticity_potential, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy epot_gpu: " ); cudasafe(cudaMemcpy(rcore_gpu, lens->rcore, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy rcore_gpu: " ); cudasafe(cudaMemcpy(rcut_gpu, lens->rcut, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy rcut_gpu: " ); cudasafe(cudaMemcpy(anglecos_gpu, lens->anglecos, nhalos*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy anglecos: " ); cudasafe(cudaMemcpy(anglesin_gpu, lens->anglesin, nhalos*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy anglesin: " ); cudasafe(cudaMemcpy(frame_gpu, frame, sizeof(grid_param), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy fame_gpu: " ); // lens_gpu->type = type_gpu; lens_gpu->position_x = lens_x_gpu; lens_gpu->position_y = lens_y_gpu; lens_gpu->b0 = b0_gpu; lens_gpu->ellipticity_angle = angle_gpu; lens_gpu->ellipticity_potential = epot_gpu; lens_gpu->rcore = rcore_gpu; lens_gpu->rcut = rcut_gpu; lens_gpu->anglecos = anglecos_gpu; lens_gpu->anglesin = anglesin_gpu; // cudaMemcpy(lens_kernel, lens_gpu, sizeof(Potential_SOA), cudaMemcpyHostToDevice); // double time = -myseconds(); module_potentialDerivatives_totalGradient_SOA_CPU_GPU(grid_grad_x_gpu, grid_grad_y_gpu, frame_gpu, lens, lens_kernel, nbgridcells, nhalos); // if (int((nbgridcells) * (nbgridcells)/threadsPerBlock) == 0) { gradient_grid_kernel<<<1,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel); } else { //gradient_grid_kernel<<<(nbgridcells) * (nbgridcells)/threadsPerBlock,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel); //gradient_grid_kernel_v2<<>>(grid_grad_x_gpu, grid_grad_y_gpu, frame_gpu, nhalos, nbgridcells, lens_kernel); //gradient_grid_kernel_v2<<>>(grid_grad_x_gpu, grid_grad_y_gpu, frame_gpu, nhalos, nbgridcells, lens_kernel); if (int((Ncells)/threadsPerBlock) == 0){ gradient_grid_kernel_multiple<<<1,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel, indexactual, Ncells); } else{ gradient_grid_kernel_multiple<<<(Ncells)/threadsPerBlock,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,nhalos, nbgridcells, lens_kernel, indexactual, Ncells); } #endif module_potentialDerivatives_totalGradient_SOA_CPU_GPU(grid_grad_x_gpu, grid_grad_y_gpu, frame_gpu, lens, lens_kernel, nbgridcells, nhalos); cudasafe(cudaMemcpy( grid_grad_x, grid_grad_x_gpu, (nbgridcells) * (nbgridcells) *sizeof(double),cudaMemcpyDeviceToHost ),"Gradientgpu.cu : Copy source_x_gpu: " ); cudasafe(cudaMemcpy( grid_grad_y, grid_grad_y_gpu, (nbgridcells) * (nbgridcells) *sizeof(double), cudaMemcpyDeviceToHost),"Gradientgpu.cu : Copy source_y_gpu: " ); //printf("%f %f \n",grid_grad_x[0],grid_grad_y[0]); // Free GPU memory cudaFree(lens_gpu); cudaFree(type_gpu); cudaFree(lens_x_gpu); cudaFree(lens_y_gpu); cudaFree(b0_gpu); cudaFree(angle_gpu); cudaFree(epot_gpu); cudaFree(rcore_gpu); cudaFree(rcut_gpu); cudaFree(anglecos_gpu); cudaFree(anglesin_gpu); cudaFree(grid_grad_x_gpu); cudaFree(grid_grad_y_gpu); /* for (int i = 0; i < nbgridcells; i++){ for(int j = 0; j < nbgridcells; j++){ printf(" %f",grid_grad_x[i*nbgridcells + j]); } printf("\n"); }*/ } __global__ void gradient_grid_kernel(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens) { //*grad_x = *grad_y = 0.; int bid=blockIdx.x; // index of the block (and of the set of images) int tid=threadIdx.x; // index of the thread within the block double dx,dy; //pixelsize int grid_dim, index; struct point image_point, Grad; dx = (frame->xmax - frame->xmin)/(nbgridcells-1); dy = (frame->ymax - frame->ymin)/(nbgridcells-1); grid_dim = (nbgridcells); index = bid * threadsPerBlock + tid ; while(index < grid_dim*grid_dim){ grid_grad_x[index] = 0.; grid_grad_y[index] = 0.; image_point.x = frame->xmin + (index/grid_dim)*dx; image_point.y = frame->ymin + (index % grid_dim)*dy; Grad = module_potentialDerivatives_totalGradient_SOA_GPU(&image_point, lens, Nlens); grid_grad_x[index] = Grad.x; grid_grad_y[index] = Grad.y; bid += gridDim.x; index = bid * threadsPerBlock + tid; } } -__global__ void gradient_grid_kernel_v2(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens) { - +__global__ void gradient_grid_kernel_v2(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens) +{ //*grad_x = *grad_y = 0.; - int bid = blockIdx.x; // index of the block (and of the set of images) + int bid = blockIdx.x; // index of the block (and of the set of images) int tid = threadIdx.x; // index of the thread within the block double dx,dy; //pixelsize int grid_dim, index; struct point image_point, Grad; // dx = (frame->xmax - frame->xmin)/(nbgridcells-1); dy = (frame->ymax - frame->ymin)/(nbgridcells-1); // grid_dim = (nbgridcells); // int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; // index = col*nbgridcells + row ; // //while(index < grid_dim*grid_dim){ //grid_grad_x[index] = 0.; //grid_grad_y[index] = 0.; image_point.x = frame->xmin + col*dx; image_point.y = frame->ymin + row*dy; Grad = module_potentialDerivatives_totalGradient_SOA_GPU(&image_point, lens, Nlens); grid_grad_x[index] = Grad.x; grid_grad_y[index] = Grad.y; bid += gridDim.x; index = bid * threadsPerBlock + tid; //} } __global__ void module_potentialDerivatives_totalGradient_8_SOA_GPU_cur(double *grid_grad_x, double *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int shalos, int nhalos) { //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad, image_point; // grad.x = 0; grad.y = 0; // int col = blockIdx.x*blockDim.x + threadIdx.x; int row = blockIdx.y*blockDim.y + threadIdx.y; // if ((row < nbgridcells) && (col < nbgridcells)) { // int index = row*nbgridcells + col; // //grid_grad_x[index] = 0.; //grid_grad_y[index] = 0.; // double dx = (frame->xmax - frame->xmin)/(nbgridcells-1); double dy = (frame->ymax - frame->ymin)/(nbgridcells-1); // image_point.x = frame->xmin + col*dx; image_point.y = frame->ymin + row*dy; // for(int i = shalos; i < shalos + nhalos; i++) { struct point true_coord, true_coord_rot; //, result; //double R, angular_deviation; complex zis; // // positionning at the potential center // Change the origin of the coordinate system to the center of the clump // //@@if ((row == Ny) && (col == Nx)) printf("image_x = %f, %f image_y = %f, %f\n", image_point.x, frame->xmin, image_point.y,frame->ymin); double true_coord_x = image_point.x - __ldg(&lens->position_x[i]); double true_coord_y = image_point.y - __ldg(&lens->position_y[i]); //if ((row == Ny) && (col == Nx)) printf("x = %f y = %f\n", true_coord_x, true_coord_y); // double cosi = __ldg(&lens->anglecos[i]); double sinu = __ldg(&lens->anglesin[i]); // double x = true_coord_x*cosi + true_coord_y*sinu; double y = true_coord_y*cosi - true_coord_x*sinu; // //if ((row == Ny) && (col == Nx)) printf("x = %f y = %f\n", x, y); // double eps = __ldg(&lens->ellipticity_potential[i]); // double sqe = sqrt(eps); // double rem2 = x*x/((1. + eps)*(1. + eps)) + y*y/((1. - eps)*(1. - eps)); // complex zci; complex znum, zden, zres; double norm; // zci.re = 0; zci.im = -0.5*(1. - eps*eps)/sqe; //@@if ((col == Nx) && (row == Ny)) printf("%d %d, zis: %f %f\n", row, col, zci.re, zci.im); // double rc = __ldg(&lens->rcore[i]); double cx1 = (1. - eps)/(1. + eps); znum.re = cx1*x; znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; // zden.re = x; zden.im = 2.*rc*sqe - y; norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden //@@if ((col == Nx) && (row == Ny)) printf("norm = %f\n", norm); // zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; // //@@if ((col == Nx) && (row == Ny)) printf("%d %d, zis: %f %f\n", row, col, zis.re, zis.im); // norm = zis.re; // zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) zis.im = atan2(zis.im, norm); // //@@if ((col == Nx) && (row == Ny)) printf("%d %d, zis: %f %f\n", row, col, zis.re, zis.im); // zres.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) ) zres.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) ) // //@@if ((col == Nx) && (row == Ny)) printf("%d %d, zres: %f %f\n", row, col, zres.re, zres.im); // double b0 = __ldg(&lens->b0[i]); grad.x += b0*(zres.re*cosi - zres.im*sinu); grad.y += b0*(zres.im*cosi + zres.re*sinu); //@@if ((col == Nx) && (row == Ny)) printf("grad: %f %f\n", grad.x, grad.y); } //IACA_END; // grid_grad_x[index] = grad.x; grid_grad_y[index] = grad.y; //if ((row == 0) && (col == 9)) //printf("%f %f: %f %f\n", image_point.x, image_point.y, grid_grad_x[index], grid_grad_y[index]); } } __global__ void module_potentialDerivatives_totalGradient_8_SOA_GPU_SM2(double *grid_grad_x, double *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int shalos, int nhalos) { // //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // double grad_x, grad_y; double clumpgrad_x, clumpgrad_y; double image_point_x, image_point_y; // SHARED double cosi [200]; SHARED double sinu [200]; SHARED double rc [200]; SHARED double b0 [200]; SHARED double epsi [200]; SHARED double position_x[200]; SHARED double position_y[200]; SHARED double rsqe [200]; SHARED double sonepeps [200]; SHARED double sonemeps [200]; // grad_x = 0; grad_y = 0; // int col = blockIdx.x*blockDim.x + threadIdx.x; int row = blockIdx.y*blockDim.y + threadIdx.y; int ithread = threadIdx.y*blockDim.x + threadIdx.x; // int index = row*nbgridcells + col; // //grid_grad_x[index] = 0.; //grid_grad_y[index] = 0.; // double dx = (frame->xmax - frame->xmin)/(nbgridcells-1); double dy = (frame->ymax - frame->ymin)/(nbgridcells-1); // image_point_x = frame->xmin + col*dx; image_point_y = frame->ymin + row*dy; // int i = ithread; if (i < nhalos) { cosi[i] = __ldg(&lens->anglecos [shalos + i]); sinu[i] = __ldg(&lens->anglesin [shalos + i]); position_x[i] = __ldg(&lens->position_x [shalos + i]); position_y[i] = __ldg(&lens->position_y [shalos + i]); rc[i] = __ldg(&lens->rcore [shalos + i]); b0[i] = __ldg(&lens->b0 [shalos + i]); epsi[i] = __ldg(&lens->ellipticity_potential[shalos + i]); - sonemeps[i] = 1 - epsi[i]; - sonepeps[i] = 1 + epsi[i]; + //sonemeps[i] = 1 - epsi[i]; + //sonepeps[i] = 1 + epsi[i]; rsqe[i] = sqrt(epsi[i]); } __syncthreads(); // if ((row < nbgridcells) && (col < nbgridcells)) { for(int i = 0; i < nhalos; i++) { // double true_coord_x = image_point_x - position_x[i]; double true_coord_y = image_point_y - position_y[i]; // double x = true_coord_x*cosi[i] + true_coord_y*sinu[i]; double y = true_coord_y*cosi[i] - true_coord_x*sinu[i]; // double eps = epsi[i]; //double onemeps = 1 - eps; //double onepeps = 1 + eps; // //double eps = epsi[i]; double onemeps = sonemeps[i]; double onepeps = sonepeps[i]; // //double sqe = sqrt(eps); double sqe = rsqe[i]; double rem2 = x*x/(onepeps*onepeps) + y*y/(onemeps*onemeps); // complex zis; // double znum_re, znum_im; double zres_re, zres_im; double norm; double zden_re, zden_im; double zis_re, zis_im; // double zci_im = -0.5*(1. - eps*eps)/sqe; // double cx1 = onemeps/onepeps; // znum_re = cx1*x; znum_im = 2.*sqe*sqrt(rc[i]*rc[i] + rem2) - y/cx1; // zden_re = x; zden_im = 2.*rc[i]*sqe - y; // norm = (x*x + zden_im*zden_im); // zis = znum/zden zis.re = (znum_re*x + znum_im*zden_im)/norm; zis.im = (znum_im*x - znum_re*zden_im)/norm; // norm = zis.re; // zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) zis.im = atan2(zis.im, norm); // zres_re = - zci_im*zis.im; // Re( zci*ln(zis) ) zres_im = zci_im*zis.re; // Im( zci*ln(zis) ) // grad_x += b0[i]*(zres_re*cosi[i] - zres_im*sinu[i]); grad_y += b0[i]*(zres_im*cosi[i] + zres_re*sinu[i]); } // grid_grad_x[index] = grad_x; grid_grad_y[index] = grad_y; //__syncthreads(); } } // // // __global__ void module_potentialDerivatives_totalGradient_8_SOA_GPU_SM3(double *grid_grad_x, double *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int shalos, int nhalos) { // //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // double grad_x, grad_y; double clumpgrad_x, clumpgrad_y; double image_point_x, image_point_y; // SHARED double cosi [200]; SHARED double sinu [200]; - SHARED double rc [200]; + SHARED double rci [200]; SHARED double b0 [200]; SHARED double epsi [200]; SHARED double position_x[200]; SHARED double position_y[200]; SHARED double rsqe [200]; + //SHARED double sgrad_x [(BLOCK_SIZE_X + 1)*BLOCK_SIZE_Y]; + //SHARED double sgrad_y [(BLOCK_SIZE_X + 1)*BLOCK_SIZE_Y]; + //SHARED double sonepeps [200]; //SHARED double sonemeps [200]; // grad_x = 0; grad_y = 0; // int row = blockIdx.x*blockDim.x + threadIdx.x; - int grid_size = nbgridcells/blockDim.y; - int col =( blockIdx.y*blockDim.y + threadIdx.y)*grid_size; + int col = blockIdx.y*blockDim.y + threadIdx.y; + // + //int loc_row = threadIdx.x; + //int loc_col = threadIdx.y*blockDim.x + threadIdx.x; + // + //int grid_size = nbgridcells/blockDim.y; + // //if (threadIdx.x == 0) printf("%d %d %d: row = %d, col = %d, grid_size = %d\n", blockIdx.y, gridDim.y, threadIdx.y, row, col, grid_size); // - // - //grid_grad_x[index] = 0.; - //grid_grad_y[index] = 0.; - // - double dx = (frame->xmax - frame->xmin)/(nbgridcells-1); - double dy = (frame->ymax - frame->ymin)/(nbgridcells-1); + double dx = (frame->xmax - frame->xmin)/(nbgridcells-1); + double dy = (frame->ymax - frame->ymin)/(nbgridcells-1); //if (threadIdx.x == 0) printf("dx = %f, dy = %f\n", dx, dy); - // - image_point_x = frame->xmin + col*dx; - image_point_y = frame->ymin + row*dy; - // - int i = row; - if (threadIdx.x == 0) - for (int i = 0; i < nhalos; i += 1) - { - cosi[i] = __ldg(&lens->anglecos [shalos + i]); - sinu[i] = __ldg(&lens->anglesin [shalos + i]); - position_x[i] = __ldg(&lens->position_x [shalos + i]); - position_y[i] = __ldg(&lens->position_y [shalos + i]); - rc[i] = __ldg(&lens->rcore [shalos + i]); - b0[i] = __ldg(&lens->b0 [shalos + i]); - epsi[i] = __ldg(&lens->ellipticity_potential[shalos + i]); - rsqe[i] = sqrt(epsi[i]); - } - //if (threadIdx.x == 0) printf("%d %d\n", blockDim.y, gridDim.y); - __syncthreads(); - // + // + image_point_x = frame->xmin + col*dx; + image_point_y = frame->ymin + row*dy; + // + //int iloc = threadIdx.x*blockDim.y + threadIdx.y; + int iglob = row*nbgridcells + col; + int numThreads = blockDim.x*blockDim.y; + // + for (int i = 0; i < (nhalos + numThreads - 1)/numThreads; ++i) + { + int iloc = threadIdx.x*blockDim.y + threadIdx.y + i*numThreads; + if (iloc < nhalos) + { + cosi[iloc] = __ldg(&lens->anglecos [shalos + iloc]); + sinu[iloc] = __ldg(&lens->anglesin [shalos + iloc]); + position_x[iloc] = __ldg(&lens->position_x [shalos + iloc]); + position_y[iloc] = __ldg(&lens->position_y [shalos + iloc]); + rci[iloc] = __ldg(&lens->rcore [shalos + iloc]); + b0[iloc] = __ldg(&lens->b0 [shalos + iloc]); + epsi[iloc] = __ldg(&lens->ellipticity_potential[shalos + iloc]); + rsqe[iloc] = sqrt(epsi[iloc]); + } + } + __syncthreads(); + // if ((row < nbgridcells) && (col < nbgridcells)) { // - for(int icol = 0; icol < grid_size; ++icol) + for(int i = 0; i < nhalos; i++) { - int index = row*nbgridcells + col + icol; - grad_x = grad_y = 0.; + //int index = iloc; +#if 1 + double rc = rci[i]; + double eps = epsi[i]; + double onemeps = 1 - eps; + double onepeps = 1 + eps; + // + double sqe = rsqe[i]; + double cx1 = onemeps/onepeps; + // + // + //double x = true_coord_y*sinu[i]; + //double y = true_coord_y*cosi[i]; + double true_coord_y = image_point_y - position_y[i]; + double true_coord_x = image_point_x - position_x[i] /*+ icol*dx*/; + // + double zci_im; + zci_im = -0.5*(1. - eps*eps)/sqe; + double inv_onepeps = 1./(onepeps*onepeps); + double inv_onemeps = 1./(onemeps*onemeps); +#endif // - for(int i = 0; i < nhalos; i++) { - // - double true_coord_x = image_point_x + icol*dx - position_x[i]; - double true_coord_y = image_point_y - position_y[i]; - //if ((row == 1) && (col == 0)) printf("%d %d: %f %f\n", row, col, true_coord_x, true_coord_y); - // - double x = true_coord_x*cosi[i] + true_coord_y*sinu[i]; - double y = true_coord_y*cosi[i] - true_coord_x*sinu[i]; - // - double eps = epsi[i]; - double onemeps = 1 - eps; - double onepeps = 1 + eps; - //if ((row == 1) && (col == 0)) printf("i = %d, eps = %f\n", i, eps); - // - //double eps = epsi[i]; - //double onemeps = sonemeps[i]; - //double onepeps = sonepeps[i]; - // - //double sqe = sqrt(eps); - double sqe = rsqe[i]; - double rem2 = x*x/(onepeps*onepeps) + y*y/(onemeps*onemeps); - // - complex zis; - // - double znum_re, znum_im; - double zres_re, zres_im; - double norm; - double zden_re, zden_im; - double zis_re, zis_im; - // - double zci_im = -0.5*(1. - eps*eps)/sqe; - // - double cx1 = onemeps/onepeps; - // - znum_re = cx1*x; - znum_im = 2.*sqe*sqrt(rc[i]*rc[i] + rem2) - y/cx1; - // - zden_re = x; - zden_im = 2.*rc[i]*sqe - y; - // - norm = (x*x + zden_im*zden_im); // zis = znum/zden - zis.re = (znum_re*x + znum_im*zden_im)/norm; - zis.im = (znum_im*x - znum_re*zden_im)/norm; - // - norm = zis.re; - // - zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) - zis.im = atan2(zis.im, norm); - // - zres_re = - zci_im*zis.im; // Re( zci*ln(zis) ) - zres_im = zci_im*zis.re; // Im( zci*ln(zis) ) - // - grad_x += b0[i]*(zres_re*cosi[i] - zres_im*sinu[i]); - grad_y += b0[i]*(zres_im*cosi[i] + zres_re*sinu[i]); + KERNEL_8_reg(0); + //KERNEL_8; } - // - grid_grad_x[index] += grad_x; - grid_grad_y[index] += grad_y; - } + /*{ + KERNEL_8_reg(BLOCK_SIZE_X/2); + grid_grad_x[iglob + BLOCK_SIZE_X/2] += grad_x; + grid_grad_y[iglob + BLOCK_SIZE_X/2] += grad_y; + }*/ // - //grid_grad_x[index] = grad_x; - //grid_grad_y[index] = grad_y; - //__syncthreads(); + } + grid_grad_x[iglob + 0] = grad_x; + grid_grad_y[iglob + 0] = grad_y; } } // // // __global__ - void +void +module_potentialDerivatives_totalGradient_8_SOA_GPU_SM4(double *grid_grad_x, double *grid_grad_y, const struct Potential_SOA + *lens, const struct grid_param *frame, int nbgridcells, int shalos, int nhalos/*, double* dtimer*/) +{ + // + //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); + // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 + // + double grad_x, grad_y; + double clumpgrad_x, clumpgrad_y; + double image_point_x, image_point_y; + // + // + double* cosi = &shared[0*nhalos]; + double* sinu = &shared[1*nhalos]; + double* rc = &shared[2*nhalos]; + double* b0 = &shared[3*nhalos]; + double* epsi = &shared[4*nhalos]; + double* position_x = &shared[5*nhalos]; + double* position_y = &shared[6*nhalos]; + double* rsqe = &shared[7*nhalos]; + + //SHARED double sonepeps [200]; + //SHARED double sonemeps [200]; + // + grad_x = 0; + grad_y = 0; + // + int grid_size = nbgridcells/gridDim.y; + int row = blockIdx.x*blockDim.x + threadIdx.x; + int col = (blockIdx.y*blockDim.y + threadIdx.y)/**grid_size*/; + // + // +#if 1 + //SHARED double sgrad_x [ + if (/*(threadIdx.x == 0) &&*/ (blockIdx.x == 0)) + { + if (threadIdx.x == 0) printf("blockDim.x = %d, blockIdx.x = %d grimdDim.x = %d threadIdx.x = %d\n", blockDim.x, blockIdx.x, gridDim.x, threadIdx.x); + if (threadIdx.x == 0) printf("blockDim.y = %d, blockIdx.y = %d grimdDim.y = %d threadIdx.y = %d\n", blockDim.y, blockIdx.y, gridDim.y, threadIdx.y); + if (threadIdx.x == 0) printf("row = %d, col = %d, grid_size = %d\n", row, col, grid_size); + } + __syncthreads(); +#endif + double* sgrad_x = &shared[8*nhalos]; + double* sgrad_y = &shared[8*nhalos + (grid_size + 1)*blockDim.x]; + // + // + //grid_grad_x[index] = 0.; + //grid_grad_y[index] = 0.; + // + double dx = (frame->xmax - frame->xmin)/(nbgridcells-1); + double dy = (frame->ymax - frame->ymin)/(nbgridcells-1); + // + //if (threadIdx.x == 0) printf("dx = %f, dy = %f\n", dx, dy); + // + image_point_x = frame->xmin + col*dx; + image_point_y = frame->ymin + row*dy; + return; + // + int i = 0; +#if 0 + for (; i < nhalos; i = i + blockDim.x) + { + int pos = threadIdx.x + i; + /*if ((threadIdx.x == 0) && (blockIdx.x == 0))*/ printf("pos = %d\n"); + __syncthreads(); + // + cosi[pos] = __ldg(&lens->anglecos [shalos + pos]); + sinu[pos] = __ldg(&lens->anglesin [shalos + pos]); + position_x[pos] = __ldg(&lens->position_x [shalos + pos]); + position_y[pos] = __ldg(&lens->position_y [shalos + pos]); + rc[pos] = __ldg(&lens->rcore [shalos + pos]); + b0[pos] = __ldg(&lens->b0 [shalos + pos]); + epsi[pos] = __ldg(&lens->ellipticity_potential[shalos + pos]); + rsqe[pos] = sqrt(epsi[i]); + // + } +#endif +#if 0 + if (threadIdx.x == 0) + for (; i < nhalos; i += 1) + { + cosi[i] = __ldg(&lens->anglecos [shalos + i]); + sinu[i] = __ldg(&lens->anglesin [shalos + i]); + position_x[i] = __ldg(&lens->position_x [shalos + i]); + position_y[i] = __ldg(&lens->position_y [shalos + i]); + rc[i] = __ldg(&lens->rcore [shalos + i]); + b0[i] = __ldg(&lens->b0 [shalos + i]); + epsi[i] = __ldg(&lens->ellipticity_potential[shalos + i]); + rsqe[i] = sqrt(epsi[i]); + } +#endif + __syncthreads(); + //if ((row == col == 0)) printf("shared mem done...\n"); + // + if (row < nbgridcells) + { + //for(int icol = 0; icol < grid_size; ++icol){ + // if (col + icol < nbgridcells){ + //grad_x = grad_y = 0.; + int index = row*nbgridcells + col; + // + for(int i = 0; i < nhalos; i++) + { + int sindex = threadIdx.x*grid_size; +#if 0 + double eps = epsi[i]; + double onemeps = 1 - eps; + double onepeps = 1 + eps; + // + double sqe = rsqe[i]; + double cx1 = onemeps/onepeps; + // + // + //double x = true_coord_y*sinu[i]; + //double y = true_coord_y*cosi[i]; + double true_coord_y = image_point_y - position_y[i]; + double true_coord_x = image_point_x - position_x[i] /*+ icol*dx*/; + // + complex zci; + zci.im = -0.5*(1. - eps*eps)/sqe; + double inv_onepeps = 1./(onepeps*onepeps); + double inv_onemeps = 1./(onemeps*onemeps); +#endif + // + for(int icol = 0; icol < grid_size; ++icol) + { + if (col + icol < nbgridcells) + { +#if 0 + if ((row == 1) && (col == 1)) printf("%d %d: %f %f\n", row, col, true_coord_x, true_coord_y); + + true_coord_x = image_point_x - position_x[i] + icol*dx; + // + //x += true_coord_x*cosi[i]; + //y -= true_coord_x*sinu[i]; + double x = true_coord_x*cosi[i] + true_coord_y*sinu[i]; + double y = true_coord_y*cosi[i] - true_coord_x*sinu[i]; + // + //if ((row == 1) && (col == 0)) printf("i = %d, eps = %f\n", i, eps); + // + //double eps = epsi[i]; + //double onemeps = sonemeps[i]; + //double onepeps = sonepeps[i]; + // + //double sqe = sqrt(eps); + //double rem2 = x*x/(onepeps*onepeps) + y*y/(onemeps*onemeps); + double rem2 = x*x*inv_onepeps + y*y*inv_onemeps; + // + // + //double znum_re, znum_im; + //double zres_re, zres_im; + //double zden_re, zden_im; + //double zis_re, zis_im; + double norm; + // + complex zis; + complex znum; + complex zden; + complex zres; + // + //double cx1 = onemeps/onepeps; + // + znum.re = cx1*x; + znum.im = 2.*sqe*sqrt(rc[i]*rc[i] + rem2) - y/cx1; + // + zden.re = x; + zden.im = 2.*rc[i]*sqe - y; + // + norm = (x*x + zden.im*zden.im); // zis = znum/zden + zis.re = (znum.re*x + znum.im*zden.im)/norm; + zis.im = (znum.im*x - znum.re*zden.im)/norm; + // + norm = zis.re; + // + zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis) + zis.im = atan2(zis.im, norm); + // + zres.re = - zci.im*zis.im; // Re( zci*ln(zis) ) + zres.im = zci.im*zis.re; // Im( zci*ln(zis) ) + // + grid_grad_x[index] += b0[i]*(zres.re*cosi[i] - zres.im*sinu[i]); + grid_grad_y[index] += b0[i]*(zres.im*cosi[i] + zres.re*sinu[i]); +#endif + //sgrad_x[sindex] += (float) sindex; + sgrad_y[sindex] += (float) -sindex; + //sindex++; + // + //grid_grad_x[index] += grad_x; + //grid_grad_y[index] += grad_y; + } + // + } + } + __syncthreads(); + return; + // +#if 0 + int sindex = threadIdx.x*grid_size; + for(int icol = 0; icol < grid_size; ++icol) + { + if (col + icol < nbgridcells) + { + grid_grad_x[index + col] = sgrad_x[sindex]; + grid_grad_y[index + col] = sgrad_y[sindex]; + sindex++; + } + } + __syncthreads(); +#endif + } +} + + + +__global__ +void module_potentialDerivatives_totalGradient_8_SOA_GPU_v2(double *grid_grad_x, double *grid_grad_y, const struct Potential_SOA *lens, const struct grid_param *frame, int nbgridcells, int i, int nhalos) { //asm volatile("# module_potentialDerivatives_totalGradient_SOA begins"); // 6 DP loads, i.e. 48 Bytes: position_x, position_y, ellipticity_angle, ellipticity_potential, rcore, b0 // struct point grad, clumpgrad, image_point; grad.x = 0; grad.y = 0; // int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; // if ((row < nbgridcells) && (col < nbgridcells)) { // int index = col*nbgridcells + row; // //grid_grad_x[index] = 0.; //grid_grad_y[index] = 0.; // double dx = (frame->xmax - frame->xmin)/(nbgridcells-1); double dy = (frame->ymax - frame->ymin)/(nbgridcells-1); // #if 0 /*SHARED*/ double img_pt[2]; if ((row == 0) && (col == 0)) { img_pt[0] = frame->xmin + col*dx; img_pt[1] = frame->ymin + row*dy; } __syncthreads(); #else image_point.x = frame->xmin + col*dx; image_point.y = frame->ymin + row*dy; #endif // // //for(int i = shalos; i < shalos + nhalos; i++) //{ //IACA_START; // struct point true_coord, true_coord_rot; //, result; //double R, angular_deviation; complex zis; // //result.x = result.y = 0.; // #if 0 true_coord.x = img_pt[0] - __ldg(&lens->position_x[i]); true_coord.y = img_pt[1] - __ldg(&lens->position_y[i]); #else true_coord.x = image_point.x - __ldg(&lens->position_x[i]); true_coord.y = image_point.y - __ldg(&lens->position_y[i]); #endif double cosi = __ldg(&lens->anglecos[i]); double sinu = __ldg(&lens->anglesin[i]); // positionning at the potential center // Change the origin of the coordinate system to the center of the clump double x = true_coord.x*cosi + true_coord.y*sinu; double y = true_coord.y*cosi - true_coord.x*sinu; // double eps = __ldg(&lens->ellipticity_potential[i]); // double sqe = sqrt(eps); // double rem2 = x*x/((1. + eps)*(1. + eps)) + y*y/((1. - eps)*(1. - eps)); // complex zci; complex znum, zden, zres; double norm; // zci.im = -0.5*(1. - eps*eps)/sqe; // double rc = __ldg(&lens->rcore[i]); double cx1 = (1. - eps)/(1. + eps); znum.re = cx1*x; znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; // zden.re = x; zden.im = 2.*rc*sqe - y; norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden // zis.re = (znum.re*zden.re + znum.im*zden.im)/norm; zis.im = (znum.im*zden.re - znum.re*zden.im)/norm; // double b0 = __ldg(&lens->b0[i]); grad.x += b0*(zres.re*cosi - zres.im*sinu); grad.y += b0*(zres.im*cosi + zres.re*sinu); // grid_grad_x[index] += grad.x; grid_grad_y[index] += grad.y; } - } +} /* typedef struct point (*halo_func_GPU_t) (const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos); __constant__ halo_func_GPU_t halo_func_GPU[100] = { 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_5_SOA_GPU, 0, 0, module_potentialDerivatives_totalGradient_8_SOA_GPU, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, module_potentialDerivatives_totalGradient_81_SOA_GPU, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; */ void module_potentialDerivatives_totalGradient_SOA_CPU_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens_cpu, const struct Potential_SOA *lens_gpu, int nbgridcells, int nhalos) - { - struct point grad, clumpgrad; - // - grad.x = clumpgrad.x = 0.; - grad.y = clumpgrad.y = 0.; - // - int shalos = 0; - // - int GRID_SIZE_X = (nbgridcells + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks - int GRID_SIZE_Y = 1; - // - { - //dim3 dimBlock(BLOCK_SIZE/1, BLOCK_SIZE); - //dim3 dimGrid (GRID_SIZE , GRID_SIZE ); - dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y); - dim3 dimGrid (GRID_SIZE_X , GRID_SIZE_Y); - // - int count = nhalos; - printf("nhalos = %d\n", nhalos); - // - cudaMemset(grid_grad_x, 0, nbgridcells*nbgridcells*sizeof(double)); - cudaMemset(grid_grad_y, 0, nbgridcells*nbgridcells*sizeof(double)); - // - module_potentialDerivatives_totalGradient_8_SOA_GPU_SM3<<>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nbgridcells, shalos, count); - } - // - /* - { - dim3 dimBlock(BLOCK_SIZE/1, BLOCK_SIZE/2); - dim3 dimGrid(GRID_SIZE, GRID_SIZE); - // - int count = nhalos; - cudaMemset(grid_grad_x, 0, nbgridcells*nbgridcells*sizeof(double)); - cudaMemset(grid_grad_y, 0, nbgridcells*nbgridcells*sizeof(double)); - // - module_potentialDerivatives_totalGradient_8_SOA_GPU_SM<<>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nbgridcells, shalos, count); - } - */ - // - cudasafe(cudaGetLastError(), "module_potentialDerivative_totalGradient_SOA_CPU_GPU"); - // - //cudaDeviceSynchronize(); - //module_potentialDerivatives_totalGradient_8_SOA_GPU_SM<<>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nbgridcells, shalos, count); - //grad.x += clumpgrad.x; - //grad.y += clumpgrad.y; +{ + struct point grad, clumpgrad; + // + grad.x = clumpgrad.x = 0.; + grad.y = clumpgrad.y = 0.; + // + int shalos = 0; + // + int GRID_SIZE_X = (nbgridcells + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks + int GRID_SIZE_Y = (nbgridcells + BLOCK_SIZE_Y - 1)/BLOCK_SIZE_Y; + printf("grid_size_x = %d, grid_size_y = %d\n", GRID_SIZE_X, GRID_SIZE_Y); + // + double* timer = (double *) malloc((int) nbgridcells*nbgridcells*sizeof(double)); + double* dtimer; + //cudasafe(cudaMalloc( (void**)&(dtimer), (int) nbgridcells*nbgridcells*sizeof(double)),"Gradientgpu.cu : totalGradient_SOA_CPU_GPU: " ); + // + { + //dim3 dimBlock(BLOCK_SIZE_X/1, BLOCK_SIZE_Y); + //dim3 dimGrid (GRID_SIZE_X , GRID_SIZE_Y ); + dim3 threads(BLOCK_SIZE_Y, BLOCK_SIZE_X); + dim3 grid (GRID_SIZE_Y , GRID_SIZE_X); + // + int count = nhalos; + //printf("nhalos = %d, size of shared memory = %lf\n", nhalos, (double) (8*nhalos + BLOCK_SIZE_X*nbgridcells/BLOCK_SIZE_Y)*sizeof(double)); + printf("nhalos = %d, size of shared memory = %lf\n", nhalos, (double) (8*nhalos + BLOCK_SIZE_X*BLOCK_SIZE_Y)*sizeof(double)); + // + cudaMemset(grid_grad_x, 0, nbgridcells*nbgridcells*sizeof(double)); + cudaMemset(grid_grad_y, 0, nbgridcells*nbgridcells*sizeof(double)); + // + module_potentialDerivatives_totalGradient_8_SOA_GPU_SM3<<>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nbgridcells, shalos, count); + //module_potentialDerivatives_totalGradient_8_SOA_GPU_SM4<<>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nbgridcells, shalos, count/*, dtimer*/); + cudasafe(cudaGetLastError(), "module_potentialDerivative_totalGradient_SOA_CPU_GPU_8_SOA_GPU_SM4"); + + } + cudaDeviceSynchronize(); + printf("GPU kernel done... ");fflush(stdout); + //cudasafe(cudaMemcpy( timer, dtimer, nbgridcells*nbgridcells*sizeof(double), cudaMemcpyDeviceToHost ),"Gradientgpu.cu : dtimer memcpy " ); + + // + /* + { + dim3 dimBlock(BLOCK_SIZE/1, BLOCK_SIZE/2); + dim3 dimGrid(GRID_SIZE, GRID_SIZE); + // + int count = nhalos; + cudaMemset(grid_grad_x, 0, nbgridcells*nbgridcells*sizeof(double)); + cudaMemset(grid_grad_y, 0, nbgridcells*nbgridcells*sizeof(double)); + // + module_potentialDerivatives_totalGradient_8_SOA_GPU_SM<<>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nbgridcells, shalos, count); + } + */ + // + cudasafe(cudaGetLastError(), "module_potentialDerivative_totalGradient_SOA_CPU_GPU"); + // + printf("\n"); + //module_potentialDerivatives_totalGradient_8_SOA_GPU_SM<<>> (grid_grad_x, grid_grad_y, lens_gpu, frame, nbgridcells, shalos, count); + //grad.x += clumpgrad.x; + //grad.y += clumpgrad.y; - // - // - /* - while (shalos < nhalos) - { - - int lens_type = lens_cpu->type[shalos]; - int count = 1; - while (lens_cpu->type[shalos + count] == lens_type) count++; - //std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl; - //printf ("%d %d %d \n",lens_type,count,shalos); - // - clumpgrad = (*halo_func_GPU[lens_type]<<>> )(lens_gpu, shalos, count); - // - grad.x += clumpgrad.x; - grad.y += clumpgrad.y; - shalos += count; - } + // + // + /* + while (shalos < nhalos) + { - return(grad); - */ - } + int lens_type = lens_cpu->type[shalos]; + int count = 1; + while (lens_cpu->type[shalos + count] == lens_type) count++; + //std::cerr << "type = " << lens_type << " " << count << " " << shalos << std::endl; + //printf ("%d %d %d \n",lens_type,count,shalos); + // + clumpgrad = (*halo_func_GPU[lens_type]<<>> )(lens_gpu, shalos, count); + // + grad.x += clumpgrad.x; + grad.y += clumpgrad.y; + shalos += count; + } + + return(grad); + */ +} diff --git a/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cuh b/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cuh index 575df16..cae430e 100644 --- a/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cuh +++ b/Benchmarks/GridGradientBenchmark/grid_gradient_GPU.cuh @@ -1,30 +1,81 @@ /* * gradientgpu.cuh * * Created on: Nov 29, 2016 * Author: cerschae */ #ifndef GRID_GRADIENT_GPU_CUH_ #define GRID_GRADIENT_GPU_CUH_ #include "cudafunctions.cuh" #include "gradient_GPU.cuh" #include void gradient_grid_GPU_sorted(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens,int nbgridcells); void gradient_grid_pinned(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int *Nlens,int nbgridcell); void gradient_grid_pinned_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int *Nlens,int nbgridcell); void gradient_grid_GPU_sub(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos,int nbgridcells, int indexactual, int Ncells ); void gradient_grid_GPU_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, int nbgridcells); __global__ void gradient_grid_kernel(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens); __global__ void gradient_grid_kernel_v2(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcells, const struct Potential_SOA *lens); __global__ void gradient_grid_piemd_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcell, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut); __global__ void gradient_grid_sis_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcell, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut); __global__ void gradient_grid_piemd_GPU_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens,int nbgridcell, int Ndevice, int indexactual, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut); __global__ void gradient_grid_kernel_multiple(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells, const struct Potential_SOA *lens, int indexactual, int ncells); __device__ static double atomicAdd_double(double* address, double val); #endif /* GRADIENTGPU_CUH_ */ + + +#define KERNEL_8 \ +double x = true_coord_x*cosi[i] + true_coord_y*sinu[i]; \ +double y = true_coord_y*cosi[i] - true_coord_x*sinu[i]; \ +double rem2 = x*x*inv_onepeps + y*y*inv_onemeps; \ +double norm; \ +complex zis; \ +complex znum; \ +complex zden; \ +complex zres; \ +znum.re = cx1*x; \ +znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; \ +zden.re = x; \ +zden.im = 2.*rc*sqe - y; \ +norm = (x*x + zden.im*zden.im); \ +zis.re = (znum.re*x + znum.im*zden.im)/norm; \ +zis.im = (znum.im*x - znum.re*zden.im)/norm; \ +norm = zis.re; \ +zis.re = log(sqrt(norm*norm + zis.im*zis.im)); \ +zis.im = atan2(zis.im, norm); \ +zres.re = - zci_im*zis.im; \ +zres.im = zci_im*zis.re; \ +grad_x += b0[i]*(zres.re*cosi[i] - zres.im*sinu[i]); \ +grad_y += b0[i]*(zres.im*cosi[i] + zres.re*sinu[i]); + + +#define KERNEL_8_reg(X) \ +double x = (true_coord_x + X*dx)*cosi[i] + true_coord_y*sinu[i]; \ +double y = true_coord_y*cosi[i] - (true_coord_x + X*dx)*sinu[i]; \ +double rem2 = x*x*inv_onepeps + y*y*inv_onemeps; \ +double norm; \ +double zis_re, zis_im; \ +double znum_re, znum_im; \ +double /*zden_re,*/ zden_im; \ +double zres_re, zres_im; \ +znum_re = cx1*x; \ +znum_im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1; \ +/*zden_re = x;*/ \ +zden_im = 2.*rc*sqe - y; \ +norm = (x*x + zden_im*zden_im); \ +zis_re = (znum_re*x + znum_im*zden_im)/norm; \ +zis_im = (znum_im*x - znum_re*zden_im)/norm; \ +norm = zis_re; \ +zis_re = log(sqrt(norm*norm + zis_im*zis_im)); \ +zis_im = atan2(zis_im, norm); \ +zres_re = - zci_im*zis_im; \ +zres_im = zci_im*zis_re; \ +grad_x += b0[i]*(zres_re*cosi[i] - zres_im*sinu[i]); \ +grad_y += b0[i]*(zres_im*cosi[i] + zres_re*sinu[i]); \ +\ diff --git a/Benchmarks/GridGradientBenchmark/main.cpp b/Benchmarks/GridGradientBenchmark/main.cpp index 4910e34..6879e75 100644 --- a/Benchmarks/GridGradientBenchmark/main.cpp +++ b/Benchmarks/GridGradientBenchmark/main.cpp @@ -1,331 +1,331 @@ /** * @file main.cpp * @Author Christoph Schaaefer, EPFL (christophernstrerne.schaefer@epfl.ch) * @date October 2016 * @brief Benchmark for gradhalo function */ #include #include #include #include #include #include #include // #include // #include #include #include "timer.h" #include "gradient.hpp" #include "chi_CPU.hpp" #include "module_cosmodistances.h" #include "module_readParameters.hpp" #include "grid_gradient_GPU.cuh" //#include int module_readCheckInput_readInput(int argc, char *argv[]) { /// check if there is a correct number of arguments, and store the name of the input file in infile char* infile; struct stat file_stat; // If we do not have 3 arguments, stop if ( argc != 3 ) { fprintf(stderr, "\nUnexpected number of arguments\n"); fprintf(stderr, "\nUSAGE:\n"); fprintf(stderr, "lenstool input_file output_directorypath [-n]\n\n"); exit(-1); } else if ( argc == 3 ) infile=argv[1]; std::ifstream ifile(infile,std::ifstream::in); // Open the file int ts = (int) time (NULL); char buffer[10]; std::stringstream ss; ss << ts; std::string trimstamp = ss.str(); // std::string outdir = argv[2]; outdir += "-"; outdir += trimstamp; std::cout << outdir << std::endl; // check whether the output directory already exists if (stat(outdir.c_str(), &file_stat) < 0){ mkdir(outdir.c_str(), S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP | S_IXGRP | S_IROTH ); } else { printf("Error : Directory %s already exists. Specify a non existing directory.\n",argv[2]); exit(-1); } // check whether the input file exists. If it could not be opened (ifile = 0), it does not exist if(ifile){ ifile.close(); } else{ printf("The file %s does not exist, please specify a valid file name\n",infile); exit(-1); } return 0; } int main(int argc, char *argv[]) { // Setting Up the problem //=========================================================================================================== // This module function reads the terminal input when calling LENSTOOL and checks that it is correct // Otherwise it exits LENSTOOL module_readCheckInput_readInput(argc, argv); // This module function reads the cosmology parameters from the parameter file // Input: struct cosmologicalparameters cosmology, parameter file // Output: Initialized cosmology struct cosmo_param cosmology; // Cosmology struct to store the cosmology data from the file std::string inputFile = argv[1]; // Input file module_readParameters_readCosmology(inputFile, cosmology); // This module function reads the runmode paragraph and the number of sources, arclets, etc. in the parameter file. // The runmode_param stores the information of what exactly the user wants to do with lenstool. struct runmode_param runmode; module_readParameters_readRunmode(inputFile, &runmode); module_readParameters_debug_cosmology(runmode.debug, cosmology); module_readParameters_debug_runmode(runmode.debug, runmode); //=== Declaring variables struct grid_param frame; struct galaxy images[runmode.nimagestot]; struct galaxy sources[runmode.nsets]; struct Potential lenses[runmode.nhalos+runmode.npotfile-1]; struct Potential_SOA lenses_SOA_table[NTYPES]; struct Potential_SOA lenses_SOA; struct cline_param cline; struct potfile_param potfile; struct Potential potfilepotentials[runmode.npotfile]; struct potentialoptimization host_potentialoptimization[runmode.nhalos]; int nImagesSet[runmode.nsets]; // Contains the number of images in each set of images // This module function reads in the potential form and its parameters (e.g. NFW) // Input: input file // Output: Potentials and its parameters module_readParameters_Potential(inputFile, lenses, runmode.nhalos); //Converts to SOA //module_readParameters_PotentialSOA(inputFile, lenses, lenses_SOA, runmode.Nlens); module_readParameters_PotentialSOA(inputFile, lenses, &lenses_SOA, runmode.nhalos); //module_readParameters_PotentialSOA_nonsorted(inputFile, lenses, &lenses_SOA_nonsorted, runmode.nhalos); module_readParameters_debug_potential(runmode.debug, lenses, runmode.nhalos); //std::cerr << lenses_SOA[1].b0[0] << " " << lenses[0].b0 << std::endl; // This module function reads in the potfiles parameters // Input: input file // Output: Potentials from potfiles and its parameters if (runmode.potfile == 1 ) { module_readParameters_readpotfiles_param(inputFile, &potfile); module_readParameters_debug_potfileparam(runmode.debug, &potfile); module_readParameters_readpotfiles(&runmode,&potfile,lenses); module_readParameters_debug_potential(runmode.debug, lenses, runmode.nhalos+runmode.npotfile); } // This module function reads in the grid form and its parameters // Input: input file // Output: grid and its parameters module_readParameters_Grid(inputFile, &frame); if (runmode.image == 1 or runmode.inverse == 1 or runmode.time > 0){ // This module function reads in the strong lensing images module_readParameters_readImages(&runmode, images, nImagesSet); //runmode.nsets = runmode.nimagestot; for(int i = 0; i < runmode.nimagestot; ++i){ images[i].dls = module_cosmodistances_objectObject(lenses[0].z, images[i].redshift, cosmology); images[i].dos = module_cosmodistances_observerObject(images[i].redshift, cosmology); images[i].dr = module_cosmodistances_lensSourceToObserverSource(lenses[0].z, images[i].redshift, cosmology); } module_readParameters_debug_image(runmode.debug, images, nImagesSet,runmode.nsets); } if (runmode.inverse == 1){ // This module function reads in the potential optimisation limits module_readParameters_limit(inputFile,host_potentialoptimization,runmode.nhalos); module_readParameters_debug_limit(runmode.debug, host_potentialoptimization[0]); } if (runmode.source == 1){ //Initialisation to default values.(Setting sources to z = 1.5 default value) for(int i = 0; i < runmode.nsets; ++i){ sources[i].redshift = 1.5; } // This module function reads in the strong lensing sources module_readParameters_readSources(&runmode, sources); //Calculating cosmoratios for(int i = 0; i < runmode.nsets; ++i){ sources[i].dls = module_cosmodistances_objectObject(lenses[0].z, sources[i].redshift, cosmology); sources[i].dos = module_cosmodistances_observerObject(sources[i].redshift, cosmology); sources[i].dr = module_cosmodistances_lensSourceToObserverSource(lenses[0].z, sources[i].redshift, cosmology); } module_readParameters_debug_source(runmode.debug, sources, runmode.nsets); } std::cout << "--------------------------" << std::endl << std::endl; double t_1,t_2,t_3,t_4; // Lenstool-CPU Grid-Gradient //=========================================================================================================== //Setting Test: double dx, dy; int grid_dim = runmode.nbgridcells; dx = (frame.xmax - frame.xmin)/(runmode.nbgridcells-1); dy = (frame.ymax - frame.ymin)/(runmode.nbgridcells-1); point test_point1_1,test_point2_2, test_result1_1,test_result2_2,test_pointN_N, test_resultN_N; double dlsds = images[0].dr; test_point1_1.x = frame.xmin; test_point1_1.y = frame.ymin; test_point2_2.x = frame.xmin + dx; test_point2_2.y = frame.ymin + dy; test_pointN_N.x = frame.xmin + ((runmode.nbgridcells*runmode.nbgridcells-1)/runmode.nbgridcells)*dx; test_pointN_N.y = frame.ymin + ((runmode.nbgridcells*runmode.nbgridcells-1) % runmode.nbgridcells)*dy; double* grid_gradient_x_cpu = (double *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double)); double* grid_gradient_y_cpu = (double *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double)); memset(grid_gradient_x_cpu, 0, (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double)); memset(grid_gradient_y_cpu, 0, (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double)); std::cout << " CPU Test... "; // t_1 = -myseconds(); //test_result1_1 = module_potentialDerivatives_totalGradient_SOA(&test_point1_1, &lenses_SOA, runmode.nhalos); //test_result2_2 = module_potentialDerivatives_totalGradient_SOA(&test_point2_2, &lenses_SOA, runmode.nhalos); //test_resultN_N = module_potentialDerivatives_totalGradient_SOA(&test_pointN_N, &lenses_SOA, runmode.nhalos); gradient_grid_CPU(grid_gradient_x_cpu, grid_gradient_y_cpu, &frame, &lenses_SOA, runmode.nhalos, grid_dim); t_1 += myseconds(); // std::cout << " Time " << std::setprecision(15) << t_1 << std::endl; // GPU test std::cout << " GPU Test... "; double *grid_gradient_x, *grid_gradient_y; - grid_gradient_x = (double *)malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double)); - grid_gradient_y = (double *)malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double)); + grid_gradient_x = (double *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double)); + grid_gradient_y = (double *) malloc((int) (runmode.nbgridcells) * (runmode.nbgridcells) * sizeof(double)); #if 0 t_1 = -myseconds(); //Packaging the image to sourceplane conversion gradient_grid_CPU(grid_gradient_x,grid_gradient_y,&frame,&lenses_SOA,runmode.nhalos,grid_dim); t_1 += myseconds(); std::cout << " gradient_grid_CPU Brute Force Benchmark " << std::endl; std::cout << " Test 1: " << std::endl; std::cout << " Point 1 : " << std::setprecision(5) << test_point1_1.x << " "<< test_point1_1.y << std::endl; std::cout << " Gradient " << std::setprecision(5) << grid_gradient_x[0] << " "<< grid_gradient_y[0] << std::endl; std::cout << " Test 2: " << std::endl; std::cout << " Point 2 : " << std::setprecision(5) << test_point2_2.x << " "<< test_point2_2.y << std::endl; std::cout << " Gradient " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells+1] << " "<< grid_gradient_y[runmode.nbgridcells+1] << std::endl; std::cout << " Time " << std::setprecision(15) << t_1 << std::endl; #endif -#if 1 t_2 = -myseconds(); gradient_grid_GPU_sorted(grid_gradient_x, grid_gradient_y, &frame, &lenses_SOA, runmode.nhalos, grid_dim); t_2 += myseconds(); std::cout << " Time " << std::setprecision(15) << t_2 << std::endl; -#endif +#if 1 double norm_x = 0.; double norm_y = 0.; for (int ii = 0; ii < grid_dim*grid_dim; ++ii) { double g_x = grid_gradient_x[ii]; double g_y = grid_gradient_y[ii]; // double c_x = grid_gradient_x_cpu[ii]; double c_y = grid_gradient_y_cpu[ii]; // //if (!(ii%grid_dim)) std::cout << std::endl; - ///std::cout << ii << " " << g_x << " " << c_x << " " << g_y << " " << c_y << std::endl; + // std::cout << ii << " " << g_x << " " << c_x << " " << g_y << " " << c_y << std::endl; // norm_x += (grid_gradient_x[ii] - grid_gradient_x_cpu[ii])*(grid_gradient_x[ii] - grid_gradient_x_cpu[ii]); norm_y += (grid_gradient_y[ii] - grid_gradient_y_cpu[ii])*(grid_gradient_y[ii] - grid_gradient_y_cpu[ii]); } std::cout << " l2 difference norm = " << std::setprecision(15) << norm_x << " " << std::setprecision(15) << norm_y << std::endl; +#endif #if 1 t_3 = -myseconds(); gradient_grid_GPU_multiple(grid_gradient_x,grid_gradient_y,&frame,&lenses_SOA,runmode.nhalos,grid_dim); t_3 += myseconds(); std::cout << " gradient_grid_GPU_multiple Brute Force Benchmark " << std::endl; std::cout << " Test 1: " << std::endl; std::cout << " Point 1 : " << std::setprecision(5) << test_point1_1.x << " "<< test_point1_1.y << std::endl; std::cout << " Gradient " << std::setprecision(5) << grid_gradient_x[0] << " "<< grid_gradient_y[0] << std::endl; std::cout << " Test 2: " << std::endl; std::cout << " Point 2 : " << std::setprecision(5) << test_point2_2.x << " "<< test_point2_2.y << std::endl; std::cout << " Gradient " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells+1] << " "<< grid_gradient_y[runmode.nbgridcells+1] << std::endl; std::cout << " Test 3: " << std::endl; std::cout << " Point 3 : " << std::setprecision(5) << test_pointN_N.x << " "<< test_pointN_N.y << std::endl; std::cout << " Gradient " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells*runmode.nbgridcells-1] << " "<< grid_gradient_y[runmode.nbgridcells*runmode.nbgridcells-1] << std::endl; std::cout << " Time " << std::setprecision(15) << t_3 << std::endl; #endif #if 1 t_4 = -myseconds(); gradient_grid_GPU_sub(grid_gradient_x,grid_gradient_y,&frame,&lenses_SOA,runmode.nhalos,grid_dim,0,grid_dim*grid_dim); t_4 += myseconds(); std::cout << " gradient_grid_GPU_sub Brute Force Benchmark " << std::endl; std::cout << " Test 1: " << std::endl; std::cout << " Point 1 : " << std::setprecision(5) << test_point1_1.x << " "<< test_point1_1.y << std::endl; std::cout << " Gradient " << std::setprecision(5) << grid_gradient_x[0] << " "<< grid_gradient_y[0] << std::endl; std::cout << " Test 2: " << std::endl; std::cout << " Point 2 : " << std::setprecision(5) << test_point2_2.x << " "<< test_point2_2.y << std::endl; std::cout << " Gradient " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells+1] << " "<< grid_gradient_y[runmode.nbgridcells+1] << std::endl; std::cout << " Test 3: " << std::endl; std::cout << " Point 3 : " << std::setprecision(5) << test_pointN_N.x << " "<< test_pointN_N.y << std::endl; std::cout << " Gradient " << std::setprecision(5) << grid_gradient_x[runmode.nbgridcells*runmode.nbgridcells-1] << " "<< grid_gradient_y[runmode.nbgridcells*runmode.nbgridcells-1] << std::endl; std::cout << " Time " << std::setprecision(15) << t_4 << std::endl; #endif }