diff --git a/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_CPU.cpp b/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_CPU.cpp index e7e633a..a922194 100644 --- a/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_CPU.cpp +++ b/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_CPU.cpp @@ -1,420 +1,428 @@ #include <stdlib.h> #include <iomanip> #include "grid_gradient_mixed_CPU.hpp" #include "structure_mixed.hpp" //#include <iacaMarks.h> extern "C"{ double mysecond() { struct timeval tp; struct timezone tzp; // int i = gettimeofday(&tp,&tzp); // return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); } } // // // //#define SIS_THRESHOLD 0.0092 // // // //struct point_double module_potentialDerivatives_totalGradient_5_SOA_DP_v2(const struct point_double *pImage, const struct Potential_SOA *lens, int shalos, int nhalos, int echo = false) //inline struct point_double module_potentialDerivatives_totalGradient_5_SOA_DP_v2(const struct point_double *pImage, const struct Potential_Mixed<double>* lens, int shalos, int nhalos, int echo = false) { asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins"); //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n"); // double one = 1.; double threehalf = 1.5; double half = 0.5; // struct point_double grad, result; // grad.x = (double) 0.; grad.y = (double) 0.; //int i = 0*shalos; // //IACA_START; for(int i = shalos; i < shalos + nhalos; i++) { // double b0 = lens->b0[i]; struct point_double true_coord, true_coord_rotation; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); double cose = lens->anglecos[i]; double sine = lens->anglesin[i]; // double x = true_coord.x*cose + true_coord.y*sine; double y = true_coord.y*cose - true_coord.x*sine; // double ell_pot = lens->ellipticity_potential[i]; // double val = x*x*(one - ell_pot) + y*y*(one + ell_pot); // double R = 1.f/sqrtf(val); R = R*(threehalf - half*val*R*R); // result.x = (one - ell_pot)*b0*x*R; result.y = (one + ell_pot)*b0*y*R; //double R = sqrt(x*x*(1 - ell_pot) + y*y*(1 + ell_pot)); // //result.x = (1 - ell_pot)*lens->b0[i]*x/R; //result.y = (1 + ell_pot)*lens->b0[i]*y/R; // grad.x += result.x*cose - result.y*sine; grad.y += result.y*cose + result.x*sine; //if (echo) printf("coord = %.15f %.15f, x = %.15f %.15f, ell_pot = %.15f, res = %.15f %.15f\n", true_coord.x, true_coord.y, x, y, ell_pot, result.x, result.y); } //IACA_END; return grad; } // // // struct point_double module_potentialDerivatives_totalGradient_5_SOA_DP_v2(const struct point_double *pImage, const struct Potential_SOA *lens, int shalos, int nhalos, int echo = false) { asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins"); //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n"); // double one = 1.; double threehalf = 1.5; double half = 0.5; // struct point_double grad, result; // grad.x = (double) 0.; grad.y = (double) 0.; // for(int i = shalos; i < shalos + nhalos; i++) { // double b0 = lens->b0[i]; struct point_double true_coord, true_coord_rotation; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); double cose = lens->anglecos[i]; double sine = lens->anglesin[i]; // double x = true_coord.x*cose + true_coord.y*sine; double y = true_coord.y*cose - true_coord.x*sine; // double ell_pot = lens->ellipticity_potential[i]; // double val = x*x*(one - ell_pot) + y*y*(one + ell_pot); // double R = 1.f/sqrtf(val); R = R*(threehalf - half*val*R*R); // result.x = (one - ell_pot)*b0*x*R; result.y = (one + ell_pot)*b0*y*R; //double R = sqrt(x*x*(1 - ell_pot) + y*y*(1 + ell_pot)); // //result.x = (1 - ell_pot)*lens->b0[i]*x/R; //result.y = (1 + ell_pot)*lens->b0[i]*y/R; // grad.x += result.x*cose - result.y*sine; grad.y += result.y*cose + result.x*sine; //if (echo) printf("coord = %.15f %.15f, x = %.15f %.15f, ell_pot = %.15f, res = %.15f %.15f\n", true_coord.x, true_coord.y, x, y, ell_pot, result.x, result.y); } return grad; } // // // //void gradient_grid_general_double_CPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells, const struct Potential_SOA *lens, int istart, int jstart) void gradient_grid_general_double_CPU(double* __restrict__ grid_grad_x, double* __restrict__ grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells, const struct Potential_Mixed<double> *lens, int istart, int jstart) { //printf("gradient_grid_general_double_CPU Potential_Mixed<double> %d %d\n", istart, jstart); fflush(stdout); const int grid_dim = nbgridcells; // //const type_t dx = (frame->xmax - frame->xmin)/(nbgridcells-1); //const type_t dy = (frame->ymax - frame->ymin)/(nbgridcells-1); const double dx = (frame->xmax - frame->xmin)/(nbgridcells - 1); const double dy = (frame->ymax - frame->ymin)/(nbgridcells - 1); // #pragma omp parallel #pragma omp for for (int jj = jstart; jj < nbgridcells; ++jj) for (int ii = istart; ii < nbgridcells; ++ii) { int index = jj*nbgridcells + ii; // point_double image_point; image_point.x = frame->xmin + ii*dx; image_point.y = frame->ymin + jj*dy; //Grad = module_potentialDerivatives_totalGradient_SOA(&image_point, lens, 0, Nlens); //printf("%d %d: %d ", ii, jj, index); point_double Grad_DP = module_potentialDerivatives_totalGradient_5_SOA_DP_v2(&image_point, lens, 0, Nlens, (ii == 0) && (jj == 0)); //if (index == 0) std::cout << "** " << index << " " << ii << " " << jj << " " << std::setprecision(15) << image_point.x << " " << image_point.y << ": " << Grad_DP.x << " " << Grad_DP.y << std::endl; // grid_grad_x[index] = (double) Grad_DP.x; grid_grad_y[index] = (double) Grad_DP.y; // } } // // // void gradient_grid_general_double_CPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells, const struct Potential_SOA *lens, int istart, int jstart) { const int grid_dim = nbgridcells; // //const type_t dx = (frame->xmax - frame->xmin)/(nbgridcells-1); //const type_t dy = (frame->ymax - frame->ymin)/(nbgridcells-1); const double dx = (frame->xmax - frame->xmin)/(nbgridcells - 1); const double dy = (frame->ymax - frame->ymin)/(nbgridcells - 1); // #pragma omp parallel #pragma omp for for (int jj = jstart; jj < nbgridcells; ++jj) for (int ii = istart; ii < nbgridcells; ++ii) { int index = jj*nbgridcells + ii; // point_double image_point; image_point.x = frame->xmin + ii*dx; image_point.y = frame->ymin + jj*dy; //Grad = module_potentialDerivatives_totalGradient_SOA(&image_point, lens, 0, Nlens); //printf("%d %d: %d ", ii, jj, index); point_double Grad_DP = module_potentialDerivatives_totalGradient_5_SOA_DP_v2(&image_point, lens, 0, Nlens, 0); //std::cout << "** " << index << " " << ii << " " << jj << " " << std::setprecision(15) << image_point.x << " " << image_point.y<< ": " << Grad_DP.x << " " << Grad_DP.y << std::endl; // grid_grad_x[index] = (double) Grad_DP.x; grid_grad_y[index] = (double) Grad_DP.y; // } } // // // void gradient_grid_euler(double* grid_grad_x, double* grid_grad_y, const double* grid_grad_temp_x, const double* grid_grad_temp_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart, int* count) //gradient_grid_euler(double* grid_grad_temp_x, double* grid_grad_temp_y, double* grid_grad_x, double* grid_grad_y, const struct grid_param *frame, const struct Potential_Mixed<double> *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart, int* count) { int loc_count = 0; double time = -mysecond(); // { const double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1); const double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1); #pragma omp parallel #pragma omp for reduction (+: loc_count) //#pragma omp single for (int jj = jstart + 1; jj < nbgridcells_y; ++jj) { grid_grad_x[jj*nbgridcells_y] = grid_grad_temp_x[jj*nbgridcells_y]; grid_grad_y[jj*nbgridcells_y] = grid_grad_temp_y[jj*nbgridcells_y]; //#pragma omp task for (int ii = istart + 1; ii < nbgridcells_x; ++ii) { { int index = (jj - 0)*nbgridcells_x + (ii - 0); int index_north = (jj - 1)*nbgridcells_x + (ii - 0); int index_west = (jj - 0)*nbgridcells_x + (ii - 1); // type_t grad_north_x = grid_grad_temp_x[index] - grid_grad_temp_x[index_north]; type_t grad_north_y = grid_grad_temp_y[index] - grid_grad_temp_y[index_north]; // type_t grad_west_x = grid_grad_temp_x[index] - grid_grad_temp_x[index_west]; type_t grad_west_y = grid_grad_temp_y[index] - grid_grad_temp_y[index_west]; // int c1 = fabs(dx - grad_north_x) < SIS_THRESHOLD; int c2 = fabs(dx - grad_west_x) < SIS_THRESHOLD; int c3 = fabs(dy - grad_north_y) < SIS_THRESHOLD; int c4 = fabs(dy - grad_west_y) < SIS_THRESHOLD; // - if (c1 || c2 || c3 || c4) + //if (c1 || c2 || c3 || c4) + if (c2 || c3) { struct point_double pImage; // pImage.x = (double) frame->xmin + ii*dx; pImage.y = (double) frame->ymin + jj*dy; // //point_double Grad_DP = module_potentialDerivatives_totalGradient_5_SOA_DP_v2(&pImage, lens, 0, Nlens, ((ii == 1) && (jj == 175))); point_double Grad_DP = module_potentialDerivatives_totalGradient_5_SOA_DP_v2(&pImage, lens, 0, Nlens, true); // grid_grad_x[index] = Grad_DP.x; grid_grad_y[index] = Grad_DP.y; loc_count++; //if (ii == 1) printf("%d %d: %f %f -> %f %f\n", ii, jj, pImage.x, pImage.y, Grad_DP.x, Grad_DP.y); } else { grid_grad_x[index] = grid_grad_temp_x[index]; grid_grad_y[index] = grid_grad_temp_y[index]; } } } } } *count = loc_count; //#pragma omp taskwait } // // // -void gradient_grid_SIS_patch(double* grid_grad_x, double* grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart, int* count) +void gradient_grid_SIS_patch(double* __restrict__ grid_grad_sis_x, double* __restrict__ grid_grad_sis_y, double* __restrict__ grid_grad_x, double* __restrict__ grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart, int* count) //void gradient_grid_SIS_patch(double* grid_grad_x, double* grid_grad_y, const struct grid_param *frame, const struct Potential_Mixed<double> *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart, int* count) { // // SIS Potentials in double precision // printf(" ** CPU SIS patch\n"); fflush(stdout); *count = 0; for (int pot = 0; pot < Nlens; ++pot) { int patch_size = 20; if (lens->vdisp[pot] > 900.) { patch_size = 200; } const double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1); const double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1); // const int px = (int) ((lens->position_x[pot] - frame->xmin)/dx); const int py = (int) ((lens->position_y[pot] - frame->ymin)/dy); (*count)++; // int istart = px - patch_size/2; int jstart = py - patch_size/2; { const double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1); const double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1); #pragma omp parallel #pragma omp for collapse(2) for (int jj = 0; jj < patch_size; ++jj) for (int ii = 0; ii < patch_size; ++ii) { // int index = (jj + jstart)*nbgridcells_x + (ii + istart); // struct point_double image_point_DP; image_point_DP.x = (double) frame->xmin + (ii + istart)*dx; image_point_DP.y = (double) frame->ymin + (jj + jstart)*dy; // point_double Grad_DP = module_potentialDerivatives_totalGradient_5_SOA_DP_v2(&image_point_DP, lens, 0, Nlens, 0); // // needs checking? // //printf("%.15f %.15f\n", Grad_DP.x, Grad_DP.y); - grid_grad_x[index] = Grad_DP.x; - grid_grad_y[index] = Grad_DP.y; + grid_grad_sis_x[index] = Grad_DP.x; + grid_grad_sis_y[index] = Grad_DP.y; } } } } + +void gradient_grid_SIS_patch(double* grid_grad_x, double* grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart, int* count) +{ + gradient_grid_SIS_patch(grid_grad_x, grid_grad_y, grid_grad_x, grid_grad_y, frame, lens, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart, count); +} + + // // // void gradient_grid_general_mixed_CPU(double* grid_grad_x, double* grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart) { const int grid_dim = nbgridcells_x; // struct Potential_Mixed<double> lens_mixed; alloc(lens_mixed, Nlens); copy (lens_mixed, lens, Nlens); // //point image_point; // const type_t dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1); const type_t dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1); // double* grid_grad_temp_x = (double*) malloc(nbgridcells_x*nbgridcells_y*sizeof(double)); double* grid_grad_temp_y = (double*) malloc(nbgridcells_x*nbgridcells_y*sizeof(double)); double time; // //printf("MIXED -> dx = %.15f, dy = %.15f, nbgridcells_x = %d, nbgridcells_y = %d\n", dx, dy, nbgridcells_x, nbgridcells_y); // // Float computation // #if 1 int count = 0; time = -mysecond(); #pragma omp parallel #pragma omp for for (int jj = jstart; jj < nbgridcells_y; ++jj) for (int ii = istart; ii < nbgridcells_x; ++ii) { int index = jj*nbgridcells_x + ii; // struct point image_point; image_point.x = frame->xmin + (ii + istart)*dx; image_point.y = frame->ymin + (jj + jstart)*dy; // point Grad = module_potentialDerivatives_totalGradient_SOA(&image_point, lens, Nlens); // grid_grad_temp_x[index] = (double) Grad.x; grid_grad_temp_y[index] = (double) Grad.y; // } memcpy(grid_grad_x, grid_grad_temp_x, nbgridcells_x*sizeof(double)); memcpy(grid_grad_y, grid_grad_temp_y, nbgridcells_y*sizeof(double)); #endif time += mysecond(); printf("\n** Float computation: %f s.\n", time); // // // #ifdef __SIS #warning "SIS CPU" // // SIS Potentials in double precision // count = 0; time = -mysecond(); { gradient_grid_SIS_patch(grid_grad_temp_x, grid_grad_temp_y, frame, lens, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart, &count); } time += mysecond(); printf("** %d SIS computations: %f s.\n", count, time); #endif // // Euler approximation // //#ifdef __EULER #if __EULER #warning "EULER CPU" //memset(grid_grad_x, nbgridcells_x*nbgridcells_y*sizeof(double)); //memset(grid_grad_y, nbgridcells_x*nbgridcells_y*sizeof(double)); count = 0; time = -mysecond(); { //gradient_grid_euler(grid_grad_temp_x, grid_grad_temp_y, grid_grad_x, grid_grad_y, frame, &lens_mixed, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart, &count); gradient_grid_euler(grid_grad_x, grid_grad_y, grid_grad_temp_x, grid_grad_temp_y, frame, lens, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart, &count); } time += mysecond(); printf("** %d Euler computations: %f s.\n", count, time); #else memcpy(grid_grad_x, grid_grad_temp_x, nbgridcells_x*nbgridcells_y*sizeof(double)); memcpy(grid_grad_y, grid_grad_temp_y, nbgridcells_x*nbgridcells_y*sizeof(double)); #endif } void gradient_grid_mixed_CPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, int nbgridcells_x, int nbgridcells_y, int istart, int jstart) { double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1); double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1); // gradient_grid_general_mixed_CPU(grid_grad_x, grid_grad_y, frame, lens, nhalos, nbgridcells_x, nbgridcells_y, istart, jstart); // } diff --git a/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_CPU.hpp b/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_CPU.hpp index e9d8450..a1d8ccf 100644 --- a/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_CPU.hpp +++ b/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_CPU.hpp @@ -1,61 +1,63 @@ /* * grid_gradient_CPU.hpp * * Created on: Jan 12, 2017 * Author: cerschae */ #ifndef GRID_GRADIENT_CPU_HPP_ #define GRID_GRADIENT_CPU_HPP_ #include "structure_hpc.hpp" #include "structure_mixed.hpp" #include <iostream> #include <string.h> //#include <cuda_runtime.h> #include <math.h> #include <gradient_avx.hpp> #include <sys/time.h> #include <fstream> // #define SIS_THRESHOLD 0.0092 // // //#include <immintrin.h> // // struct point_double { double x; double y; }; // // // void gradient_grid_CPU(type_t *grid_grad_x, type_t *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells, int istart = 0, int jstart = 0); // // // void gradient_grid_CPU(type_t *grid_grad_x, type_t *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart = 0, int jstart = 0); // static void gradient_grid_general_CPU_old(type_t *grid_grad_x, type_t *grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells, const struct Potential_SOA *lens); // void gradient_grid_mixed_CPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, int nbgridcells_x, int nbgridcells_y, int istart, int jstart); // void gradient_grid_general_mixed_CPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells, const struct Potential_SOA *lens, int istart, int jstart); // void gradient_grid_general_double_CPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells, const struct Potential_SOA *lens, int istart, int jstart); // void gradient_grid_general_double_CPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens, int nbgridcells, const struct Potential_Mixed<double> *lens, int istart, int jstart); // -void gradient_grid_SIS_patch(double* grid_grad_x, double* grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart, int* count); +void gradient_grid_SIS_patch(double* __restrict__ grid_grad_x, double* __restrict__grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart, int* count); +// +void gradient_grid_SIS_patch(double* __restrict__ grid_grad_sis_x, double* __restrict__ grid_grad_sis_y, double* __restrict__ grid_grad_x, double* __restrict__grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart, int* count); // // #endif /* GRID_GRADIENT_CPU_HPP_ */ diff --git a/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_GPU.cu b/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_GPU.cu index acd63f2..f50bf20 100644 --- a/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_GPU.cu +++ b/Benchmarks/GridGradientBenchmarkMixed/grid_gradient_mixed_GPU.cu @@ -1,1393 +1,1417 @@ #include <stdlib.h> #include <iomanip> #include "grid_gradient_mixed_CPU.hpp" #include "grid_gradient_mixed_GPU.cuh" //#include "structure_mixed.hpp" #include "gradient_GPU.cuh" //#include "cudafunctions.h" // must be squared #define BLOCK_SIZE_X 16 #define BLOCK_SIZE_Y 16 // // #define CUDA_SAFE_CALL(call) \ do { \ cudaError_t err = call; \ if (cudaSuccess != err) { \ fprintf (stderr, "'%s' Cuda error in file '%s' in line %i : %s.", \ __FILE__, __LINE__, cudaGetErrorString(err) ); \ exit(EXIT_FAILURE); \ } \ } while (0) // // // __device__ __managed__ unsigned int gpu_count ; extern "C"{ double seconds() { struct timeval tp; struct timezone tzp; // int i = gettimeofday(&tp,&tzp); // return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); } } // // // #define SIS_THRESHOLD 0.0092 // // // __device__ point module_potentialDerivatives_totalGradient_5_SOA_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos); __device__ point_double module_potentialDerivatives_totalGradient_5_SOA_DP_GPU(const struct point_double *pImage, const struct Potential_Mixed<double> *lens, int shalos, int nhalos, int echo = 0); // // // __device__ point module_potentialDerivatives_totalGradient_5_SOA_SP_GPU(const struct point *pImage, const struct Potential_SOA *lens, int shalos, int nhalos) { //asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins"); //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n"); // struct point grad, result; grad.x = 0; grad.y = 0; for(int i = shalos; i < shalos + nhalos; i++) { // struct point true_coord, true_coord_rotation; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); // float cose = lens->anglecos[i]; float sine = lens->anglesin[i]; // float x = true_coord.x*cose + true_coord.y*sine; float y = true_coord.y*cose - true_coord.x*sine; // float ell_pot = lens->ellipticity_potential[i]; // float b0_inv_R = lens->b0[i]/sqrtf((float) (x*x*(1.f - ell_pot) + y*y*(1.f + ell_pot))); // result.x = (1.f - ell_pot)*x*b0_inv_R; result.y = (1.f + ell_pot)*y*b0_inv_R; // grad.x += result.x*cose - result.y*sine; grad.y += result.y*cose + result.x*sine; } return grad; } __device__ point_double module_potentialDerivatives_totalGradient_5_SOA_DP_GPU(const struct point_double *pImage, const struct Potential_SOA *lens, int shalos, int nhalos, int echo) { //asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins"); //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n"); // struct point_double grad, result; grad.x = 0; grad.y = 0; // for(int i = shalos; i < shalos + nhalos; i++) { struct point_double true_coord, true_coord_rotation; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); double cose = lens->anglecos[i]; double sine = lens->anglesin[i]; // double x = true_coord.x*cose + true_coord.y*sine; double y = true_coord.y*cose - true_coord.x*sine; // double ell_pot = lens->ellipticity_potential[i]; // double b0_inv_R; b0_inv_R = lens->b0[i]/sqrt(x*x*(1. - ell_pot) + y*y*(1. + ell_pot)); // result.x = (1. - ell_pot)*x*b0_inv_R; result.y = (1. + ell_pot)*y*b0_inv_R; //if (echo) printf("x, y = %.15f %15f, lens = %.15f %15f, coord = %.15f %.15f, x = %.15f %.15f, ell_pot = %.15f, res = %.15f %.15f\n", pImage->x, pImage->y, lens->position_x[i], lens->position_y[i], true_coord.x, true_coord.y, x, y, ell_pot, result.x, result.y); // grad.x += result.x*cose - result.y*sine; grad.y += result.y*cose + result.x*sine; // } //if (echo) printf("grad = %.15f %.15f\n", grad.x, grad.y); return grad; } // // // Single Precision // __device__ point module_potentialDerivatives_totalGradient_5_SOA_SP_GPU(const struct point_double *pImage, const struct Potential_SOA *lens, int shalos, int nhalos, int echo = 0) { // asm volatile("# module_potentialDerivatives_totalGradient_5_SOA_SP_GPU begins"); // printf("# module_potentialDerivatives_totalGradient_5_SOA_SP_GPU begins\n"); // struct point grad, result; grad.x = 0; grad.y = 0; // for(int i = shalos; i < shalos + nhalos; i++) { // struct point_double true_coord, true_coord_rotation; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // float cose = lens->anglecos[i]; float sine = lens->anglesin[i]; // float x = true_coord.x*cose + true_coord.y*sine; float y = true_coord.y*cose - true_coord.x*sine; // float ell_pot = lens->ellipticity_potential[i]; // float b0_inv_R = lens->b0[i]/sqrtf(x*x*(1.f - ell_pot) + y*y*(1.f + ell_pot)); // result.x = (1.f - ell_pot)*x*b0_inv_R; result.y = (1.f + ell_pot)*y*b0_inv_R; // grad.x += result.x*cose - result.y*sine; grad.y += result.y*cose + result.x*sine; } return grad; } // __global__ void gradient_grid_general_SP_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, int nbgridcells_x, int nbgridcells_y, int istart, int jstart) { // struct point grad; struct point pImage; // int col = blockIdx.x*blockDim.x + threadIdx.x; if (col > nbgridcells_x) return; int row = blockIdx.y*blockDim.y + threadIdx.y; if (row > nbgridcells_y) return; // int index = row*nbgridcells_x + col; // const float dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1); const float dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1); // pImage.x = frame->xmin + (col + istart)*dx; pImage.y = frame->ymin + (row + jstart)*dy; // #if 0 grad = module_potentialDerivatives_totalGradient_5_SOA_SP_GPU(&pImage, lens, 0, nhalos); #else struct point result; grad.x = 0; grad.y = 0; // for(int i = 0; i < nhalos; i++) { // struct point true_coord, true_coord_rotation; // true_coord.x = pImage.x - lens->position_x[i]; true_coord.y = pImage.y - lens->position_y[i]; // float cose = lens->anglecos[i]; float sine = lens->anglesin[i]; // float x = true_coord.x*cose + true_coord.y*sine; float y = true_coord.y*cose - true_coord.x*sine; // float ell_pot = lens->ellipticity_potential[i]; // //float b0_inv_R = lens->b0[i]/sqrtf((float) (x*x*(1.f - ell_pot) + y*y*(1.f + ell_pot))); //float b0_inv_R = lens->b0[i]/__fsqrt_rn((float) (x*x*(1.f - ell_pot) + y*y*(1.f + ell_pot))); float b0_inv_R = lens->b0[i]*rsqrtf((float) (x*x*(1.f - ell_pot) + y*y*(1.f + ell_pot))); // result.x = (1.f - ell_pot)*x*b0_inv_R; result.y = (1.f + ell_pot)*y*b0_inv_R; // grad.x += result.x*cose - result.y*sine; grad.y += result.y*cose + result.x*sine; } #endif // grid_grad_x[index] = (double) grad.x; grid_grad_y[index] = (double) grad.y; // } // // // __global__ void gradient_grid_euler_GPU(double* grid_grad_euler_x, double* grid_grad_euler_y, const double* grid_grad_temp_x, const double* grid_grad_temp_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart) { int loc_count = 0; //*count = 100; //return; //double time = -second(); // #if 1 int ii = blockIdx.x*blockDim.x + threadIdx.x; int jj = blockIdx.y*blockDim.y + threadIdx.y; + if ((ii == istart) || (jj == jstart)) + { + int index = (jj - 0)*nbgridcells_x + (ii - 0); + grid_grad_euler_x[index] = grid_grad_temp_x[index]; + grid_grad_euler_y[index] = grid_grad_temp_y[index]; + return; + } // if (((ii < nbgridcells_x) && (jj < nbgridcells_y) && (ii > istart) && (jj > jstart))) { const double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1); const double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1); { { { int index = (jj - 0)*nbgridcells_x + (ii - 0); int index_north = (jj - 1)*nbgridcells_x + (ii - 0); int index_west = (jj - 0)*nbgridcells_x + (ii - 1); // type_t grid_x = grid_grad_temp_x[index]; type_t grid_y = grid_grad_temp_y[index]; // type_t grad_north_x = grid_x - grid_grad_temp_x[index_north]; type_t grad_north_y = grid_y - grid_grad_temp_y[index_north]; // type_t grad_west_x = grid_x - grid_grad_temp_x[index_west]; type_t grad_west_y = grid_y - grid_grad_temp_y[index_west]; // int c1 = fabs(dx - grad_north_x) < SIS_THRESHOLD; int c2 = fabs(dx - grad_west_x) < SIS_THRESHOLD; int c3 = fabs(dy - grad_north_y) < SIS_THRESHOLD; int c4 = fabs(dy - grad_west_y) < SIS_THRESHOLD; // - if (c1 || c2 || c3 || c4) + //if (c1 || c2 || c3 || c4) + if (c2 || c3) { - //atomicAdd(&gpu_count, 1); + atomicAdd(&gpu_count, 1); //printf("count = \n", count); struct point_double pImage; // pImage.x = (double) frame->xmin + ii*dx; pImage.y = (double) frame->ymin + jj*dy; struct point_double grad, result; // point_double Grad_DP; Grad_DP = module_potentialDerivatives_totalGradient_5_SOA_DP_GPU(&pImage, lens, 0, Nlens, ((ii == 1) && (jj == 175))); grid_grad_euler_x[index] = Grad_DP.x; grid_grad_euler_y[index] = Grad_DP.y; // } else { - //grid_grad_euler_x[index] = grid_grad_temp_x[index]; - //grid_grad_euler_y[index] = grid_grad_temp_y[index]; + grid_grad_euler_x[index] = grid_grad_temp_x[index]; + grid_grad_euler_y[index] = grid_grad_temp_y[index]; } } } } } //#pragma omp taskwait #endif } // // // void gradient_grid_general_mixed_GPU(double* __restrict__ grid_grad_x, double* __restrict__ grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart) { grid_param *frame_gpu; Potential_SOA *lens_gpu,*lens_kernel; //double *grid_grad_x_gpu, *grid_grad_y_gpu ; lens_gpu = (Potential_SOA *) malloc(sizeof(Potential_SOA)); //lens_gpu->type = (int *) malloc(sizeof(int)); // int nhalos = Nlens; // // Allocate variables on the GPU // double atime = -seconds(); // double* __restrict__ grid_grad_sis_x; cudaMallocHost((void**) &grid_grad_sis_x, nbgridcells_x*nbgridcells_y*sizeof(double)); double* __restrict__ grid_grad_sis_y; cudaMallocHost((void**) &grid_grad_sis_y, nbgridcells_x*nbgridcells_y*sizeof(double)); // + double* __restrict__ grid_grad_euler_cpu_x; cudaMallocHost((void**) &grid_grad_euler_cpu_x, nbgridcells_x*nbgridcells_y*sizeof(double)); + double* __restrict__ grid_grad_euler_cpu_y; cudaMallocHost((void**) &grid_grad_euler_cpu_y, nbgridcells_x*nbgridcells_y*sizeof(double)); + // memset(grid_grad_sis_x, 0., nbgridcells_x*nbgridcells_y*sizeof(double)); memset(grid_grad_sis_y, 0., nbgridcells_x*nbgridcells_y*sizeof(double)); // double* grid_grad_gpu_x; cudasafer(cudaMalloc((void**) &grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_gpu_x"); double* grid_grad_gpu_y; cudasafer(cudaMalloc((void**) &grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_gpu_y"); // double* grid_grad_euler_x; cudasafer(cudaMalloc((void**) &grid_grad_euler_x, nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_gpu_x"); double* grid_grad_euler_y; cudasafer(cudaMalloc((void**) &grid_grad_euler_y, nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_gpu_y"); // #if 1 int *type_gpu; type_t *lens_x_gpu, *lens_y_gpu, *b0_gpu, *angle_gpu, *epot_gpu, *rcore_gpu, *rcut_gpu, *anglecos_gpu, *anglesin_gpu; // cudasafer(cudaMalloc( (void**)&(lens_kernel) , sizeof(Potential_SOA)),"Gradientgpu.cu : Alloc Potential_SOA: " ); cudasafer(cudaMalloc( (void**)&(type_gpu) , nhalos*sizeof(int)) ,"Gradientgpu.cu : Alloc type_gpu: " ); cudasafer(cudaMalloc( (void**)&(lens_x_gpu) , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc x_gpu: " ); cudasafer(cudaMalloc( (void**)&(lens_y_gpu) , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc y_gpu: " ); cudasafer(cudaMalloc( (void**)&(b0_gpu) , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc b0_gpu: " ); cudasafer(cudaMalloc( (void**)&(angle_gpu) , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc angle_gpu: " ); cudasafer(cudaMalloc( (void**)&(epot_gpu) , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc epot_gpu: " ); cudasafer(cudaMalloc( (void**)&(rcore_gpu) , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc rcore_gpu: " ); cudasafer(cudaMalloc( (void**)&(rcut_gpu) , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc rcut_gpu: " ); cudasafer(cudaMalloc( (void**)&(anglecos_gpu), nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc anglecos_gpu: " ); cudasafer(cudaMalloc( (void**)&(anglesin_gpu), nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc anglesin_gpu: " ); cudasafer(cudaMalloc( (void**)&(frame_gpu) , sizeof(grid_param)) ,"Gradientgpu.cu : Alloc frame_gpu: " ); // // Copy values to the GPU // double ttime = -seconds(); cudasafer(cudaMemcpy(type_gpu , lens->type , nhalos*sizeof(int) , cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy type_gpu: " ); cudasafer(cudaMemcpy(lens_x_gpu , lens->position_x , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy x_gpu: " ); cudasafer(cudaMemcpy(lens_y_gpu , lens->position_y , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy y_gpu: " ); cudasafer(cudaMemcpy(b0_gpu , lens->b0 , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy b0_gpu: " ); cudasafer(cudaMemcpy(angle_gpu , lens->ellipticity_angle , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy angle_gpu: " ); cudasafer(cudaMemcpy(epot_gpu , lens->ellipticity_potential, nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy epot_gpu: " ); cudasafer(cudaMemcpy(rcore_gpu , lens->rcore , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy rcore_gpu: " ); cudasafer(cudaMemcpy(rcut_gpu , lens->rcut , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy rcut_gpu: " ); cudasafer(cudaMemcpy(anglecos_gpu , lens->anglecos , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy anglecos: " ); cudasafer(cudaMemcpy(anglesin_gpu , lens->anglesin , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy anglesin: " ); cudasafer(cudaMemcpy(frame_gpu , frame , sizeof(grid_param), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy fame_gpu: " ); // lens_gpu->type = type_gpu; lens_gpu->position_x = lens_x_gpu; lens_gpu->position_y = lens_y_gpu; lens_gpu->b0 = b0_gpu; lens_gpu->ellipticity_angle = angle_gpu; lens_gpu->ellipticity_potential = epot_gpu; lens_gpu->rcore = rcore_gpu; lens_gpu->rcut = rcut_gpu; lens_gpu->anglecos = anglecos_gpu; lens_gpu->anglesin = anglesin_gpu; // cudaMemcpy(lens_kernel, lens_gpu, sizeof(Potential_SOA), cudaMemcpyHostToDevice); //cudaDeviceSynchronize(); #else gpu_allocate_SOA<type_t>(lens_gpu, lens, frame, nhalos, nbgridcells_x, nbgridcells_y); #endif // atime += seconds(); long int mem_len = 10*nhalos*sizeof(type_t) + sizeof(grid_param) + sizeof(Potential_SOA); // printf("\n ** transfer bandwidth = %f GB/s, %d Bytes, %f s.\n", (double) mem_len/ttime/1024./1024./1024., mem_len, atime); // const int grid_dim = nbgridcells_x; // //point image_point; // const type_t dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1); const type_t dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1); // // // double time, euler_time, sis_time; //printf("MIXED -> dx = %.15f, dy = %.15f, nbgridcells_x = %d, nbgridcells_y = %d, %p %p\n", dx, dy, nbgridcells_x, nbgridcells_y, grid_grad_temp_x, grid_grad_temp_y); // int count = 0; cudaStream_t stream1; cudaStreamCreate(&stream1); // cudaStream_t stream2; cudaStreamCreate(&stream2); // // Float computation // #if 0 #pragma omp parallel #pragma omp for for (int jj = jstart; jj < nbgridcells_y; ++jj) for (int ii = istart; ii < nbgridcells_x; ++ii) { // int index = jj*nbgridcells_x + ii; // struct point image_point; image_point.x = frame->xmin + (ii + istart)*dx; image_point.y = frame->ymin + (jj + jstart)*dy; // point Grad = module_potentialDerivatives_totalGradient_SOA(&image_point, lens, Nlens); // grid_grad_sis_x[index] = (double) Grad.x; grid_grad_sis_y[index] = (double) Grad.y; // } //memcpy(grid_grad_x, grid_grad_temp_x, nbgridcells_x*sizeof(double)); //memcpy(grid_grad_y, grid_grad_temp_y, nbgridcells_y*sizeof(double)); // #endif PUSH_RANGE("Float GPU", 1); time = -seconds(); #if 1 { int GRID_SIZE_X = (nbgridcells_x + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks int GRID_SIZE_Y = (nbgridcells_y + BLOCK_SIZE_Y - 1)/BLOCK_SIZE_Y; // dim3 threads(BLOCK_SIZE_X, BLOCK_SIZE_Y/1); dim3 grid ( GRID_SIZE_X, GRID_SIZE_Y); // //gradient_grid_general_SP_GPU<<<grid, threads>>>(grid_grad_temp_x, grid_grad_temp_y, frame_gpu, lens_kernel, Nlens, dx, dy, nbgridcells_x, nbgridcells_y, istart, jstart); gradient_grid_general_SP_GPU<<<grid, threads, 0, stream1>>>(grid_grad_gpu_x, grid_grad_gpu_y, frame_gpu, lens_kernel, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart); //POP_RANGE; // // CudaCheckError(); //PUSH_RANGE("Float GPU", 2); //@@cudaMemcpy(grid_grad_sis_x, grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost); //@@cudaMemcpy(grid_grad_sis_y, grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); time += seconds(); - cudaMemcpyAsync(grid_grad_sis_x, grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost, stream1); - cudaMemcpyAsync(grid_grad_sis_y, grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost, stream1); + cudaMemcpyAsync(grid_grad_x, grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost, stream1); + cudaMemcpyAsync(grid_grad_y, grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost, stream1); } #endif #if !defined(__EULER) && !defined(__SIS) cudaDeviceSynchronize(); #endif //time += seconds(); POP_RANGE; // // // double setime = time - seconds(); #ifdef __EULER #warning "EULER defined" //memset(grid_grad_x, nbgridcells_x*nbgridcells_y*sizeof(double)); //memset(grid_grad_y, nbgridcells_x*nbgridcells_y*sizeof(double)); // // count = 0; //int* euler_count; CudaSafeCall(cudaMallocManaged((void**) &euler_count, sizeof(int))); euler_time = -seconds(); { int GRID_SIZE_X = (nbgridcells_x + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks int GRID_SIZE_Y = (nbgridcells_y + BLOCK_SIZE_Y - 1)/BLOCK_SIZE_Y; // dim3 threads(BLOCK_SIZE_X, BLOCK_SIZE_Y); dim3 grid ( GRID_SIZE_X, GRID_SIZE_Y); // //cudaMemcpy(grid_grad_euler_x, grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyHostToHost); //cudaMemcpy(grid_grad_euler_y, grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyHostToHost); // PUSH_RANGE("Euler GPU", 3); //@@gradient_grid_euler_GPU<<<grid, threads, 0, stream1>>>(grid_grad_euler_x, grid_grad_euler_y, grid_grad_sis_x, grid_grad_sis_y, frame_gpu, lens_kernel, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart); gradient_grid_euler_GPU<<<grid, threads, 0, stream2>>>(grid_grad_euler_x, grid_grad_euler_y, grid_grad_gpu_x, grid_grad_gpu_y, frame_gpu, lens_kernel, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart); //gradient_grid_euler_GPU<<<grid, threads>>>(grid_grad_x, grid_grad_y, grid_grad_temp_x, grid_grad_temp_y, frame_gpu, lens_kernel, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart); //daxpy<<<grid, threads>>>(nbgridcells_x*nbgridcells_y, 1., grid_grad_temp_x, grid_grad_temp_y); //cudaDeviceSynchronize(); CudaCheckError(); //POP_RANGE; // //PUSH_RANGE("Euler COPY", 4); //cudaMemcpyAsync(grid_grad_x, grid_grad_euler_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost, stream2); //cudaMemcpyAsync(grid_grad_y, grid_grad_euler_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost, stream2); //cudaMemcpyAsync(&a[offset], &d_a[offset], // streamBytes, cudaMemcpyDeviceToHost, cudaMemcpyDeviceToHost, stream[i]); POP_RANGE; } euler_time += seconds(); // #else - cudaMemcpy(grid_grad_x, grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(grid_grad_y, grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost); + //cudaMemcpy(grid_grad_x, grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost); + //cudaMemcpy(grid_grad_y, grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(grid_grad_euler_x, grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToDevice); + cudaMemcpy(grid_grad_euler_y, grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToDevice); #endif // #ifdef __SIS // // SIS Potentials in double precision // cudaStreamSynchronize(stream1); PUSH_RANGE("SIS CPU", 5); count = 0; sis_time = -seconds(); { //gradient_grid_SIS_patch(grid_grad_temp_x, grid_grad_temp_y, frame, lens, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart, &count); - gradient_grid_SIS_patch(grid_grad_sis_x, grid_grad_sis_y, frame, lens, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart, &count); + gradient_grid_SIS_patch(grid_grad_sis_x, grid_grad_sis_y, grid_grad_x, grid_grad_y, frame, lens, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart, &count); } POP_RANGE; cudaDeviceSynchronize(); sis_time += seconds(); #endif // // /* #ifdef __EULER cudaMemcpy(grid_grad_x, grid_grad_euler_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(grid_grad_y, grid_grad_euler_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost); #endif */ // double rtime = -seconds(); #ifdef __SIS //cudaMemcpyAsync(grid_grad_x, grid_grad_euler_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost, stream2); cudaStreamSynchronize(stream2); - cudaMemcpy(grid_grad_x, grid_grad_euler_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(grid_grad_euler_cpu_x, grid_grad_euler_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost); PUSH_RANGE("Reduction CPU x", 7); - cudaMemcpyAsync(grid_grad_y, grid_grad_euler_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost, stream2); + cudaMemcpyAsync(grid_grad_euler_cpu_y, grid_grad_euler_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost, stream2); // #pragma omp parallel for for (int ii = 0; ii < nbgridcells_x*nbgridcells_y; ++ii) //if ((grid_grad_sis_x[ii] != 0.) || (grid_grad_sis_y[ii] != 0.)) if (grid_grad_sis_x[ii] != 0.) { grid_grad_x[ii] = grid_grad_sis_x[ii]; } + else + { + grid_grad_x[ii] = grid_grad_euler_cpu_x[ii]; + } POP_RANGE; PUSH_RANGE("Reduction CPU y", 8); cudaStreamSynchronize(stream2); #pragma omp parallel for for (int ii = 0; ii < nbgridcells_x*nbgridcells_y; ++ii) if (grid_grad_sis_y[ii] != 0.) { grid_grad_y[ii] = grid_grad_sis_y[ii]; } + else + { + grid_grad_y[ii] = grid_grad_euler_cpu_y[ii]; + } POP_RANGE; rtime += seconds(); +#else + cudaMemcpy(grid_grad_x, grid_grad_euler_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(grid_grad_y, grid_grad_euler_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost); #endif setime += seconds(); //time += rtime; printf(" ** GPU Float computation: %f s.\n", time); printf(" ** %d Euler computations: %f s.\n", gpu_count, euler_time); printf(" ** %d SIS computations : %f s.\n", count, sis_time); printf(" ** reduction time : %f s.\n", count, rtime); printf(" ** SP + Euler + SIS time: %f s.\n", time + sis_time + euler_time + rtime); //POP_RANGE; // // // cudaStreamDestroy(stream1); cudaStreamDestroy(stream2); // // Euler approximation // //cudaDeviceSynchronize(); //printf("** %d Euler computations: %f s.\n", gpu_count, time); //cudasafer(cudaFree(grid_grad_temp_x ), "free grid_grad_temp_x"); //cudasafer(cudaFree(grid_grad_temp_y ), "free grid_grad_temp_y"); cudasafer(cudaFree(grid_grad_euler_x), "free grid_grad_euler_x"); cudasafer(cudaFree(grid_grad_euler_y), "free grid_grad_euler_y"); cudasafer(cudaFree(grid_grad_gpu_x) , "free grid_grad_gpu_x"); cudasafer(cudaFree(grid_grad_gpu_y) , "free grid_grad_gpu_y"); // // lens_gpu = (Potential_SOA *) malloc(sizeof(Potential_SOA)); // lens_gpu->type = (int *) malloc(sizeof(int)); // // Allocate variables on the GPU // cudasafer(cudaFree(lens_gpu), "free lens_kernel"); // free(lens_gpu); // cudasafer(cudaFree(lens_kernel) , "free kernel_gpu"); cudasafer(cudaFree(type_gpu) , "free type_gpu"); cudasafer(cudaFree(lens_x_gpu) , "free lens_x_gpu"); cudasafer(cudaFree(lens_y_gpu) , "free lens_y_gpu"); cudasafer(cudaFree(b0_gpu) , "free b0_gpu"); cudasafer(cudaFree(angle_gpu) , "free angle_gpu"); cudasafer(cudaFree(epot_gpu) , "free epot_gpu"); cudasafer(cudaFree(rcore_gpu) , "free rcore_gpu"); cudasafer(cudaFree(rcut_gpu) , "free rcut_gpu"); cudasafer(cudaFree(anglecos_gpu), "free anglecos_gpu"); cudasafer(cudaFree(anglesin_gpu), "free anglesin_gpu"); cudasafer(cudaFree(frame_gpu) , "free frame_gpu"); } // // // void gradient_grid_mixed_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, int nbgridcells_x, int nbgridcells_y, int istart, int jstart) { double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1); double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1); // std::cout << "Calling general mixed GPU" << std::endl; gradient_grid_general_mixed_GPU(grid_grad_x, grid_grad_y, frame, lens, nhalos, nbgridcells_x, nbgridcells_y, istart, jstart); // } // // templated double precision kernels /* __device__ point_double module_potentialDerivatives_totalGradient_5_SOA_DP_GPU(const struct point_double *pImage, const struct Potential_Mixed<double> *lens, int shalos, int nhalos, int echo) { //asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins"); //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n"); // struct point_double grad, result; grad.x = 0; grad.y = 0; // for(int i = shalos; i < shalos + nhalos; i++) { struct point_double true_coord, true_coord_rotation; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); double cose = lens->anglecos[i]; double sine = lens->anglesin[i]; // double x = true_coord.x*cose + true_coord.y*sine; double y = true_coord.y*cose - true_coord.x*sine; // double ell_pot = lens->ellipticity_potential[i]; // double b0_inv_R; b0_inv_R = lens->b0[i]/sqrt(x*x*(1. - ell_pot) + y*y*(1. + ell_pot)); // result.x = (1. - ell_pot)*x*b0_inv_R; result.y = (1. + ell_pot)*y*b0_inv_R; // grad.x += result.x*cose - result.y*sine; grad.y += result.y*cose + result.x*sine; // } //if (echo) printf("grad = %.15f %.15f\n", grad.x, grad.y); return grad; } // __global__ void gradient_grid_general_DP_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_Mixed<double> *lens, int nhalos, int nbgridcells_x, int nbgridcells_y, int istart, int jstart) { // // // int col = blockIdx.x*blockDim.x + threadIdx.x; if (col > nbgridcells_x) return; int row = blockIdx.y*blockDim.y + threadIdx.y; if (row > nbgridcells_y) return; // int index = row*nbgridcells_x + col; // const double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1); const double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1); // struct point_double grad; struct point_double pImage; // pImage.x = (double) frame->xmin + (col + istart)*dx; pImage.y = (double) frame->ymin + (row + jstart)*dy; //if (index == 0) printf("x, y = %.15f %15f\n", pImage.x, pImage.y); double grad_x, grad_y; // grad = module_potentialDerivatives_totalGradient_5_SOA_DP_GPU(&pImage, lens, 0, nhalos, index == 0); grid_grad_x[index] = (double) grad.x; grid_grad_y[index] = (double) grad.y; } */ // // __device__ point_double module_potentialDerivatives_totalGradient_5_SOA_DP_GPU(const struct point_double *pImage, const struct Potential_Mixed<double> *lens, int shalos, int nhalos, int echo) { //asm volatile("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins"); //printf("# module_potentialDerivatives_totalGradient_SIS_SOA_v2 begins\n"); // struct point_double grad, result; grad.x = 0; grad.y = 0; // for(int i = shalos; i < shalos + nhalos; i++) { struct point_double true_coord, true_coord_rotation; // true_coord.x = pImage->x - lens->position_x[i]; true_coord.y = pImage->y - lens->position_y[i]; // //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); double cose = lens->anglecos[i]; double sine = lens->anglesin[i]; // double x = true_coord.x*cose + true_coord.y*sine; double y = true_coord.y*cose - true_coord.x*sine; // double ell_pot = lens->ellipticity_potential[i]; // double b0_inv_R; b0_inv_R = lens->b0[i]/sqrt(x*x*(1. - ell_pot) + y*y*(1. + ell_pot)); // result.x = (1. - ell_pot)*x*b0_inv_R; result.y = (1. + ell_pot)*y*b0_inv_R; //if (echo) printf("x, y = %.15f %15f, lens = %.15f %15f, coord = %.15f %.15f, x = %.15f %.15f, ell_pot = %.15f, res = %.15f %.15f\n", pImage->x, pImage->y, lens->position_x[i], lens->position_y[i], true_coord.x, true_coord.y, x, y, ell_pot, result.x, result.y); // grad.x += result.x*cose - result.y*sine; grad.y += result.y*cose + result.x*sine; // } //if (echo) printf("grad = %.15f %.15f\n", grad.x, grad.y); return grad; } // // // __global__ void gradient_grid_general_DP_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_Mixed<double> *lens, int nhalos, int nbgridcells_x, int nbgridcells_y, int istart, int jstart) { // // // int col = blockIdx.x*blockDim.x + threadIdx.x; if (col > nbgridcells_x) return; int row = blockIdx.y*blockDim.y + threadIdx.y; if (row > nbgridcells_y) return; // int index = row*nbgridcells_x + col; // const double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1); const double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1); // struct point_double grad; struct point_double pImage; // pImage.x = (double) frame->xmin + (col + istart)*dx; pImage.y = (double) frame->ymin + (row + jstart)*dy; //if (index == 0) printf("x, y = %.15f %15f\n", pImage.x, pImage.y); // #if 0 grad = module_potentialDerivatives_totalGradient_5_SOA_DP_GPU(&pImage, lens, 0, nhalos, index == 0); grid_grad_x[index] = (double) grad.x; grid_grad_y[index] = (double) grad.y; #else struct point_double result; double grad_x = 0., grad_y = 0.; for(int i = 0; i < nhalos; i++) { struct point_double true_coord, true_coord_rotation; // true_coord.x = pImage.x - lens->position_x[i]; true_coord.y = pImage.y - lens->position_y[i]; // //true_coord_rotation = rotateCoordinateSystem(true_coord, lens->ellipticity_angle[i]); double cose = lens->anglecos[i]; double sine = lens->anglesin[i]; // double x = true_coord.x*cose + true_coord.y*sine; double y = true_coord.y*cose - true_coord.x*sine; // double ell_pot = lens->ellipticity_potential[i]; // double b0_inv_R; b0_inv_R = lens->b0[i]/sqrt(x*x*(1. - ell_pot) + y*y*(1. + ell_pot)); // result.x = (1. - ell_pot)*x*b0_inv_R; result.y = (1. + ell_pot)*y*b0_inv_R; // grad_x += result.x*cose - result.y*sine; grad_y += result.y*cose + result.x*sine; // } grid_grad_x[index] = (double) grad_x; grid_grad_y[index] = (double) grad_y; #endif // } // // // __global__ void gradient_grid_SP_GPU(float *grid_grad_x, float *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int nhalos, float dx, float dy, int nbgridcells_x, int nbgridcells_y, int istart, int jstart) { // struct point grad; struct point pImage; // int col = blockIdx.x*blockDim.x + threadIdx.x; if (col > nbgridcells_x) return; int row = blockIdx.y*blockDim.y + threadIdx.y; if (row > nbgridcells_y) return; // int index = row*nbgridcells_x + col; // //const float dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1); //const float dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1); // pImage.x = frame->xmin + (col + istart)*dx; pImage.y = frame->ymin + (row + jstart)*dy; // struct point result; float grad_x = 0., grad_y = 0.; // for(int i = 0; i < nhalos; i++) { // struct point true_coord; // true_coord.x = pImage.x - lens->position_x[i]; true_coord.y = pImage.y - lens->position_y[i]; // float cose = lens->anglecos[i]; float sine = lens->anglesin[i]; // float x = true_coord.x*cose + true_coord.y*sine; float y = true_coord.y*cose - true_coord.x*sine; // float ell_pot = lens->ellipticity_potential[i]; // float b0_inv_R; b0_inv_R = lens->b0[i]*rsqrtf((float) (x*x*(1.f - ell_pot) + y*y*(1.f + ell_pot))); // result.x = (1.f - ell_pot)*x*b0_inv_R; result.y = (1.f + ell_pot)*y*b0_inv_R; // grad_x += result.x*cose - result.y*sine; grad_y += result.y*cose + result.x*sine; //if (index == 0) printf("%d: %f %f %f %f %f %f\n", i, cose, sine, ell_pot, b0_inv_R, grad_x, grad_y); // } // grid_grad_x[index] = grad_x; grid_grad_y[index] = grad_y; // } // // // void gradient_grid_general_SP_GPU(float* __restrict__ grid_grad_x, float* __restrict__ grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart) { grid_param *frame_gpu; // int nhalos = Nlens; // // Allocate variables on the GPU // printf("\n"); //printf(" size of gridparam = %d %d %d\n", sizeof(grid_param), sizeof(frame->xmin), sizeof(type_t)); double atime = -seconds(); // Potential_SOA *lens_gpu, *lens_kernel;// = (Potential_Mixed<double> *) malloc(sizeof(Potential_Mixed<double>)); lens_gpu = (Potential_SOA *) malloc(sizeof(Potential_SOA)); int *type_gpu; float *lens_x_gpu, *lens_y_gpu, *b0_gpu, *angle_gpu, *epot_gpu, *rcore_gpu, *rcut_gpu, *anglecos_gpu, *anglesin_gpu; #if 1 float* grid_grad_gpu_x; cudasafer(cudaMalloc((void**) &grid_grad_gpu_x , nbgridcells_x*nbgridcells_y*sizeof(float)), "grid_grad_temp_x"); float* grid_grad_gpu_y; cudasafer(cudaMalloc((void**) &grid_grad_gpu_y , nbgridcells_x*nbgridcells_y*sizeof(float)), "grid_grad_temp_y"); // lens_gpu = (Potential_SOA *) malloc(sizeof(Potential_SOA)); //lens_gpu->type = (int *) malloc(sizeof(int)); // Allocate variables on the GPU cudasafer(cudaMalloc( (void**)&(lens_kernel), sizeof(Potential_SOA)),"Gradientgpu.cu : Alloc Potential_SOA: " ); cudasafer(cudaMalloc( (void**)&(type_gpu), nhalos*sizeof(int)),"Gradientgpu.cu : Alloc type_gpu: " ); cudasafer(cudaMalloc( (void**)&(lens_x_gpu), nhalos*sizeof(float)),"Gradientgpu.cu : Alloc x_gpu: " ); cudasafer(cudaMalloc( (void**)&(lens_y_gpu), nhalos*sizeof(float)),"Gradientgpu.cu : Alloc y_gpu: " ); cudasafer(cudaMalloc( (void**)&(b0_gpu), nhalos*sizeof(float)),"Gradientgpu.cu : Alloc b0_gpu: " ); cudasafer(cudaMalloc( (void**)&(angle_gpu), nhalos*sizeof(float)),"Gradientgpu.cu : Alloc angle_gpu: " ); cudasafer(cudaMalloc( (void**)&(epot_gpu), nhalos*sizeof(float)),"Gradientgpu.cu : Alloc epot_gpu: " ); cudasafer(cudaMalloc( (void**)&(rcore_gpu), nhalos*sizeof(float)),"Gradientgpu.cu : Alloc rcore_gpu: " ); cudasafer(cudaMalloc( (void**)&(rcut_gpu), nhalos*sizeof(float)),"Gradientgpu.cu : Alloc rcut_gpu: " ); cudasafer(cudaMalloc( (void**)&(anglecos_gpu), nhalos*sizeof(float)),"Gradientgpu.cu : Alloc anglecos_gpu: " ); cudasafer(cudaMalloc( (void**)&(anglesin_gpu), nhalos*sizeof(float)),"Gradientgpu.cu : Alloc anglesin_gpu: " ); cudasafer(cudaMalloc( (void**)&(frame_gpu), sizeof(grid_param)),"Gradientgpu.cu : Alloc frame_gpu: " ); cudasafer(cudaMalloc( (void**)&(grid_grad_gpu_x), (nbgridcells_x) * (nbgridcells_y) *sizeof(type_t)),"Gradientgpu.cu : Alloc source_x_gpu: " ); cudasafer(cudaMalloc( (void**)&(grid_grad_gpu_y), (nbgridcells_x) * (nbgridcells_y) *sizeof(type_t)),"Gradientgpu.cu : Alloc source_y_gpu: " ); atime += seconds(); std::cout << "\n ** allocation time = " << atime << " s." << std::endl; // // Copy values to the GPU // double ttime = -seconds(); cudasafer(cudaMemcpy(type_gpu, lens->type , nhalos*sizeof(int),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy type_gpu: " ); cudasafer(cudaMemcpy(lens_x_gpu, lens->position_x , nhalos*sizeof(float),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy x_gpu: " ); cudasafer(cudaMemcpy(lens_y_gpu, lens->position_y , nhalos*sizeof(float), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy y_gpu: " ); cudasafer(cudaMemcpy(b0_gpu, lens->b0 , nhalos*sizeof(float), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy b0_gpu: " ); cudasafer(cudaMemcpy(angle_gpu, lens->ellipticity_angle , nhalos*sizeof(float), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy angle_gpu: " ); cudasafer(cudaMemcpy(epot_gpu, lens->ellipticity_potential, nhalos*sizeof(float),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy epot_gpu: " ); cudasafer(cudaMemcpy(rcore_gpu, lens->rcore, nhalos*sizeof(float),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy rcore_gpu: " ); cudasafer(cudaMemcpy(rcut_gpu, lens->rcut, nhalos*sizeof(float), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy rcut_gpu: " ); cudasafer(cudaMemcpy(anglecos_gpu, lens->anglecos, nhalos*sizeof(float),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy anglecos: " ); cudasafer(cudaMemcpy(anglesin_gpu, lens->anglesin, nhalos*sizeof(float), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy anglesin: " ); cudasafer(cudaMemcpy(frame_gpu, frame, sizeof(grid_param), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy fame_gpu: " ); //printf(" transfer bandwidth = %f, %d Bytes, %f s.\n", (double) mem_len/ttime/1024./1024./1024., mem_len, ttime); // lens_gpu->type = type_gpu; lens_gpu->position_x = lens_x_gpu; lens_gpu->position_y = lens_y_gpu; lens_gpu->b0 = b0_gpu; lens_gpu->ellipticity_angle = angle_gpu; lens_gpu->ellipticity_potential = epot_gpu; lens_gpu->rcore = rcore_gpu; lens_gpu->rcut = rcut_gpu; lens_gpu->anglecos = anglecos_gpu; lens_gpu->anglesin = anglesin_gpu; // cudaMemcpy(lens_kernel, lens_gpu, sizeof(Potential_SOA), cudaMemcpyHostToDevice); //cudaDeviceSynchronyze(); ttime += seconds(); #else gpu_allocate_SOA<type_t>(lens_gpu, lens, frame, nhalos, nbgridcells_x, nbgridcells_y); #endif // ttime += seconds(); long int mem_len = 10*nhalos*sizeof(float) + sizeof(grid_param) + sizeof(Potential_SOA); // printf("\n ** transfer bandwidth = %f GB/s, %d Bytes, %f s.\n", (double) mem_len/ttime/1024./1024./1024., mem_len, ttime); // const int grid_dim = nbgridcells_x; // //point image_point; // const float dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1); const float dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1); // // // double time; //printf("MIXED -> dx = %.15f, dy = %.15f, nbgridcells_x = %d, nbgridcells_y = %d, %p %p\n", dx, dy, nbgridcells_x, nbgridcells_y, grid_grad_temp_x, grid_grad_temp_y); // int count = 0; cudaStream_t stream1; cudaStreamCreate(&stream1); // // Float computation // PUSH_RANGE("Float GPU", 1); time = -seconds(); #if 1 { int GRID_SIZE_X = (nbgridcells_x + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks int GRID_SIZE_Y = (nbgridcells_y + BLOCK_SIZE_Y - 1)/BLOCK_SIZE_Y; // dim3 threads(BLOCK_SIZE_X, BLOCK_SIZE_Y/1); dim3 grid ( GRID_SIZE_X, GRID_SIZE_Y); // gradient_grid_SP_GPU<<<grid, threads>>>(grid_grad_gpu_x, grid_grad_gpu_y, frame_gpu, lens_kernel, Nlens, dx, dy, nbgridcells_x, nbgridcells_y, istart, jstart); //gradient_grid_general_SP_GPU<<<grid, threads>>>(grid_grad_gpu_dp_x, grid_grad_gpu_dp_y, frame_gpu, lens_kernel, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart); cudaDeviceSynchronize(); cudasafer(cudaGetLastError(), "module_potentialDerivative_totalGradient_SOA_SOA_GPU"); CudaCheckError(); } #else module_potentialDerivatives_totalGradient_SOA_CPU_GPU(grid_grad_gpu_x, grid_grad_gpu_y, frame_gpu, lens_kernel, nhalos, dx, dy, nbgridcells_x, nbgridcells_y, istart, jstart); #endif time += seconds(); printf(" ** GPU SP computation: %f s.\n", time); ttime = -seconds(); { cudaMemcpyAsync(grid_grad_x, grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(float), cudaMemcpyDeviceToHost, stream1); cudaMemcpyAsync(grid_grad_y, grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(float), cudaMemcpyDeviceToHost, stream1); } cudaDeviceSynchronize(); //for (int ii = 0; ii < nbgridcells_x*nbgridcells_y; ++ii) // printf("%d: %f\n", ii, grid_grad_x[ii]); ttime += seconds(); mem_len = 2*nbgridcells_x*nbgridcells_y*sizeof(float); printf(" ** transfer bandwidth = %f GB/s, %d Bytes, %f s.\n", (double) mem_len/ttime/1024./1024./1024., mem_len, ttime); POP_RANGE; } // // // void gradient_grid_general_double_GPU(double* grid_grad_x, double* grid_grad_y, const struct grid_param *frame, const struct Potential_Mixed<double> *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart) { grid_param *frame_gpu; // int nhalos = Nlens; // // Allocate variables on the GPU // printf("\n"); printf(" ** allocating memory on the GPU...\n");fflush(stdout); //printf(" size of gridparam = %d %d %d\n", sizeof(grid_param), sizeof(frame->xmin), sizeof(type_t)); double atime = -seconds(); // Potential_Mixed<double> *lens_gpu, *lens_kernel;// = (Potential_Mixed<double> *) malloc(sizeof(Potential_Mixed<double>)); lens_gpu = (Potential_Mixed<double> *) malloc(sizeof(Potential_Mixed<double>)); #if 1 //double* grid_grad_gpu_x; cudasafer(cudaMallocManaged((void**) &grid_grad_gpu_x , nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_temp_x"); //double* grid_grad_gpu_y; cudasafer(cudaMallocManaged((void**) &grid_grad_gpu_y , nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_temp_y"); double* grid_grad_gpu_x; cudasafer(cudaMallocHost((void**) &grid_grad_gpu_x , nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_temp_x"); double* grid_grad_gpu_y; cudasafer(cudaMallocHost((void**) &grid_grad_gpu_y , nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_temp_y"); // int *type_gpu; double *lens_x_gpu, *lens_y_gpu, *b0_gpu, *angle_gpu, *epot_gpu, *rcore_gpu, *rcut_gpu, *anglecos_gpu, *anglesin_gpu; // cudasafer(cudaMalloc( (void**)&(lens_kernel) , sizeof(Potential_Mixed<double>)),"Gradientgpu.cu : Alloc Potential_SOA: " ); cudasafer(cudaMalloc( (void**)&(type_gpu) , nhalos*sizeof(int)) ,"Gradientgpu.cu : Alloc type_gpu: " ); cudasafer(cudaMalloc( (void**)&(lens_x_gpu) , nhalos*sizeof(double)),"Gradientgpu.cu : Alloc x_gpu: " ); cudasafer(cudaMalloc( (void**)&(lens_y_gpu) , nhalos*sizeof(double)),"Gradientgpu.cu : Alloc y_gpu: " ); cudasafer(cudaMalloc( (void**)&(b0_gpu) , nhalos*sizeof(double)),"Gradientgpu.cu : Alloc b0_gpu: " ); cudasafer(cudaMalloc( (void**)&(angle_gpu) , nhalos*sizeof(double)),"Gradientgpu.cu : Alloc angle_gpu: " ); cudasafer(cudaMalloc( (void**)&(epot_gpu) , nhalos*sizeof(double)),"Gradientgpu.cu : Alloc epot_gpu: " ); cudasafer(cudaMalloc( (void**)&(rcore_gpu) , nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcore_gpu: " ); cudasafer(cudaMalloc( (void**)&(rcut_gpu) , nhalos*sizeof(double)),"Gradientgpu.cu : Alloc rcut_gpu: " ); cudasafer(cudaMalloc( (void**)&(anglecos_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglecos_gpu: " ); cudasafer(cudaMalloc( (void**)&(anglesin_gpu), nhalos*sizeof(double)),"Gradientgpu.cu : Alloc anglesin_gpu: " ); cudasafer(cudaMalloc( (void**)&(frame_gpu) , sizeof(grid_param)) ,"Gradientgpu.cu : Alloc frame_gpu: " ); // // Copy values to the GPU // double ttime = -seconds(); cudasafer(cudaMemcpy(type_gpu , lens->type , nhalos*sizeof(int) , cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy type_gpu: " ); cudasafer(cudaMemcpy(lens_x_gpu , lens->position_x , nhalos*sizeof(double), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy x_gpu: " ); cudasafer(cudaMemcpy(lens_y_gpu , lens->position_y , nhalos*sizeof(double), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy y_gpu: " ); cudasafer(cudaMemcpy(b0_gpu , lens->b0 , nhalos*sizeof(double), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy b0_gpu: " ); cudasafer(cudaMemcpy(angle_gpu , lens->ellipticity_angle , nhalos*sizeof(double), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy angle_gpu: " ); cudasafer(cudaMemcpy(epot_gpu , lens->ellipticity_potential, nhalos*sizeof(double), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy epot_gpu: " ); cudasafer(cudaMemcpy(rcore_gpu , lens->rcore , nhalos*sizeof(double), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy rcore_gpu: " ); cudasafer(cudaMemcpy(rcut_gpu , lens->rcut , nhalos*sizeof(double), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy rcut_gpu: " ); cudasafer(cudaMemcpy(anglecos_gpu , lens->anglecos , nhalos*sizeof(double), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy anglecos: " ); cudasafer(cudaMemcpy(anglesin_gpu , lens->anglesin , nhalos*sizeof(double), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy anglesin: " ); cudasafer(cudaMemcpy(frame_gpu , frame , sizeof(grid_param), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy fame_gpu: " ); // lens_gpu->type = type_gpu; lens_gpu->position_x = lens_x_gpu; lens_gpu->position_y = lens_y_gpu; lens_gpu->b0 = b0_gpu; lens_gpu->ellipticity_angle = angle_gpu; lens_gpu->ellipticity_potential = epot_gpu; lens_gpu->rcore = rcore_gpu; lens_gpu->rcut = rcut_gpu; lens_gpu->anglecos = anglecos_gpu; lens_gpu->anglesin = anglesin_gpu; // cudaMemcpy((void*) lens_kernel, (void*) lens_gpu, sizeof(Potential_Mixed<double>), cudaMemcpyHostToDevice); // #else gpu_allocate_SOA<double>(lens_gpu, lens, frame, nhalos, nbgridcells_x, nbgridcells_y); #endif // cudaDeviceSynchronize(); // ttime += seconds(); long int mem_len = 10*nhalos*sizeof(double) + sizeof(grid_param) + sizeof(Potential_Mixed<double>); // //printf(" ** transfer bandwidth = %f GB/s, %d Bytes, %f s.\n", (double) mem_len/ttime/1024./1024./1024., mem_len, ttime); // const int grid_dim = nbgridcells_x; // //point image_point; // const double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1); const double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1); // // // double time; //printf("MIXED -> dx = %.15f, dy = %.15f, nbgridcells_x = %d, nbgridcells_y = %d, %p %p\n", dx, dy, nbgridcells_x, nbgridcells_y, grid_grad_temp_x, grid_grad_temp_y); int count = 0; // // Double computation // int GRID_SIZE_X = (nbgridcells_x + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks int GRID_SIZE_Y = (nbgridcells_y + BLOCK_SIZE_Y - 1)/BLOCK_SIZE_Y; // dim3 threads(BLOCK_SIZE_X, BLOCK_SIZE_Y/1); dim3 grid ( GRID_SIZE_X, GRID_SIZE_Y); // PUSH_RANGE("Double", 1); time = -seconds(); { // gradient_grid_general_DP_GPU<<<grid, threads>>>(grid_grad_gpu_x, grid_grad_gpu_y, frame_gpu, lens_kernel, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart); // CudaCheckError(); } cudaDeviceSynchronize(); time += seconds(); printf(" ** GPU DP computation: %f s.\n", time); ttime = -seconds(); cudasafer(cudaMemcpy(grid_grad_x, grid_grad_gpu_x, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost), "gradient_grid_general_double_GPU: cudaMemcpy gpu_x"); cudasafer(cudaMemcpy(grid_grad_y, grid_grad_gpu_y, nbgridcells_x*nbgridcells_y*sizeof(double), cudaMemcpyDeviceToHost), "gradient_grid_general_double_GPU: cudaMemcpy gpu_y"); ttime += seconds(); mem_len = 2*nbgridcells_x*nbgridcells_y*sizeof(double); printf(" ** transfer bandwidth = %f GB/s, %d Bytes, %f s.\n", (double) mem_len/ttime/1024./1024./1024., mem_len, ttime); // // POP_RANGE; // cudasafer(cudaFree( lens_kernel) , "gradient_grid_general_double_GPU: free lens_kernel"); cudasafer(cudaFree( type_gpu) , "gradient_grid_general_double_GPU: free type_gpu"); cudasafer(cudaFree( lens_x_gpu) , "gradient_grid_general_double_GPU: free lens_x_gpu"); cudasafer(cudaFree( lens_y_gpu) , "gradient_grid_general_double_GPU: free lens_y_gpu"); cudasafer(cudaFree( b0_gpu) , "gradient_grid_general_double_GPU: free b0_gpu"); cudasafer(cudaFree( angle_gpu) , "gradient_grid_general_double_GPU: free angle_gpu"); cudasafer(cudaFree( epot_gpu) , "gradient_grid_general_double_GPU: free epot_gpu"); cudasafer(cudaFree( rcore_gpu) , "gradient_grid_general_double_GPU: free rcore_gpu"); cudasafer(cudaFree( rcut_gpu) , "gradient_grid_general_double_GPU: free rcut_gpu"); cudasafer(cudaFree( anglecos_gpu) , "gradient_grid_general_double_GPU: free anglecos_gpu"); cudasafer(cudaFree( anglesin_gpu) , "gradient_grid_general_double_GPU: free anglesin_gpu"); cudasafer(cudaFree( frame_gpu) , "gradient_grid_general_double_GPU: free frame_gpu"); // cudasafer(cudaFreeHost(grid_grad_gpu_x), "gradient_grid_general_double_GPU: free grid_grad_gpu_x"); cudasafer(cudaFreeHost(grid_grad_gpu_y), "gradient_grid_general_double_GPU: free grid_grad_gpu_y"); // free(lens_gpu); // } void gradient_grid_general_mixed_GPU(double* grid_grad_x, double* grid_grad_y, const struct grid_param *frame, const struct Potential_Mixed<double> *lens, int Nlens, int nbgridcells_x, int nbgridcells_y, int istart, int jstart) { grid_param *frame_gpu; Potential_SOA *lens_gpu,*lens_kernel; //double *grid_grad_x_gpu, *grid_grad_y_gpu ; lens_gpu = (Potential_SOA *) malloc(sizeof(Potential_SOA)); //lens_gpu->type = (int *) malloc(sizeof(int)); // int nhalos = Nlens; // // Allocate variables on the GPU // double atime = -seconds(); // double* grid_grad_temp_x; cudasafer(cudaMallocManaged((void**) &grid_grad_temp_x , nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_temp_x"); double* grid_grad_temp_y; cudasafer(cudaMallocManaged((void**) &grid_grad_temp_y , nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_temp_y"); // double* grid_grad_euler_x; cudasafer(cudaMallocManaged((void**) &grid_grad_euler_x, nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_euler_x"); double* grid_grad_euler_y; cudasafer(cudaMallocManaged((void**) &grid_grad_euler_y, nbgridcells_x*nbgridcells_y*sizeof(double)), "grid_grad_euler_y"); // #if 1 int *type_gpu; type_t *lens_x_gpu, *lens_y_gpu, *b0_gpu, *angle_gpu, *epot_gpu, *rcore_gpu, *rcut_gpu, *anglecos_gpu, *anglesin_gpu; // cudasafer(cudaMalloc( (void**)&(lens_kernel) , sizeof(Potential_SOA)),"Gradientgpu.cu : Alloc Potential_SOA: " ); cudasafer(cudaMalloc( (void**)&(type_gpu) , nhalos*sizeof(int)) ,"Gradientgpu.cu : Alloc type_gpu: " ); cudasafer(cudaMalloc( (void**)&(lens_x_gpu) , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc x_gpu: " ); cudasafer(cudaMalloc( (void**)&(lens_y_gpu) , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc y_gpu: " ); cudasafer(cudaMalloc( (void**)&(b0_gpu) , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc b0_gpu: " ); cudasafer(cudaMalloc( (void**)&(angle_gpu) , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc angle_gpu: " ); cudasafer(cudaMalloc( (void**)&(epot_gpu) , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc epot_gpu: " ); cudasafer(cudaMalloc( (void**)&(rcore_gpu) , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc rcore_gpu: " ); cudasafer(cudaMalloc( (void**)&(rcut_gpu) , nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc rcut_gpu: " ); cudasafer(cudaMalloc( (void**)&(anglecos_gpu), nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc anglecos_gpu: " ); cudasafer(cudaMalloc( (void**)&(anglesin_gpu), nhalos*sizeof(type_t)),"Gradientgpu.cu : Alloc anglesin_gpu: " ); cudasafer(cudaMalloc( (void**)&(frame_gpu) , sizeof(grid_param)) ,"Gradientgpu.cu : Alloc frame_gpu: " ); // // Copy values to the GPU // double ttime = -seconds(); cudasafer(cudaMemcpy(type_gpu , lens->type , nhalos*sizeof(int) , cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy type_gpu: " ); cudasafer(cudaMemcpy(lens_x_gpu , lens->position_x , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy x_gpu: " ); cudasafer(cudaMemcpy(lens_y_gpu , lens->position_y , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy y_gpu: " ); cudasafer(cudaMemcpy(b0_gpu , lens->b0 , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy b0_gpu: " ); cudasafer(cudaMemcpy(angle_gpu , lens->ellipticity_angle , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy angle_gpu: " ); cudasafer(cudaMemcpy(epot_gpu , lens->ellipticity_potential, nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy epot_gpu: " ); cudasafer(cudaMemcpy(rcore_gpu , lens->rcore , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy rcore_gpu: " ); cudasafer(cudaMemcpy(rcut_gpu , lens->rcut , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy rcut_gpu: " ); cudasafer(cudaMemcpy(anglecos_gpu , lens->anglecos , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy anglecos: " ); cudasafer(cudaMemcpy(anglesin_gpu , lens->anglesin , nhalos*sizeof(type_t), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy anglesin: " ); cudasafer(cudaMemcpy(frame_gpu , frame , sizeof(grid_param), cudaMemcpyHostToDevice) ,"Gradientgpu.cu : Copy fame_gpu: " ); // lens_gpu->type = type_gpu; lens_gpu->position_x = lens_x_gpu; lens_gpu->position_y = lens_y_gpu; lens_gpu->b0 = b0_gpu; lens_gpu->ellipticity_angle = angle_gpu; lens_gpu->ellipticity_potential = epot_gpu; lens_gpu->rcore = rcore_gpu; lens_gpu->rcut = rcut_gpu; lens_gpu->anglecos = anglecos_gpu; lens_gpu->anglesin = anglesin_gpu; // cudaMemcpy(lens_kernel, lens_gpu, sizeof(Potential_SOA), cudaMemcpyHostToDevice); //cudaDeviceSynchronize(); #else gpu_allocate_SOA<type_t>(lens_gpu, lens, frame, nhalos, nbgridcells_x, nbgridcells_y); #endif // atime += seconds(); long int mem_len = 10*nhalos*sizeof(type_t) + sizeof(grid_param) + sizeof(Potential_SOA); // printf("\n ** transfer bandwidth = %f GB/s, %d Bytes, %f s.\n", (double) mem_len/ttime/1024./1024./1024., mem_len, atime); // const int grid_dim = nbgridcells_x; // //point image_point; // const type_t dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1); const type_t dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1); // // // double time; //printf("MIXED -> dx = %.15f, dy = %.15f, nbgridcells_x = %d, nbgridcells_y = %d, %p %p\n", dx, dy, nbgridcells_x, nbgridcells_y, grid_grad_temp_x, grid_grad_temp_y); int count = 0; // // Float computation // PUSH_RANGE("Float", 1); time = -seconds(); // #if 0 #pragma omp parallel #pragma omp for for (int jj = jstart; jj < nbgridcells_y; ++jj) for (int ii = istart; ii < nbgridcells_x; ++ii) { // int index = jj*nbgridcells_x + ii; // struct point image_point; image_point.x = frame->xmin + (ii + istart)*dx; image_point.y = frame->ymin + (jj + jstart)*dy; // point Grad = module_potentialDerivatives_totalGradient_SOA(&image_point, lens, Nlens); // grid_grad_temp_x[index] = (double) Grad.x; grid_grad_temp_y[index] = (double) Grad.y; // } //memcpy(grid_grad_x, grid_grad_temp_x, nbgridcells_x*sizeof(double)); //memcpy(grid_grad_y, grid_grad_temp_y, nbgridcells_y*sizeof(double)); // #else { int GRID_SIZE_X = (nbgridcells_x + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks int GRID_SIZE_Y = (nbgridcells_y + BLOCK_SIZE_Y - 1)/BLOCK_SIZE_Y; // dim3 threads(BLOCK_SIZE_X, BLOCK_SIZE_Y/1); dim3 grid ( GRID_SIZE_X, GRID_SIZE_Y); // gradient_grid_general_SP_GPU<<<grid, threads>>>(grid_grad_temp_x, grid_grad_temp_y, frame_gpu, lens_kernel, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart); // CudaCheckError(); cudaDeviceSynchronize(); } #endif POP_RANGE; time += seconds(); printf(" ** GPU Float computation: %f s.\n", time); // // // #if 0 PUSH_RANGE("Euler GPU", 2); count = 0; time = -seconds(); { int GRID_SIZE_X = (nbgridcells_x + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks int GRID_SIZE_Y = (nbgridcells_y + BLOCK_SIZE_Y - 1)/BLOCK_SIZE_Y; // dim3 threads(BLOCK_SIZE_X, BLOCK_SIZE_Y/1); dim3 grid ( GRID_SIZE_X, GRID_SIZE_Y); // gradient_grid_GPU<<<grid, threads>>>(grid_grad_temp_x, grid_grad_temp_y, frame_gpu, lens_kernel, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart); //daxpy<<<grid, threads>>>(nbgridcells_x*nbgridcells_y, 1., grid_grad_temp_x, grid_grad_temp_y); CudaCheckError(); //cudaDeviceSynchronize(); } time += seconds(); printf("** %d Euler computations: %f s.\n", gpu_count, time); POP_RANGE; #endif // #if 1 // // SIS Potentials in double precision // PUSH_RANGE("SIS CPU", 2); count = 0; time = -seconds(); { //gradient_grid_SIS_patch(grid_grad_temp_x, grid_grad_temp_y, frame, lens, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart, &count); } time += seconds(); printf("** %d SIS computations: %f s.\n", count, time); POP_RANGE; // // // #endif // // Euler approximation // #if 1 //memset(grid_grad_x, nbgridcells_x*nbgridcells_y*sizeof(double)); //memset(grid_grad_y, nbgridcells_x*nbgridcells_y*sizeof(double)); // count = 0; int* euler_count; CudaSafeCall(cudaMallocManaged((void**) &euler_count, sizeof(int))); time = -seconds(); { int GRID_SIZE_X = (nbgridcells_x + BLOCK_SIZE_X - 1)/BLOCK_SIZE_X; // number of blocks int GRID_SIZE_Y = (nbgridcells_y + BLOCK_SIZE_Y - 1)/BLOCK_SIZE_Y; // dim3 threads(BLOCK_SIZE_X, BLOCK_SIZE_Y/1); dim3 grid ( GRID_SIZE_X, GRID_SIZE_Y); // gradient_grid_euler_GPU<<<grid, threads>>>(grid_grad_euler_x, grid_grad_euler_y, grid_grad_temp_x, grid_grad_temp_y, frame_gpu, lens_kernel, Nlens, nbgridcells_x, nbgridcells_y, istart, jstart); //daxpy<<<grid, threads>>>(nbgridcells_x*nbgridcells_y, 1., grid_grad_temp_x, grid_grad_temp_y); CudaCheckError(); cudaDeviceSynchronize(); } time += seconds(); printf("** %d Euler computations: %f s.\n", gpu_count, time); // memcpy(grid_grad_x, grid_grad_euler_x, nbgridcells_x*nbgridcells_y*sizeof(double)); memcpy(grid_grad_y, grid_grad_euler_y, nbgridcells_x*nbgridcells_y*sizeof(double)); // #else memcpy(grid_grad_x, grid_grad_temp_x, nbgridcells_x*nbgridcells_y*sizeof(double)); memcpy(grid_grad_y, grid_grad_temp_y, nbgridcells_x*nbgridcells_y*sizeof(double)); #endif cudaFree(grid_grad_temp_x ); cudaFree(grid_grad_temp_y ); cudaFree(grid_grad_euler_x); cudaFree(grid_grad_euler_y); // // lens_gpu = (Potential_SOA *) malloc(sizeof(Potential_SOA)); // lens_gpu->type = (int *) malloc(sizeof(int)); // // Allocate variables on the GPU // cudasafer(cudaFree(lens_gpu), "free lens_kernel"); // free(lens_gpu); cudasafer(cudaFree(lens_kernel) , "free kernel_gpu"); cudasafer(cudaFree(type_gpu) , "free type_gpu"); cudasafer(cudaFree(lens_x_gpu) , "free lens_x_gpu"); cudasafer(cudaFree(lens_y_gpu) , "free lens_y_gpu"); cudasafer(cudaFree(b0_gpu) , "free b0_gpu"); cudasafer(cudaFree(angle_gpu) , "free angle_gpu"); cudasafer(cudaFree(epot_gpu) , "free epot_gpu"); cudasafer(cudaFree(rcore_gpu) , "free rcore_gpu"); cudasafer(cudaFree(rcut_gpu) , "free rcut_gpu"); cudasafer(cudaFree(anglecos_gpu), "free anglecos_gpu"); cudasafer(cudaFree(anglesin_gpu), "free anglesin_gpu"); cudasafer(cudaFree(frame_gpu) , "free frame_gpu"); } // // // void gradient_grid_mixed_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_Mixed<double> *lens, int nhalos, int nbgridcells_x, int nbgridcells_y, int istart, int jstart) { double dx = (frame->xmax - frame->xmin)/(nbgridcells_x - 1); double dy = (frame->ymax - frame->ymin)/(nbgridcells_y - 1); // std::cout << "Calling general mixed GPU" << std::endl; gradient_grid_general_mixed_GPU(grid_grad_x, grid_grad_y, frame, lens, nhalos, nbgridcells_x, nbgridcells_y, istart, jstart); // } diff --git a/src/structure_hpc.hpp b/src/structure_hpc.hpp index c4500c2..e7b85e8 100644 --- a/src/structure_hpc.hpp +++ b/src/structure_hpc.hpp @@ -1,546 +1,577 @@ /** * @file structure.h * @Author Thomas Jalabert, EPFL (me@example.com) , Christoph Schaefer, EPFL (christophernstrerne.schaefer@epfl.ch) * @date July 2015 * @version 0,1 * @brief Header file to define the used structures (e.g. defined structs) * * @param configuration file (parameters.par) * @return Depends on choice in the configuration file, e.g. least chi2 model */ // Header guard #ifndef STRUCTURE_HPC_H #define STRUCTURE_HPC_H #include <iostream> #include "type.h" /*****************************************************************/ /* */ /* Constants: Will be migrated to constants.h when there are too many of them*/ /* */ /*****************************************************************/ // GPU definitions #define threadsPerBlock 512 // threads for each set of images #define MAXIMPERSOURCE 20 // maximum number of multiple images for one source #define MAXIM 200 // maximum total number of images treated // Dimensions definitions #define NPZMAX 9 /* maximum number of critical lines in g_cline struct*/ //#define CLINESIZE 500 /* maximum number of critical and caustic points for Cline mode. Adjust depending on RAM*/ #define NPOTFILESIZE 2000 //maximum number of potential in potfiles #define DMIN 1e-4 // distance minimale de convergence dans le plan image (in arcsec) #define NITMAX 100 #define NTYPES 2 // gNFW definitions #define gNFW_ARRAY_SIZE 1800 // Set the dimension of the gnfw table gNFW_ARRAY_SIZE, must be 1800 for the current table file // Filename definition #define FILENAME_SIZE 50 // size of a filename in .par file //constants definition #define pi_c2 7.209970e-06 /* pi en arcsecond/ c^2 = 648000/vol/vol */ #define cH2piG 0.11585881 /* cH0/2piG en g/cm^2 (H0=50) */ #define cH4piG 0.057929405 /* cH0/4piG en g/cm^2 (H0=50) */ #define cH0_4piG 2.7730112e-4 /* cH0/4piG en 10^12 M_Sol/kpc^2 (H0=50) */ #define d0 29.068701 /* vol/h0*1000/rta -- c/H0 en h-1.kpc/arcsecond (h0=50)*/ #define MCRIT12 .2343165 /* c^3/4piGh0/RTA/RTA in 1e12 M_sol/arcsec^2 (h0=50) */ /*****************************************************************/ /* */ /* Types definition */ /* */ /*****************************************************************/ /** @brief Point: Structure of 2 coordinates * * @param x: X coordinate * @param y: Y coordinate */ #ifdef __WITH_LENSTOOL #include "structure.h" #else struct point { type_t x; type_t y; }; /** @brief Complex: Structure of 2 doubles * @param re: Real Part * @param im: Imaginary Part */ struct complex { type_t re; type_t im; }; /** @brief Segment: Structure of two points */ struct segment { point a; point b; }; /** @brief triplet: Structure of three points defining a triangle */ struct triplet { struct point a; struct point b; struct point c; }; /** @brief bitriplet: Defines two linked triangles (one in source plane and one in image plane) * @param i: Triangle on image plane * @param s: Triangle on source plane */ struct bitriplet { struct triplet i; struct triplet s; }; /** @brief contains the table needed to compute the potential derivatives of general NFW profile */ typedef struct { double alpha_now, x_now, kappa, dpl; } gNFWdata; /** @brief Matrix: 2by2 doubles */ struct matrix { type_t a; type_t b; type_t c; type_t d; }; /** @brief ellipse: for shape computation * @param a: semimajor axis * @param b: semiminor axis * @param theta: shape ellipticity */ struct ellipse { type_t a; type_t b; type_t theta; }; #endif /** @brief Storage type for sources, lenses and arclets * @param center: position of the center of galaxy * @param shape: shape of galaxy * @param mag: magnitude * @param redshift: redshift * @param dls: Distance lens-source * @param dos: Distance observer-source * @param dr: dls/dos */ struct galaxy { //char name[IDSIZE]; struct point center; struct ellipse shape; type_t mag; type_t redshift; type_t dls; type_t dos; type_t dr; }; /** @brief Contains the information for optimising a parameter in the inverse mode * @param block: blockorfree variable (whether a parameter is blocked or free for the mcmc algorithm) * @param min: lower optimisation value * @param max: upper optimisation value * @param sigma: optimisation step (MIGHT NOT BE TAKEN INTO ACCOUNT) */ struct optimize_block { int block; type_t min; type_t max; type_t sigma; }; /** @brief two optimize_block to simulate a point */ struct optimize_point // blockorfree for the point structure { struct optimize_block x; struct optimize_block y; }; /** @brief Contains the information for optimising the potential in the inverse mode * @param position: position of the center of the halo * @param weight: weight of the clump (the projected mass sigma0 for PIEMD, the density rhoscale for NFW) * @param b0: Impact parameter * @param ellipticity_angle: orientation of the clump * @param ellipticity: Mass ellipticity * @param ellipticity_potential: Potential ellipticity * @param rcore: PIEMD specific value * @param rcut: PIEMD specific value * @param rscale: scale radius for NFW, Einasto * @param exponent: exponent for Einasto * @param vdisp: Dispersion velocity * @param alpha: exponent for general NFW * @param einasto_kappacritic: critical kappa for Einasto profile * @param z: redshift */ struct potentialoptimization // block or free variable for the MCMC for the potential { struct optimize_point position; struct optimize_block weight; struct optimize_block b0; struct optimize_block ellipticity_angle; struct optimize_block ellipticity; struct optimize_block ellipticity_potential; struct optimize_block rcore; struct optimize_block rcut; struct optimize_block rscale; struct optimize_block exponent; struct optimize_block vdisp; struct optimize_block alpha; struct optimize_block einasto_kappacritic; struct optimize_block z; }; /** @brief Contains the information of the lens potential * @param type: 1=PIEMD , 2=NFW, 3=SIES, 4=point mass, 5=SIS, 8=PIEMD * @param type_name: IEMD, NFW, SIES, point * @param name: name of the clump (e.g. name of the galaxy) : not compulsory * @param position: position of the center of the halo * @param weight: weight of the clump (the projected mass sigma0 for PIEMD, the density rhoscale for NFW) * @param b0: Impact parameter * @param ellipticity_angle: * @param ellipticity: Mass ellipticity * @param ellipticity_potential: Potential ellipticity * @param rcore: PIEMD specific value * @param rcut: PIEMD specific value * @param rscale: scale radius for NFW, Einasto * @param exponent: exponent for Einasto * @param vdisp: Dispersion velocity * @param alpha: exponent for general NFW * @param einasto_kappacritic: critical kappa for Einasto profile * @param z: redshift */ + +#if 1 +struct Potential_SOA +{ + int* type; // 1=PIEMD ; 2=NFW; 3=SIES, 4=point mass + char type_name[10]; // PIEMD, NFW, SIES, point + type_t* name; // name of the clump (e.g. name of the galaxy) : not compulsory + //struct point position; // position of the center of the halo + type_t* __restrict__ position_x; // position of the center of the halo + type_t* __restrict__ position_y; // position of the center of the halo + type_t* __restrict__ weight; // weight of the clump (the projected mass sigma0 for PIEMD, the density rhoscale for NFW) + type_t* __restrict__ b0; // Impact parameter + type_t* __restrict__ vdisp; //Dispersion velocity + type_t* __restrict__ ellipticity_angle; // orientation of the clump + type_t* __restrict__ ellipticity; // ellipticity of the mass distribition + type_t* __restrict__ ellipticity_potential; //ellipticity of the potential + type_t* __restrict__ rcore; // core radius + type_t* __restrict__ rcut; // cut radius + type_t* __restrict__ rscale; // scale radius for NFW, Einasto + type_t* __restrict__ exponent; // exponent for Einasto + type_t* __restrict__ alpha; // exponent for general NFW + type_t* __restrict__ einasto_kappacritic; // critical kappa for Einasto profile + type_t* __restrict__ z; // redshift of the clump + type_t* __restrict__ mag; //magnitude + type_t* __restrict__ lum; //luminosity + type_t* __restrict__ theta; //theta + type_t* __restrict__ anglecos; //theta precomputation of cosinus and sinus values + type_t* __restrict__ anglesin; //theta + type_t* __restrict__ sigma; // sigma +}; +#else struct Potential_SOA { int* type; // 1=PIEMD ; 2=NFW; 3=SIES, 4=point mass char type_name[10]; // PIEMD, NFW, SIES, point type_t* name; // name of the clump (e.g. name of the galaxy) : not compulsory //struct point position; // position of the center of the halo - type_t* position_x; // position of the center of the halo + const type_t* __restrict__ position_x; // position of the center of the halo type_t* position_y; // position of the center of the halo type_t* weight; // weight of the clump (the projected mass sigma0 for PIEMD, the density rhoscale for NFW) type_t* b0; // Impact parameter type_t* vdisp; //Dispersion velocity type_t* ellipticity_angle; // orientation of the clump type_t* ellipticity; // ellipticity of the mass distribition type_t* ellipticity_potential; //ellipticity of the potential type_t* rcore; // core radius type_t* rcut; // cut radius type_t* rscale; // scale radius for NFW, Einasto type_t* exponent; // exponent for Einasto type_t* alpha; // exponent for general NFW type_t* einasto_kappacritic; // critical kappa for Einasto profile type_t* z; // redshift of the clump type_t* mag; //magnitude type_t* lum; //luminosity type_t* theta; //theta type_t* anglecos; //theta precomputation of cosinus and sinus values type_t* anglesin; //theta type_t* sigma; // sigma }; - +#endif struct Potential { int type; // 1=PIEMD ; 2=NFW; 3=SIES, 4=point mass char type_name[10]; // PIEMD, NFW, SIES, point char name[20]; // name of the clump (e.g. name of the galaxy) : not compulsory struct point position; // position of the center of the halo type_t weight; // weight of the clump (the projected mass sigma0 for PIEMD, the density rhoscale for NFW) type_t b0; // Impact parameter type_t vdisp; //Dispersion velocity type_t ellipticity_angle; // orientation of the clump type_t ellipticity; // ellipticity of the mass distribition type_t ellipticity_potential; //ellipticity of the potential type_t rcore; // core radius type_t rcut; // cut radius type_t rscale; // scale radius for NFW, Einasto type_t exponent; // exponent for Einasto type_t alpha; // exponent for general NFW type_t einasto_kappacritic; // critical kappa for Einasto profile type_t z; // redshift of the clump type_t mag; //magnitude type_t lum; //luminosity type_t theta; //theta type_t sigma; // sigma }; /*****************************************************************/ /* */ /* Control structure definition */ /* */ /*****************************************************************/ /** @brief Control structure for runmode parameters * * Default values are set in module_readParameters_readRunmode * * @param nbgridcells: Number of grid cells * @param source: flag for simple source to image conversion * @param sourfile: file name for source information * @param image: flag for simple image to source conversion * @param imafile: file name for image information * @param mass: flag for mass fitsfile * @param massgridcells: Nb of cell for fitsfile * @param z_mass: redshift for which to be computed * @param z_mass_s: redshift of source for which effect of mass will be computed * @param potential: flag for potential fitsfile * @param potgridcells: Nb of cell for fitsfile * @param z_pot: redshift for which to be computed * @param dpl: flag for displacement fitsfile * @param dplgridcells: Nb of cell for fitsfile * @param z_dpl: redshift for which to be computed * @param inverse: flag for inversion mode (MCMC etc,) * @param arclet: flag for arclet mode * @param debug: flag for debug mode * @param nimagestotal: total number of lensed images in file * @param nsets: number of sources attributed to the lensed images * @param nhalo: Number of halos * @param grid: 0 for automatic grid (not implemented), 1 for grid infor applying on source plane, 2 for grid information applying on image plane * @param nbgridcells: Number of grid cells * @param zgrid: redshift of grid * @param cline: flag for critical and caustic line calculation */ struct runmode_param { int nbgridcells; //Source Mode int source; int nsets; //Image Mode int image; int nimagestot; //Mult Mode int multi; //Mass Mode int mass; int mass_gridcells; type_t z_mass; type_t z_mass_s; //Potential Mode int potential; int pot_gridcells; type_t z_pot; int nhalos; //Potfile Mode int potfile; int npotfile; //displacement Mode int dpl; int dpl_gridcells; type_t z_dpl; //Inverse Mode int inverse; //Arclet Mode int arclet; //Debug Mode int debug; //Grid Mode int grid; int gridcells; type_t zgrid; //Critic and caustic mode int cline; //Amplification Mode int amplif; int amplif_gridcells; type_t z_amplif; //Time/Benchmark mode int time; //SOA variables int Nlens[NTYPES]; std::string imagefile; std::string potfilename; std::string sourfile; }; /** @brief Not used yet * */ struct image_param { }; /** @brief Not used yet * */ struct source_param { }; /** @brief Contains Grid information */ struct grid_param { type_t xmin; type_t xmax; type_t ymin; type_t ymax; type_t lmin; type_t lmax; type_t rmax; }; /** @brief Control structure for cosmological parameters * * @param model: Cosmological model * @param omegaM: * @param omegaX: * @param curvature: curvature parameter * @param wX: * @param wa: * @param H0: Hubble constant * @param h: H0/50 */ struct cosmo_param { int model; type_t omegaM; type_t omegaX; type_t curvature; type_t wX; type_t wa; type_t H0; type_t h; }; /** @brief Control structure for potfile parameters * * @param potid: 1: pot P, 2: pot Q @param ftype: @param potfile[FILENAME_SIZE]; @param type; @param zlens; @param core;CCC @param corekpc; @param mag0; @param select; @param ircut; @param cut, cut1, cut2; @param cutkpc1, cutkpc2; @param isigma; @param sigma, sigma1, sigma2; @param islope; @param slope, slope1, slope2; @param ivdslope; @param vdslope, vdslope1, vdslope2; @param ivdscat; @param vdscat, vdscat1, vdscat2; @param ircutscat; @param rcutscat, rcutscat1, rcutscat2; @param ia; // scaling relation of msm200 @param a, a1, a2; @param ib; // scaling relation of msm200 @param b, b1, b2; */ struct potfile_param { int potid; // 1: pot P, 2: pot Q int ftype; char potfile[FILENAME_SIZE]; int type; type_t zlens; type_t core; type_t corekpc; type_t mag0; int select; int ircut; type_t cut, cut1, cut2; type_t cutkpc1, cutkpc2; int isigma; type_t sigma, sigma1, sigma2; int islope; type_t slope, slope1, slope2; int ivdslope; type_t vdslope, vdslope1, vdslope2; int ivdscat; type_t vdscat, vdscat1, vdscat2; int ircutscat; type_t rcutscat, rcutscat1, rcutscat2; int ia; // scaling relation of msm200 type_t a, a1, a2; int ib; // scaling relation of msm200 type_t b, b1, b2; int npotfile; }; /** @brief Control structure for caustic and critical lines * * @param nplan: number of sourceplanes for which the caustic lines will be computed * @param cz: redshift values array for respective sourceplanes * @param dos: distcosmo1 to redshift z * @param dls: distcosmo2 between lens[0] and z * @param dlsds: ratio of dl0s/dos * @param limitLow: minimum size of the squares in MARCHINGSQUARES * @param dmax: Size of search area * @param xmin: * @param xmax: * @param ymin: * @param ymax: * @param limithigh: maximum size of the squares in MARCHINGSQUARES algorithm * @param nbgridcells: nbgridcells * nbgridcells = number of pixels for critical line computation */ struct cline_param { int nplan; type_t cz[NPZMAX]; type_t dos[NPZMAX]; // distcosmo1 to redshift z type_t dls[NPZMAX]; // distcosmo2 between lens[0] and z type_t dlsds[NPZMAX]; // ratio of dl0s/dos type_t limitLow; // minimum size of the squares in MARCHINGSQUARES or initial step size in SNAKE type_t dmax; type_t xmin; type_t xmax; type_t ymin; type_t ymax; type_t limitHigh; // maximum size of the squares in MARCHINGSQUARES algorithm int nbgridcells; // nbgridcells * nbgridcells = number of pixels for critical line computation }; #endif // if STRUCTURE_H