Page MenuHomec4science

delense_GPU.cu
No OneTemporary

File Metadata

Created
Fri, Nov 29, 01:38

delense_GPU.cu

// This file is part of lenstoolHPC
// authors: gilles.fourestey@epfl.ch
#include "delense_CPU_utils.hpp"
#include "delense_GPU_utils.cuh"
#include "structure_hpc.hpp"
#include "cudafunctions.cuh"
#ifdef __WITH_MPI
#include <mpi.h>
#endif
extern double myseconds();
#define BLOCK_SIZE_X 16
#define BLOCK_SIZE_Y 32
#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
inline
void
moveToGPU(void** d_mem, void* h_mem, int size)
{
#ifdef __WITH_UM
cudasafe(cudaMallocManaged((void**) d_mem, size), "memory allocation (UM)");
cudasafe(cudaMemcpy(*d_mem, h_mem, size, cudaMemcpyHostToDevice), "H2D memory copy (UM)");
#else
cudasafe(cudaMalloc((void**) d_mem, size), "memory allocation");
cudasafe(cudaMemcpy(*d_mem, h_mem, size, cudaMemcpyHostToDevice), "H2D memory copy");
#endif
}
inline
void
moveFromGPU(void* h_mem, void* d_mem, int size)
{
#ifdef __WITH_UM
//cudasafe(cudaMemcpy(h_mem, d_mem, size, cudaMemcpyDeviceToHost), "D2H memory copy (UM)");
memcpy(h_mem, d_mem, size);
#else
cudasafe(cudaMemcpy(h_mem, d_mem, size, cudaMemcpyHostToDevice), "D2H memory copy");
#endif
}
//
//
//
#if 1
__global__
void
delense_GPU(struct point* image_pos, int MAXIMPERSOURCE, const runmode_param *runmode, const struct Potential_SOA *lens, const struct grid_param *frame, const struct galaxy* sources, const int y_pos_loc, const int y_bound, const int source_id, double* grid_gradient_x, double* grid_gradient_y, int* img_pos)
{
#define INDEX2D_BAR(y, x) (MAXIMPERSOURCE*y + x)
//
int nbgridcells_x = runmode->nbgridcells;
int nbgridcells_y = runmode->nbgridcells;
//
int col = blockIdx.x*blockDim.x + threadIdx.x;
if (col > nbgridcells_x - 2) return;
//
int num_blocks_y = blockDim.y;
int row_y = threadIdx.y;
int block_size_y = y_bound/num_blocks_y;
//
const double dx = (frame->xmax - frame->xmin)/(runmode->nbgridcells - 1);
const double dy = (frame->ymax - frame->ymin)/(runmode->nbgridcells - 1);
//
int row_s = (row_y + 0)*block_size_y;
int row_e = MIN((row_y + 1)*block_size_y, nbgridcells_y);
if (row_y == BLOCK_SIZE_Y - 1)
{
block_size_y = y_bound - row_y*block_size_y - 1;
row_e = nbgridcells_y;
}
//
//for (int row = y_pos_loc; row < y_pos_loc + y_bound - 1; ++row)
int yp = y_pos_loc + row_s;
//
for (int row = 0; row < block_size_y; ++row)
{
//
int y_id = row;
int x_id = col;
//
// int images_total;
//
double x_pos = frame->xmin + ( x_id + 0 )*dx;
double y_pos = frame->ymin + ( y_id + yp )*dy;
//
// Define the upper + lower triangle, both together = square = pixel
//
struct triplet Tsup, Tinf;
//
Tsup.a.x = x_pos;
Tsup.b.x = x_pos;
Tsup.c.x = x_pos + dx;
Tinf.a.x = x_pos + dx;
Tinf.b.x = x_pos + dx;
Tinf.c.x = x_pos;
//
Tsup.a.y = y_pos;
Tsup.b.y = y_pos + dy;
Tsup.c.y = y_pos;
Tinf.a.y = y_pos + dy;
Tinf.b.y = y_pos;
Tinf.c.y = y_pos + dy;
//
// Lens to Sourceplane conversion of triangles
//
// double time = -myseconds();
//
struct triplet Timage;
struct triplet Tsource;
//
int g_loc = (y_id + row_s)*nbgridcells_x + x_id;
//
mychi_transformtriangleImageToSourcePlane_SOA_grid_gradient_upper_GPU(&Tsup, sources[source_id].dr, &Tsource, grid_gradient_x, grid_gradient_y, g_loc, nbgridcells_x);
//
//int thread_found_image = 0;
//
#if 1
if (mychi_insideborder_GPU(&sources[source_id].center, &Tsource) == 1)
{
image_pos[atomicAdd(img_pos,1)] = mychi_barycenter_GPU(&Tsup);
}
#if 1
else
{
mychi_transformtriangleImageToSourcePlane_SOA_grid_gradient_lower_GPU(&Tinf, sources[source_id].dr, &Tsource, grid_gradient_x, grid_gradient_y, g_loc, nbgridcells_x);
if (mychi_inside_GPU(&sources[source_id].center, &Tsource) == 1)
{
image_pos[atomicAdd(img_pos,1 )] = mychi_barycenter_GPU(&Tinf);
}
}
#endif
#endif
}
}
#endif
//
//
//
__global__
void
delense_GPU_v2(struct point* image_pos, int MAXIMPERSOURCE, const struct Potential_SOA *lens, const struct grid_param *frame, const struct galaxy* sources, const int y_pos_loc, const int y_bound, const int source_id, double* grid_gradient_x, double* grid_gradient_y, int* img_pos, int nbgridcells)
{
#define INDEX2D_BAR(y, x) (MAXIMPERSOURCE*y + x)
//
int nbgridcells_x = nbgridcells;
int nbgridcells_y = nbgridcells;
//
int col = blockIdx.x*blockDim.x + threadIdx.x;
if (col > nbgridcells_x - 2) return;
int row = blockIdx.y*blockDim.y + threadIdx.y;
//if (row + y_pos_loc > nbgridcells_y - 2) return;
if (row > y_bound - 2) return;
//if (!row) printf("y_bound = %d\n", y_bound);
//
const double dx = (frame->xmax - frame->xmin)/(nbgridcells - 1);
const double dy = (frame->ymax - frame->ymin)/(nbgridcells - 1);
//
/*
int num_blocks_y = blockDim.y;
int row_y = threadIdx.y;
int block_size_y = y_bound/num_blocks_y;
//
//
int row_s = (row_y + 0)*block_size_y;
int row_e = MIN((row_y + 1)*block_size_y, nbgridcells_y);
if (row_y == BLOCK_SIZE_Y - 1)
{
block_size_y = y_bound - row_y*block_size_y - 1;
row_e = nbgridcells_y;
}
*/
//for (int row = y_pos_loc; row < y_pos_loc + y_bound - 1; ++row)
int yp = y_pos_loc;// + 0*row_s;
//
//for (int row = 0; row < block_size_y; ++row)
{
//
int y_id = row;
int x_id = col;
//
// int images_total;
//
double x_pos = frame->xmin + ( x_id + 0 )*dx;
double y_pos = frame->ymin + ( y_id + yp )*dy;
//
// Define the upper + lower triangle, both together = square = pixel
//
struct triplet Tsup, Tinf;
//
Tsup.a.x = x_pos;
Tsup.b.x = x_pos;
Tsup.c.x = x_pos + dx;
Tinf.a.x = x_pos + dx;
Tinf.b.x = x_pos + dx;
Tinf.c.x = x_pos;
//
Tsup.a.y = y_pos;
Tsup.b.y = y_pos + dy;
Tsup.c.y = y_pos;
Tinf.a.y = y_pos + dy;
Tinf.b.y = y_pos;
Tinf.c.y = y_pos + dy;
//
// Lens to Sourceplane conversion of triangles
//
// double time = -myseconds();
//
struct triplet Timage;
struct triplet Tsource;
//
int g_loc = (y_id /*+ row_s*/)*nbgridcells_x + x_id;
//
mychi_transformtriangleImageToSourcePlane_SOA_grid_gradient_upper_GPU(&Tsup, sources[source_id].dr, &Tsource, grid_gradient_x, grid_gradient_y, g_loc, nbgridcells_x);
//
//int thread_found_image = 0;
//
if (mychi_insideborder_GPU(&sources[source_id].center, &Tsource) == 1)
{
image_pos[atomicAdd(img_pos,1)] = mychi_barycenter_GPU(&Tsup);
}
else
{
mychi_transformtriangleImageToSourcePlane_SOA_grid_gradient_lower_GPU(&Tinf, sources[source_id].dr, &Tsource, grid_gradient_x, grid_gradient_y, g_loc, nbgridcells_x);
if (mychi_inside_GPU(&sources[source_id].center, &Tsource) == 1)
{
image_pos[atomicAdd(img_pos,1 )] = mychi_barycenter_GPU(&Tinf);
}
}
}
}
//
//
//
void delense_barycenter_GPU(struct point* image_pos, int MAXIMPERSOURCE, int* locimagesfound, int* numimg, const runmode_param *runmode, const struct Potential_SOA *lens, const struct grid_param *frame, const int *nimages_strongLensing, const struct galaxy* sources, double* grid_gradient_x, double* grid_gradient_y, const int y_pos_loc, const int y_bound)
{
#define INDEX2D_BAR(y, x) (MAXIMPERSOURCE*y + x)
int world_size = 1;
int world_rank = 0;
/*
#ifdef __WITH_MPI
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
#endif
*/
//const unsigned int nimagestot = runmode->nimagestot;
const unsigned int nsets = runmode->nsets;
const unsigned int nbgridcells = runmode->nbgridcells;
const unsigned int nbgridcells_x = runmode->nbgridcells;
const unsigned int nbgridcells_y = runmode->nbgridcells;
//
const int grid_size = nbgridcells;
//
const double dx = (frame->xmax - frame->xmin)/(nbgridcells - 1);
const double dy = (frame->ymax - frame->ymin)/(nbgridcells - 1);
//
int images_total = 0;
int index = 0;
//
int block_size_x = BLOCK_SIZE_X;
int block_size_y = BLOCK_SIZE_Y;
//
int GRID_SIZE_X = (nbgridcells_x + block_size_x - 1)/block_size_x; // number of blocks
int GRID_SIZE_Y = (y_bound + block_size_y - 1)/block_size_y;
printf("%d: y_loc_pos = %d, y_bound = %d\n", world_rank, y_pos_loc, y_bound);
//cudasafe(cudaGetLastError(), "before allocation ");
//int GRID_SIZE_Y = 1;
//
dim3 threads(block_size_x, block_size_y);
dim3 grid (GRID_SIZE_X , GRID_SIZE_Y);
//
double alloc_time = -myseconds();
//
struct point* image_pos_gpu;
cudaMallocManaged((void**) &image_pos_gpu, grid_size*MAXIMPERSOURCE*sizeof(struct point));
//
int* img_pos;
cudaMallocManaged((void**) &img_pos, sizeof(int));
//
memset(locimagesfound, 0, nsets*sizeof(int));
//
//cudaDeviceSynchronize();
cudasafe(cudaGetLastError(), "before image_pos_gpu allocation ");
alloc_time += myseconds();
//
//
double time_gpu, time_cpu;
//
time_gpu = 0.;
time_cpu = 0.;
//
for( int source_id = 0; source_id < nsets; source_id ++)
{
*img_pos = 0;
//that slows down everything on P100s unfortunately
//cudaMemPrefetchAsync(image_pos_gpu, grid_size*block_size_y*MAXIMPERSOURCE*sizeof(struct point), 0);
int loc_images_found = 0;
//
int x_pos_loc = 0;
time_gpu -= myseconds();
//
delense_GPU_v2<<<grid, threads>>> (image_pos_gpu, MAXIMPERSOURCE, lens, frame, sources, y_pos_loc, y_bound, source_id, grid_gradient_x, grid_gradient_y, img_pos, nbgridcells);
cudaDeviceSynchronize();
cudasafe(cudaGetLastError(), "delense_GPU");
//
time_gpu += myseconds();
time_cpu -= myseconds();
memcpy(&image_pos[source_id*MAXIMPERSOURCE], image_pos_gpu, MAXIMPERSOURCE*sizeof(struct point));
memset(image_pos_gpu, 0, MAXIMPERSOURCE*sizeof(struct point));
locimagesfound[source_id] = *img_pos;
*numimg = *numimg + *img_pos;
time_cpu += myseconds();
}
cudaFree(image_pos_gpu);
cudaFree(img_pos);
printf("%d: num img found = %d, gpu time = %f, cpu time = %f\n", world_rank, *numimg, time_gpu, time_cpu);
}

Event Timeline