delense_GPU.cu
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Wed, Dec 4, 11:31

delense_GPU.cu
View Options

	// This file is part of lenstoolHPC
	// authors: gilles.fourestey@epfl.ch

	#include "delense_CPU_utils.hpp"
	#include "delense_GPU_utils.cuh"
	#include "structure_hpc.hpp"
	#include "cudafunctions.cuh"
	#ifdef __WITH_MPI
	#include <mpi.h>
	#endif

	extern double myseconds();

	#define BLOCK_SIZE_X 16
	#define BLOCK_SIZE_Y 8


	#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))


	inline
	void
	moveToGPU(void** d_mem, void* h_mem, int size)
	{
	#ifdef __WITH_UM
	cudasafe(cudaMallocManaged((void**) d_mem, size), "memory allocation (UM)");
	cudasafe(cudaMemcpy(*d_mem, h_mem, size, cudaMemcpyHostToDevice), "H2D memory copy (UM)");
	#else
	cudasafe(cudaMalloc((void**) d_mem, size), "memory allocation");
	cudasafe(cudaMemcpy(*d_mem, h_mem, size, cudaMemcpyHostToDevice), "H2D memory copy");
	#endif
	}

	inline
	void
	moveFromGPU(void* h_mem, void* d_mem, int size)
	{
	#ifdef __WITH_UM
	//cudasafe(cudaMemcpy(h_mem, d_mem, size, cudaMemcpyDeviceToHost), "D2H memory copy (UM)");
	memcpy(h_mem, d_mem, size);
	#else
	cudasafe(cudaMemcpy(h_mem, d_mem, size, cudaMemcpyHostToDevice), "D2H memory copy");
	#endif
	}
	//
	//
	//
	__global__
	void
	delense_GPU(struct point* image_pos, const runmode_param runmode, const struct Potential_SOA lens, const struct grid_param frame, const struct galaxy sources, const int y_pos_loc, const int y_bound, const int source_id, double* grid_gradient_x, double* grid_gradient_y)
	{
	#define INDEX2D_BAR(y, x) (MAXIMPERSOURCE*y + x)
	//
	int nbgridcells_x = runmode->nbgridcells;
	int nbgridcells_y = runmode->nbgridcells;
	//
	int col = blockIdx.x*blockDim.x + threadIdx.x;
	if (col > nbgridcells_x - 2) return;
	//
	int num_blocks_y = blockDim.y;
	int row_y = threadIdx.y;
	int block_size_y = y_bound/num_blocks_y;
	//
	const double dx = (frame->xmax - frame->xmin)/(runmode->nbgridcells - 1);
	const double dy = (frame->ymax - frame->ymin)/(runmode->nbgridcells - 1);
	//
	int img_pos = 0;
	//
	int row_s = (row_y + 0)*block_size_y;
	int row_e = MIN((row_y + 1)*block_size_y, nbgridcells_y);
	//
	//
	//for (int row = y_pos_loc; row < y_pos_loc + y_bound - 1; ++row)
	int yp = y_pos_loc + row_s;
	//
	//printf("block id = %d, from %d to %d\n", row_y, yp, yp + block_size_y);
	//
	for (int row = 0; row < block_size_y - 0; ++row)
	{
	//
	int y_id = row;
	int x_id = col;
	//
	// int images_total;
	//
	double x_pos = frame->xmin + ( x_id + 0 )*dx;
	double y_pos = frame->ymin + ( y_id + yp )*dy;
	//
	// Define the upper + lower triangle, both together = square = pixel
	//
	struct triplet Tsup, Tinf;
	//
	Tsup.a.x = x_pos;
	Tsup.b.x = x_pos;
	Tsup.c.x = x_pos + dx;
	Tinf.a.x = x_pos + dx;
	Tinf.b.x = x_pos + dx;
	Tinf.c.x = x_pos;
	//
	Tsup.a.y = y_pos;
	Tsup.b.y = y_pos + dy;
	Tsup.c.y = y_pos;
	Tinf.a.y = y_pos + dy;
	Tinf.b.y = y_pos;
	Tinf.c.y = y_pos + dy;
	//
	// Lens to Sourceplane conversion of triangles
	//
	// double time = -myseconds();
	//
	struct triplet Timage;
	struct triplet Tsource;
	//
	int g_loc = (y_id + row_s)*nbgridcells_x + x_id;
	//
	mychi_transformtriangleImageToSourcePlane_SOA_grid_gradient_upper_GPU(&Tsup, sources[source_id].dr, &Tsource, grid_gradient_x, grid_gradient_y, g_loc, nbgridcells_x);
	//
	//int thread_found_image = 0;
	//
	if (mychi_insideborder_GPU(&sources[source_id].center, &Tsource) == 1)
	{
	// printf("col = %d, %d, row = %d, %d, img_pos = %d\n", col, colnum_blocks_yMAXIMPERSOURCE, row_y, row_y*MAXIMPERSOURCE, img_pos);
	//printf("found source = %d, row = %d, index 1 = %d, index 2 = %d, img_pos = %d\n", source_id, row + yp, colnum_blocks_yMAXIMPERSOURCE, row_y*MAXIMPERSOURCE, img_pos);
	image_pos[colnum_blocks_yMAXIMPERSOURCE + row_y*MAXIMPERSOURCE + img_pos + 0] = mychi_barycenter_GPU(&Tsup);
	img_pos++;
	}
	else
	{
	mychi_transformtriangleImageToSourcePlane_SOA_grid_gradient_lower_GPU(&Tinf, sources[source_id].dr, &Tsource, grid_gradient_x, grid_gradient_y, g_loc, nbgridcells_x);
	if (mychi_inside_GPU(&sources[source_id].center, &Tsource) == 1)
	{
	// printf("col = %d, %d, row = %d, %d, img_pos = %d\n", col, colnum_blocks_yMAXIMPERSOURCE, row_y, row_y*MAXIMPERSOURCE, img_pos);
	//printf("found source = %d, row = %d, index 1 = %d, index 2 = %d, img_pos = %d\n", source_id, row, colnum_blocks_yMAXIMPERSOURCE, row_y*MAXIMPERSOURCE, img_pos);
	image_pos[colnum_blocks_yMAXIMPERSOURCE + row_y*MAXIMPERSOURCE + img_pos + 0] = mychi_barycenter_GPU(&Tinf);
	img_pos++;
	}
	}
	}
	}
	//
	//
	//
	__global__
	void
	delense_GPU_old(struct point* image_pos, const runmode_param runmode, const struct Potential_SOA lens, const struct grid_param frame, const struct galaxy sources, const int y_pos_loc, const int y_bound, const int source_id, double* grid_gradient_x, double* grid_gradient_y)
	{
	#define INDEX2D_BAR(y, x) (MAXIMPERSOURCE*y + x)
	//
	int nbgridcells_x = runmode->nbgridcells;
	int nbgridcells_y = runmode->nbgridcells;
	int row = blockIdx.x*blockDim.x + threadIdx.x;
	if (row > y_bound) return;
	int num_blocks_y = gridDim.y;
	int block_size_y = nbgridcells_y/num_blocks_y;
	int x_pos_loc = threadIdx.y*block_size_y;
	//
	//int col = blockIdx.y*blockDim.y + threadIdx.y;
	//if (col > runmode->nbgridcells) return;
	//
	const double dx = (frame->xmax - frame->xmin)/(runmode->nbgridcells - 1);
	const double dy = (frame->ymax - frame->ymin)/(runmode->nbgridcells - 1);
	//
	int img_pos = 0;
	//
	for (int col = 0; col < runmode->nbgridcells; ++col)
	{
	//
	int y_id = row;
	int x_id = col;
	//
	//int images_total;
	//
	double x_pos = frame->xmin + (x_pos_loc + x_id)*dx;
	double y_pos = frame->ymin + (y_pos_loc + y_id)*dy;
	//
	// Define the upper + lower triangle, both together = square = pixel
	//
	struct triplet Tsup, Tinf;
	//
	Tsup.a.x = x_pos;
	Tsup.b.x = x_pos;
	Tsup.c.x = x_pos + dx;
	Tinf.a.x = x_pos + dx;
	Tinf.b.x = x_pos + dx;
	Tinf.c.x = x_pos;
	//
	Tsup.a.y = y_pos;
	Tsup.b.y = y_pos + dy;
	Tsup.c.y = y_pos;
	Tinf.a.y = y_pos + dy;
	Tinf.b.y = y_pos;
	Tinf.c.y = y_pos + dy;
	//
	// Lens to Sourceplane conversion of triangles
	//
	// double time = -myseconds();
	//
	struct triplet Timage;
	struct triplet Tsource;
	//
	mychi_transformtriangleImageToSourcePlane_SOA_grid_gradient_upper_GPU(&Tsup, sources[source_id].dr, &Tsource, grid_gradient_x, grid_gradient_y, y_id*runmode->nbgridcells + x_id, runmode->nbgridcells);
	//
	int thread_found_image = 0;
	//
	if (mychi_insideborder_GPU(&sources[source_id].center, &Tsource) == 1)
	{
	image_pos[row*MAXIMPERSOURCE + img_pos + 0] = mychi_barycenter_GPU(&Tsup);
	//printf("%d %d: T = %f %f\n", row, img_pos, image_pos[rowMAXIMPERSOURCE + img_pos + 0].x, image_pos[rowMAXIMPERSOURCE + img_pos + 0].y);
	img_pos++;
	}
	else
	{
	mychi_transformtriangleImageToSourcePlane_SOA_grid_gradient_lower_GPU(&Tinf, sources[source_id].dr, &Tsource, grid_gradient_x, grid_gradient_y, y_id*runmode->nbgridcells + x_id, runmode->nbgridcells);
	if (mychi_inside_GPU(&sources[source_id].center, &Tsource) == 1)
	{
	image_pos[row*MAXIMPERSOURCE + img_pos + 0] = mychi_barycenter_GPU(&Tinf);
	//printf("%d %d: T = %f %f\n", row, img_pos, image_pos[rowMAXIMPERSOURCE + img_pos + 0].x, image_pos[rowMAXIMPERSOURCE + img_pos + 0].y);
	img_pos++;
	}
	}
	}
	}
	//
	//
	//
	#if 1
	void delense_barycenter_GPU(struct point* image_pos, int* locimagesfound, int* numimg, const runmode_param runmode, const struct Potential_SOA lens, const struct grid_param frame, const int nimages_strongLensing, const struct galaxy* sources, double* grid_gradient_x, double* grid_gradient_y)
	{
	#define INDEX2D_BAR(y, x) (MAXIMPERSOURCE*y + x)
	//const unsigned int nimagestot = runmode->nimagestot;
	const unsigned int nsets = runmode->nsets;
	const unsigned int nbgridcells = runmode->nbgridcells;
	const unsigned int nbgridcells_x = runmode->nbgridcells;
	const unsigned int nbgridcells_y = runmode->nbgridcells;
	//
	int world_size = 1;
	int world_rank = 0;
	//
	#ifdef __WITH_MPI
	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
	MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
	#endif
	//
	unsigned int verbose = (world_rank == 0);
	//
	int numimg_loc = 0;
	//
	for (int ii = 0; ii < nsets; ++ii)
	numimg_loc += nimages_strongLensing[ii];
	//
	//const int nbgridcells = runmode->nbgridcells;
	const int grid_size = nbgridcells;
	const int loc_grid_size = nbgridcells/world_size;
	//
	double y_len = fabs(frame->ymax - frame->ymin);
	int y_len_loc = nbgridcells/world_size;
	int y_pos_loc = (int) world_rank*y_len_loc;
	int y_bound = y_len_loc;
	//
	if ((world_rank + 1) != world_size) y_bound += 1;
	//
	printf("%d: numimg_loc = %d, y_len_loc = %d, y_pos_loc = %d, y_bound = %d\n", world_rank, numimg_loc, y_len_loc, y_pos_loc, y_bound);
	//
	const double dx = (frame->xmax - frame->xmin)/(nbgridcells - 1);
	const double dy = (frame->ymax - frame->ymin)/(nbgridcells - 1);
	//
	int images_total = 0;
	int index = 0;
	//
	int block_size_x = BLOCK_SIZE_X;
	int block_size_y = BLOCK_SIZE_Y;
	//
	int GRID_SIZE_X = (nbgridcells_y + block_size_y - 1)/block_size_y; // number of blocks
	int GRID_SIZE_Y = 1;
	//
	dim3 threads(block_size_x, block_size_y);
	dim3 grid (GRID_SIZE_X , GRID_SIZE_Y);
	//
	double alloc_time = -myseconds();
	struct galaxy* sources_gpu;
	moveToGPU((void*) &sources_gpu, (void) sources, sizeof(galaxy)*nsets);
	//
	struct grid_param* frame_gpu;
	moveToGPU((void*) &frame_gpu , (void) frame , sizeof(grid_param));
	//
	struct runmode_param* runmode_gpu;
	moveToGPU((void*) &runmode_gpu, (void) runmode, sizeof(runmode_param));
	//
	struct point* image_pos_gpu;
	cudaMallocManaged((void*) &image_pos_gpu, grid_sizeblock_size_yMAXIMPERSOURCEsizeof(struct point));
	//
	//printf("image_pos size = %d\n", grid_sizeblock_size_yMAXIMPERSOURCE);
	//
	#if 1
	//
	memset(locimagesfound, 0, nsets*sizeof(int));
	//
	cudaDeviceSynchronize();
	cudasafe(cudaGetLastError(), "before image_pos_gpu allocation ");
	alloc_time += myseconds();
	//
	#endif
	//
	double time_gpu, time_cpu;
	//
	time_gpu = 0.;
	time_cpu = 0.;
	//
	for( int source_id = 0; source_id < nsets; source_id ++)
	{
	int loc_images_found = 0;
	//
	int x_pos_loc = 0;
	cudasafe(cudaGetLastError(), "before delense_GPU");
	time_gpu -= myseconds();
	//
	delense_GPU<<<grid, threads>>> (image_pos_gpu, runmode_gpu, lens, frame_gpu, sources_gpu, y_pos_loc, y_bound, source_id, grid_gradient_x, grid_gradient_y);
	//
	cudaDeviceSynchronize();
	cudasafe(cudaGetLastError(), "after delense_GPU");
	time_gpu += myseconds();
	fflush(stdout);
	#if 1
	time_cpu -= myseconds();
	int numimg_cpu = 0;
	#pragma omp parallel for reduction(+ : numimg_cpu)
	for (int ii = 0; ii < block_size_ynbgridcells_xMAXIMPERSOURCE; ++ii)
	{
	struct point T = image_pos_gpu[ii];
	if((T.x != 0.) \|\| (T.y != 0.))
	{
	#pragma omp critical
	{
	image_pos[source_id*MAXIMPERSOURCE + locimagesfound[source_id]] = T;
	locimagesfound[source_id]++;
	numimg_cpu++;
	}
	image_pos_gpu[ii].x = image_pos_gpu[ii].y = 0.;
	}
	}
	//memset(image_pos_gpu, 0, nbgridcellsMAXIMPERSOURCEsizeof(struct point));
	//memset(image_pos_gpu, 0, loc_grid_sizeMAXIMPERSOURCEsizeof(struct point));
	numimg = numimg + numimg_cpu;
	time_cpu += myseconds();
	//memset(image_pos_gpu, 0, 2nbgridcellsnbgridcells*sizeof(struct point));
	#endif
	}
	cudaFree(image_pos_gpu);
	printf("num img found = %d, gpu time = %f, cpu time = %f\n", *numimg, time_gpu, time_cpu);
	}
	#endif

delense_GPU.cuNo OneTemporaryActions

File Metadata

delense_GPU.cuView Options

Event Timeline

delense_GPU.cu
No OneTemporary
Actions

delense_GPU.cu
View Options