delense_GPU.cu
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Fri, Nov 29, 01:38

delense_GPU.cu
View Options

	// This file is part of lenstoolHPC
	// authors: gilles.fourestey@epfl.ch

	#include "delense_CPU_utils.hpp"
	#include "delense_GPU_utils.cuh"
	#include "structure_hpc.hpp"
	#include "cudafunctions.cuh"
	#ifdef __WITH_MPI
	#include <mpi.h>
	#endif

	extern double myseconds();

	#define BLOCK_SIZE_X 16
	#define BLOCK_SIZE_Y 32


	#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))


	inline
	void
	moveToGPU(void** d_mem, void* h_mem, int size)
	{
	#ifdef __WITH_UM
	cudasafe(cudaMallocManaged((void**) d_mem, size), "memory allocation (UM)");
	cudasafe(cudaMemcpy(*d_mem, h_mem, size, cudaMemcpyHostToDevice), "H2D memory copy (UM)");
	#else
	cudasafe(cudaMalloc((void**) d_mem, size), "memory allocation");
	cudasafe(cudaMemcpy(*d_mem, h_mem, size, cudaMemcpyHostToDevice), "H2D memory copy");
	#endif
	}

	inline
	void
	moveFromGPU(void* h_mem, void* d_mem, int size)
	{
	#ifdef __WITH_UM
	//cudasafe(cudaMemcpy(h_mem, d_mem, size, cudaMemcpyDeviceToHost), "D2H memory copy (UM)");
	memcpy(h_mem, d_mem, size);
	#else
	cudasafe(cudaMemcpy(h_mem, d_mem, size, cudaMemcpyHostToDevice), "D2H memory copy");
	#endif
	}
	//
	//
	//
	#if 1
	__global__
	void
	delense_GPU(struct point* image_pos, int MAXIMPERSOURCE, const runmode_param runmode, const struct Potential_SOA lens, const struct grid_param frame, const struct galaxy sources, const int y_pos_loc, const int y_bound, const int source_id, double* grid_gradient_x, double* grid_gradient_y, int* img_pos)
	{
	#define INDEX2D_BAR(y, x) (MAXIMPERSOURCE*y + x)
	//
	int nbgridcells_x = runmode->nbgridcells;
	int nbgridcells_y = runmode->nbgridcells;
	//
	int col = blockIdx.x*blockDim.x + threadIdx.x;
	if (col > nbgridcells_x - 2) return;
	//
	int num_blocks_y = blockDim.y;
	int row_y = threadIdx.y;
	int block_size_y = y_bound/num_blocks_y;
	//
	const double dx = (frame->xmax - frame->xmin)/(runmode->nbgridcells - 1);
	const double dy = (frame->ymax - frame->ymin)/(runmode->nbgridcells - 1);
	//
	int row_s = (row_y + 0)*block_size_y;
	int row_e = MIN((row_y + 1)*block_size_y, nbgridcells_y);
	if (row_y == BLOCK_SIZE_Y - 1)
	{
	block_size_y = y_bound - row_y*block_size_y - 1;
	row_e = nbgridcells_y;
	}
	//
	//for (int row = y_pos_loc; row < y_pos_loc + y_bound - 1; ++row)
	int yp = y_pos_loc + row_s;

	//
	for (int row = 0; row < block_size_y; ++row)
	{
	//
	int y_id = row;
	int x_id = col;
	//
	// int images_total;
	//
	double x_pos = frame->xmin + ( x_id + 0 )*dx;
	double y_pos = frame->ymin + ( y_id + yp )*dy;
	//
	// Define the upper + lower triangle, both together = square = pixel
	//
	struct triplet Tsup, Tinf;
	//
	Tsup.a.x = x_pos;
	Tsup.b.x = x_pos;
	Tsup.c.x = x_pos + dx;
	Tinf.a.x = x_pos + dx;
	Tinf.b.x = x_pos + dx;
	Tinf.c.x = x_pos;
	//
	Tsup.a.y = y_pos;
	Tsup.b.y = y_pos + dy;
	Tsup.c.y = y_pos;
	Tinf.a.y = y_pos + dy;
	Tinf.b.y = y_pos;
	Tinf.c.y = y_pos + dy;
	//
	// Lens to Sourceplane conversion of triangles
	//
	// double time = -myseconds();
	//
	struct triplet Timage;
	struct triplet Tsource;
	//
	int g_loc = (y_id + row_s)*nbgridcells_x + x_id;
	//
	mychi_transformtriangleImageToSourcePlane_SOA_grid_gradient_upper_GPU(&Tsup, sources[source_id].dr, &Tsource, grid_gradient_x, grid_gradient_y, g_loc, nbgridcells_x);
	//
	//int thread_found_image = 0;
	//
	#if 1
	if (mychi_insideborder_GPU(&sources[source_id].center, &Tsource) == 1)
	{
	image_pos[atomicAdd(img_pos,1)] = mychi_barycenter_GPU(&Tsup);
	}
	#if 1
	else
	{
	mychi_transformtriangleImageToSourcePlane_SOA_grid_gradient_lower_GPU(&Tinf, sources[source_id].dr, &Tsource, grid_gradient_x, grid_gradient_y, g_loc, nbgridcells_x);
	if (mychi_inside_GPU(&sources[source_id].center, &Tsource) == 1)
	{
	image_pos[atomicAdd(img_pos,1 )] = mychi_barycenter_GPU(&Tinf);
	}
	}
	#endif
	#endif
	}
	}
	#endif
	//
	//
	//
	__global__
	void
	delense_GPU_v2(struct point* image_pos, int MAXIMPERSOURCE, const struct Potential_SOA lens, const struct grid_param frame, const struct galaxy* sources, const int y_pos_loc, const int y_bound, const int source_id, double* grid_gradient_x, double* grid_gradient_y, int* img_pos, int nbgridcells)
	{
	#define INDEX2D_BAR(y, x) (MAXIMPERSOURCE*y + x)
	//
	int nbgridcells_x = nbgridcells;
	int nbgridcells_y = nbgridcells;
	//
	int col = blockIdx.x*blockDim.x + threadIdx.x;
	if (col > nbgridcells_x - 2) return;
	int row = blockIdx.y*blockDim.y + threadIdx.y;
	//if (row + y_pos_loc > nbgridcells_y - 2) return;
	if (row > y_bound - 2) return;
	//if (!row) printf("y_bound = %d\n", y_bound);
	//
	const double dx = (frame->xmax - frame->xmin)/(nbgridcells - 1);
	const double dy = (frame->ymax - frame->ymin)/(nbgridcells - 1);
	//
	/*
	int num_blocks_y = blockDim.y;
	int row_y = threadIdx.y;
	int block_size_y = y_bound/num_blocks_y;
	//
	//
	int row_s = (row_y + 0)*block_size_y;
	int row_e = MIN((row_y + 1)*block_size_y, nbgridcells_y);
	if (row_y == BLOCK_SIZE_Y - 1)
	{
	block_size_y = y_bound - row_y*block_size_y - 1;
	row_e = nbgridcells_y;
	}
	*/
	//for (int row = y_pos_loc; row < y_pos_loc + y_bound - 1; ++row)
	int yp = y_pos_loc;// + 0*row_s;
	//
	//for (int row = 0; row < block_size_y; ++row)
	{
	//
	int y_id = row;
	int x_id = col;
	//
	// int images_total;
	//
	double x_pos = frame->xmin + ( x_id + 0 )*dx;
	double y_pos = frame->ymin + ( y_id + yp )*dy;
	//
	// Define the upper + lower triangle, both together = square = pixel
	//
	struct triplet Tsup, Tinf;
	//
	Tsup.a.x = x_pos;
	Tsup.b.x = x_pos;
	Tsup.c.x = x_pos + dx;
	Tinf.a.x = x_pos + dx;
	Tinf.b.x = x_pos + dx;
	Tinf.c.x = x_pos;
	//
	Tsup.a.y = y_pos;
	Tsup.b.y = y_pos + dy;
	Tsup.c.y = y_pos;
	Tinf.a.y = y_pos + dy;
	Tinf.b.y = y_pos;
	Tinf.c.y = y_pos + dy;
	//
	// Lens to Sourceplane conversion of triangles
	//
	// double time = -myseconds();
	//
	struct triplet Timage;
	struct triplet Tsource;
	//
	int g_loc = (y_id /+ row_s/)*nbgridcells_x + x_id;
	//
	mychi_transformtriangleImageToSourcePlane_SOA_grid_gradient_upper_GPU(&Tsup, sources[source_id].dr, &Tsource, grid_gradient_x, grid_gradient_y, g_loc, nbgridcells_x);
	//
	//int thread_found_image = 0;
	//
	if (mychi_insideborder_GPU(&sources[source_id].center, &Tsource) == 1)
	{
	image_pos[atomicAdd(img_pos,1)] = mychi_barycenter_GPU(&Tsup);
	}
	else
	{
	mychi_transformtriangleImageToSourcePlane_SOA_grid_gradient_lower_GPU(&Tinf, sources[source_id].dr, &Tsource, grid_gradient_x, grid_gradient_y, g_loc, nbgridcells_x);
	if (mychi_inside_GPU(&sources[source_id].center, &Tsource) == 1)
	{
	image_pos[atomicAdd(img_pos,1 )] = mychi_barycenter_GPU(&Tinf);
	}
	}
	}
	}

	//
	//
	//
	void delense_barycenter_GPU(struct point* image_pos, int MAXIMPERSOURCE, int* locimagesfound, int* numimg, const runmode_param runmode, const struct Potential_SOA lens, const struct grid_param frame, const int nimages_strongLensing, const struct galaxy* sources, double* grid_gradient_x, double* grid_gradient_y, const int y_pos_loc, const int y_bound)
	{
	#define INDEX2D_BAR(y, x) (MAXIMPERSOURCE*y + x)
	int world_size = 1;
	int world_rank = 0;
	/*
	#ifdef __WITH_MPI
	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
	MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
	#endif
	*/
	//const unsigned int nimagestot = runmode->nimagestot;
	const unsigned int nsets = runmode->nsets;
	const unsigned int nbgridcells = runmode->nbgridcells;
	const unsigned int nbgridcells_x = runmode->nbgridcells;
	const unsigned int nbgridcells_y = runmode->nbgridcells;
	//
	const int grid_size = nbgridcells;
	//
	const double dx = (frame->xmax - frame->xmin)/(nbgridcells - 1);
	const double dy = (frame->ymax - frame->ymin)/(nbgridcells - 1);
	//
	int images_total = 0;
	int index = 0;
	//
	int block_size_x = BLOCK_SIZE_X;
	int block_size_y = BLOCK_SIZE_Y;
	//
	int GRID_SIZE_X = (nbgridcells_x + block_size_x - 1)/block_size_x; // number of blocks
	int GRID_SIZE_Y = (y_bound + block_size_y - 1)/block_size_y;
	printf("%d: y_loc_pos = %d, y_bound = %d\n", world_rank, y_pos_loc, y_bound);
	//cudasafe(cudaGetLastError(), "before allocation ");
	//int GRID_SIZE_Y = 1;
	//
	dim3 threads(block_size_x, block_size_y);
	dim3 grid (GRID_SIZE_X , GRID_SIZE_Y);
	//
	double alloc_time = -myseconds();
	//
	struct point* image_pos_gpu;
	cudaMallocManaged((void*) &image_pos_gpu, grid_sizeMAXIMPERSOURCE*sizeof(struct point));
	//
	int* img_pos;
	cudaMallocManaged((void**) &img_pos, sizeof(int));
	//
	memset(locimagesfound, 0, nsets*sizeof(int));
	//
	//cudaDeviceSynchronize();
	cudasafe(cudaGetLastError(), "before image_pos_gpu allocation ");
	alloc_time += myseconds();
	//
	//
	double time_gpu, time_cpu;
	//
	time_gpu = 0.;
	time_cpu = 0.;
	//
	for( int source_id = 0; source_id < nsets; source_id ++)
	{
	*img_pos = 0;
	//that slows down everything on P100s unfortunately
	//cudaMemPrefetchAsync(image_pos_gpu, grid_sizeblock_size_yMAXIMPERSOURCE*sizeof(struct point), 0);
	int loc_images_found = 0;
	//
	int x_pos_loc = 0;
	time_gpu -= myseconds();
	//
	delense_GPU_v2<<<grid, threads>>> (image_pos_gpu, MAXIMPERSOURCE, lens, frame, sources, y_pos_loc, y_bound, source_id, grid_gradient_x, grid_gradient_y, img_pos, nbgridcells);
	cudaDeviceSynchronize();
	cudasafe(cudaGetLastError(), "delense_GPU");
	//
	time_gpu += myseconds();
	time_cpu -= myseconds();
	memcpy(&image_pos[source_idMAXIMPERSOURCE], image_pos_gpu, MAXIMPERSOURCEsizeof(struct point));
	memset(image_pos_gpu, 0, MAXIMPERSOURCE*sizeof(struct point));
	locimagesfound[source_id] = *img_pos;
	numimg = numimg + *img_pos;
	time_cpu += myseconds();
	}
	cudaFree(image_pos_gpu);
	cudaFree(img_pos);
	printf("%d: num img found = %d, gpu time = %f, cpu time = %f\n", world_rank, *numimg, time_gpu, time_cpu);
	}

delense_GPU.cuNo OneTemporaryActions

File Metadata

delense_GPU.cuView Options

Event Timeline

delense_GPU.cu
No OneTemporary
Actions

delense_GPU.cu
View Options