Page MenuHomec4science

gradientgpu.cu
No OneTemporary

File Metadata

Created
Sat, Sep 14, 05:39

gradientgpu.cu

#include <fstream>
#include "gradientgpu.cuh"
#include "structure.h"
void gradient_grid(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int *Nlens){
int nBlocks_gpu = 0;
// Define the number of threads per block the GPU will use
cudaDeviceProp properties_gpu;
cudaGetDeviceProperties(&properties_gpu, 0); // Get properties of 0th GPU in use
if (properties_gpu.maxThreadsDim[0]<threadsPerBlock)
{
fprintf(stderr, "ERROR: The GPU has to support at least %u threads per block.\n", threadsPerBlock);
exit(-1);
}
else
{
nBlocks_gpu = properties_gpu.maxGridSize[0] / threadsPerBlock; // Get the maximum number of blocks with the chosen number of threads
// per Block that the GPU supports
}
grid_param *frame_gpu;
double *lens_x_gpu, *lens_y_gpu, *b0_gpu, *angle_gpu, *epot_gpu, *rcore_gpu, *rcut_gpu;
double *grid_grad_x_gpu, *grid_grad_y_gpu ;
//SIS//
//PIEMD8//
// Allocate variables on the GPU
cudasafe(cudaMalloc( (void**)&(lens_x_gpu), Nlens[1]*sizeof(double)),"Gradientgpu.cu : Alloc x_gpu: " );
cudasafe(cudaMalloc( (void**)&(lens_y_gpu), Nlens[1]*sizeof(double)),"Gradientgpu.cu : Alloc y_gpu: " );
cudasafe(cudaMalloc( (void**)&(b0_gpu), Nlens[1]*sizeof(double)),"Gradientgpu.cu : Alloc b0_gpu: " );
cudasafe(cudaMalloc( (void**)&(angle_gpu), Nlens[1]*sizeof(double)),"Gradientgpu.cu : Alloc angle_gpu: " );
cudasafe(cudaMalloc( (void**)&(epot_gpu), Nlens[1]*sizeof(double)),"Gradientgpu.cu : Alloc epot_gpu: " );
cudasafe(cudaMalloc( (void**)&(rcore_gpu), Nlens[1]*sizeof(double)),"Gradientgpu.cu : Alloc rcore_gpu: " );
cudasafe(cudaMalloc( (void**)&(rcut_gpu), Nlens[1]*sizeof(double)),"Gradientgpu.cu : Alloc rcut_gpu: " );
cudasafe(cudaMalloc( (void**)&(frame_gpu), sizeof(grid_param)),"Gradientgpu.cu : Alloc frame_gpu: " );
cudasafe(cudaMalloc( (void**)&(grid_grad_x_gpu), (frame->nbgridcells) * (frame->nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_x_gpu: " );
cudasafe(cudaMalloc( (void**)&(grid_grad_y_gpu), (frame->nbgridcells) * (frame->nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_y_gpu: " );
// Copy values to the GPU
cudasafe(cudaMemcpy(lens_x_gpu,lens[1].position_x , Nlens[1]*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy x_gpu: " );
cudasafe(cudaMemcpy(lens_y_gpu,lens[1].position_y , Nlens[1]*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy y_gpu: " );
cudasafe(cudaMemcpy(b0_gpu,lens[1].b0 , Nlens[1]*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy b0_gpu: " );
cudasafe(cudaMemcpy(angle_gpu,lens[1].ellipticity_angle , Nlens[1]*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy angle_gpu: " );
cudasafe(cudaMemcpy(epot_gpu, lens[1].ellipticity_potential, Nlens[1]*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy epot_gpu: " );
cudasafe(cudaMemcpy(rcore_gpu, lens[1].rcore, Nlens[1]*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy rcore_gpu: " );
cudasafe(cudaMemcpy(rcut_gpu, lens[1].rcut, Nlens[1]*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy rcut_gpu: " );
cudasafe(cudaMemcpy(frame_gpu, frame, sizeof(grid_param), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy fame_gpu: " );
//printf( "%d %d",nBlocks_gpu,(frame->nbgridcells) * (frame->nbgridcells)/threadsPerBlock );
if (int((frame->nbgridcells) * (frame->nbgridcells)/threadsPerBlock) == 0){
gradient_grid_piemd_GPU<<<1,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,Nlens[1], lens_x_gpu, lens_y_gpu, b0_gpu, angle_gpu, epot_gpu, rcore_gpu, rcut_gpu);
}
gradient_grid_piemd_GPU<<<(frame->nbgridcells) * (frame->nbgridcells)/threadsPerBlock,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,Nlens[1], lens_x_gpu, lens_y_gpu, b0_gpu, angle_gpu, epot_gpu, rcore_gpu, rcut_gpu);
cudasafe(cudaMemcpy( grid_grad_x, grid_grad_x_gpu, (frame->nbgridcells) * (frame->nbgridcells) *sizeof(double),cudaMemcpyDeviceToHost ),"Gradientgpu.cu : Copy source_x_gpu: " );
cudasafe(cudaMemcpy( grid_grad_y, grid_grad_y_gpu, (frame->nbgridcells) * (frame->nbgridcells) *sizeof(double), cudaMemcpyDeviceToHost),"Gradientgpu.cu : Copy source_y_gpu: " );
//printf ("%f %f \n", grid_grad_x[0],grid_grad_y[0]);
// Free GPU memory
cudaFree(lens_x_gpu);
cudaFree(lens_y_gpu);
cudaFree(b0_gpu);
cudaFree(angle_gpu);
cudaFree(epot_gpu);
cudaFree(rcore_gpu);
cudaFree(rcut_gpu);
cudaFree(grid_grad_x_gpu);
cudaFree(grid_grad_y_gpu);
}
__global__ void gradient_grid_piemd_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut) {
//*grad_x = *grad_y = 0.;
int bid=blockIdx.x; // index of the block (and of the set of images)
int tid=threadIdx.x; // index of the thread within the block
double dx,dy,x_pos,y_pos; //pixelsize
int grid_dim, index;
struct point true_coord, true_coord_rotation;
complex zis;
dx = (frame->xmax - frame->xmin)/(frame->nbgridcells-1);
dy = (frame->ymax - frame->ymin)/(frame->nbgridcells-1);
grid_dim = (frame->nbgridcells);
index = bid * threadsPerBlock + tid ;
while(index < grid_dim*grid_dim){
grid_grad_x[index] = 0.;
grid_grad_y[index] = 0.;
x_pos= frame->xmin + (index/grid_dim)*dx;
y_pos= frame->ymin + (index % grid_dim)*dy;
//printf("%f %f \n", x_pos, y_pos);
for(int iterator=0; iterator < Nlens; iterator++){
true_coord.x = x_pos - lens_x[iterator];
true_coord.y = y_pos - lens_y[iterator];
// Change the origin of the coordinate system to the center of the clump
true_coord_rotation = rotateCoordinateSystem_GPU(true_coord, lens_angle[iterator]);
double x = true_coord_rotation.x;
double y = true_coord_rotation.y;
double eps = lens_epot[iterator];
double rc = lens_rcore[iterator];
double sqe = sqrt(eps);
//
double cx1 = (1. - eps)/(1. + eps);
double cxro = (1. + eps)*(1. + eps);
double cyro = (1. - eps)*(1. - eps);
//
double rem2 = x*x/cxro + y*y/cyro;
complex zci, znum, zden, zres;
double norm;
//
zci.re = 0;
zci.im = -0.5*(1. - eps*eps)/sqe;
//
znum.re = cx1*x;
znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
//
zden.re = x;
zden.im = 2.*rc*sqe - y;
norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden
//
zis.re = (znum.re*zden.re + znum.im*zden.im)/norm;
zis.im = (znum.im*zden.re - znum.re*zden.im)/norm;
norm = zis.re;
zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis)
zis.im = atan2(zis.im, norm);
// norm = zis.re;
zres.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) )
zres.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) )
//
zis.re = zres.re;
zis.im = zres.im;
grid_grad_x[index] += lens_b0[iterator]*zis.re;
grid_grad_y[index] += lens_b0[iterator]*zis.im;
}
bid += gridDim.x;
index = bid * threadsPerBlock + tid;
}
}
struct point grad_halo_piemd_SOA_GPU(const struct point *pImage, int Nlens, const struct Potential_SOA *lens){
int nBlocks_gpu = 0;
point grad;
grad.x = grad.y = 0;
/*
int nDevices;
cudaGetDeviceCount(&nDevices);
for (int i = 0; i < nDevices; i++) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printf("Device Number: %d\n", i);
printf(" Device name: %s\n", prop.name);
printf(" Memory Clock Rate (KHz): %d\n",
prop.memoryClockRate);
printf(" Memory Bus Width (bits): %d\n",
prop.memoryBusWidth);
printf(" Peak Memory Bandwidth (GB/s): %f\n\n",
2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
}*/
// Define the number of threads per block the GPU will use
cudaDeviceProp properties_gpu;
cudaGetDeviceProperties(&properties_gpu, 0); // Get properties of 0th GPU in use
if (properties_gpu.maxThreadsDim[0]<threadsPerBlock)
{
fprintf(stderr, "ERROR: The GPU has to support at least %u threads per block.\n", threadsPerBlock);
exit(-1);
}
else
{
nBlocks_gpu = properties_gpu.maxGridSize[0] / threadsPerBlock; // Get the maximum number of blocks with the chosen number of threads
// per Block that the GPU supports
}
//!!!!GPU variables Think about unified memory, What is the best way of allocating SOA memory to GPU
point *pImage_gpu;
double *x_gpu, *y_gpu, *b0_gpu, *angle_gpu, *epot_gpu, *rcore_gpu, *rcut_gpu;
double *grad_x_gpu, *grad_y_gpu ;
// Allocate variables on the GPU
cudasafe(cudaMalloc( (void**)&(pImage_gpu), sizeof(point)),"Gradientgpu.cu : Alloc pImage_gpu: " );
cudasafe(cudaMalloc( (void**)&(x_gpu), Nlens*sizeof(double)),"Gradientgpu.cu : Alloc x_gpu: " );
cudasafe(cudaMalloc( (void**)&(y_gpu), Nlens*sizeof(double)),"Gradientgpu.cu : Alloc y_gpu: " );
cudasafe(cudaMalloc( (void**)&(b0_gpu), Nlens*sizeof(double)),"Gradientgpu.cu : Alloc b0_gpu: " );
cudasafe(cudaMalloc( (void**)&(angle_gpu), Nlens*sizeof(double)),"Gradientgpu.cu : Alloc angle_gpu: " );
cudasafe(cudaMalloc( (void**)&(epot_gpu), Nlens*sizeof(double)),"Gradientgpu.cu : Alloc epot_gpu: " );
cudasafe(cudaMalloc( (void**)&(rcore_gpu), Nlens*sizeof(double)),"Gradientgpu.cu : Alloc rcore_gpu: " );
cudasafe(cudaMalloc( (void**)&(rcut_gpu), Nlens*sizeof(double)),"Gradientgpu.cu : Alloc rcut_gpu: " );
cudasafe(cudaMalloc( (void**)&(grad_x_gpu), sizeof(double)),"Gradientgpu.cu : Alloc grad_gpu: " );
cudasafe(cudaMalloc( (void**)&(grad_y_gpu), sizeof(double)),"Gradientgpu.cu : Alloc grad_gpu: " );
// Copy values to the GPU
cudasafe(cudaMemcpy(pImage_gpu, pImage, sizeof(point),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy pImage_gpu: " );
cudasafe(cudaMemcpy(x_gpu,lens->position_x , Nlens*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy x_gpu: " );
cudasafe(cudaMemcpy(y_gpu,lens->position_y , Nlens*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy y_gpu: " );
cudasafe(cudaMemcpy(b0_gpu,lens->b0 , Nlens*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy b0_gpu: " );
cudasafe(cudaMemcpy(angle_gpu,lens->ellipticity_angle , Nlens*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy angle_gpu: " );
cudasafe(cudaMemcpy(epot_gpu, lens->ellipticity_potential, Nlens*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy epot_gpu: " );
cudasafe(cudaMemcpy(rcore_gpu, lens->rcore, Nlens*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy rcore_gpu: " );
cudasafe(cudaMemcpy(rcut_gpu, lens->rcut, Nlens*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy rcut_gpu: " );
cudasafe(cudaMemcpy(grad_x_gpu, &grad.x, sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy GRADX_gpu: " );
cudasafe(cudaMemcpy(grad_y_gpu, &grad.y, sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy FRADY_gpu: " );
piemd_GPU<<<nBlocks_gpu,threadsPerBlock>>>(grad_x_gpu, grad_y_gpu, pImage_gpu, Nlens, x_gpu, y_gpu, b0_gpu, angle_gpu, epot_gpu, rcore_gpu, rcut_gpu);
cudasafe(cudaMemcpy(&grad.x, grad_x_gpu, sizeof(double), cudaMemcpyDeviceToHost)," Gradientgpu x.cu : return grad_gpu ");
cudasafe(cudaMemcpy(&grad.y, grad_y_gpu, sizeof(double), cudaMemcpyDeviceToHost)," Gradientgpu y.cu : return grad_gpu ");
// Free GPU memory
cudaFree(pImage_gpu);
cudaFree(x_gpu);
cudaFree(y_gpu);
cudaFree(b0_gpu);
cudaFree(angle_gpu);
cudaFree(epot_gpu);
cudaFree(rcore_gpu);
cudaFree(rcut_gpu);
cudaFree(grad_x_gpu);
cudaFree(grad_y_gpu);
//printf("%f %f \n ", grad.x, grad.y);
return grad;
}
__global__ void piemd_GPU(double *grad_x, double *grad_y, point *pImage, int Nlens, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut) {
//*grad_x = *grad_y = 0.;
int bid=blockIdx.x; // index of the block (and of the set of images)
int tid=threadIdx.x; // index of the thread within the block
struct point true_coord, true_coord_rotation;
complex zis;
//gridDim.x , threadsPerBlock;
int iterator = bid * threadsPerBlock + tid ;
//for(int iterator = 0; iterator < Nlens; ++iterator){
while(iterator < Nlens){
//printf(" %d %d %d %d \n",iterator , bid , threadsPerBlock , tid );
true_coord.x = pImage->x - lens_x[iterator];
true_coord.y = pImage->y - lens_y[iterator];
// Change the origin of the coordinate system to the center of the clump
true_coord_rotation = rotateCoordinateSystem_GPU(true_coord, lens_angle[iterator]);
double x = true_coord_rotation.x;
double y = true_coord_rotation.y;
double eps = lens_epot[iterator];
double rc = lens_rcore[iterator];
double sqe = sqrt(eps);
//
double cx1 = (1. - eps)/(1. + eps);
double cxro = (1. + eps)*(1. + eps);
double cyro = (1. - eps)*(1. - eps);
//
double rem2 = x*x/cxro + y*y/cyro;
complex zci, znum, zden, zres;
double norm;
//
zci.re = 0;
zci.im = -0.5*(1. - eps*eps)/sqe;
//
znum.re = cx1*x;
znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
//
zden.re = x;
zden.im = 2.*rc*sqe - y;
norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden
//
zis.re = (znum.re*zden.re + znum.im*zden.im)/norm;
zis.im = (znum.im*zden.re - znum.re*zden.im)/norm;
norm = zis.re;
zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis)
zis.im = atan2(zis.im, norm);
// norm = zis.re;
zres.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) )
zres.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) )
//
zis.re = zres.re;
zis.im = zres.im;
//
//zres.re = zis.re*b0;
//zres.im = zis.im*b0;
//
//
//grad->x += lens_b0[iterator]*zis.re;
//grad->y += lens_b0[iterator]*zis.im;
atomicAdd_double(grad_x,lens_b0[iterator]*zis.re);
atomicAdd_double(grad_y,lens_b0[iterator]*zis.im);
//printf("%f %f \n ", lens_b0[iterator]*zis.re, lens_b0[iterator]*zis.im);
//printf("%f %f \n ", *grad_x, *grad_y);
bid += gridDim.x;
iterator = bid * threadsPerBlock + tid ;
}
}
__device__ static double atomicAdd_double(double* address, double val)
{
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
__device__ struct point rotateCoordinateSystem_GPU(struct point P, double theta)
{
struct point Q;
Q.x = P.x*cos(theta) + P.y*sin(theta);
Q.y = P.y*cos(theta) - P.x*sin(theta);
return(Q);
}

Event Timeline