Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F82921074
gradientgpu.cu
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Sep 14, 05:39
Size
14 KB
Mime Type
text/x-c
Expires
Mon, Sep 16, 05:39 (2 d)
Engine
blob
Format
Raw Data
Handle
20775022
Attached To
R1448 Lenstool-HPC
gradientgpu.cu
View Options
#include <fstream>
#include "gradientgpu.cuh"
#include "structure.h"
void gradient_grid(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, const struct Potential_SOA *lens, int *Nlens){
int nBlocks_gpu = 0;
// Define the number of threads per block the GPU will use
cudaDeviceProp properties_gpu;
cudaGetDeviceProperties(&properties_gpu, 0); // Get properties of 0th GPU in use
if (properties_gpu.maxThreadsDim[0]<threadsPerBlock)
{
fprintf(stderr, "ERROR: The GPU has to support at least %u threads per block.\n", threadsPerBlock);
exit(-1);
}
else
{
nBlocks_gpu = properties_gpu.maxGridSize[0] / threadsPerBlock; // Get the maximum number of blocks with the chosen number of threads
// per Block that the GPU supports
}
grid_param *frame_gpu;
double *lens_x_gpu, *lens_y_gpu, *b0_gpu, *angle_gpu, *epot_gpu, *rcore_gpu, *rcut_gpu;
double *grid_grad_x_gpu, *grid_grad_y_gpu ;
//SIS//
//PIEMD8//
// Allocate variables on the GPU
cudasafe(cudaMalloc( (void**)&(lens_x_gpu), Nlens[1]*sizeof(double)),"Gradientgpu.cu : Alloc x_gpu: " );
cudasafe(cudaMalloc( (void**)&(lens_y_gpu), Nlens[1]*sizeof(double)),"Gradientgpu.cu : Alloc y_gpu: " );
cudasafe(cudaMalloc( (void**)&(b0_gpu), Nlens[1]*sizeof(double)),"Gradientgpu.cu : Alloc b0_gpu: " );
cudasafe(cudaMalloc( (void**)&(angle_gpu), Nlens[1]*sizeof(double)),"Gradientgpu.cu : Alloc angle_gpu: " );
cudasafe(cudaMalloc( (void**)&(epot_gpu), Nlens[1]*sizeof(double)),"Gradientgpu.cu : Alloc epot_gpu: " );
cudasafe(cudaMalloc( (void**)&(rcore_gpu), Nlens[1]*sizeof(double)),"Gradientgpu.cu : Alloc rcore_gpu: " );
cudasafe(cudaMalloc( (void**)&(rcut_gpu), Nlens[1]*sizeof(double)),"Gradientgpu.cu : Alloc rcut_gpu: " );
cudasafe(cudaMalloc( (void**)&(frame_gpu), sizeof(grid_param)),"Gradientgpu.cu : Alloc frame_gpu: " );
cudasafe(cudaMalloc( (void**)&(grid_grad_x_gpu), (frame->nbgridcells) * (frame->nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_x_gpu: " );
cudasafe(cudaMalloc( (void**)&(grid_grad_y_gpu), (frame->nbgridcells) * (frame->nbgridcells) *sizeof(double)),"Gradientgpu.cu : Alloc source_y_gpu: " );
// Copy values to the GPU
cudasafe(cudaMemcpy(lens_x_gpu,lens[1].position_x , Nlens[1]*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy x_gpu: " );
cudasafe(cudaMemcpy(lens_y_gpu,lens[1].position_y , Nlens[1]*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy y_gpu: " );
cudasafe(cudaMemcpy(b0_gpu,lens[1].b0 , Nlens[1]*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy b0_gpu: " );
cudasafe(cudaMemcpy(angle_gpu,lens[1].ellipticity_angle , Nlens[1]*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy angle_gpu: " );
cudasafe(cudaMemcpy(epot_gpu, lens[1].ellipticity_potential, Nlens[1]*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy epot_gpu: " );
cudasafe(cudaMemcpy(rcore_gpu, lens[1].rcore, Nlens[1]*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy rcore_gpu: " );
cudasafe(cudaMemcpy(rcut_gpu, lens[1].rcut, Nlens[1]*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy rcut_gpu: " );
cudasafe(cudaMemcpy(frame_gpu, frame, sizeof(grid_param), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy fame_gpu: " );
//printf( "%d %d",nBlocks_gpu,(frame->nbgridcells) * (frame->nbgridcells)/threadsPerBlock );
if (int((frame->nbgridcells) * (frame->nbgridcells)/threadsPerBlock) == 0){
gradient_grid_piemd_GPU<<<1,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,Nlens[1], lens_x_gpu, lens_y_gpu, b0_gpu, angle_gpu, epot_gpu, rcore_gpu, rcut_gpu);
}
gradient_grid_piemd_GPU<<<(frame->nbgridcells) * (frame->nbgridcells)/threadsPerBlock,threadsPerBlock>>>(grid_grad_x_gpu, grid_grad_y_gpu,frame_gpu,Nlens[1], lens_x_gpu, lens_y_gpu, b0_gpu, angle_gpu, epot_gpu, rcore_gpu, rcut_gpu);
cudasafe(cudaMemcpy( grid_grad_x, grid_grad_x_gpu, (frame->nbgridcells) * (frame->nbgridcells) *sizeof(double),cudaMemcpyDeviceToHost ),"Gradientgpu.cu : Copy source_x_gpu: " );
cudasafe(cudaMemcpy( grid_grad_y, grid_grad_y_gpu, (frame->nbgridcells) * (frame->nbgridcells) *sizeof(double), cudaMemcpyDeviceToHost),"Gradientgpu.cu : Copy source_y_gpu: " );
//printf ("%f %f \n", grid_grad_x[0],grid_grad_y[0]);
// Free GPU memory
cudaFree(lens_x_gpu);
cudaFree(lens_y_gpu);
cudaFree(b0_gpu);
cudaFree(angle_gpu);
cudaFree(epot_gpu);
cudaFree(rcore_gpu);
cudaFree(rcut_gpu);
cudaFree(grid_grad_x_gpu);
cudaFree(grid_grad_y_gpu);
}
__global__ void gradient_grid_piemd_GPU(double *grid_grad_x, double *grid_grad_y, const struct grid_param *frame, int Nlens, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut) {
//*grad_x = *grad_y = 0.;
int bid=blockIdx.x; // index of the block (and of the set of images)
int tid=threadIdx.x; // index of the thread within the block
double dx,dy,x_pos,y_pos; //pixelsize
int grid_dim, index;
struct point true_coord, true_coord_rotation;
complex zis;
dx = (frame->xmax - frame->xmin)/(frame->nbgridcells-1);
dy = (frame->ymax - frame->ymin)/(frame->nbgridcells-1);
grid_dim = (frame->nbgridcells);
index = bid * threadsPerBlock + tid ;
while(index < grid_dim*grid_dim){
grid_grad_x[index] = 0.;
grid_grad_y[index] = 0.;
x_pos= frame->xmin + (index/grid_dim)*dx;
y_pos= frame->ymin + (index % grid_dim)*dy;
//printf("%f %f \n", x_pos, y_pos);
for(int iterator=0; iterator < Nlens; iterator++){
true_coord.x = x_pos - lens_x[iterator];
true_coord.y = y_pos - lens_y[iterator];
// Change the origin of the coordinate system to the center of the clump
true_coord_rotation = rotateCoordinateSystem_GPU(true_coord, lens_angle[iterator]);
double x = true_coord_rotation.x;
double y = true_coord_rotation.y;
double eps = lens_epot[iterator];
double rc = lens_rcore[iterator];
double sqe = sqrt(eps);
//
double cx1 = (1. - eps)/(1. + eps);
double cxro = (1. + eps)*(1. + eps);
double cyro = (1. - eps)*(1. - eps);
//
double rem2 = x*x/cxro + y*y/cyro;
complex zci, znum, zden, zres;
double norm;
//
zci.re = 0;
zci.im = -0.5*(1. - eps*eps)/sqe;
//
znum.re = cx1*x;
znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
//
zden.re = x;
zden.im = 2.*rc*sqe - y;
norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden
//
zis.re = (znum.re*zden.re + znum.im*zden.im)/norm;
zis.im = (znum.im*zden.re - znum.re*zden.im)/norm;
norm = zis.re;
zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis)
zis.im = atan2(zis.im, norm);
// norm = zis.re;
zres.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) )
zres.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) )
//
zis.re = zres.re;
zis.im = zres.im;
grid_grad_x[index] += lens_b0[iterator]*zis.re;
grid_grad_y[index] += lens_b0[iterator]*zis.im;
}
bid += gridDim.x;
index = bid * threadsPerBlock + tid;
}
}
struct point grad_halo_piemd_SOA_GPU(const struct point *pImage, int Nlens, const struct Potential_SOA *lens){
int nBlocks_gpu = 0;
point grad;
grad.x = grad.y = 0;
/*
int nDevices;
cudaGetDeviceCount(&nDevices);
for (int i = 0; i < nDevices; i++) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printf("Device Number: %d\n", i);
printf(" Device name: %s\n", prop.name);
printf(" Memory Clock Rate (KHz): %d\n",
prop.memoryClockRate);
printf(" Memory Bus Width (bits): %d\n",
prop.memoryBusWidth);
printf(" Peak Memory Bandwidth (GB/s): %f\n\n",
2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
}*/
// Define the number of threads per block the GPU will use
cudaDeviceProp properties_gpu;
cudaGetDeviceProperties(&properties_gpu, 0); // Get properties of 0th GPU in use
if (properties_gpu.maxThreadsDim[0]<threadsPerBlock)
{
fprintf(stderr, "ERROR: The GPU has to support at least %u threads per block.\n", threadsPerBlock);
exit(-1);
}
else
{
nBlocks_gpu = properties_gpu.maxGridSize[0] / threadsPerBlock; // Get the maximum number of blocks with the chosen number of threads
// per Block that the GPU supports
}
//!!!!GPU variables Think about unified memory, What is the best way of allocating SOA memory to GPU
point *pImage_gpu;
double *x_gpu, *y_gpu, *b0_gpu, *angle_gpu, *epot_gpu, *rcore_gpu, *rcut_gpu;
double *grad_x_gpu, *grad_y_gpu ;
// Allocate variables on the GPU
cudasafe(cudaMalloc( (void**)&(pImage_gpu), sizeof(point)),"Gradientgpu.cu : Alloc pImage_gpu: " );
cudasafe(cudaMalloc( (void**)&(x_gpu), Nlens*sizeof(double)),"Gradientgpu.cu : Alloc x_gpu: " );
cudasafe(cudaMalloc( (void**)&(y_gpu), Nlens*sizeof(double)),"Gradientgpu.cu : Alloc y_gpu: " );
cudasafe(cudaMalloc( (void**)&(b0_gpu), Nlens*sizeof(double)),"Gradientgpu.cu : Alloc b0_gpu: " );
cudasafe(cudaMalloc( (void**)&(angle_gpu), Nlens*sizeof(double)),"Gradientgpu.cu : Alloc angle_gpu: " );
cudasafe(cudaMalloc( (void**)&(epot_gpu), Nlens*sizeof(double)),"Gradientgpu.cu : Alloc epot_gpu: " );
cudasafe(cudaMalloc( (void**)&(rcore_gpu), Nlens*sizeof(double)),"Gradientgpu.cu : Alloc rcore_gpu: " );
cudasafe(cudaMalloc( (void**)&(rcut_gpu), Nlens*sizeof(double)),"Gradientgpu.cu : Alloc rcut_gpu: " );
cudasafe(cudaMalloc( (void**)&(grad_x_gpu), sizeof(double)),"Gradientgpu.cu : Alloc grad_gpu: " );
cudasafe(cudaMalloc( (void**)&(grad_y_gpu), sizeof(double)),"Gradientgpu.cu : Alloc grad_gpu: " );
// Copy values to the GPU
cudasafe(cudaMemcpy(pImage_gpu, pImage, sizeof(point),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy pImage_gpu: " );
cudasafe(cudaMemcpy(x_gpu,lens->position_x , Nlens*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy x_gpu: " );
cudasafe(cudaMemcpy(y_gpu,lens->position_y , Nlens*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy y_gpu: " );
cudasafe(cudaMemcpy(b0_gpu,lens->b0 , Nlens*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy b0_gpu: " );
cudasafe(cudaMemcpy(angle_gpu,lens->ellipticity_angle , Nlens*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy angle_gpu: " );
cudasafe(cudaMemcpy(epot_gpu, lens->ellipticity_potential, Nlens*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy epot_gpu: " );
cudasafe(cudaMemcpy(rcore_gpu, lens->rcore, Nlens*sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy rcore_gpu: " );
cudasafe(cudaMemcpy(rcut_gpu, lens->rcut, Nlens*sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy rcut_gpu: " );
cudasafe(cudaMemcpy(grad_x_gpu, &grad.x, sizeof(double),cudaMemcpyHostToDevice ),"Gradientgpu.cu : Copy GRADX_gpu: " );
cudasafe(cudaMemcpy(grad_y_gpu, &grad.y, sizeof(double), cudaMemcpyHostToDevice),"Gradientgpu.cu : Copy FRADY_gpu: " );
piemd_GPU<<<nBlocks_gpu,threadsPerBlock>>>(grad_x_gpu, grad_y_gpu, pImage_gpu, Nlens, x_gpu, y_gpu, b0_gpu, angle_gpu, epot_gpu, rcore_gpu, rcut_gpu);
cudasafe(cudaMemcpy(&grad.x, grad_x_gpu, sizeof(double), cudaMemcpyDeviceToHost)," Gradientgpu x.cu : return grad_gpu ");
cudasafe(cudaMemcpy(&grad.y, grad_y_gpu, sizeof(double), cudaMemcpyDeviceToHost)," Gradientgpu y.cu : return grad_gpu ");
// Free GPU memory
cudaFree(pImage_gpu);
cudaFree(x_gpu);
cudaFree(y_gpu);
cudaFree(b0_gpu);
cudaFree(angle_gpu);
cudaFree(epot_gpu);
cudaFree(rcore_gpu);
cudaFree(rcut_gpu);
cudaFree(grad_x_gpu);
cudaFree(grad_y_gpu);
//printf("%f %f \n ", grad.x, grad.y);
return grad;
}
__global__ void piemd_GPU(double *grad_x, double *grad_y, point *pImage, int Nlens, double * lens_x, double *lens_y, double * lens_b0, double *lens_angle, double *lens_epot, double *lens_rcore, double *lens_rcut) {
//*grad_x = *grad_y = 0.;
int bid=blockIdx.x; // index of the block (and of the set of images)
int tid=threadIdx.x; // index of the thread within the block
struct point true_coord, true_coord_rotation;
complex zis;
//gridDim.x , threadsPerBlock;
int iterator = bid * threadsPerBlock + tid ;
//for(int iterator = 0; iterator < Nlens; ++iterator){
while(iterator < Nlens){
//printf(" %d %d %d %d \n",iterator , bid , threadsPerBlock , tid );
true_coord.x = pImage->x - lens_x[iterator];
true_coord.y = pImage->y - lens_y[iterator];
// Change the origin of the coordinate system to the center of the clump
true_coord_rotation = rotateCoordinateSystem_GPU(true_coord, lens_angle[iterator]);
double x = true_coord_rotation.x;
double y = true_coord_rotation.y;
double eps = lens_epot[iterator];
double rc = lens_rcore[iterator];
double sqe = sqrt(eps);
//
double cx1 = (1. - eps)/(1. + eps);
double cxro = (1. + eps)*(1. + eps);
double cyro = (1. - eps)*(1. - eps);
//
double rem2 = x*x/cxro + y*y/cyro;
complex zci, znum, zden, zres;
double norm;
//
zci.re = 0;
zci.im = -0.5*(1. - eps*eps)/sqe;
//
znum.re = cx1*x;
znum.im = 2.*sqe*sqrt(rc*rc + rem2) - y/cx1;
//
zden.re = x;
zden.im = 2.*rc*sqe - y;
norm = (zden.re*zden.re + zden.im*zden.im); // zis = znum/zden
//
zis.re = (znum.re*zden.re + znum.im*zden.im)/norm;
zis.im = (znum.im*zden.re - znum.re*zden.im)/norm;
norm = zis.re;
zis.re = log(sqrt(norm*norm + zis.im*zis.im)); // ln(zis) = ln(|zis|)+i.Arg(zis)
zis.im = atan2(zis.im, norm);
// norm = zis.re;
zres.re = zci.re*zis.re - zci.im*zis.im; // Re( zci*ln(zis) )
zres.im = zci.im*zis.re + zis.im*zci.re; // Im( zci*ln(zis) )
//
zis.re = zres.re;
zis.im = zres.im;
//
//zres.re = zis.re*b0;
//zres.im = zis.im*b0;
//
//
//grad->x += lens_b0[iterator]*zis.re;
//grad->y += lens_b0[iterator]*zis.im;
atomicAdd_double(grad_x,lens_b0[iterator]*zis.re);
atomicAdd_double(grad_y,lens_b0[iterator]*zis.im);
//printf("%f %f \n ", lens_b0[iterator]*zis.re, lens_b0[iterator]*zis.im);
//printf("%f %f \n ", *grad_x, *grad_y);
bid += gridDim.x;
iterator = bid * threadsPerBlock + tid ;
}
}
__device__ static double atomicAdd_double(double* address, double val)
{
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
__device__ struct point rotateCoordinateSystem_GPU(struct point P, double theta)
{
struct point Q;
Q.x = P.x*cos(theta) + P.y*sin(theta);
Q.y = P.y*cos(theta) - P.x*sin(theta);
return(Q);
}
Event Timeline
Log In to Comment