Page MenuHomec4science

gradients_fd4.cu
No OneTemporary

File Metadata

Created
Wed, Oct 9, 23:45

gradients_fd4.cu

#include <stdio.h>
//#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
//#include "cudalloc.h"
#include "unistd.h"
#include "gradients_cuda.h"
/* CUDA error macro */
#define CUDA_SAFE_CALL(call) do { \
cudaError_t err = call; \
if(cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
__constant__ __device__ double coef_int[4] = {-1.0/16.0, 9.0/16.0, 9.0/16.0, -1.0/16.0 };
__constant__ __device__ double coef_der1_stag[4] = { +1.0/24.0, -9.0/8.0, 9.0/8.0, -1.0/24.0 };
__constant__ __device__ double coef_der2[5] = {-1.0/12.0, 4.0/3.0, -5.0/2.0, +4.0/3.0, -1.0/12.0};
__constant__ __device__ double coef_der1_n2n[4] = { 1.0/12.0, -2.0/3.0, 2.0/3.0, -1.0/12.0};
__constant__ __device__ double coef_der2_stag[4] = { 1.0/2., -1.0/2.0, -1.0/2., 1.0/2.0};
__device__ int xyindex(int index, int nz )
{
return index%nz;
}
__device__ int disp(int ix, int iy, int iz, int nx, int ny, int nz)
{
return iz + ix*nz + iy*nz*nx;
}
__device__ int dispc(int ix, int iy, int iz, int nx, int ny)
{
return iy + ix*ny + iz*ny*nx;
}
//GRAD_N2N
__device__ void grad_n2n_fd4Device(double *f_in, double *f_out, int nx, int ny, int nz,
double fact, int *which_grad)
//double *fact, int *which_grad)
{
int index = blockDim.x*blockIdx.x+threadIdx.x;
int d1=dispc(which_grad[0],which_grad[1],which_grad[2],nx,ny);
int d2=dispc(2*which_grad[0],2*which_grad[1],2*which_grad[2],nx,ny);
if (index<(nx*ny*nz-d2) && index>=d2){
//f_out[index] = coef_der1_n2n[0]+f_in[index-d2] + coef_der1_n2n[1]+f_in[index-d1]+
// coef_der1_n2n[2]+f_in[index+d1] + coef_der1_n2n[3]+f_in[index+d2];
f_out[index] = fact*coef_der1_n2n[0]*f_in[index-d2] + fact*coef_der1_n2n[1]*f_in[index-d1]+
fact*coef_der1_n2n[2]*f_in[index+d1] + fact*coef_der1_n2n[3]*f_in[index+d2];
//f_out[index] = f_in[index-d2] + f_in[index-d1] +
// f_in[index+d1] + f_in[index+d2];
// f_out[index] = index-d2;//f_in[index-d2];
//f_out[index] *= fact;
//f_out[index] = fact[0]*f_in[index-d2] + fact[1]*f_in[index-d1] +
// fact[2]*f_in[index+d1] + fact[3]*f_in[index+d2];
}
}
__global__ void grad_n2n_fd4(double *f_in, double *f_out, int nx, int ny, int nz,
double fact, int *which_grad)
//double *fact, int *which_grad)
{
grad_n2n_fd4Device(f_in,f_out, nx, ny, nz, fact, which_grad);
}
extern "C" void grady_n2n_fd4_cuda_( double *f_in, double *f_out, int *nx, int *ny, int *nz, double *fact){
int n=(*nx)*(*ny)*(*nz);
which_grad[0] = 0;
which_grad[1] = 1;
which_grad[2] = 0;
dim3 threadsPerBlock = 256;
dim3 numBlocks((n + threadsPerBlock.x -1) / threadsPerBlock.x);
grad_n2n_fd4<<<numBlocks, threadsPerBlock>>>(f_in, f_out, *nx, *ny, *nz, *fact, which_grad);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
}
extern "C" void gradx_n2n_fd4_cuda_( double *f_in, double *f_out, int *nx, int *ny, int *nz, double *fact){
int n=(*nx)*(*ny)*(*nz);
which_grad[0] = 1;
which_grad[1] = 0;
which_grad[1] = 0;
which_grad[2] = 0;
dim3 threadsPerBlock = 256;
dim3 numBlocks((n + threadsPerBlock.x -1) / threadsPerBlock.x);
grad_n2n_fd4<<<numBlocks, threadsPerBlock>>>(f_in, f_out, *nx, *ny, *nz, *fact, which_grad);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
}
extern "C" void gradz_n2n_fd4_cuda_( double *f_in, double *f_out, int *nx, int *ny, int *nz, double *fact){
int n=(*nx)*(*ny)*(*nz);
which_grad[0] = 0;
which_grad[1] = 0;
which_grad[2] = 1;
dim3 threadsPerBlock = 256;
dim3 numBlocks((n + threadsPerBlock.x -1) / threadsPerBlock.x);
grad_n2n_fd4<<<numBlocks, threadsPerBlock>>>(f_in, f_out, *nx, *ny, *nz, *fact, which_grad);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
}
__global__ void gradz_v2n(double *f_in, double *f_out, int nx, int ny, int nz, double fact)
{
int index=blockDim.x*blockIdx.x+threadIdx.x;
int d11=index+dispc(0,-1,-1,nx,ny);
int d12=index+dispc(0,-1,0,nx,ny);
int d13=index+dispc(0,-1, 1,nx,ny);
int d14=index+dispc(0,-1, 2,nx,ny);
int d21=index+dispc(0, 0,-1,nx,ny);
int d22=index+dispc(0, 0,0,nx,ny);
int d23=index+dispc(0, 0, 1,nx,ny);
int d24=index+dispc(0, 0, 2,nx,ny);
int d31=index+dispc(0, 1,-1,nx,ny);
int d32=index+dispc(0, 1,0,nx,ny);
int d33=index+dispc(0, 1, 1,nx,ny);
int d34=index+dispc(0, 1, 2,nx,ny);
int d41=index+dispc(0, 2,-1,nx,ny);
int d42=index+dispc(0, 2,0,nx,ny);
int d43=index+dispc(0, 2, 1,nx,ny);
int d44=index+dispc(0, 2, 2,nx,ny);
double coef_der[4];
for(int i=0; i<4; i++)
coef_der[i]=coef_der1_stag[i]*fact;
if (d11>=0 && d44<(nx*ny*nz)){
f_out[index] = coef_int[0] * (coef_der[0]*f_in[d11] + coef_der[1]*f_in[d12] +
coef_der[2]*f_in[d13] + coef_der[3]*f_in[d14])+
coef_int[1] * (coef_der[0]*f_in[d21] + coef_der[1]*f_in[d22] +
coef_der[2]*f_in[d23] + coef_der[3]*f_in[d24])+
coef_int[2] * (coef_der[0]*f_in[d31] + coef_der[1]*f_in[d32] +
coef_der[2]*f_in[d33] + coef_der[3]*f_in[d34]) +
coef_int[3] * (coef_der[0]*f_in[d41] + coef_der[1]*f_in[d42] +
coef_der[2]*f_in[d43] + coef_der[3]*f_in[d44]);
}
}
extern "C" void gradz_v2n_fd4_cuda_(double *f_in, double *f_out, int *nx, int *ny, int *nz, double *fact){
int n = (*nx)*(*ny)*(*nz);
dim3 threadsPerBlock = 256;
dim3 numBlocks((n + threadsPerBlock.x -1) / threadsPerBlock.x);
gradz_v2n<<<numBlocks, threadsPerBlock>>>(f_in, f_out, *nx, *ny, *nz, *fact);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
}
__device__ void grady_v2n_fd4Device(double *f_in, double *f_out, int nx, int ny, int nz, double fact)
{
int index=blockDim.x*blockIdx.x+threadIdx.x;
int d11=index+dispc(0,-1,-1,nx,ny);
int d12=index+dispc(0, 0,-1,nx,ny);
int d13=index+dispc(0, 1,-1,nx,ny);
int d14=index+dispc(0, 2,-1,nx,ny);
int d21=index+dispc(0,-1, 0,nx,ny);
int d22=index+dispc(0, 0, 0,nx,ny);
int d23=index+dispc(0, 1, 0,nx,ny);
int d24=index+dispc(0, 2, 0,nx,ny);
int d31=index+dispc(0,-1, 1,nx,ny);
int d32=index+dispc(0, 0, 1,nx,ny);
int d33=index+dispc(0, 1, 1,nx,ny);
int d34=index+dispc(0, 2, 1,nx,ny);
int d41=index+dispc(0,-1, 2,nx,ny);
int d42=index+dispc(0,0 , 2,nx,ny);
int d43=index+dispc(0, 1, 2,nx,ny);
int d44=index+dispc(0, 2, 2,nx,ny);
double coef_der[4];
for(int i=0; i<4; i++)
coef_der[i]=coef_der1_stag[i]*fact;
if (d11>=0 && d44<(nx*ny*nz)){
f_out[index] = coef_int[0] * (coef_der[0]*f_in[d11] + coef_der[1]*f_in[d12] +
coef_der[2]*f_in[d13] + coef_der[3]*f_in[d14])+
coef_int[1] * (coef_der[0]*f_in[d21] + coef_der[1]*f_in[d22] +
coef_der[2]*f_in[d23] + coef_der[3]*f_in[d24])+
coef_int[2] * (coef_der[0]*f_in[d31] + coef_der[1]*f_in[d32] +
coef_der[2]*f_in[d33] + coef_der[3]*f_in[d34]) +
coef_int[3] * (coef_der[0]*f_in[d41] + coef_der[1]*f_in[d42] +
coef_der[2]*f_in[d43] + coef_der[3]*f_in[d44]);
}
}
__global__ void grady_v2n_fd4(double *f_in, double *f_out, int nx, int ny, int nz, double fact){
grady_v2n_fd4Device(f_in, f_out, nx, ny, nz, fact);
}
extern "C" void grady_v2n_fd4_cuda_(double *f_in, double *f_out, int *nx, int *ny, int *nz, double *fact){
int n = (*nx)*(*ny)*(*nz);
dim3 threadsPerBlock = 256;
dim3 numBlocks((n + threadsPerBlock.x -1) / threadsPerBlock.x);
grady_v2n_fd4<<<numBlocks, threadsPerBlock>>>(f_in, f_out, *nx, *ny, *nz, *fact);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
}
__device__ void interp_v2n_f4Device(double *f_in, double *f_out, int nx, int ny, int nz)
{
int index=blockDim.x*blockIdx.x+threadIdx.x;
int d11=index+dispc(0,-1,-1,nx,ny);
int d12=index+dispc(0,-1,0,nx,ny);
int d13=index+dispc(0,-1,1,nx,ny);
int d14=index+dispc(0,-1,2,nx,ny);
int d21=index+dispc(0,0,-1,nx,ny);
int d22=index+dispc(0,0,0,nx,ny);
int d23=index+dispc(0,0,1,nx,ny);
int d24=index+dispc(0,0,2,nx,ny);
int d31=index+dispc(0,1,-1,nx,ny);
int d32=index+dispc(0,1,0,nx,ny);
int d33=index+dispc(0,1,1,nx,ny);
int d34=index+dispc(0,1,2,nx,ny);
int d41=index+dispc(0,2,-1,nx,ny);
int d42=index+dispc(0,2,0,nx,ny);
int d43=index+dispc(0,2,1,nx,ny);
int d44=index+dispc(0,2,2,nx,ny);
if (d11>=0 && d44<(nx*ny*nz)){
f_out[index] = coef_int[0] * (coef_int[0]*f_in[d11] + coef_int[1]*f_in[d12] +
coef_int[2]*f_in[d13] + coef_int[3]*f_in[d14]) +
coef_int[1] * (coef_int[0]*f_in[d21] + coef_int[1]*f_in[d22] +
coef_int[2]*f_in[d23] + coef_int[3]*f_in[d24])+
coef_int[2] * (coef_int[0]*f_in[d31] + coef_int[1]*f_in[d32] +
coef_int[2]*f_in[d33] + coef_int[3]*f_in[d34]) +
coef_int[3] * (coef_int[0]*f_in[d41] + coef_int[1]*f_in[d42] +
coef_int[2]*f_in[d43] + coef_int[3]*f_in[d44]);
}
}
__global__ void interp_v2n_fd4(double *f_in, double *f_out, int nx, int ny, int nz){
interp_v2n_f4Device(f_in, f_out, nx, ny, nz);
}
extern "C" void interp_v2n_fd4_cuda_(double *f_in, double *f_out, int *nx, int *ny, int *nz){
int n = (*nx)*(*ny)*(*nz);
dim3 threadsPerBlock = 256;
dim3 numBlocks((n + threadsPerBlock.x -1) / threadsPerBlock.x);
interp_v2n_fd4<<<numBlocks, threadsPerBlock>>>(f_in, f_out, *nx, *ny, *nz);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
}
__global__ void gradpar_fd4(double *f_z, double *f_y, double *gradpar_y,
double *gradpar_x, double *f_x, double gradpar_z, double *f_out,
int nx, int ny, int nz){
int ix = blockDim.x*blockIdx.x+threadIdx.x;
int iy = blockDim.y*blockIdx.y+threadIdx.y;
int iz = blockDim.z*blockIdx.z+threadIdx.z;
int index = dispc(ix,iy,iz,nx,ny);
int index2d = dispc(ix,iy,0,nx,ny);
if(ix<nx &iy<ny && iz<nz)
f_out[index] = gradpar_z*f_z[index] + gradpar_y[index2d]*f_y[index]+
gradpar_x[index2d]*f_x[index];
}
extern "C" void gradpar_fd4_cuda_(double *f_z, double *f_y, double *gradpar_y,
double *gradpar_x, double *f_x, double *gradpar_z, double *f_out,
int *nx, int *ny, int *nz){
dim3 numBlocks;
int threadsxy = 16;
int threadsz = 2;
dim3 numThreads = dim3(threadsxy, threadsxy, threadsz);
numBlocks.x = ((*nx + threadsxy -1) / threadsxy);
numBlocks.y = ((*ny + threadsxy -1) / threadsxy);
numBlocks.z = ((*nz + threadsz -1) / threadsz);
gradpar_fd4<<<numBlocks, numThreads>>>(f_z, f_y, gradpar_y,
gradpar_x, f_x, *gradpar_z, f_out, *nx, *ny, *nz);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
}
extern "C" void synchronize_cuda_device_(){
cudaDeviceSynchronize();
}
extern "C" void init_which_grad_(){
cudaMallocManaged((void **)&(which_grad), sizeof(int)*(3));
}

Event Timeline