Page MenuHomec4science

compute_parallel.cu
No OneTemporary

File Metadata

Created
Sun, Jun 9, 08:07

compute_parallel.cu

/*
Parallel version in CUDA/C++ of compute.cpp code
*/
#include <iostream>
#include <iomanip>
#include <fstream>
#include <cmath>
#include <cstring>
#include <sstream>
#include "kernels.cuh" //parallel kernels
#include "functions.h" //Sequential functions
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) getchar();
}
}
using namespace std;
int main(int argc, char *argv[]){
//-------User Parameters--------------------------------------------------------
bool verbose = true; // Control messages
int Nthreads = 256; // Number of threads per block
int nx = 2001; // Grid 1D size
int Ntmax = 100; // Maximum of iteration
if(argc>1){
string filename = argv[1];
int *NX = (int *)malloc(sizeof(int));
int *NTHREADS = (int *)malloc(sizeof(int));
int *VERBOSE = (int *)malloc(sizeof(int));
int *NTMAX = (int *)malloc(sizeof(int));
read_parameters(filename,NX,NTHREADS,VERBOSE,NTMAX);
verbose = *VERBOSE;
Nthreads = *NTHREADS;
nx = *NX;
Ntmax = *NTMAX;
free(NX); free(NTHREADS); free(VERBOSE); free(NTMAX);
}
//-------Basic Parameters of the simulation-------------------------------------
clock_t timer = clock();
int Size = 500; // Size of map, Size*Size [km]
double dx = ((double)Size)/((double)nx); // Grid spacening
float Tend = 0.2; // Simulation time in hours [hr]
int numElements = nx*nx; // Total number of elements
size_t memsize = numElements * sizeof(double); // Memory size of one array
//-------Simulation variables HOST----------------------------------------------
double T = 0.0; // Time
int nt = 0; // Iteration counter
double C = 0.0; // Coefficient 1/2*dt/dx
double *H, *HU, *HV; // Water height and x,y speeds
double *Ht, *HUt, *HVt; // Temporary memory of H HU and HV
double *Zdx, *Zdy; // Topology of the map
double *dt; // Host Time step
double *GPU_mu;
double GPU_dt;
//-------Simulation variables DEVICE--------------------------------------------
double *d_H, *d_HU, *d_HV; // Water height and x,y speeds
double *d_Ht, *d_HUt, *d_HVt; // Temporary memory of H HU and HV
double *d_Zdx, *d_Zdy; // Topology of the map
double *d_mu; // device time step
int *d_mutex;
// Tracking variables
double *dt_array; // Record the evolution time steps
string datapath = "../data/"; // Path for the data
unsigned long int GPU_memory_need = 0; // Device RAM occupancy
unsigned long int Host_memory_need = 0; // Hoste RAM occupancy
//-------Allocate host memory for loading the initial conditions----------------
if(verbose) cout << " Allocating host memory .. ";
H = (double *)malloc(memsize); Host_memory_need += memsize;
HU = (double *)malloc(memsize); Host_memory_need += memsize;
HV = (double *)malloc(memsize); Host_memory_need += memsize;
Ht = (double *)malloc(memsize); Host_memory_need += memsize;
HUt = (double *)malloc(memsize); Host_memory_need += memsize;
HVt = (double *)malloc(memsize); Host_memory_need += memsize;
Zdx = (double *)malloc(memsize); Host_memory_need += memsize;
Zdy = (double *)malloc(memsize); Host_memory_need += memsize;
dt = (double *)malloc(sizeof(double)); Host_memory_need += sizeof(double);
GPU_mu = (double *)malloc(sizeof(double)); Host_memory_need += sizeof(double);
dt_array = (double *)malloc(Ntmax*sizeof(double)); Host_memory_need += Ntmax*sizeof(double);
if(verbose) cout << Host_memory_need/1e6 << " MB allocated\n";
//-------Load initial state on host memory--------------------------------------
load_initial_data(H,HU,HV,Zdx,Zdy,datapath,nx,Size,Tend,numElements,verbose);
//-------Allocate device memory for computing-----------------------------------
if(verbose) cout << " Allocating device memory on host.. ";
gpuErrchk(cudaMalloc((void **) &d_H, memsize)); GPU_memory_need+= memsize;
gpuErrchk(cudaMalloc((void **) &d_HU, memsize)); GPU_memory_need+= memsize;
gpuErrchk(cudaMalloc((void **) &d_HV, memsize)); GPU_memory_need+= memsize;
gpuErrchk(cudaMalloc((void **) &d_Ht, memsize)); GPU_memory_need+= memsize;
gpuErrchk(cudaMalloc((void **) &d_HUt, memsize)); GPU_memory_need+= memsize;
gpuErrchk(cudaMalloc((void **) &d_HVt, memsize)); GPU_memory_need+= memsize;
gpuErrchk(cudaMalloc((void **) &d_Zdx, memsize)); GPU_memory_need+= memsize;
gpuErrchk(cudaMalloc((void **) &d_Zdy, memsize)); GPU_memory_need+= memsize;
gpuErrchk(cudaMalloc((void **) &d_mu, sizeof(double))); GPU_memory_need+= sizeof(double);
gpuErrchk(cudaMalloc((void **) &d_mutex, sizeof(int))); GPU_memory_need+= sizeof(int);
if(verbose) cout << GPU_memory_need/1e6 << " MB allocated\n";
//-------Set some device variables----------------------------------------------
gpuErrchk(cudaMemset(d_mu, 0.0, sizeof(double)));
gpuErrchk(cudaMemset(d_mutex, 0, sizeof(float)));
//-------Copy initial conditions from host to device----------------------------
if(verbose) cout << " Copying variables from host to device.." << endl;
gpuErrchk(cudaMemcpy(d_H, H, memsize, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_HU, HU, memsize, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_HV, HV, memsize, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Zdx, Zdx, memsize, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Zdy, Zdy, memsize, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Ht, Ht, memsize, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_HUt, HUt, memsize, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_HVt, HVt, memsize, cudaMemcpyHostToDevice));
cpy_to(Ht,H,numElements);
cpy_to(HUt,HU,numElements);
cpy_to(HVt,HV,numElements);
copy_host2device(d_H,d_HU,d_HV,H,HU,HV,memsize);
copy_host2device(d_Ht,d_HUt,d_HVt,Ht,HUt,HVt,memsize);
//-------Set grid and block dimensions------------------------------------------
// For finite volume kernel
int NblocksFV = ((nx-2)*(nx-2) + Nthreads -1)/Nthreads;
// For tolerances kernel
int NblocksBC = pow(2,ceil(log2(nx)))/Nthreads;
dim3 GridDimBC(NblocksBC,4);
// For Enforce BC kernel
int NblocksTol = (nx*nx + Nthreads -1)/Nthreads;
if(verbose){
cout << "parallel FV iterator kernel :" << endl;
cout <<"\t Number of elements \t\t: " << (nx-2)*(nx-2) << endl;
cout <<"\t Number of blocks needed \t: " << NblocksFV << endl;
cout <<"\t Nthreads \t\t\t: " << NblocksFV << "x" << Nthreads
<< " (=" << NblocksFV*Nthreads << ")" << endl;
cout << "parallel Enforce BC kernel :" << endl;
cout <<"\t Number of elements \t\t: " << 4*nx << endl;
cout <<"\t Number of blocks needed \t: " << NblocksBC << endl;
cout <<"\t Nthreads \t\t\t: " << NblocksBC << "x4x" << Nthreads
<< " (=" << NblocksBC*Nthreads*4 << ")" << endl;
cout << "parallel Tolerances kernel :" << endl;
cout <<"\t Number of elements \t\t: " << nx*nx << endl;
cout <<"\t Number of blocks needed \t: " << NblocksTol << endl;
cout <<"\t Nthreads \t\t\t: " << NblocksTol<< "x" << Nthreads
<< " (=" << NblocksTol*Nthreads << ")" << endl;
}
//------------------------------------------------------------------------------
//-------Evolution loop------------/!\ Work in progress /!\---------------------
//------------------------------------------------------------------------------
while (T < Tend and nt < Ntmax) {
//-------Compute the time-step length-------------------------------------------
// update_dt(H,HU,HV,dt,dx,numElements);
// cudaMemset(d_mu, 0.0, sizeof(double));
find_mumax_kernel<<<256,256>>>(d_H,d_HU,d_HV,d_mutex,d_mu,numElements);
cudaMemcpy(GPU_mu, d_mu, sizeof(double), cudaMemcpyDeviceToHost);
GPU_dt = dx/(sqrt(2.0)*GPU_mu[0]);
if(T+GPU_dt > Tend){
GPU_dt = Tend-T;
}
//-------Print status-----------------------------------------------------------
if(verbose){
cout << " Computing for T=" << T+GPU_dt
<< "\t("<< 100*(T+GPU_dt)/Tend << "%),\t"
<< "dt=" << GPU_dt << "\n";
// cout << " CPU dt \t : " << dt[0] << "\n";
cout << " GPU dt \t : " << GPU_dt << "\n";
}
//-------Copy solution to temp storage------------------------------------------
// cpy_to(Ht,H,numElements);
// cpy_to(HUt,HU,numElements);
// cpy_to(HVt,HV,numElements);
// copy_temp_variables_kernel<<<numBlocks,threadsPerBlock>>>(d_H,d_HU,d_HV,d_Ht,d_HUt,d_HVt,nx);
gpuErrchk(cudaMemcpy(d_Ht, d_H, memsize, cudaMemcpyDeviceToDevice));
gpuErrchk(cudaMemcpy(d_HUt, d_HU, memsize, cudaMemcpyDeviceToDevice));
gpuErrchk(cudaMemcpy(d_HVt, d_HV, memsize, cudaMemcpyDeviceToDevice));
cudaDeviceSynchronize();
//-------Enforce boundary conditions--------------------------------------------
// enforce_BC(Ht, HUt, HVt, nx);
enforce_BC_kernel<<<GridDimBC,Nthreads>>>(d_Ht,d_HUt,d_HVt,nx);
cudaDeviceSynchronize();
//-------Compute a time-step----------------------------------------------------
C = (.5*GPU_dt/dx);
//FV_time_step(H,HU,HV,Zdx,Zdy,Ht,HUt,HVt,C,dt[0],nx);
//FV_time_step_kernel<<<numBlocks,threadsPerBlock>>>(d_H,d_HU,d_HV,d_Zdx,d_Zdy,d_Ht,d_HUt,d_HVt,C,dt[0],nx);
//copy_host2device(d_Ht,d_HUt,d_HVt,Ht,HUt,HVt,memsize);
cudaDeviceSynchronize();
FV_iterator_kernel<<<NblocksFV,Nthreads>>>(d_H,d_HU,d_HV,d_Zdx,d_Zdy,d_Ht,d_HUt,d_HVt,C,GPU_dt,nx);
cudaDeviceSynchronize();
//copy_device2host(H,HU,HV,d_H,d_HU,d_HV,memsize);
//-------Impose tolerances------------------------------------------------------
//impose_tolerances(H,HU,HV,numElements);
impose_tolerances_kernel<<<NblocksTol,Nthreads>>>(d_H,d_HU,d_HV,numElements);
cudaDeviceSynchronize();
//-------Store time step evolution and update time and counter------------------
dt_array[nt] = GPU_dt;
T = T + GPU_dt;
nt++;
//cudaDeviceSynchronize();
}
//------------------------------------------------------------------------------
//-------Copy device result to the host memory----------------------------------
if(verbose){
cout << " Copy the output data from the CUDA device to the host memory" << endl;
}
cudaMemcpy(H, d_H, memsize, cudaMemcpyDeviceToHost);
//-------Save solution to disk--------------------------------------------------
ostringstream soutfilename;
soutfilename <<"../output/CUDA_Solution_nx"<<to_string(nx)<<"_"<<to_string(Size)<<"km_T"<<Tend<<"_h.bin"<< setprecision(2);
string outfilename = soutfilename.str();
ofstream fout;
fout.open(outfilename, std::ios::out | std::ios::binary);
if(verbose) cout<<" Writing solution in "<<outfilename<<endl;
fout.write(reinterpret_cast<char*>(&H[0]), numElements*sizeof(double));
fout.close();
//-------Save time step history-------------------------------------------------
ostringstream soutfilename2;
soutfilename2 <<"../output/CUDA_dt_nx"<<to_string(nx)<<"_"<<to_string(Size)<<"km_T"<<Tend<<"_h.bin"<< setprecision(2);
outfilename = soutfilename2.str();
fout.open(outfilename, std::ios::out | std::ios::binary);
if(verbose) cout<<" Writing solution in "<<outfilename<<endl;
fout.write(reinterpret_cast<char*>(&dt_array[0]), Ntmax*sizeof(double));
fout.close();
//-------Free device global memory----------------------------------------------
if(verbose) cout << " Free device memory space.." << endl;
cudaFree(d_H); cudaFree(d_HU); cudaFree(d_HV); cudaFree(d_Zdx);
cudaFree(d_Zdy); cudaFree(d_Ht); cudaFree(d_HUt); cudaFree(d_HVt);
cudaFree(d_mu); cudaFree(d_mutex);
//-------Free host memory-------------------------------------------------------
if(verbose) cout << " Free host memory space.." << endl;
free(H); free(HU); free(HV); free(Zdx); free(Zdy);
free(Ht); free(HUt); free(HVt); free(dt_array); free(dt); free(GPU_mu);
//-------End of the timer-----------------------------------------------------
timer = clock()-timer;
timer = (double)(timer)/CLOCKS_PER_SEC*1000;
if(verbose){
cout << "Ellapsed time : " << timer/60000 << "min "
<< (timer/1000)%60 << "s " << timer%1000 << "ms" << endl;
}
else{cout << nx << "," << Nthreads << "," << Ntmax << "," << timer/1000.0 << endl;}
return 0;
}

Event Timeline