compute_parallel.cu
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Jun 9, 08:07

compute_parallel.cu
View Options

	/*
	Parallel version in CUDA/C++ of compute.cpp code
	*/
	#include <iostream>
	#include <iomanip>
	#include <fstream>
	#include <cmath>
	#include <cstring>
	#include <sstream>
	#include "kernels.cuh" //parallel kernels
	#include "functions.h" //Sequential functions

	#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
	inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
	{
	if (code != cudaSuccess)
	{
	fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
	if (abort) getchar();
	}
	}

	using namespace std;

	int main(int argc, char *argv[]){
	//-------User Parameters--------------------------------------------------------
	bool verbose = true; // Control messages
	int Nthreads = 256; // Number of threads per block
	int nx = 2001; // Grid 1D size
	int Ntmax = 100; // Maximum of iteration
	if(argc>1){
	string filename = argv[1];
	int NX = (int )malloc(sizeof(int));
	int NTHREADS = (int )malloc(sizeof(int));
	int VERBOSE = (int )malloc(sizeof(int));
	int NTMAX = (int )malloc(sizeof(int));
	read_parameters(filename,NX,NTHREADS,VERBOSE,NTMAX);
	verbose = *VERBOSE;
	Nthreads = *NTHREADS;
	nx = *NX;
	Ntmax = *NTMAX;
	free(NX); free(NTHREADS); free(VERBOSE); free(NTMAX);
	}
	//-------Basic Parameters of the simulation-------------------------------------
	clock_t timer = clock();
	int Size = 500; // Size of map, Size*Size [km]
	double dx = ((double)Size)/((double)nx); // Grid spacening
	float Tend = 0.2; // Simulation time in hours [hr]
	int numElements = nx*nx; // Total number of elements
	size_t memsize = numElements * sizeof(double); // Memory size of one array
	//-------Simulation variables HOST----------------------------------------------
	double T = 0.0; // Time
	int nt = 0; // Iteration counter
	double C = 0.0; // Coefficient 1/2*dt/dx
	double H, HU, *HV; // Water height and x,y speeds
	double Ht, HUt, *HVt; // Temporary memory of H HU and HV
	double Zdx, Zdy; // Topology of the map
	double *dt; // Host Time step
	double *GPU_mu;
	double GPU_dt;
	//-------Simulation variables DEVICE--------------------------------------------
	double d_H, d_HU, *d_HV; // Water height and x,y speeds
	double d_Ht, d_HUt, *d_HVt; // Temporary memory of H HU and HV
	double d_Zdx, d_Zdy; // Topology of the map
	double *d_mu; // device time step
	int *d_mutex;
	// Tracking variables
	double *dt_array; // Record the evolution time steps
	string datapath = "../data/"; // Path for the data
	unsigned long int GPU_memory_need = 0; // Device RAM occupancy
	unsigned long int Host_memory_need = 0; // Hoste RAM occupancy
	//-------Allocate host memory for loading the initial conditions----------------
	if(verbose) cout << " Allocating host memory .. ";
	H = (double *)malloc(memsize); Host_memory_need += memsize;
	HU = (double *)malloc(memsize); Host_memory_need += memsize;
	HV = (double *)malloc(memsize); Host_memory_need += memsize;
	Ht = (double *)malloc(memsize); Host_memory_need += memsize;
	HUt = (double *)malloc(memsize); Host_memory_need += memsize;
	HVt = (double *)malloc(memsize); Host_memory_need += memsize;
	Zdx = (double *)malloc(memsize); Host_memory_need += memsize;
	Zdy = (double *)malloc(memsize); Host_memory_need += memsize;
	dt = (double *)malloc(sizeof(double)); Host_memory_need += sizeof(double);
	GPU_mu = (double *)malloc(sizeof(double)); Host_memory_need += sizeof(double);
	dt_array = (double )malloc(Ntmaxsizeof(double)); Host_memory_need += Ntmax*sizeof(double);
	if(verbose) cout << Host_memory_need/1e6 << " MB allocated\n";
	//-------Load initial state on host memory--------------------------------------
	load_initial_data(H,HU,HV,Zdx,Zdy,datapath,nx,Size,Tend,numElements,verbose);
	//-------Allocate device memory for computing-----------------------------------
	if(verbose) cout << " Allocating device memory on host.. ";
	gpuErrchk(cudaMalloc((void **) &d_H, memsize)); GPU_memory_need+= memsize;
	gpuErrchk(cudaMalloc((void **) &d_HU, memsize)); GPU_memory_need+= memsize;
	gpuErrchk(cudaMalloc((void **) &d_HV, memsize)); GPU_memory_need+= memsize;
	gpuErrchk(cudaMalloc((void **) &d_Ht, memsize)); GPU_memory_need+= memsize;
	gpuErrchk(cudaMalloc((void **) &d_HUt, memsize)); GPU_memory_need+= memsize;
	gpuErrchk(cudaMalloc((void **) &d_HVt, memsize)); GPU_memory_need+= memsize;
	gpuErrchk(cudaMalloc((void **) &d_Zdx, memsize)); GPU_memory_need+= memsize;
	gpuErrchk(cudaMalloc((void **) &d_Zdy, memsize)); GPU_memory_need+= memsize;
	gpuErrchk(cudaMalloc((void **) &d_mu, sizeof(double))); GPU_memory_need+= sizeof(double);
	gpuErrchk(cudaMalloc((void **) &d_mutex, sizeof(int))); GPU_memory_need+= sizeof(int);
	if(verbose) cout << GPU_memory_need/1e6 << " MB allocated\n";
	//-------Set some device variables----------------------------------------------
	gpuErrchk(cudaMemset(d_mu, 0.0, sizeof(double)));
	gpuErrchk(cudaMemset(d_mutex, 0, sizeof(float)));

	//-------Copy initial conditions from host to device----------------------------
	if(verbose) cout << " Copying variables from host to device.." << endl;
	gpuErrchk(cudaMemcpy(d_H, H, memsize, cudaMemcpyHostToDevice));
	gpuErrchk(cudaMemcpy(d_HU, HU, memsize, cudaMemcpyHostToDevice));
	gpuErrchk(cudaMemcpy(d_HV, HV, memsize, cudaMemcpyHostToDevice));
	gpuErrchk(cudaMemcpy(d_Zdx, Zdx, memsize, cudaMemcpyHostToDevice));
	gpuErrchk(cudaMemcpy(d_Zdy, Zdy, memsize, cudaMemcpyHostToDevice));
	gpuErrchk(cudaMemcpy(d_Ht, Ht, memsize, cudaMemcpyHostToDevice));
	gpuErrchk(cudaMemcpy(d_HUt, HUt, memsize, cudaMemcpyHostToDevice));
	gpuErrchk(cudaMemcpy(d_HVt, HVt, memsize, cudaMemcpyHostToDevice));

	cpy_to(Ht,H,numElements);
	cpy_to(HUt,HU,numElements);
	cpy_to(HVt,HV,numElements);
	copy_host2device(d_H,d_HU,d_HV,H,HU,HV,memsize);
	copy_host2device(d_Ht,d_HUt,d_HVt,Ht,HUt,HVt,memsize);
	//-------Set grid and block dimensions------------------------------------------
	// For finite volume kernel
	int NblocksFV = ((nx-2)*(nx-2) + Nthreads -1)/Nthreads;
	// For tolerances kernel
	int NblocksBC = pow(2,ceil(log2(nx)))/Nthreads;
	dim3 GridDimBC(NblocksBC,4);
	// For Enforce BC kernel
	int NblocksTol = (nx*nx + Nthreads -1)/Nthreads;
	if(verbose){
	cout << "parallel FV iterator kernel :" << endl;
	cout <<"\t Number of elements \t\t: " << (nx-2)*(nx-2) << endl;
	cout <<"\t Number of blocks needed \t: " << NblocksFV << endl;
	cout <<"\t Nthreads \t\t\t: " << NblocksFV << "x" << Nthreads
	<< " (=" << NblocksFV*Nthreads << ")" << endl;
	cout << "parallel Enforce BC kernel :" << endl;
	cout <<"\t Number of elements \t\t: " << 4*nx << endl;
	cout <<"\t Number of blocks needed \t: " << NblocksBC << endl;
	cout <<"\t Nthreads \t\t\t: " << NblocksBC << "x4x" << Nthreads
	<< " (=" << NblocksBCNthreads4 << ")" << endl;
	cout << "parallel Tolerances kernel :" << endl;
	cout <<"\t Number of elements \t\t: " << nx*nx << endl;
	cout <<"\t Number of blocks needed \t: " << NblocksTol << endl;
	cout <<"\t Nthreads \t\t\t: " << NblocksTol<< "x" << Nthreads
	<< " (=" << NblocksTol*Nthreads << ")" << endl;

	}
	//------------------------------------------------------------------------------
	//-------Evolution loop------------/!\ Work in progress /!\---------------------
	//------------------------------------------------------------------------------
	while (T < Tend and nt < Ntmax) {
	//-------Compute the time-step length-------------------------------------------
	// update_dt(H,HU,HV,dt,dx,numElements);
	// cudaMemset(d_mu, 0.0, sizeof(double));
	find_mumax_kernel<<<256,256>>>(d_H,d_HU,d_HV,d_mutex,d_mu,numElements);
	cudaMemcpy(GPU_mu, d_mu, sizeof(double), cudaMemcpyDeviceToHost);
	GPU_dt = dx/(sqrt(2.0)*GPU_mu[0]);
	if(T+GPU_dt > Tend){
	GPU_dt = Tend-T;
	}
	//-------Print status-----------------------------------------------------------
	if(verbose){
	cout << " Computing for T=" << T+GPU_dt
	<< "\t("<< 100*(T+GPU_dt)/Tend << "%),\t"
	<< "dt=" << GPU_dt << "\n";
	// cout << " CPU dt \t : " << dt[0] << "\n";
	cout << " GPU dt \t : " << GPU_dt << "\n";
	}
	//-------Copy solution to temp storage------------------------------------------
	// cpy_to(Ht,H,numElements);
	// cpy_to(HUt,HU,numElements);
	// cpy_to(HVt,HV,numElements);
	// copy_temp_variables_kernel<<<numBlocks,threadsPerBlock>>>(d_H,d_HU,d_HV,d_Ht,d_HUt,d_HVt,nx);
	gpuErrchk(cudaMemcpy(d_Ht, d_H, memsize, cudaMemcpyDeviceToDevice));
	gpuErrchk(cudaMemcpy(d_HUt, d_HU, memsize, cudaMemcpyDeviceToDevice));
	gpuErrchk(cudaMemcpy(d_HVt, d_HV, memsize, cudaMemcpyDeviceToDevice));
	cudaDeviceSynchronize();
	//-------Enforce boundary conditions--------------------------------------------
	// enforce_BC(Ht, HUt, HVt, nx);
	enforce_BC_kernel<<<GridDimBC,Nthreads>>>(d_Ht,d_HUt,d_HVt,nx);
	cudaDeviceSynchronize();
	//-------Compute a time-step----------------------------------------------------
	C = (.5*GPU_dt/dx);
	//FV_time_step(H,HU,HV,Zdx,Zdy,Ht,HUt,HVt,C,dt[0],nx);
	//FV_time_step_kernel<<<numBlocks,threadsPerBlock>>>(d_H,d_HU,d_HV,d_Zdx,d_Zdy,d_Ht,d_HUt,d_HVt,C,dt[0],nx);
	//copy_host2device(d_Ht,d_HUt,d_HVt,Ht,HUt,HVt,memsize);
	cudaDeviceSynchronize();
	FV_iterator_kernel<<<NblocksFV,Nthreads>>>(d_H,d_HU,d_HV,d_Zdx,d_Zdy,d_Ht,d_HUt,d_HVt,C,GPU_dt,nx);
	cudaDeviceSynchronize();
	//copy_device2host(H,HU,HV,d_H,d_HU,d_HV,memsize);
	//-------Impose tolerances------------------------------------------------------
	//impose_tolerances(H,HU,HV,numElements);
	impose_tolerances_kernel<<<NblocksTol,Nthreads>>>(d_H,d_HU,d_HV,numElements);
	cudaDeviceSynchronize();
	//-------Store time step evolution and update time and counter------------------
	dt_array[nt] = GPU_dt;
	T = T + GPU_dt;
	nt++;
	//cudaDeviceSynchronize();
	}
	//------------------------------------------------------------------------------
	//-------Copy device result to the host memory----------------------------------
	if(verbose){
	cout << " Copy the output data from the CUDA device to the host memory" << endl;
	}
	cudaMemcpy(H, d_H, memsize, cudaMemcpyDeviceToHost);
	//-------Save solution to disk--------------------------------------------------
	ostringstream soutfilename;
	soutfilename <<"../output/CUDA_Solution_nx"<<to_string(nx)<<"_"<<to_string(Size)<<"km_T"<<Tend<<"_h.bin"<< setprecision(2);
	string outfilename = soutfilename.str();
	ofstream fout;
	fout.open(outfilename, std::ios::out \| std::ios::binary);
	if(verbose) cout<<" Writing solution in "<<outfilename<<endl;
	fout.write(reinterpret_cast<char>(&H[0]), numElementssizeof(double));
	fout.close();
	//-------Save time step history-------------------------------------------------
	ostringstream soutfilename2;
	soutfilename2 <<"../output/CUDA_dt_nx"<<to_string(nx)<<"_"<<to_string(Size)<<"km_T"<<Tend<<"_h.bin"<< setprecision(2);
	outfilename = soutfilename2.str();
	fout.open(outfilename, std::ios::out \| std::ios::binary);
	if(verbose) cout<<" Writing solution in "<<outfilename<<endl;
	fout.write(reinterpret_cast<char>(&dt_array[0]), Ntmaxsizeof(double));
	fout.close();
	//-------Free device global memory----------------------------------------------
	if(verbose) cout << " Free device memory space.." << endl;
	cudaFree(d_H); cudaFree(d_HU); cudaFree(d_HV); cudaFree(d_Zdx);
	cudaFree(d_Zdy); cudaFree(d_Ht); cudaFree(d_HUt); cudaFree(d_HVt);
	cudaFree(d_mu); cudaFree(d_mutex);
	//-------Free host memory-------------------------------------------------------
	if(verbose) cout << " Free host memory space.." << endl;
	free(H); free(HU); free(HV); free(Zdx); free(Zdy);
	free(Ht); free(HUt); free(HVt); free(dt_array); free(dt); free(GPU_mu);
	//-------End of the timer-----------------------------------------------------
	timer = clock()-timer;
	timer = (double)(timer)/CLOCKS_PER_SEC*1000;

	if(verbose){
	cout << "Ellapsed time : " << timer/60000 << "min "
	<< (timer/1000)%60 << "s " << timer%1000 << "ms" << endl;
	}
	else{cout << nx << "," << Nthreads << "," << Ntmax << "," << timer/1000.0 << endl;}
	return 0;
	}

compute_parallel.cuNo OneTemporaryActions

File Metadata

compute_parallel.cuView Options

Event Timeline

compute_parallel.cu
No OneTemporary
Actions

compute_parallel.cu
View Options