cuda_shared.h
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Wed, Sep 18, 02:59

cuda_shared.h
View Options

	/* ----------------------------------------------------------------------
	LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator

	Original Version:
	http://lammps.sandia.gov, Sandia National Laboratories
	Steve Plimpton, sjplimp@sandia.gov

	See the README file in the top-level LAMMPS directory.

	-----------------------------------------------------------------------

	USER-CUDA Package and associated modifications:
	https://sourceforge.net/projects/lammpscuda/

	Christian Trott, christian.trott@tu-ilmenau.de
	Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
	Theoretical Physics II, University of Technology Ilmenau, Germany

	See the README file in the USER-CUDA directory.

	This software is distributed under the GNU General Public License.
	------------------------------------------------------------------------- */

	#ifndef _CUDA_SHARED_H_
	#define _CUDA_SHARED_H_
	#include "cuda_precision.h"

	#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int)

	struct dev_array
	{
	void* dev_data; // pointer to memory address on cuda device
	unsigned dim[3]; // array dimensions
	};

	struct cuda_shared_atom // relevent data from atom class
	{
	dev_array dx; // cumulated distance for binning settings
	dev_array x; // position
	dev_array v; // velocity
	dev_array f; // force
	dev_array tag;
	dev_array type; // global ID number, there are ghosttype = ntypes (ntypescuda=ntypes+1)
	dev_array mask;
	dev_array image;
	dev_array q; // charges
	dev_array mass; // per-type masses
	dev_array rmass; // per-atom masses
	dev_array radius; // per-atom radius
	dev_array density;
	dev_array omega;
	dev_array torque;
	dev_array molecule;

	dev_array special;
	int maxspecial;
	dev_array nspecial;
	int* special_flag;
	int molecular;

	dev_array eatom; // per-atom energy
	dev_array vatom; // per-atom virial
	int need_eatom;
	int need_vatom;

	dev_array x_type; // position + type in X_FLOAT4 struct
	dev_array v_radius; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
	dev_array omega_rmass; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style

	double* mass_host; // remember per-type host pointer to masses
	//int natoms; // total # of atoms in system, could be 0
	int nghost; // and ghost atoms on this proc
	int nlocal; // # of owned
	int nall; // total # of atoms in this proc
	int nmax; // max # of owned+ghost in arrays on this proc
	int ntypes;
	int q_flag; // do we have charges?
	int rmass_flag; // do we have per-atom masses?
	int firstgroup;
	int nfirst;

	int update_nlocal;
	int update_nmax;
	int update_neigh;

	dev_array xhold; // position at last neighboring
	X_FLOAT triggerneighsq; // maximum square movement before reneighboring
	int reneigh_flag; // is reneighboring necessary
	int maxhold; // size of xhold
	int dist_check; //perform distance check for reneighboring
	dev_array binned_id; //id of each binned atom (not tag!!)
	dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]]
	float bin_extraspace;
	int bin_dim[3];
	int bin_nmax;
	dev_array map_array;
	};

	struct cuda_shared_pair // relevent data from pair class
	{
	char cudable_force; // check for (cudable_force!=0)
	X_FLOAT cut_global;
	X_FLOAT cut_inner_global;
	X_FLOAT cut_coul_global;
	double** cut; // type-type cutoff
	double** cutsq; // type-type cutoff
	double** cut_inner; // type-type cutoff for coul
	double** cut_coul; // type-type cutoff for coul
	double** coeff1; // tpye-type pair parameters
	double** coeff2;
	double** coeff3;
	double** coeff4;
	double** coeff5;
	double** coeff6;
	double** coeff7;
	double** coeff8;
	double** coeff9;
	double** coeff10;
	double** offset;
	double* special_lj;
	double* special_coul;
	dev_array virial; // ENERGY_FLOAT
	dev_array eng_vdwl; // ENERGY_FLOAT
	dev_array eng_coul; // ENERGY_FLOAT
	X_FLOAT cut_coulsq_global;
	F_FLOAT g_ewald,kappa;
	int freeze_group_bit;

	dev_array coeff1_gm;
	dev_array coeff2_gm;
	dev_array coeff3_gm;
	dev_array coeff4_gm;
	dev_array coeff5_gm;
	dev_array coeff6_gm;
	dev_array coeff7_gm;
	dev_array coeff8_gm;
	dev_array coeff9_gm;
	dev_array coeff10_gm;

	int lastgridsize;
	int n_energy_virial;
	int collect_forces_later;
	int use_block_per_atom;
	int override_block_per_atom;
	bool neighall;

	};

	struct cuda_shared_domain // relevent data from domain class
	{
	X_FLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc
	X_FLOAT subhi[3];
	X_FLOAT boxlo[3];
	X_FLOAT boxhi[3];
	X_FLOAT prd[3];
	int periodicity[3]; // xyz periodicity as array

	int triclinic;
	X_FLOAT xy;
	X_FLOAT xz;
	X_FLOAT yz;
	X_FLOAT boxlo_lamda[3];
	X_FLOAT boxhi_lamda[3];
	X_FLOAT prd_lamda[3];
	X_FLOAT h[6];
	X_FLOAT h_inv[6];
	V_FLOAT h_rate[6];
	int update;
	};

	struct cuda_shared_pppm
	{
	char cudable_force;
	#ifdef FFT_CUFFT
	FFT_FLOAT* work1;
	FFT_FLOAT* work2;
	FFT_FLOAT* work3;
	PPPM_FLOAT* greensfn;
	PPPM_FLOAT* fkx;
	PPPM_FLOAT* fky;
	PPPM_FLOAT* fkz;
	PPPM_FLOAT* vg;
	#endif
	int* part2grid;
	PPPM_FLOAT* density_brick;
	int* density_brick_int;
	PPPM_FLOAT density_intScale;
	PPPM_FLOAT* vdx_brick;
	PPPM_FLOAT* vdy_brick;
	PPPM_FLOAT* vdz_brick;
	PPPM_FLOAT* density_fft;
	ENERGY_FLOAT* energy;
	ENERGY_FLOAT* virial;
	int nxlo_in;
	int nxhi_in;
	int nxlo_out;
	int nxhi_out;
	int nylo_in;
	int nyhi_in;
	int nylo_out;
	int nyhi_out;
	int nzlo_in;
	int nzhi_in;
	int nzlo_out;
	int nzhi_out;
	int nx_pppm;
	int ny_pppm;
	int nz_pppm;
	PPPM_FLOAT qqrd2e;
	int order;
	// float3 sublo;
	PPPM_FLOAT* rho_coeff;
	int nmax;
	int nlocal;
	PPPM_FLOAT* debugdata;
	PPPM_FLOAT delxinv;
	PPPM_FLOAT delyinv;
	PPPM_FLOAT delzinv;
	int nlower;
	int nupper;
	PPPM_FLOAT shiftone;
	PPPM_FLOAT3* fH;
	};

	struct cuda_shared_comm
	{
	int maxswap;
	int maxlistlength;
	dev_array pbc;
	dev_array slablo;
	dev_array slabhi;
	dev_array multilo;
	dev_array multihi;
	dev_array sendlist;
	int grow_flag;
	int comm_phase;

	int nsend;
	int* nsend_swap;
	int* send_size;
	int* recv_size;
	double** buf_send;
	void** buf_send_dev;
	double** buf_recv;
	void** buf_recv_dev;
	void* buffer;
	int buffer_size;
	double overlap_split_ratio;
	};

	struct cuda_shared_neighlist // member of CudaNeighList, has no instance in cuda_shared_data
	{
	int maxlocal;
	int inum; // # of I atoms neighbors are stored for local indices of I atoms
	int inum_border2;
	dev_array inum_border; // # of atoms which interact with border atoms
	dev_array ilist;
	dev_array ilist_border;
	dev_array numneigh;
	dev_array numneigh_inner;
	dev_array numneigh_border;
	dev_array firstneigh;
	dev_array neighbors;
	dev_array neighbors_border;
	dev_array neighbors_inner;
	int maxpage;
	dev_array page_pointers;
	dev_array* pages;
	int maxneighbors;
	int neigh_lists_per_page;
	double** cutneighsq;
	CUDA_FLOAT* cu_cutneighsq;
	int* binned_id;
	int* bin_dim;
	int bin_nmax;
	float bin_extraspace;
	double maxcut;
	dev_array ex_type;
	int nex_type;
	dev_array ex1_bit;
	dev_array ex2_bit;
	int nex_group;
	dev_array ex_mol_bit;
	int nex_mol;

	};

	struct cuda_compile_settings // this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files
	{
	int prec_glob;
	int prec_x;
	int prec_v;
	int prec_f;
	int prec_pppm;
	int prec_fft;
	int cufft;
	int arch;
	};

	struct cuda_timings_struct
	{
	//Debug:
	double test1;
	double test2;
	//transfers
	double transfer_upload_tmp_constr;
	double transfer_download_tmp_deconstr;

	//communication
	double comm_forward_total;
	double comm_forward_mpi_upper;
	double comm_forward_mpi_lower;
	double comm_forward_kernel_pack;
	double comm_forward_kernel_unpack;
	double comm_forward_kernel_self;
	double comm_forward_upload;
	double comm_forward_download;

	double comm_exchange_total;
	double comm_exchange_mpi;
	double comm_exchange_kernel_pack;
	double comm_exchange_kernel_unpack;
	double comm_exchange_kernel_fill;
	double comm_exchange_cpu_pack;
	double comm_exchange_upload;
	double comm_exchange_download;

	double comm_border_total;
	double comm_border_mpi;
	double comm_border_kernel_pack;
	double comm_border_kernel_unpack;
	double comm_border_kernel_self;
	double comm_border_kernel_buildlist;
	double comm_border_upload;
	double comm_border_download;

	//pair forces
	double pair_xtype_conversion;
	double pair_kernel;
	double pair_virial;
	double pair_force_collection;

	//neighbor
	double neigh_bin;
	double neigh_build;
	double neigh_special;

	//PPPM
	double pppm_particle_map;
	double pppm_make_rho;
	double pppm_brick2fft;
	double pppm_poisson;
	double pppm_fillbrick;
	double pppm_fieldforce;
	double pppm_compute;

	};

	struct cuda_shared_data // holds space for all relevent data from the different classes
	{
	void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine]
	int buffersize; //maxsize of buffer
	int buffer_new; //should be 1 if the pointer to buffer has changed
	void* flag;
	void* debugdata; //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array
	cuda_shared_atom atom;
	cuda_shared_pair pair;
	cuda_shared_domain domain;
	cuda_shared_pppm pppm;
	cuda_shared_comm comm;
	cuda_compile_settings compile_settings;
	cuda_timings_struct cuda_timings;
	int exchange_dim;
	int me; //mpi rank
	unsigned int datamask;
	int overlap_comm;
	};


	#endif // #ifndef _CUDA_SHARED_H_

cuda_shared.hNo OneTemporaryActions

File Metadata

cuda_shared.hView Options

Event Timeline

cuda_shared.h
No OneTemporary
Actions

cuda_shared.h
View Options