diff --git a/lib/gpu/pair_gpu_atom.cpp b/lib/gpu/pair_gpu_atom.cpp index 0ca234508..106dc3971 100644 --- a/lib/gpu/pair_gpu_atom.cpp +++ b/lib/gpu/pair_gpu_atom.cpp @@ -1,577 +1,584 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #include "pair_gpu_atom.h" #define PairGPUAtomT PairGPUAtom<numtyp,acctyp> #ifdef WINDLL #include <windows.h> typedef bool (*__win_sort_alloc)(const int max_atoms); typedef void (*__win_sort)(const int max_atoms, unsigned *cell_begin, int *particle_begin); __win_sort_alloc _win_sort_alloc; __win_sort _win_sort; #endif template <class numtyp, class acctyp> PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false), _vflag(false),_inum(0),_ilist(NULL), _newton(false) { #ifndef USE_OPENCL sort_config.op = CUDPP_ADD; sort_config.datatype = CUDPP_UINT; sort_config.algorithm = CUDPP_SORT_RADIX; sort_config.options = CUDPP_OPTION_KEY_VALUE_PAIRS; #ifdef WINDLL HINSTANCE hinstLib = LoadLibrary(TEXT("gpu.dll")); if (hinstLib == NULL) { printf("\nUnable to load gpu.dll\n"); exit(1); } _win_sort_alloc=(__win_sort_alloc)GetProcAddress(hinstLib,"_win_sort_alloc"); _win_sort=(__win_sort)GetProcAddress(hinstLib,"_win_sort"); #endif #endif } template <class numtyp, class acctyp> int PairGPUAtomT::bytes_per_atom() const { int id_space=0; if (_gpu_nbor) id_space=2; int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space; if (_rot) bytes+=4*sizeof(numtyp)+4*sizeof(acctyp); if (_charge) bytes+=sizeof(numtyp); return bytes; } template <class numtyp, class acctyp> bool PairGPUAtomT::alloc(const int inum, const int nall) { _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10); if (_newton) _max_local=_max_atoms; else _max_local=static_cast<int>(static_cast<double>(inum)*1.10); bool success=true; int ans_elements=4; if (_rot) ans_elements+=4; // Ignore host/device transfers? bool cpuview=false; if (dev->device_type()==UCL_CPU) cpuview=true; // Allocate storage for CUDPP sort #ifndef USE_OPENCL #ifdef WINDLL _win_sort_alloc(_max_atoms); #else if (_gpu_nbor) { CUDPPResult result = cudppPlan(&sort_plan, sort_config, _max_atoms, 1, 0); if (CUDPP_SUCCESS != result) return false; } #endif #endif // -------------------------- Host allocations // Get a host write only buffer #ifdef GPU_CAST success=success && (host_x_cast.alloc(_max_atoms*3,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); success=success && (host_type_cast.alloc(_max_atoms,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); #else success=success && (host_x.alloc(_max_atoms*4,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); #endif success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS); success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS); // Buffer for casting only if different precisions if (_charge) success=success && (host_q.alloc(_max_atoms,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); // Buffer for casting only if different precisions if (_rot) success=success && (host_quat.alloc(_max_atoms*4,*dev, UCL_WRITE_OPTIMIZED)==UCL_SUCCESS); // --------------------------- Device allocations _gpu_bytes=0; if (cpuview) { #ifdef GPU_CAST assert(0==1); #else dev_x.view(host_x); #endif dev_engv.view(host_engv); dev_ans.view(host_ans); if (_rot) dev_quat.view(host_quat); if (_charge) dev_q.view(host_q); } else { #ifdef GPU_CAST success=success && (UCL_SUCCESS==dev_x.alloc(_max_atoms*4,*dev)); success=success && (UCL_SUCCESS== dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY)); success=success && (UCL_SUCCESS== dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY)); _gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes(); #else success=success && (UCL_SUCCESS== dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY)); #endif success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev, UCL_WRITE_ONLY)==UCL_SUCCESS); success=success && (dev_ans.alloc(ans_elements*_max_local, *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); if (_charge) { success=success && (dev_q.alloc(_max_atoms,*dev, UCL_READ_ONLY)==UCL_SUCCESS); _gpu_bytes+=dev_q.row_bytes(); } if (_rot) { success=success && (dev_quat.alloc(_max_atoms*4,*dev, UCL_READ_ONLY)==UCL_SUCCESS); _gpu_bytes+=dev_quat.row_bytes(); } } if (_gpu_nbor) { success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS); _gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes(); if (_bonds) { success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS); _gpu_bytes+=dev_tag.row_bytes(); } } _gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes(); _allocated=true; return success; } template <class numtyp, class acctyp> bool PairGPUAtomT::init(const int inum, const int nall, const bool charge, const bool rot, UCL_Device &devi, const bool gpu_nbor, const bool bonds) { clear(); bool success=true; _gpu_nbor=gpu_nbor; _bonds=bonds; _charge=charge; _rot=rot; _other=_charge || _rot; dev=&devi; _e_fields=1; if (_charge) _e_fields++; _ev_fields=6+_e_fields; // Initialize atom and nbor data int ef_inum=inum; if (ef_inum==0) ef_inum=1000; int ef_nall=nall; if (ef_nall<=ef_inum) ef_nall=ef_inum*2; // Initialize timers for the selected device time_pos.init(*dev); time_other.init(*dev); time_answer.init(*dev); time_pos.zero(); time_other.zero(); time_answer.zero(); _time_cast=0.0; #ifdef GPU_CAST compile_kernels(*dev); #endif return success && alloc(ef_inum,ef_nall); } +template <class numtyp, class acctyp> +bool PairGPUAtomT::add_fields(const bool charge, const bool rot) { + if (charge && _charge=false) { + _charge=true; + _e_fields++; + + template <class numtyp, class acctyp> void PairGPUAtomT::clear_resize() { if (!_allocated) return; _allocated=false; dev_x.clear(); if (_charge) { dev_q.clear(); host_q.clear(); } if (_rot) { dev_quat.clear(); host_quat.clear(); } dev_ans.clear(); dev_engv.clear(); #ifndef GPU_CAST host_x.clear(); #else host_x_cast.clear(); host_type_cast.clear(); #endif host_ans.clear(); host_engv.clear(); dev_cell_id.clear(); dev_particle_id.clear(); dev_tag.clear(); #ifdef GPU_CAST dev_x_cast.clear(); dev_type_cast.clear(); #endif #ifndef USE_OPENCL #ifndef WINDLL if (_gpu_nbor) cudppDestroyPlan(sort_plan); #endif #endif } template <class numtyp, class acctyp> void PairGPUAtomT::clear() { _gpu_bytes=0; if (!_allocated) return; time_pos.clear(); time_other.clear(); time_answer.clear(); clear_resize(); _inum=0; _eflag=false; _vflag=false; #ifdef GPU_CAST if (_compiled) { k_cast_x.clear(); delete atom_program; _compiled=false; } #endif } template <class numtyp, class acctyp> double PairGPUAtomT::host_memory_usage() const { int atom_bytes=4; if (_charge) atom_bytes+=1; if (_rot) atom_bytes+=4; int ans_bytes=atom_bytes+_ev_fields; return _max_atoms*atom_bytes*sizeof(numtyp)+ ans_bytes*(_max_local)*sizeof(acctyp)+ sizeof(PairGPUAtom<numtyp,acctyp>); } template <class numtyp, class acctyp> void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, const bool ef_atom, const bool vf_atom) { time_answer.start(); _eflag=eflag; _vflag=vflag; _ef_atom=ef_atom; _vf_atom=vf_atom; int csize=_ev_fields; if (!eflag) csize-=_e_fields; if (!vflag) csize-=6; if (csize>0) ucl_copy(host_engv,dev_engv,_inum*csize,true); if (_rot) ucl_copy(host_ans,dev_ans,_inum*4*2,true); else ucl_copy(host_ans,dev_ans,_inum*4,true); time_answer.stop(); } template <class numtyp, class acctyp> void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, const bool ef_atom, const bool vf_atom, int *ilist) { _ilist=ilist; copy_answers(eflag,vflag,ef_atom,vf_atom); } template <class numtyp, class acctyp> double PairGPUAtomT::energy_virial(double *eatom, double **vatom, double *virial) { if (_eflag==false && _vflag==false) return 0.0; double evdwl=0.0; if (_gpu_nbor) { for (int i=0; i<_inum; i++) { acctyp *ap=host_engv.begin()+i; if (_eflag) { if (_ef_atom) { evdwl+=*ap; eatom[i]+=*ap*0.5; ap+=_inum; } else { evdwl+=*ap; ap+=_inum; } } if (_vflag) { if (_vf_atom) { for (int j=0; j<6; j++) { vatom[i][j]+=*ap*0.5; virial[j]+=*ap; ap+=_inum; } } else { for (int j=0; j<6; j++) { virial[j]+=*ap; ap+=_inum; } } } } for (int j=0; j<6; j++) virial[j]*=0.5; } else { for (int i=0; i<_inum; i++) { acctyp *ap=host_engv.begin()+i; int ii=_ilist[i]; if (_eflag) { if (_ef_atom) { evdwl+=*ap; eatom[ii]+=*ap*0.5; ap+=_inum; } else { evdwl+=*ap; ap+=_inum; } } if (_vflag) { if (_vf_atom) { for (int j=0; j<6; j++) { vatom[ii][j]+=*ap*0.5; virial[j]+=*ap; ap+=_inum; } } else { for (int j=0; j<6; j++) { virial[j]+=*ap; ap+=_inum; } } } } for (int j=0; j<6; j++) virial[j]*=0.5; } evdwl*=0.5; return evdwl; } template <class numtyp, class acctyp> double PairGPUAtomT::energy_virial(double *eatom, double **vatom, double *virial, double &ecoul) { if (_eflag==false && _vflag==false) { ecoul=0.0; return 0.0; } if (_charge==false) return energy_virial(eatom,vatom,virial); double evdwl=0.0; double _ecoul=0.0; if (_gpu_nbor) { for (int i=0; i<_inum; i++) { acctyp *ap=host_engv.begin()+i; if (_eflag) { if (_ef_atom) { evdwl+=*ap; eatom[i]+=*ap*0.5; ap+=_inum; _ecoul+=*ap; eatom[i]+=*ap*0.5; ap+=_inum; } else { evdwl+=*ap; ap+=_inum; _ecoul+=*ap; ap+=_inum; } } if (_vflag) { if (_vf_atom) { for (int j=0; j<6; j++) { vatom[i][j]+=*ap*0.5; virial[j]+=*ap; ap+=_inum; } } else { for (int j=0; j<6; j++) { virial[j]+=*ap; ap+=_inum; } } } } for (int j=0; j<6; j++) virial[j]*=0.5; } else { for (int i=0; i<_inum; i++) { acctyp *ap=host_engv.begin()+i; int ii=_ilist[i]; if (_eflag) { if (_ef_atom) { evdwl+=*ap; eatom[ii]+=*ap*0.5; ap+=_inum; _ecoul+=*ap; eatom[ii]+=*ap*0.5; ap+=_inum; } else { evdwl+=*ap; ap+=_inum; _ecoul+=*ap; ap+=_inum; } } if (_vflag) { if (_vf_atom) { for (int j=0; j<6; j++) { vatom[ii][j]+=*ap*0.5; virial[j]+=*ap; ap+=_inum; } } else { for (int j=0; j<6; j++) { virial[j]+=*ap; ap+=_inum; } } } } for (int j=0; j<6; j++) virial[j]*=0.5; } evdwl*=0.5; ecoul+=_ecoul*0.5; return evdwl; } template <class numtyp, class acctyp> void PairGPUAtomT::get_answers(double **f, double **tor) { acctyp *ap=host_ans.begin(); if (_gpu_nbor) { for (int i=0; i<_inum; i++) { f[i][0]+=*ap; ap++; f[i][1]+=*ap; ap++; f[i][2]+=*ap; ap+=2; } if (_rot) { for (int i=0; i<_inum; i++) { tor[i][0]+=*ap; ap++; tor[i][1]+=*ap; ap++; tor[i][2]+=*ap; ap+=2; } } } else { for (int i=0; i<_inum; i++) { int ii=_ilist[i]; f[ii][0]+=*ap; ap++; f[ii][1]+=*ap; ap++; f[ii][2]+=*ap; ap+=2; } if (_rot) { for (int i=0; i<_inum; i++) { int ii=_ilist[i]; tor[ii][0]+=*ap; ap++; tor[ii][1]+=*ap; ap++; tor[ii][2]+=*ap; ap+=2; } } } } // Sort arrays for neighbor list calculation template <class numtyp, class acctyp> void PairGPUAtomT::sort_neighbor(const int num_atoms) { #ifndef USE_OPENCL #ifdef WINDLL _win_sort(num_atoms,(unsigned *)dev_cell_id.begin(), (int *)dev_particle_id.begin()); #else CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(), (int *)dev_particle_id.begin(), 8*sizeof(unsigned), num_atoms); if (CUDPP_SUCCESS != result) { printf("Error in cudppSort\n"); NVD_GERYON_EXIT; } #endif #endif } #ifdef GPU_CAST #ifdef USE_OPENCL #include "pair_gpu_atom_cl.h" #else #include "pair_gpu_atom_ptx.h" #endif template <class numtyp, class acctyp> void PairGPUAtomT::compile_kernels(UCL_Device &dev) { atom_program=new UCL_Program(dev); atom_program->load_string(pair_gpu_atom_kernel,""); k_cast_x.set_function(*atom_program,"kernel_cast_x"); _compiled=true; } #endif template class PairGPUAtom<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h index e0a1fd9fb..2f816be03 100644 --- a/lib/gpu/pair_gpu_atom.h +++ b/lib/gpu/pair_gpu_atom.h @@ -1,419 +1,424 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #ifndef PAIR_GPU_ATOM_H #define PAIR_GPU_ATOM_H #include <math.h> #include "mpi.h" #ifdef USE_OPENCL #include "geryon/ocl_device.h" #include "geryon/ocl_timer.h" #include "geryon/ocl_mat.h" #include "geryon/ocl_kernel.h" using namespace ucl_opencl; #else #include "cudpp.h" #include "geryon/nvd_device.h" #include "geryon/nvd_timer.h" #include "geryon/nvd_mat.h" #include "geryon/nvd_kernel.h" using namespace ucl_cudadr; #endif #ifndef int2 struct int2 { int x; int y; }; #endif #include "pair_gpu_precision.h" template <class numtyp, class acctyp> class PairGPUAtom { public: PairGPUAtom(); ~PairGPUAtom() { clear(); } /// Maximum number of atoms that can be stored with current allocation inline int max_atoms() const { return _max_atoms; } /// Current number of local+ghost atoms stored inline int nall() const { return _nall; } /// Current number of local atoms stored inline int inum() const { return _inum; } /// Set number of local+ghost atoms for future copy operations inline void nall(const int n) { _nall=n; } /// Set number of local atoms for future copy operations inline void inum(const int n) { _inum=n; } /// Memory usage per atom in this class int bytes_per_atom() const; /// Clear any previous data and set up for a new LAMMPS run /** \param rot True if atom storage needs quaternions * \param gpu_nbor True if neighboring will be performed on device **/ bool init(const int inum, const int nall, const bool charge, const bool rot, UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false); /// Check if we have enough device storage and realloc if not inline bool resize(const int inum, const int nall, bool &success) { _inum=inum; _nall=nall; if (inum>_max_local || nall>_max_atoms) { clear_resize(); success = success && alloc(inum,nall); return true; } return false; } + + /// If already initialized by another LAMMPS style, add fields as necessary + /** \param rot True if atom storage needs quaternions + * \param gpu_nbor True if neighboring will be performed on device **/ + bool add_fields(const bool charge, const bool rot); /// Only free matrices of length inum or nall for resizing void clear_resize(); /// Free all memory on host and device void clear(); /// Return the total amount of host memory used by class in bytes double host_memory_usage() const; /// Sort arrays for neighbor list calculation on device void sort_neighbor(const int num_atoms); /// Add copy times to timers inline void acc_timers() { time_pos.add_to_total(); time_answer.add_to_total(); if (_other) time_other.add_to_total(); } /// Add copy times to timers inline void zero_timers() { time_pos.zero(); time_answer.zero(); if (_other) time_other.zero(); } /// Return the total time for host/device data transfer inline double transfer_time() { double total=time_pos.total_seconds()+time_answer.total_seconds(); if (_other) total+=time_other.total_seconds(); return total; } /// Return the total time for data cast/pack inline double cast_time() { return _time_cast; } /// Pack LAMMPS atom type constants into matrix and copy to device template <class dev_typ, class t1> inline void type_pack1(const int n, const int m_size, UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer, t1 **one) { int ii=0; for (int i=0; i<n; i++) { for (int j=0; j<n; j++) { buffer[ii]=static_cast<numtyp>(one[i][j]); ii++; } ii+=m_size-n; } UCL_H_Vec<dev_typ> view; view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); ucl_copy(dev_v,view,false); } /// Pack LAMMPS atom type constants into 2 vectors and copy to device template <class dev_typ, class t1, class t2> inline void type_pack2(const int n, const int m_size, UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) { int ii=0; for (int i=0; i<n; i++) { for (int j=0; j<n; j++) { buffer[ii*2]=static_cast<numtyp>(one[i][j]); buffer[ii*2+1]=static_cast<numtyp>(two[i][j]); ii++; } ii+=m_size-n; } UCL_H_Vec<dev_typ> view; view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); ucl_copy(dev_v,view,false); } /// Pack LAMMPS atom type constants (3) into 4 vectors and copy to device template <class dev_typ, class t1, class t2, class t3> inline void type_pack4(const int n, const int m_size, UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two, t3 **three) { int ii=0; for (int i=0; i<n; i++) { for (int j=0; j<n; j++) { buffer[ii*4]=static_cast<numtyp>(one[i][j]); buffer[ii*4+1]=static_cast<numtyp>(two[i][j]); buffer[ii*4+2]=static_cast<numtyp>(three[i][j]); ii++; } ii+=m_size-n; } UCL_H_Vec<dev_typ> view; view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); ucl_copy(dev_v,view,false); } /// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device template <class dev_typ, class t1, class t2, class t3, class t4> inline void type_pack4(const int n, const int m_size, UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two, t3 **three, t4 **four) { int ii=0; for (int i=0; i<n; i++) { for (int j=0; j<n; j++) { buffer[ii*4]=static_cast<numtyp>(one[i][j]); buffer[ii*4+1]=static_cast<numtyp>(two[i][j]); buffer[ii*4+2]=static_cast<numtyp>(three[i][j]); buffer[ii*4+3]=static_cast<numtyp>(four[i][j]); ii++; } ii+=m_size-n; } UCL_H_Vec<dev_typ> view; view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); ucl_copy(dev_v,view,false); } /// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device template <class dev_typ, class t1, class t2> inline void self_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v, UCL_H_Vec<numtyp> &buffer, t1 **one, t2 **two) { for (int i=0; i<n; i++) { buffer[i*2]=static_cast<numtyp>(one[i][i]); buffer[i*2+1]=static_cast<numtyp>(two[i][i]); } UCL_H_Vec<dev_typ> view; view.view((dev_typ*)buffer.begin(),n,*dev); ucl_copy(dev_v,view,false); } // -------------------------COPY TO GPU ---------------------------------- /// Cast positions and types to write buffer inline void cast_x_data(double **host_ptr, const int *host_type) { double t=MPI_Wtime(); #ifdef GPU_CAST memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double)); memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int)); #else numtyp *_write_loc=host_x.begin(); for (int i=0; i<_nall; i++) { *_write_loc=host_ptr[i][0]; _write_loc++; *_write_loc=host_ptr[i][1]; _write_loc++; *_write_loc=host_ptr[i][2]; _write_loc++; *_write_loc=host_type[i]; _write_loc++; } #endif _time_cast+=MPI_Wtime()-t; } /// Copy positions and types to device asynchronously /** Copies nall() elements **/ inline void add_x_data(double **host_ptr, int *host_type) { time_pos.start(); #ifdef GPU_CAST ucl_copy(dev_x_cast,host_x_cast,_nall*3,true); ucl_copy(dev_type_cast,host_type_cast,_nall,true); int block_size=64; int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size)); k_cast_x.set_size(GX,block_size); k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), &_nall); #else ucl_copy(dev_x,host_x,_nall*4,true); #endif time_pos.stop(); } /// Calls cast_x_data and add_x_data and times the routines inline void cast_copy_x(double **host_ptr, int *host_type) { cast_x_data(host_ptr,host_type); add_x_data(host_ptr,host_type); } /// Cast charges to write buffer template<class cpytyp> inline void cast_q_data(cpytyp *host_ptr) { double t=MPI_Wtime(); if (dev->device_type()==UCL_CPU) { if (sizeof(numtyp)==sizeof(double)) { host_q.view((numtyp*)host_ptr,_nall,*dev); dev_q.view(host_q); } else for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; } else { if (sizeof(numtyp)==sizeof(double)) memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp)); else for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i]; } _time_cast+=MPI_Wtime()-t; } /// Copy charges to device asynchronously inline void add_q_data() { ucl_copy(dev_q,host_q,_nall,true); } /// Cast quaternions to write buffer template<class cpytyp> inline void cast_quat_data(cpytyp *host_ptr) { double t=MPI_Wtime(); if (dev->device_type()==UCL_CPU) { if (sizeof(numtyp)==sizeof(double)) { host_quat.view((numtyp*)host_ptr,_nall*4,*dev); dev_quat.view(host_quat); } else for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; } else { if (sizeof(numtyp)==sizeof(double)) memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp)); else for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i]; } _time_cast+=MPI_Wtime()-t; } /// Copy quaternions to device /** Copies nall()*4 elements **/ inline void add_quat_data() { ucl_copy(dev_quat,host_quat,_nall*4,true); } /// Copy data other than pos and data to device inline void add_other_data() { time_other.start(); if (_charge) add_q_data(); if (_rot) add_quat_data(); time_other.stop(); } /// Return number of bytes used on device inline double gpu_bytes() { return _gpu_bytes; } // -------------------------COPY FROM GPU ------------------------------- /// Copy answers from device into read buffer asynchronously void copy_answers(const bool eflag, const bool vflag, const bool ef_atom, const bool vf_atom); /// Copy answers from device into read buffer asynchronously void copy_answers(const bool eflag, const bool vflag, const bool ef_atom, const bool vf_atom, int *ilist); /// Copy energy and virial data into LAMMPS memory double energy_virial(double *eatom, double **vatom, double *virial); /// Copy energy and virial data into LAMMPS memory double energy_virial(double *eatom, double **vatom, double *virial, double &ecoul); /// Add forces and torques from the GPU into a LAMMPS pointer void get_answers(double **f, double **tor); // ------------------------------ DATA ---------------------------------- /// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type UCL_D_Vec<numtyp> dev_x; /// Charges UCL_D_Vec<numtyp> dev_q; /// Quaterions UCL_D_Vec<numtyp> dev_quat; /// Force and possibly torque UCL_D_Vec<acctyp> dev_ans; /// Energy and virial per-atom storage UCL_D_Vec<acctyp> dev_engv; #ifdef GPU_CAST UCL_D_Vec<double> dev_x_cast; UCL_D_Vec<int> dev_type_cast; UCL_H_Vec<double> host_x_cast; UCL_H_Vec<int> host_type_cast; #endif /// Buffer for moving positions to device UCL_H_Vec<numtyp> host_x; /// Buffer for moving charge data to GPU UCL_H_Vec<numtyp> host_q; /// Buffer for moving quat data to GPU UCL_H_Vec<numtyp> host_quat; /// Force and possibly torque data on host UCL_H_Vec<acctyp> host_ans; /// Energy/virial data on host UCL_H_Vec<acctyp> host_engv; /// Cell list identifiers for device nbor builds UCL_D_Vec<unsigned> dev_cell_id; /// Cell list identifiers for device nbor builds UCL_D_Vec<int> dev_particle_id; /// Atom tag information for device nbor builds UCL_D_Vec<int> dev_tag; /// Device timers UCL_Timer time_pos, time_other, time_answer; /// Geryon device UCL_Device *dev; private: #ifdef GPU_CAST UCL_Program *atom_program; UCL_Kernel k_cast_x; void compile_kernels(UCL_Device &dev); #endif bool _compiled; bool alloc(const int inum, const int nall); bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other; int _max_local, _max_atoms, _nall, _inum, _e_fields, _ev_fields; bool _gpu_nbor, _bonds; int *_ilist; double _time_cast; double _gpu_bytes; bool _newton; #ifndef USE_OPENCL CUDPPConfiguration sort_config; CUDPPHandle sort_plan; #endif }; #endif diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp index d02618e22..09728c77d 100644 --- a/lib/gpu/pair_gpu_device.cpp +++ b/lib/gpu/pair_gpu_device.cpp @@ -1,286 +1,287 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Mike Brown (ORNL), brownw@ornl.gov ------------------------------------------------------------------------- */ #include "pair_gpu_device.h" #include "pair_gpu_precision.h" #include <map> #include <math.h> #ifdef _OPENMP #include <omp.h> #endif #define PairGPUDeviceT PairGPUDevice<numtyp, acctyp> template <class numtyp, class acctyp> PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false), _gpu_mode(GPU_FORCE), _first_device(0), _last_device(0) { } template <class numtyp, class acctyp> PairGPUDeviceT::~PairGPUDevice() { clear_device(); } template <class numtyp, class acctyp> bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, const int last_gpu, const int gpu_mode, const double p_split, const int nthreads) { _nthreads=nthreads; #ifdef _OPENMP omp_set_num_threads(nthreads); #endif if (_device_init) return true; _device_init=true; _comm_world=world; _comm_replica=replica; _first_device=first_gpu; _last_device=last_gpu; _gpu_mode=gpu_mode; _particle_split=p_split; // Get the rank/size within the world MPI_Comm_rank(_comm_world,&_world_me); MPI_Comm_size(_comm_world,&_world_size); // Get the rank/size within the replica MPI_Comm_rank(_comm_replica,&_replica_me); MPI_Comm_size(_comm_replica,&_replica_size); // Get the names of all nodes int name_length; char node_name[MPI_MAX_PROCESSOR_NAME]; char node_names[MPI_MAX_PROCESSOR_NAME*_world_size]; MPI_Get_processor_name(node_name,&name_length); MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names, MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world); std::string node_string=std::string(node_name); // Get the number of procs per node std::map<std::string,int> name_map; std::map<std::string,int>::iterator np; for (int i=0; i<_world_size; i++) { std::string i_string=std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]); np=name_map.find(i_string); if (np==name_map.end()) name_map[i_string]=1; else np->second++; } int procs_per_node=name_map.begin()->second; // Assign a unique id to each node int split_num=0, split_id=0; for (np=name_map.begin(); np!=name_map.end(); ++np) { if (np->first==node_string) split_id=split_num; split_num++; } // Set up a per node communicator and find rank within MPI_Comm node_comm; MPI_Comm_split(_comm_world, split_id, 0, &node_comm); int node_rank; MPI_Comm_rank(node_comm,&node_rank); // set the device ID _procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/ (last_gpu-first_gpu+1))); int my_gpu=node_rank/_procs_per_gpu; // Set up a per device communicator MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu); MPI_Comm_rank(_comm_gpu,&_gpu_rank); gpu=new UCL_Device(); if (my_gpu>=gpu->num_devices()) return false; gpu->set(my_gpu); return true; } template <class numtyp, class acctyp> bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal, const int host_nlocal, const int nall, const int maxspecial, const bool gpu_nbor, const int gpu_host, const int max_nbors, const double cell_size, const bool pre_cut) { if (!_device_init) return false; if (_init_count==0) { // Initialize atom and nbor data int ef_nlocal=nlocal; if (_particle_split<1.0 && _particle_split>0.0) ef_nlocal=static_cast<int>(_particle_split*nlocal); if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor, gpu_nbor && maxspecial>0)) return false; if (!nbor.init(ef_nlocal,host_nlocal,max_nbors,maxspecial,*gpu,gpu_nbor, gpu_host,pre_cut)) return false; nbor.cell_size(cell_size); } else { + atom.add_fields(charge,rot); if (cell_size>nbor.cell_size()) nbor.cell_size(cell_size); } _init_count++; return true; } template <class numtyp, class acctyp> void PairGPUDeviceT::init_message(FILE *screen, const char *name, const int first_gpu, const int last_gpu) { #ifdef USE_OPENCL std::string fs=""; #else std::string fs=toa(gpu->free_gigabytes())+"/"; #endif if (_replica_me == 0 && screen) { fprintf(screen,"\n-------------------------------------"); fprintf(screen,"-------------------------------------\n"); fprintf(screen,"- Using GPGPU acceleration for %s:\n",name); fprintf(screen,"- with %d procs per device.\n",_procs_per_gpu); #ifdef _OPENMP fprintf(screen,"- with %d threads per proc.\n",_nthreads); #endif fprintf(screen,"-------------------------------------"); fprintf(screen,"-------------------------------------\n"); for (int i=first_gpu; i<=last_gpu; i++) { std::string sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+ toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+ " GHZ ("; if (sizeof(PRECISION)==4) { if (sizeof(ACC_PRECISION)==4) sname+="Single Precision)"; else sname+="Mixed Precision)"; } else sname+="Double Precision)"; fprintf(screen,"GPU %d: %s\n",i,sname.c_str()); } fprintf(screen,"-------------------------------------"); fprintf(screen,"-------------------------------------\n\n"); } } template <class numtyp, class acctyp> void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split, const double max_bytes, FILE *screen) { double single[5], times[5]; single[0]=atom.transfer_time(); single[1]=nbor.time_nbor.total_seconds(); single[2]=nbor.time_kernel.total_seconds(); single[3]=time_pair.total_seconds(); single[4]=atom.cast_time(); MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica); double my_max_bytes=max_bytes; double mpi_max_bytes; MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); double max_mb=mpi_max_bytes/(1024.0*1024.0); if (replica_me()==0) if (screen && times[3]>0.0) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); fprintf(screen," GPU Time Info (average): "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); if (procs_per_gpu()==1) { fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size); fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size); fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size); if (nbor.gpu_nbor()) fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/_replica_size); else fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size); fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size); } fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); fprintf(screen,"-------------------------------------"); fprintf(screen,"--------------------------------\n\n"); } } template <class numtyp, class acctyp> void PairGPUDeviceT::clear() { if (_init_count>0) { _init_count--; if (_init_count==0) { atom.clear(); nbor.clear(); } } } template <class numtyp, class acctyp> void PairGPUDeviceT::clear_device() { while (_init_count>0) clear(); if (_device_init) { delete gpu; _device_init=false; } } template <class numtyp, class acctyp> double PairGPUDeviceT::host_memory_usage() const { return atom.host_memory_usage()+ nbor.host_memory_usage()+4*sizeof(numtyp)+ sizeof(PairGPUDevice<numtyp,acctyp>); } template class PairGPUDevice<PRECISION,ACC_PRECISION>; PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device; bool lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, const int last_gpu, const int gpu_mode, const double particle_split, const int nthreads) { return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode, particle_split,nthreads); } void lmp_clear_device() { pair_gpu_device.clear_device(); } double lmp_gpu_forces(double **f, double **tor, double *eatom, double **vatom, double *virial, double &ecoul) { if (pair_gpu_device.init_count()) { pair_gpu_device.stop_host_timer(); pair_gpu_device.gpu->sync(); double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul); pair_gpu_device.atom.get_answers(f,tor); return evdw; } return 0.0; } diff --git a/src/pair_hybrid.cpp b/src/pair_hybrid.cpp index 00a2c5d65..12137406e 100644 --- a/src/pair_hybrid.cpp +++ b/src/pair_hybrid.cpp @@ -1,681 +1,677 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ #include "math.h" #include "string.h" #include "ctype.h" #include "pair_hybrid.h" #include "atom.h" #include "force.h" #include "pair.h" #include "neighbor.h" #include "neigh_request.h" #include "update.h" #include "comm.h" #include "memory.h" #include "error.h" using namespace LAMMPS_NS; #define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MAX(a,b) ((a) > (b) ? (a) : (b)) /* ---------------------------------------------------------------------- */ PairHybrid::PairHybrid(LAMMPS *lmp) : Pair(lmp) { nstyles = 0; } /* ---------------------------------------------------------------------- */ PairHybrid::~PairHybrid() { if (nstyles) { for (int m = 0; m < nstyles; m++) delete styles[m]; delete [] styles; for (int m = 0; m < nstyles; m++) delete [] keywords[m]; delete [] keywords; } if (allocated) { memory->destroy_2d_int_array(setflag); memory->destroy_2d_double_array(cutsq); memory->destroy_2d_int_array(nmap); memory->destroy_3d_int_array(map); } } /* ---------------------------------------------------------------------- call each sub-style's compute function accumulate sub-style global/peratom energy/virial in hybrid for global vflag = 1: each sub-style computes own virial[6] sum sub-style virial[6] to hybrid's virial[6] for global vflag = 2: call sub-style with adjusted vflag to prevent it calling virial_compute() hybrid calls virial_compute() on final accumulated f ------------------------------------------------------------------------- */ void PairHybrid::compute(int eflag, int vflag) { int i,j,m,n; // if no_virial_compute is set and global component of incoming vflag = 2 // reset vflag as if global component were 1 // necessary since one or more sub-styles cannot compute virial as F dot r if (no_virial_compute && vflag % 4 == 2) vflag = 1 + vflag/4 * 4; if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = eflag_global = vflag_global = eflag_atom = vflag_atom = 0; // check if global component of incoming vflag = 2 // if so, reset vflag passed to substyle as if it were 0 // necessary so substyle will not invoke virial_compute() int vflag_substyle; if (vflag % 4 == 2) vflag_substyle = vflag/4 * 4; else vflag_substyle = vflag; for (m = 0; m < nstyles; m++) { styles[m]->compute(eflag,vflag_substyle); if (eflag_global) { eng_vdwl += styles[m]->eng_vdwl; eng_coul += styles[m]->eng_coul; } if (vflag_global) { for (n = 0; n < 6; n++) virial[n] += styles[m]->virial[n]; } if (eflag_atom) { n = atom->nlocal; if (force->newton_pair) n += atom->nghost; double *eatom_substyle = styles[m]->eatom; for (i = 0; i < n; i++) eatom[i] += eatom_substyle[i]; } if (vflag_atom) { n = atom->nlocal; if (force->newton_pair) n += atom->nghost; double **vatom_substyle = styles[m]->vatom; for (i = 0; i < n; i++) for (j = 0; j < 6; j++) vatom[i][j] += vatom_substyle[i][j]; } } if (vflag_fdotr) virial_compute(); } /* ---------------------------------------------------------------------- */ void PairHybrid::compute_inner() { for (int m = 0; m < nstyles; m++) if (styles[m]->respa_enable) styles[m]->compute_inner(); } /* ---------------------------------------------------------------------- */ void PairHybrid::compute_middle() { for (int m = 0; m < nstyles; m++) if (styles[m]->respa_enable) styles[m]->compute_middle(); } /* ---------------------------------------------------------------------- */ void PairHybrid::compute_outer(int eflag, int vflag) { for (int m = 0; m < nstyles; m++) if (styles[m]->respa_enable) styles[m]->compute_outer(eflag,vflag); } /* ---------------------------------------------------------------------- allocate all arrays ------------------------------------------------------------------------- */ void PairHybrid::allocate() { allocated = 1; int n = atom->ntypes; setflag = memory->create_2d_int_array(n+1,n+1,"pair:setflag"); for (int i = 1; i <= n; i++) for (int j = i; j <= n; j++) setflag[i][j] = 0; cutsq = memory->create_2d_double_array(n+1,n+1,"pair:cutsq"); nmap = memory->create_2d_int_array(n+1,n+1,"pair:nmap"); map = memory->create_3d_int_array(n+1,n+1,nstyles,"pair:map"); for (int i = 1; i <= n; i++) for (int j = i; j <= n; j++) nmap[i][j] = 0; } /* ---------------------------------------------------------------------- create one pair style for each arg in list ------------------------------------------------------------------------- */ void PairHybrid::settings(int narg, char **arg) { int i,m,istyle; if (narg < 1) error->all("Illegal pair_style command"); // delete old lists, since cannot just change settings if (nstyles) { for (m = 0; m < nstyles; m++) delete styles[m]; delete [] styles; for (m = 0; m < nstyles; m++) delete [] keywords[m]; delete [] keywords; } if (allocated) { memory->destroy_2d_int_array(setflag); memory->destroy_2d_double_array(cutsq); memory->destroy_2d_int_array(nmap); memory->destroy_3d_int_array(map); } allocated = 0; // count sub-styles by skipping numeric args // exception is 1st arg of style "table", which is non-numeric word // exception is 1st two args of style "lj/coul", which are non-numeric // exception is 1st two args of style "buck/coul", which are non-numeric - // exception is 1st arg of any "gpu" style, which is non-numeric // exception is 1st arg of reax/c style, which is non-numeric // need a better way to skip these exceptions nstyles = 0; i = 0; while (i < narg) { if (strcmp(arg[i],"table") == 0) i++; if (strcmp(arg[i],"lj/coul") == 0) i += 2; if (strcmp(arg[i],"buck/coul") == 0) i += 2; - if (strstr(arg[i],"gpu")) i++; if (strcmp(arg[i],"reax/c") == 0) i++; i++; while (i < narg && !isalpha(arg[i][0])) i++; nstyles++; } // allocate list of sub-styles styles = new Pair*[nstyles]; keywords = new char*[nstyles]; // allocate each sub-style and call its settings() with subset of args // define subset of args for a sub-style by skipping numeric args // exception is 1st arg of style "table", which is non-numeric // exception is 1st two args of style "lj/coul", which are non-numeric // exception is 1st two args of style "buck/coul", which are non-numeric - // exception is 1st arg of any "gpu" style, which is non-numeric // exception is 1st arg of reax/c style, which is non-numeric // need a better way to skip these exceptions nstyles = 0; i = 0; while (i < narg) { for (m = 0; m < nstyles; m++) if (strcmp(arg[i],keywords[m]) == 0) error->all("Pair style hybrid cannot use same pair style twice"); if (strcmp(arg[i],"hybrid") == 0) error->all("Pair style hybrid cannot have hybrid as an argument"); if (strcmp(arg[i],"none") == 0) error->all("Pair style hybrid cannot have none as an argument"); styles[nstyles] = force->new_pair(arg[i]); keywords[nstyles] = new char[strlen(arg[i])+1]; strcpy(keywords[nstyles],arg[i]); istyle = i; if (strcmp(arg[i],"table") == 0) i++; if (strcmp(arg[i],"lj/coul") == 0) i += 2; if (strcmp(arg[i],"buck/coul") == 0) i += 2; - if (strstr(arg[i],"gpu")) i++; if (strcmp(arg[i],"reax/c") == 0) i++; i++; while (i < narg && !isalpha(arg[i][0])) i++; styles[nstyles]->settings(i-istyle-1,&arg[istyle+1]); nstyles++; } // set comm_forward, comm_reverse to max of any sub-style for (m = 0; m < nstyles; m++) { if (styles[m]) comm_forward = MAX(comm_forward,styles[m]->comm_forward); if (styles[m]) comm_reverse = MAX(comm_reverse,styles[m]->comm_reverse); } // single_enable = 0 if any sub-style = 0 // respa_enable = 1 if any sub-style is set // no_virial_compute = 1 if any sub-style is set for (m = 0; m < nstyles; m++) if (styles[m]->single_enable == 0) single_enable = 0; for (m = 0; m < nstyles; m++) if (styles[m]->respa_enable) respa_enable = 1; for (m = 0; m < nstyles; m++) if (styles[m]->no_virial_compute) no_virial_compute = 1; } /* ---------------------------------------------------------------------- set coeffs for one or more type pairs ------------------------------------------------------------------------- */ void PairHybrid::coeff(int narg, char **arg) { if (narg < 3) error->all("Incorrect args for pair coefficients"); if (!allocated) allocate(); int ilo,ihi,jlo,jhi; force->bounds(arg[0],atom->ntypes,ilo,ihi); force->bounds(arg[1],atom->ntypes,jlo,jhi); // 3rd arg = pair sub-style name // allow for "none" as valid sub-style name int m; for (m = 0; m < nstyles; m++) if (strcmp(arg[2],keywords[m]) == 0) break; int none = 0; if (m == nstyles) { if (strcmp(arg[2],"none") == 0) none = 1; else error->all("Pair coeff for hybrid has invalid style"); } // move 1st/2nd args to 2nd/3rd args // just copy ptrs, since arg[] points into original input line arg[2] = arg[1]; arg[1] = arg[0]; // invoke sub-style coeff() starting with 1st arg if (!none) styles[m]->coeff(narg-1,&arg[1]); // if sub-style only allows one pair coeff call (with * * and type mapping) // then unset setflag/map assigned to that style before setting it below // in case pair coeff for this sub-style is being called for 2nd time if (!none && styles[m]->one_coeff) for (int i = 1; i <= atom->ntypes; i++) for (int j = i; j <= atom->ntypes; j++) if (nmap[i][j] && map[i][j][0] == m) { setflag[i][j] = 0; nmap[i][j] = 0; } // set setflag and which type pairs map to which sub-style // if sub-style is none: set hybrid setflag, wipe out map // else: set hybrid setflag & map only if substyle setflag is set // previous mappings are wiped out int count = 0; for (int i = ilo; i <= ihi; i++) { for (int j = MAX(jlo,i); j <= jhi; j++) { if (none) { setflag[i][j] = 1; nmap[i][j] = 0; count++; } else if (styles[m]->setflag[i][j]) { setflag[i][j] = 1; nmap[i][j] = 1; map[i][j][0] = m; count++; } } } if (count == 0) error->all("Incorrect args for pair coefficients"); } /* ---------------------------------------------------------------------- init specific to this pair style ------------------------------------------------------------------------- */ void PairHybrid::init_style() { int i,m,itype,jtype,used,istyle,skip; // error if a sub-style is not used int ntypes = atom->ntypes; for (istyle = 0; istyle < nstyles; istyle++) { used = 0; for (itype = 1; itype <= ntypes; itype++) for (jtype = itype; jtype <= ntypes; jtype++) for (m = 0; m < nmap[itype][jtype]; m++) if (map[itype][jtype][m] == istyle) used = 1; if (used == 0) error->all("Pair hybrid sub-style is not used"); } // each sub-style makes its neighbor list request(s) for (istyle = 0; istyle < nstyles; istyle++) styles[istyle]->init_style(); // create skip lists for each pair neigh request // any kind of list can have its skip flag set at this stage for (i = 0; i < neighbor->nrequest; i++) { if (!neighbor->requests[i]->pair) continue; // istyle = associated sub-style for (istyle = 0; istyle < nstyles; istyle++) if (styles[istyle] == neighbor->requests[i]->requestor) break; // allocate iskip and ijskip // initialize so as to skip all pair types // set ijskip = 0 if type pair matches any entry in sub-style map // set ijskip = 0 if mixing will assign type pair to this sub-style // will occur if type pair is currently unassigned // and both I,I and J,J are assigned to single sub-style // and sub-style for both I,I and J,J match istyle // set iskip = 1 only if all ijskip for itype are 1 int *iskip = new int[ntypes+1]; int **ijskip = memory->create_2d_int_array(ntypes+1,ntypes+1, "pair_hybrid:ijskip"); for (itype = 1; itype <= ntypes; itype++) for (jtype = 1; jtype <= ntypes; jtype++) ijskip[itype][jtype] = 1; for (itype = 1; itype <= ntypes; itype++) for (jtype = itype; jtype <= ntypes; jtype++) { for (m = 0; m < nmap[itype][jtype]; m++) if (map[itype][jtype][m] == istyle) ijskip[itype][jtype] = ijskip[jtype][itype] = 0; if (nmap[itype][jtype] == 0 && nmap[itype][itype] == 1 && map[itype][itype][0] == istyle && nmap[jtype][jtype] == 1 && map[jtype][jtype][0] == istyle) ijskip[itype][jtype] = ijskip[jtype][itype] = 0; } for (itype = 1; itype <= ntypes; itype++) { iskip[itype] = 1; for (jtype = 1; jtype <= ntypes; jtype++) if (ijskip[itype][jtype] == 0) iskip[itype] = 0; } // if any skipping occurs // set request->skip and copy iskip and ijskip into request // else delete iskip and ijskip skip = 0; for (itype = 1; itype <= ntypes; itype++) for (jtype = 1; jtype <= ntypes; jtype++) if (ijskip[itype][jtype] == 1) skip = 1; if (skip) { neighbor->requests[i]->skip = 1; neighbor->requests[i]->iskip = iskip; neighbor->requests[i]->ijskip = ijskip; } else { delete [] iskip; memory->destroy_2d_int_array(ijskip); } } // combine sub-style neigh list requests and create new ones if needed modify_requests(); } /* ---------------------------------------------------------------------- init for one type pair i,j and corresponding j,i ------------------------------------------------------------------------- */ double PairHybrid::init_one(int i, int j) { // if I,J is not set explicitly: // perform mixing only if I,I sub-style = J,J sub-style // also require I,I and J,J are both assigned to single sub-style if (setflag[i][j] == 0) { if (nmap[i][i] != 1 || nmap[j][j] != 1 || map[i][i][0] != map[j][j][0]) error->one("All pair coeffs are not set"); nmap[i][j] = 1; map[i][j][0] = map[i][i][0]; } // call init/mixing for all sub-styles of I,J // set cutsq in sub-style just as pair::init_one() does // sum tail corrections for I,J // return max cutoff of all sub-styles assigned to I,J // if no sub-styles assigned to I,J (pair_coeff none), cutmax = 0.0 returned double cutmax = 0.0; if (tail_flag) etail_ij = ptail_ij = 0.0; nmap[j][i] = nmap[i][j]; for (int k = 0; k < nmap[i][j]; k++) { map[j][i][k] = map[i][j][k]; double cut = styles[map[i][j][k]]->init_one(i,j); styles[map[i][j][k]]->cutsq[i][j] = styles[map[i][j][k]]->cutsq[j][i] = cut*cut; if (tail_flag) { etail_ij += styles[map[i][j][k]]->etail_ij; ptail_ij += styles[map[i][j][k]]->ptail_ij; } cutmax = MAX(cutmax,cut); } return cutmax; } /* ---------------------------------------------------------------------- combine sub-style neigh list requests and create new ones if needed ------------------------------------------------------------------------- */ void PairHybrid::modify_requests() { int i,j; NeighRequest *irq,*jrq; // loop over pair requests only // if list is skip list and not copy, look for non-skip list of same kind // if one exists, point at that one // else make new non-skip request of same kind and point at that one // don't bother to set ID for new request, since pair hybrid ignores list // only exception is half_from_full: // ignore it, turn off skip, since it will derive from its skip parent // after possible new request creation, unset skip flag and otherlist // for these derived lists: granhistory, rRESPA inner/middle // this prevents neighbor from treating them as skip lists // copy list check is for pair style = hybrid/overlay // which invokes this routine for (i = 0; i < neighbor->nrequest; i++) { if (!neighbor->requests[i]->pair) continue; irq = neighbor->requests[i]; if (irq->skip == 0 || irq->copy) continue; if (irq->half_from_full) { irq->skip = 0; continue; } for (j = 0; j < neighbor->nrequest; j++) { if (!neighbor->requests[j]->pair) continue; jrq = neighbor->requests[j]; if (irq->same_kind(jrq) && jrq->skip == 0) break; } if (j < neighbor->nrequest) irq->otherlist = j; else { int newrequest = neighbor->request(this); neighbor->requests[newrequest]->copy_request(irq); irq->otherlist = newrequest; } if (irq->granhistory || irq->respainner || irq->respamiddle) { irq->skip = 0; irq->otherlist = -1; } } } /* ---------------------------------------------------------------------- proc 0 writes to restart file ------------------------------------------------------------------------- */ void PairHybrid::write_restart(FILE *fp) { fwrite(&nstyles,sizeof(int),1,fp); // each sub-style writes its settings, but no coeff info int n; for (int m = 0; m < nstyles; m++) { n = strlen(keywords[m]) + 1; fwrite(&n,sizeof(int),1,fp); fwrite(keywords[m],sizeof(char),n,fp); styles[m]->write_restart_settings(fp); } } /* ---------------------------------------------------------------------- proc 0 reads from restart file, bcasts ------------------------------------------------------------------------- */ void PairHybrid::read_restart(FILE *fp) { int me = comm->me; if (me == 0) fread(&nstyles,sizeof(int),1,fp); MPI_Bcast(&nstyles,1,MPI_INT,0,world); styles = new Pair*[nstyles]; keywords = new char*[nstyles]; // each sub-style is created via new_pair() // each reads its settings, but no coeff info int n; for (int m = 0; m < nstyles; m++) { if (me == 0) fread(&n,sizeof(int),1,fp); MPI_Bcast(&n,1,MPI_INT,0,world); keywords[m] = new char[n]; if (me == 0) fread(keywords[m],sizeof(char),n,fp); MPI_Bcast(keywords[m],n,MPI_CHAR,0,world); styles[m] = force->new_pair(keywords[m]); styles[m]->read_restart_settings(fp); } } /* ---------------------------------------------------------------------- call sub-style to compute single interaction error if sub-style does not support single() call since overlay could have multiple sub-styles, sum results explicitly ------------------------------------------------------------------------- */ double PairHybrid::single(int i, int j, int itype, int jtype, double rsq, double factor_coul, double factor_lj, double &fforce) { if (nmap[itype][jtype] == 0) error->one("Invoked pair single on pair style none"); double fone; fforce = 0.0; double esum = 0.0; for (int m = 0; m < nmap[itype][jtype]; m++) { if (rsq < styles[map[itype][jtype][m]]->cutsq[itype][jtype]) { if (styles[map[itype][jtype][m]]->single_enable == 0) error->all("Pair hybrid sub-style does not support single call"); esum += styles[map[itype][jtype][m]]-> single(i,j,itype,jtype,rsq,factor_coul,factor_lj,fone); fforce += fone; } } return esum; } /* ---------------------------------------------------------------------- modify parameters of the pair style call modify_params of PairHybrid also pass command args to each sub-style of hybrid ------------------------------------------------------------------------- */ void PairHybrid::modify_params(int narg, char **arg) { Pair::modify_params(narg,arg); for (int m = 0; m < nstyles; m++) styles[m]->modify_params(narg,arg); } /* ---------------------------------------------------------------------- memory usage of each sub-style ------------------------------------------------------------------------- */ double PairHybrid::memory_usage() { double bytes = maxeatom * sizeof(double); bytes += maxvatom*6 * sizeof(double); for (int m = 0; m < nstyles; m++) bytes += styles[m]->memory_usage(); return bytes; } /* ---------------------------------------------------------------------- extract a ptr to a particular quantity stored by pair pass request thru to sub-styles return first non-NULL result except for cut_coul request for cut_coul, insure all non-NULL results are equal since required by Kspace ------------------------------------------------------------------------- */ void *PairHybrid::extract(char *str, int &dim) { void *cutptr = NULL; void *ptr; double cutvalue; for (int m = 0; m < nstyles; m++) { ptr = styles[m]->extract(str,dim); if (ptr && strcmp(str,"cut_coul") == 0) { double *p_newvalue = (double *) ptr; double newvalue = *p_newvalue; if (cutptr && newvalue != cutvalue) error->all("Coulomb cutoffs of pair hybrid sub-styles do not match"); cutptr = ptr; cutvalue = newvalue; } else if (ptr) return ptr; } if (strcmp(str,"cut_coul") == 0) return cutptr; return NULL; } /* ---------------------------------------------------------------------- */ void PairHybrid::reset_dt() { for (int m = 0; m < nstyles; m++) styles[m]->reset_dt(); } /* ---------------------------------------------------------------------- check if itype,jtype maps to sub-style ------------------------------------------------------------------------- */ int PairHybrid::check_ijtype(int itype, int jtype, char *substyle) { for (int m = 0; m < nmap[itype][jtype]; m++) if (strcmp(keywords[map[itype][jtype][m]],substyle) == 0) return 1; return 0; }