fix_intel.h
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Thu, Jul 3, 05:58

fix_intel.h
View Options

	/* -- c++ -- ----------------------------------------------------------
	LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
	http://lammps.sandia.gov, Sandia National Laboratories
	Steve Plimpton, sjplimp@sandia.gov

	Copyright (2003) Sandia Corporation. Under the terms of Contract
	DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
	certain rights in this software. This software is distributed under
	the GNU General Public License.

	See the README file in the top-level LAMMPS directory.
	------------------------------------------------------------------------- */

	/* ----------------------------------------------------------------------
	Contributing author: W. Michael Brown (Intel)
	------------------------------------------------------------------------- */

	#ifdef FIX_CLASS

	FixStyle(INTEL,FixIntel)

	#else

	#ifndef LMP_FIX_INTEL_H
	#define LMP_FIX_INTEL_H

	#include "fix.h"
	#include "intel_buffers.h"
	#include "force.h"
	#include "pair.h"
	#include "error.h"
	#include "update.h"

	namespace LAMMPS_NS {

	class IntelData;
	template <class flt_t, class acc_t> class IntelBuffers;

	class FixIntel : public Fix {
	public:
	FixIntel(class LAMMPS , int, char *);
	virtual ~FixIntel();
	virtual int setmask();
	virtual void init();
	virtual void setup(int);
	void pair_init_check(const bool cdmessage=false);
	void bond_init_check();
	void kspace_init_check();

	void pre_reverse(int eflag = 0, int vflag = 0);

	// Get all forces, calculation results from coprocesser
	void sync_coprocessor();

	double memory_usage();

	typedef struct { double x,y,z; } lmp_ft;

	enum {PREC_MODE_SINGLE, PREC_MODE_MIXED, PREC_MODE_DOUBLE};

	inline int precision() { return _precision_mode; }
	inline IntelBuffers<float,float> * get_single_buffers()
	{ return _single_buffers; }
	inline IntelBuffers<float,double> * get_mixed_buffers()
	{ return _mixed_buffers; }
	inline IntelBuffers<double,double> * get_double_buffers()
	{ return _double_buffers; }

	inline int nbor_pack_width() const { return _nbor_pack_width; }
	inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }

	inline int need_zero(const int tid) {
	if (_need_reduce == 0 && tid > 0) return 1;
	return 0;
	}
	inline void set_reduce_flag() { _need_reduce = 1; }
	inline int lrt() {
	if (force->kspace_match("pppm/intel", 0)) return _lrt;
	else return 0;
	}

	protected:
	IntelBuffers<float,float> *_single_buffers;
	IntelBuffers<float,double> *_mixed_buffers;
	IntelBuffers<double,double> *_double_buffers;

	int _precision_mode, _nthreads, _nbor_pack_width;

	public:
	inline int* get_overflow_flag() { return _overflow_flag; }
	inline int* get_off_overflow_flag() { return _off_overflow_flag; }
	inline void add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
	double *ev_in, const int offload,
	const int eatom = 0, const int vatom = 0,
	const int rflag = 0);
	inline void add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
	double *ev_in, const int offload,
	const int eatom = 0, const int vatom = 0,
	const int rflag = 0);
	inline void add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
	float *ev_in, const int offload,
	const int eatom = 0, const int vatom = 0,
	const int rflag = 0);
	inline void get_buffern(const int offload, int &nlocal, int &nall,
	int &minlocal);

	#ifdef _LMP_INTEL_OFFLOAD
	void post_force(int vflag);
	inline int coprocessor_number() { return _cop; }
	inline int full_host_list() { return _full_host_list; }
	void set_offload_affinity();
	inline double offload_balance() { return _offload_balance; }
	inline int offload_end_neighbor();
	inline int offload_end_pair();
	inline int host_start_neighbor()
	{ if (_offload_noghost) return 0; else return offload_end_neighbor(); }
	inline int host_start_pair()
	{ if (_offload_noghost) return 0; else return offload_end_pair(); }
	inline int offload_nlocal() { return _offload_nlocal; }
	inline int offload_nall() { return _offload_nall; }
	inline int offload_min_ghost() { return _offload_min_ghost; }
	inline int host_min_local() { return _host_min_local; }
	inline int host_min_ghost() { return _host_min_ghost; }
	inline int host_used_local() { return _host_used_local; }
	inline int host_used_ghost() { return _host_used_ghost; }
	inline int host_nall() { return _host_nall; }
	inline int separate_buffers() { return _separate_buffers; }
	inline int offload_noghost() { return _offload_noghost; }
	inline void set_offload_noghost(const int v)
	{ if (_offload_ghost < 0) _offload_noghost = v; }
	inline void set_neighbor_host_sizes();

	inline void zero_timers()
	{ memset(_timers, 0, sizeof(double) * NUM_ITIMERS); }
	inline void start_watch(const int which) { _stopwatch[which] = MPI_Wtime(); }
	inline double stop_watch(const int which);
	inline double * off_watch_pair() { return _stopwatch_offload_pair; }
	inline double * off_watch_neighbor() { return _stopwatch_offload_neighbor; }
	inline void balance_stamp();
	inline void acc_timers();
	#else
	inline int offload_end_neighbor() { return 0; }
	inline int offload_end_pair() { return 0; }
	inline int host_start_neighbor() { return 0; }
	inline int host_start_pair() { return 0; }
	inline void zero_timers() {}
	inline void start_watch(const int which) {}
	inline double stop_watch(const int which) { return 0.0; }
	double * off_watch_pair() { return NULL; }
	double * off_watch_neighbor() { return NULL; }
	inline void balance_stamp() {}
	inline void acc_timers() {}
	inline int separate_buffers() { return 0; }
	#endif

	protected:
	int _overflow_flag[5];
	_alignvar(int _off_overflow_flag[5],64);
	int _allow_separate_buffers, _offload_ghost, _lrt;

	IntelBuffers<float,float>::vec3_acc_t *_force_array_s;
	IntelBuffers<float,double>::vec3_acc_t *_force_array_m;
	IntelBuffers<double,double>::vec3_acc_t *_force_array_d;
	float *_ev_array_s;
	double *_ev_array_d;
	int _results_eatom, _results_vatom;
	int _need_reduce;

	#ifdef _LMP_INTEL_OFFLOAD
	double _balance_pair_time, _balance_other_time;
	int _offload_nlocal, _offload_nall, _offload_min_ghost, _offload_nghost;
	int _host_min_local, _host_min_ghost, _host_nall;
	int _host_used_local, _host_used_ghost, _sync_mode;
	int _separate_buffers, _offload_noghost, _separate_coi;
	bool _setup_time_cleared, _timers_allocated;
	void output_timing_data();
	FILE *_tscreen;

	IntelBuffers<float,float>::vec3_acc_t *_off_force_array_s;
	IntelBuffers<float,double>::vec3_acc_t *_off_force_array_m;
	IntelBuffers<double,double>::vec3_acc_t *_off_force_array_d;
	float *_off_ev_array_s;
	double *_off_ev_array_d;
	int _off_results_eatom, _off_results_vatom;
	int _full_host_list, _cop, _ncops;

	int get_ppn(int &);
	int set_host_affinity(const int);
	#endif
	void check_neighbor_intel();

	double _offload_balance, _balance_neighbor, _balance_pair, _balance_fixed;
	double _timers[NUM_ITIMERS];
	double _stopwatch[NUM_ITIMERS];
	_alignvar(double _stopwatch_offload_neighbor[1],64);
	_alignvar(double _stopwatch_offload_pair[1],64);

	template <class ft>
	void reduce_results(ft * _noalias const f_in);

	template <class ft, class acc_t>
	inline void add_results(const ft * _noalias const f_in,
	const acc_t * _noalias const ev_global,
	const int eatom, const int vatom,
	const int offload);

	template <class ft, class acc_t>
	inline void add_oresults(const ft * _noalias const f_in,
	const acc_t * _noalias const ev_global,
	const int eatom, const int vatom,
	const int out_offset, const int nall);

	int _offload_affinity_balanced, _offload_threads, _offload_tpc;
	#ifdef _LMP_INTEL_OFFLOAD
	int _max_offload_threads, _offload_cores, _offload_affinity_set;
	int _im_real_space_task;
	MPI_Comm _real_space_comm;
	template <class ft, class acc_t>
	inline void add_off_results(const ft * _noalias const f_in,
	const acc_t * _noalias const ev_global);
	#endif
	};

	/* ---------------------------------------------------------------------- */

	void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
	int &minlocal) {
	#ifdef _LMP_INTEL_OFFLOAD
	if (_separate_buffers) {
	if (offload) {
	if (neighbor->ago != 0) {
	nlocal = _offload_nlocal;
	nall = _offload_nall;
	} else {
	nlocal = atom->nlocal;
	nall = nlocal + atom->nghost;
	}
	minlocal = 0;
	} else {
	nlocal = atom->nlocal;
	nall = _host_nall;
	minlocal = _host_min_local;
	}
	return;
	}
	if (_offload_noghost && offload)
	nall = atom->nlocal;
	else
	#endif
	nall = atom->nlocal + atom->nghost;
	nlocal = atom->nlocal;
	minlocal = 0;
	}

	/* ---------------------------------------------------------------------- */

	void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
	double *ev_in, const int offload,
	const int eatom, const int vatom,
	const int rflag) {
	#ifdef _LMP_INTEL_OFFLOAD
	if (offload) {
	_off_results_eatom = eatom;
	_off_results_vatom = vatom;
	_off_force_array_d = f_in;
	_off_ev_array_d = ev_in;
	return;
	}
	#endif

	_force_array_d = f_in;
	_ev_array_d = ev_in;
	_results_eatom = eatom;
	_results_vatom = vatom;
	#ifndef _LMP_INTEL_OFFLOAD
	if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
	#endif

	if (_overflow_flag[LMP_OVERFLOW])
	error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
	}

	/* ---------------------------------------------------------------------- */

	void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
	double *ev_in, const int offload,
	const int eatom, const int vatom,
	const int rflag) {
	#ifdef _LMP_INTEL_OFFLOAD
	if (offload) {
	_off_results_eatom = eatom;
	_off_results_vatom = vatom;
	_off_force_array_m = f_in;
	_off_ev_array_d = ev_in;
	return;
	}
	#endif

	_force_array_m = f_in;
	_ev_array_d = ev_in;
	_results_eatom = eatom;
	_results_vatom = vatom;
	#ifndef _LMP_INTEL_OFFLOAD
	if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
	#endif

	if (_overflow_flag[LMP_OVERFLOW])
	error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
	}

	/* ---------------------------------------------------------------------- */

	void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
	float *ev_in, const int offload,
	const int eatom, const int vatom,
	const int rflag) {
	#ifdef _LMP_INTEL_OFFLOAD
	if (offload) {
	_off_results_eatom = eatom;
	_off_results_vatom = vatom;
	_off_force_array_s = f_in;
	_off_ev_array_s = ev_in;
	return;
	}
	#endif

	_force_array_s = f_in;
	_ev_array_s = ev_in;
	_results_eatom = eatom;
	_results_vatom = vatom;
	#ifndef _LMP_INTEL_OFFLOAD
	if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
	#endif

	if (_overflow_flag[LMP_OVERFLOW])
	error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
	}

	/* ---------------------------------------------------------------------- */

	#ifdef _LMP_INTEL_OFFLOAD

	/* ---------------------------------------------------------------------- */

	int FixIntel::offload_end_neighbor() {
	if (_offload_balance < 0.0) {
	if (atom->nlocal < 2)
	error->one(FLERR,"Too few atoms for load balancing offload");
	double granularity = 1.0 / atom->nlocal;
	if (_balance_neighbor < granularity)
	_balance_neighbor = granularity + 1e-10;
	else if (_balance_neighbor > 1.0 - granularity)
	_balance_neighbor = 1.0 - granularity + 1e-10;
	}
	return _balance_neighbor * atom->nlocal;
	}

	int FixIntel::offload_end_pair() {
	if (neighbor->ago == 0) return _balance_neighbor * atom->nlocal;
	else return _balance_pair * atom->nlocal;
	}

	/* ---------------------------------------------------------------------- */

	double FixIntel::stop_watch(const int which) {
	double elapsed = MPI_Wtime() - _stopwatch[which];
	_timers[which] += elapsed;
	return elapsed;
	}

	/* ---------------------------------------------------------------------- */

	void FixIntel::balance_stamp() {
	if (_offload_balance < 0.0) {
	double ct = MPI_Wtime();
	_balance_other_time = ct;
	_balance_pair_time = ct - _stopwatch[TIME_HOST_PAIR];
	}
	}

	/* ---------------------------------------------------------------------- */

	void FixIntel::acc_timers() {
	_timers[TIME_OFFLOAD_PAIR] += *_stopwatch_offload_pair;
	if (neighbor->ago == 0) {
	_timers[TIME_OFFLOAD_NEIGHBOR] += *_stopwatch_offload_neighbor;
	if (_setup_time_cleared == false) {
	zero_timers();
	_setup_time_cleared = true;
	}
	}
	}

	/* ---------------------------------------------------------------------- */

	void FixIntel::set_neighbor_host_sizes() {
	_host_min_local = _overflow_flag[LMP_LOCAL_MIN];
	_host_min_ghost = _overflow_flag[LMP_GHOST_MIN];
	_host_used_local = atom->nlocal - _host_min_local;
	_host_used_ghost = _overflow_flag[LMP_GHOST_MAX] + 1 - _host_min_ghost;
	if (_host_used_ghost < 0) _host_used_ghost = 0;
	_host_nall = atom->nlocal + _host_used_ghost;
	}

	/* ---------------------------------------------------------------------- */

	#endif

	}

	#endif
	#endif

	/* ERROR/WARNING messages:

	E: The 'package intel' command is required for /intel styles

	Self-explanatory.

	W: Could not set host affinity for offload tasks

	When using offload to a coprocessor, the application will try to set affinity
	for host MPI tasks and OpenMP threads and will generate a warning if unable
	to do so successfully. In the unsuccessful case, you might wish to set
	affinity outside of the application and performance might suffer if
	hyperthreading is disable on the CPU.

	E: Neighbor list overflow, boost neigh_modify one

	Increase the value for neigh_modify one to allow for larger allocations for
	neighbor list builds. The value required can be different for the Intel
	package in order to support offload to a coprocessor.

	E: Bad matrix inversion in mldivide3

	This error should not occur unless the matrix is badly formed.

	E: Illegal package intel command

	The format for the package intel command is incorrect. Please see the
	documentation.

	E: fix intel has to operate on group 'all'

	Self explanatory.

	E: Illegal package intel mode requested

	The format for the package intel command is incorrect. Please see the
	documentation.

	E: Currently, neighbor style BIN must be used with Intel package.

	This is the only neighbor style that has been implemented for the Intel
	package.

	E: Currently, cannot use neigh_modify exclude with Intel package.

	This is a current restriction of the Intel package.

	W: Unknown Intel Compiler Version

	The compiler version used to build LAMMPS has not been tested with
	offload to a coprocessor.

	W: Unsupported Intel Compiler

	The compiler version used to build LAMMPS is not supported when using
	offload to a coprocessor. There could be performance or correctness
	issues. Please use 14.0.1.106 or 15.1.133 or later.

	E: Currently, cannot use more than one intel style with hybrid.

	Currently, hybrid pair styles can only use the intel suffix for one of the
	pair styles.

	E: Cannot yet use hybrid styles with Intel package.

	The hybrid pair style configuration is not yet supported by the Intel
	package. Support is limited to hybrid/overlay or a hybrid style that does
	not require a skip list.

	W: Leaving a core/node free can improve performance for offload

	When each CPU is fully subscribed with MPI tasks and OpenMP threads,
	context switching with threads used for offload can sometimes decrease
	performance. If you see this warning, try using fewer MPI tasks/OpenMP threads
	per node to leave a physical CPU core free on each node.

	E: MPI tasks per node must be multiple of offload_cards

	For offload to multiple coprocessors on a single node, the Intel package
	requires that each coprocessor is used by the same number of MPI tasks.

	W: More MPI tasks/OpenMP threads than available cores

	Using more MPI tasks/OpenMP threads than available cores will typically
	decrease performance.

	E: USER-INTEL package requires same setting for newton bond and non-bond.

	The newton setting must be the same for both pairwise and bonded forces.

	E: Intel styles for bond/angle/dihedral/improper require intel pair style."

	You cannot use the USER-INTEL package for bond calculations without a
	USER-INTEL supported pair style.

	E: Intel styles for kspace require intel pair style.

	You cannot use the USER-INTEL package for kspace calculations without a
	USER-INTEL supported pair style.

	E: Cannot currently get per-atom virials with intel package.

	The Intel package does not yet support per-atom virial calculation.

	E: Too few atoms for load balancing offload.

	When using offload to a coprocessor, each MPI task must have at least 2
	atoms throughout the simulation.

	*/

fix_intel.hNo OneTemporaryActions

File Metadata

fix_intel.hView Options

Event Timeline

fix_intel.h
No OneTemporary
Actions

fix_intel.h
View Options