fix_intel.cpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Fri, Jul 19, 22:15

fix_intel.cpp
View Options

	/* ----------------------------------------------------------------------
	LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
	http://lammps.sandia.gov, Sandia National Laboratories
	Steve Plimpton, sjplimp@sandia.gov

	Copyright (2003) Sandia Corporation. Under the terms of Contract
	DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
	certain rights in this software. This software is distributed under
	the GNU General Public License.

	See the README file in the top-level LAMMPS directory.
	------------------------------------------------------------------------- */

	/* ----------------------------------------------------------------------
	Contributing author: W. Michael Brown (Intel)
	Anupama Kurpad (Intel) - Host Affinitization
	------------------------------------------------------------------------- */

	#include "comm.h"
	#include "error.h"
	#include "force.h"
	#include "neighbor.h"
	#include "neigh_request.h"
	#include "pair.h"
	#include "pair_hybrid.h"
	#include "pair_hybrid_overlay.h"
	#include "timer.h"
	#include "universe.h"
	#include "update.h"
	#include "fix_intel.h"

	#include <string.h>
	#include <stdlib.h>
	#include <stdio.h>

	#ifdef _LMP_INTEL_OFFLOAD
	#ifndef INTEL_OFFLOAD_NOAFFINITY
	#include <unistd.h>
	#endif
	#endif

	#include "suffix.h"

	using namespace LAMMPS_NS;
	using namespace FixConst;

	#ifdef __INTEL_OFFLOAD
	#ifndef _LMP_INTEL_OFFLOAD
	#warning "Not building Intel package with Xeon Phi offload support."
	#endif
	#endif

	enum{NSQ,BIN,MULTI};

	/* ---------------------------------------------------------------------- */

	FixIntel::FixIntel(LAMMPS lmp, int narg, char *arg) : Fix(lmp, narg, arg)
	{
	if (narg < 4) error->all(FLERR,"Illegal package intel command");

	int ncops = force->inumeric(FLERR,arg[3]);

	_nbor_pack_width = 1;
	_three_body_neighbor = 0;

	_precision_mode = PREC_MODE_MIXED;
	_offload_balance = -1.0;
	_overflow_flag[LMP_OVERFLOW] = 0;
	_off_overflow_flag[LMP_OVERFLOW] = 0;

	_offload_affinity_balanced = 0;
	_offload_threads = 0;
	_offload_tpc = 4;

	_force_array_s = 0;
	_force_array_m = 0;
	_force_array_d = 0;
	_ev_array_s = 0;
	_ev_array_d = 0;

	#ifdef _LMP_INTEL_OFFLOAD
	if (ncops < 0) error->all(FLERR,"Illegal package intel command");
	_offload_affinity_set = 0;
	_off_force_array_s = 0;
	_off_force_array_m = 0;
	_off_force_array_d = 0;
	_off_ev_array_s = 0;
	_off_ev_array_d = 0;
	_balance_fixed = 0.0;
	_cop = 0;
	#endif

	// optional keywords

	int nomp = 0, no_affinity = 0;
	_allow_separate_buffers = 1;
	_offload_ghost = -1;
	_lrt = 0;

	int iarg = 4;
	while (iarg < narg) {
	if (strcmp(arg[iarg],"omp") == 0) {
	if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
	nomp = force->inumeric(FLERR,arg[iarg+1]);
	iarg += 2;
	} else if (strcmp(arg[iarg],"mode") == 0) {
	if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
	if (strcmp(arg[iarg+1],"single") == 0)
	_precision_mode = PREC_MODE_SINGLE;
	else if (strcmp(arg[iarg+1],"mixed") == 0)
	_precision_mode = PREC_MODE_MIXED;
	else if (strcmp(arg[iarg+1],"double") == 0)
	_precision_mode = PREC_MODE_DOUBLE;
	else error->all(FLERR,"Illegal package intel command");
	iarg += 2;
	} else if (strcmp(arg[iarg],"balance") == 0) {
	if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
	_offload_balance = force->numeric(FLERR,arg[iarg+1]);
	iarg += 2;
	} else if (strcmp(arg[iarg], "ghost") == 0) {
	if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
	if (strcmp(arg[iarg+1],"yes") == 0) _offload_ghost = 1;
	else if (strcmp(arg[iarg+1],"no") == 0) _offload_ghost = 0;
	else error->all(FLERR,"Illegal package intel command");
	iarg += 2;
	} else if (strcmp(arg[iarg], "tpc") == 0) {
	if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
	_offload_tpc = atoi(arg[iarg+1]);
	iarg += 2;
	} else if (strcmp(arg[iarg],"tptask") == 0) {
	if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
	_offload_threads = atoi(arg[iarg+1]);
	iarg += 2;
	} else if (strcmp(arg[iarg],"no_affinity") == 0) {
	no_affinity = 1;
	iarg++;
	} else if (strcmp(arg[iarg], "lrt") == 0) {
	if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
	if (strcmp(arg[iarg+1],"yes") == 0) _lrt = 1;
	else if (strcmp(arg[iarg+1],"no") == 0) _lrt = 0;
	else error->all(FLERR,"Illegal package intel command");
	iarg += 2;
	}

	// undocumented options

	else if (strcmp(arg[iarg],"offload_affinity_balanced") == 0) {
	_offload_affinity_balanced = 1;
	iarg++;
	} else if (strcmp(arg[iarg],"buffers") == 0) {
	if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
	_allow_separate_buffers = atoi(arg[iarg+1]);
	iarg += 2;
	} else error->all(FLERR,"Illegal package intel command");
	}

	// if ncops is zero, just run on the cpu
	if (ncops < 1) {
	ncops = -1;
	_offload_balance = 0.0;
	}

	// if using LRT mode, create the integrate style
	if (_lrt) {
	char *str;
	str = (char *) "verlet/lrt/intel";
	update->create_integrate(1,&str,0);
	}

	// error check

	if (_offload_balance > 1.0 \|\| _offload_threads < 0 \|\|
	_offload_tpc <= 0 \|\| _offload_tpc > 4 \|\| nomp < 0)
	error->all(FLERR,"Illegal package intel command");

	#ifdef _LMP_INTEL_OFFLOAD
	_ncops = ncops;
	if (_offload_balance != 0.0) {
	_real_space_comm = MPI_COMM_WORLD;
	if (no_affinity == 0)
	if (set_host_affinity(nomp) != 0)
	error->all(FLERR,"Could not set host affinity for offload tasks");
	}

	int max_offload_threads = 0, offload_cores = 0;
	if (_offload_balance != 0.0) {
	#pragma offload target(mic:_cop) mandatory \
	out(max_offload_threads,offload_cores)
	{
	offload_cores = omp_get_num_procs();
	omp_set_num_threads(offload_cores);
	max_offload_threads = omp_get_max_threads();
	#ifdef __AVX512F__
	if ( (offload_cores / 4) % 2 == 1) {
	offload_cores += 4;
	max_offload_threads += 4;
	}
	#endif
	}
	_max_offload_threads = max_offload_threads;
	_offload_cores = offload_cores;
	if (_offload_threads == 0) _offload_threads = offload_cores;
	if (_offload_cores > 244 && _offload_tpc > 2)
	_offload_tpc = 2;
	}
	#endif

	// set OpenMP threads
	// nomp is user setting, default = 0

	#if defined(_OPENMP)
	#if defined(__INTEL_COMPILER)
	kmp_set_blocktime(0);
	#endif
	if (nomp != 0) {
	omp_set_num_threads(nomp);
	comm->nthreads = nomp;
	} else {
	int nthreads;
	#pragma omp parallel default(none) shared(nthreads)
	nthreads = omp_get_num_threads();
	comm->nthreads = nthreads;
	}
	#endif

	// set offload params

	#ifdef _LMP_INTEL_OFFLOAD
	if (_offload_balance < 0.0) {
	_balance_neighbor = 0.9;
	_balance_pair = 0.9;
	} else {
	_balance_neighbor = _offload_balance;
	_balance_pair = _offload_balance;
	}

	_tscreen = screen;
	zero_timers();
	_setup_time_cleared = false;
	_timers_allocated = false;

	#else
	_offload_balance = 0.0;
	#endif

	// set precision

	if (_precision_mode == PREC_MODE_SINGLE)
	_single_buffers = new IntelBuffers<float,float>(lmp);
	else if (_precision_mode == PREC_MODE_MIXED)
	_mixed_buffers = new IntelBuffers<float,double>(lmp);
	else
	_double_buffers = new IntelBuffers<double,double>(lmp);
	}

	/* ---------------------------------------------------------------------- */

	FixIntel::~FixIntel()
	{
	#ifdef _LMP_INTEL_OFFLOAD
	output_timing_data();
	if (_timers_allocated) {
	double *time1 = off_watch_pair();
	double *time2 = off_watch_neighbor();
	int *overflow = get_off_overflow_flag();
	if (_offload_balance != 0.0 && time1 != NULL && time2 != NULL &&
	overflow != NULL) {
	#pragma offload_transfer target(mic:_cop) \
	nocopy(time1,time2,overflow:alloc_if(0) free_if(1))
	}
	}
	#endif

	if (_precision_mode == PREC_MODE_SINGLE)
	delete _single_buffers;
	else if (_precision_mode == PREC_MODE_MIXED)
	delete _mixed_buffers;
	else
	delete _double_buffers;
	}

	/* ---------------------------------------------------------------------- */

	int FixIntel::setmask()
	{
	int mask = 0;
	mask \|= PRE_REVERSE;
	mask \|= MIN_PRE_REVERSE;
	#ifdef _LMP_INTEL_OFFLOAD
	mask \|= POST_FORCE;
	mask \|= MIN_POST_FORCE;
	#endif
	return mask;
	}

	/* ---------------------------------------------------------------------- */

	void FixIntel::init()
	{
	#ifdef _LMP_INTEL_OFFLOAD
	output_timing_data();
	_sync_mode = 0;
	if (offload_balance() != 0.0) {
	if (offload_noghost() \|\| force->newton_pair == 0)
	_sync_mode = 2;
	else
	_sync_mode = 1;
	if (update->whichflag == 2) _sync_mode = 1;
	}
	#endif

	int nstyles = 0;
	if (force->pair_match("hybrid", 1) != NULL) {
	PairHybrid hybrid = (PairHybrid ) force->pair;
	for (int i = 0; i < hybrid->nstyles; i++)
	if (strstr(hybrid->keywords[i], "/intel") != NULL)
	nstyles++;
	} else if (force->pair_match("hybrid/overlay", 1) != NULL) {
	PairHybridOverlay hybrid = (PairHybridOverlay ) force->pair;
	for (int i = 0; i < hybrid->nstyles; i++)
	if (strstr(hybrid->keywords[i], "/intel") != NULL)
	nstyles++;
	else
	force->pair->no_virial_fdotr_compute = 1;
	}
	if (nstyles > 1)
	error->all(FLERR,
	"Currently, cannot use more than one intel style with hybrid.");

	check_neighbor_intel();
	int off_mode = 0;
	if (_offload_balance != 0.0) off_mode = 1;
	if (_precision_mode == PREC_MODE_SINGLE) {
	_single_buffers->zero_ev();
	_single_buffers->grow_ncache(off_mode,_nthreads);
	} else if (_precision_mode == PREC_MODE_MIXED) {
	_mixed_buffers->zero_ev();
	_mixed_buffers->grow_ncache(off_mode,_nthreads);
	} else {
	_double_buffers->zero_ev();
	_double_buffers->grow_ncache(off_mode,_nthreads);
	}

	_need_reduce = 0;
	}

	/* ---------------------------------------------------------------------- */

	void FixIntel::setup(int vflag)
	{
	if (neighbor->style != BIN)
	error->all(FLERR,
	"Currently, neighbor style BIN must be used with Intel package.");
	if (neighbor->exclude_setting() != 0)
	error->all(FLERR,
	"Currently, cannot use neigh_modify exclude with Intel package.");
	if (vflag_atom)
	error->all(FLERR,
	"Cannot currently get per-atom virials with Intel package.");
	#ifdef _LMP_INTEL_OFFLOAD
	post_force(vflag);
	#endif
	}

	/* ---------------------------------------------------------------------- */

	void FixIntel::setup_pre_reverse(int eflag, int vflag)
	{
	pre_reverse(eflag,vflag);
	}

	/* ---------------------------------------------------------------------- */

	void FixIntel::pair_init_check(const bool cdmessage)
	{
	#ifdef INTEL_VMASK
	atom->sortfreq = 1;
	#endif

	_nbor_pack_width = 1;

	#ifdef _LMP_INTEL_OFFLOAD
	if (_offload_balance != 0.0) atom->sortfreq = 1;

	_offload_noghost = 0;
	if (force->newton_pair && _offload_ghost == 0)
	_offload_noghost = 1;

	set_offload_affinity();

	if (!_timers_allocated) {
	double *time1 = off_watch_pair();
	double *time2 = off_watch_neighbor();
	int *overflow = get_off_overflow_flag();
	if (_offload_balance !=0.0 && time1 != NULL && time2 != NULL &&
	overflow != NULL) {
	#pragma offload_transfer target(mic:_cop) \
	nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \
	in(overflow:length(5) alloc_if(1) free_if(0))
	}
	_timers_allocated = true;
	}
	#endif
	_nthreads = comm->nthreads;

	if (_offload_balance != 0.0 && comm->me == 0) {
	#ifndef __INTEL_COMPILER_BUILD_DATE
	error->warning(FLERR, "Unknown Intel Compiler Version\n");
	#else
	if (__INTEL_COMPILER_BUILD_DATE != 20131008 &&
	__INTEL_COMPILER_BUILD_DATE < 20141023)
	error->warning(FLERR, "Unsupported Intel Compiler.");
	#endif
	#if !defined(__INTEL_COMPILER)
	error->warning(FLERR, "Unsupported Intel Compiler.");
	#endif
	}

	int need_tag = 0;
	if (atom->molecular) need_tag = 1;

	// Clear buffers used for pair style
	char kmode[80];
	if (_precision_mode == PREC_MODE_SINGLE) {
	strcpy(kmode, "single");
	get_single_buffers()->need_tag(need_tag);
	} else if (_precision_mode == PREC_MODE_MIXED) {
	strcpy(kmode, "mixed");
	get_mixed_buffers()->need_tag(need_tag);
	} else {
	strcpy(kmode, "double");
	get_double_buffers()->need_tag(need_tag);
	}

	#ifdef _LMP_INTEL_OFFLOAD
	set_offload_affinity();
	#endif

	if (comm->me == 0) {
	if (screen) {
	fprintf(screen,
	"----------------------------------------------------------\n");
	if (_offload_balance != 0.0) {
	fprintf(screen,"Using Intel Coprocessor with %d threads per core, ",
	_offload_tpc);
	fprintf(screen,"%d threads per task\n",_offload_threads);
	} else {
	fprintf(screen,"Using Intel Package without Coprocessor.\n");
	}
	fprintf(screen,"Precision: %s\n",kmode);
	if (cdmessage) {
	#ifdef LMP_USE_AVXCD
	fprintf(screen,"AVX512 CD Optimizations: Enabled\n");
	#else
	fprintf(screen,"AVX512 CD Optimizations: Disabled\n");
	#endif
	}
	fprintf(screen,
	"----------------------------------------------------------\n");
	}
	}
	}

	/* ---------------------------------------------------------------------- */

	void FixIntel::bond_init_check()
	{
	if (_offload_balance != 0.0 && atom->molecular &&
	force->newton_pair != force->newton_bond)
	error->all(FLERR,
	"USER-INTEL package requires same setting for newton bond and non-bond.");

	int intel_pair = 0;
	if (force->pair_match("/intel", 0) != NULL)
	intel_pair = 1;
	else if (force->pair_match("hybrid", 1) != NULL) {
	PairHybrid hybrid = (PairHybrid ) force->pair;
	for (int i = 0; i < hybrid->nstyles; i++)
	if (strstr(hybrid->keywords[i], "/intel") != NULL)
	intel_pair = 1;
	} else if (force->pair_match("hybrid/overlay", 1) != NULL) {
	PairHybridOverlay hybrid = (PairHybridOverlay ) force->pair;
	for (int i = 0; i < hybrid->nstyles; i++)
	if (strstr(hybrid->keywords[i], "/intel") != NULL)
	intel_pair = 1;
	}

	if (intel_pair == 0)
	error->all(FLERR, "Intel styles for bond/angle/dihedral/improper "
	"require intel pair style.");
	}

	/* ---------------------------------------------------------------------- */

	void FixIntel::kspace_init_check()
	{
	int intel_pair = 0;
	if (force->pair_match("/intel", 0) != NULL)
	intel_pair = 1;
	else if (force->pair_match("hybrid", 1) != NULL) {
	PairHybrid hybrid = (PairHybrid ) force->pair;
	for (int i = 0; i < hybrid->nstyles; i++)
	if (strstr(hybrid->keywords[i], "/intel") != NULL)
	intel_pair = 1;
	} else if (force->pair_match("hybrid/overlay", 1) != NULL) {
	PairHybridOverlay hybrid = (PairHybridOverlay ) force->pair;
	for (int i = 0; i < hybrid->nstyles; i++)
	if (strstr(hybrid->keywords[i], "/intel") != NULL)
	intel_pair = 1;
	}

	if (intel_pair == 0)
	error->all(FLERR, "Intel styles for kspace require intel pair style.");
	}

	/* ---------------------------------------------------------------------- */

	void FixIntel::check_neighbor_intel()
	{
	#ifdef _LMP_INTEL_OFFLOAD
	_full_host_list = 0;
	#endif
	const int nrequest = neighbor->nrequest;

	for (int i = 0; i < nrequest; ++i) {
	#ifdef _LMP_INTEL_OFFLOAD
	if (_offload_balance != 0.0 && neighbor->requests[i]->intel == 0) {
	_full_host_list = 1;
	_offload_noghost = 0;
	}
	#endif
	if (neighbor->requests[i]->skip)
	error->all(FLERR, "Cannot yet use hybrid styles with Intel package.");
	}
	}

	/* ---------------------------------------------------------------------- */

	void FixIntel::pre_reverse(int eflag, int vflag)
	{
	if (_force_array_m != 0) {
	if (_need_reduce) {
	reduce_results(&_force_array_m[0].x);
	_need_reduce = 0;
	}
	add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom,0);
	_force_array_m = 0;
	} else if (_force_array_d != 0) {
	if (_need_reduce) {
	reduce_results(&_force_array_d[0].x);
	_need_reduce = 0;
	}
	add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom,0);
	_force_array_d = 0;
	} else if (_force_array_s != 0) {
	if (_need_reduce) {
	reduce_results(&_force_array_s[0].x);
	_need_reduce = 0;
	}
	add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom,0);
	_force_array_s = 0;
	}

	#ifdef _LMP_INTEL_OFFLOAD
	if (_sync_mode == 1) sync_coprocessor();
	#endif
	}

	/* ---------------------------------------------------------------------- */

	template <class acc_t>
	void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
	{
	int o_range, f_stride;
	if (force->newton_pair)
	o_range = atom->nlocal + atom->nghost;
	else
	o_range = atom->nlocal;
	IP_PRE_get_stride(f_stride, o_range, (sizeof(acc_t)*4), lmp->atom->torque);

	o_range *= 4;
	const int f_stride4 = f_stride * 4;

	if (_nthreads <= INTEL_HTHREADS) {
	acc_t *f_scalar2 = f_scalar + f_stride4;
	if (_nthreads == 4) {
	acc_t *f_scalar3 = f_scalar2 + f_stride4;
	acc_t *f_scalar4 = f_scalar3 + f_stride4;
	_use_simd_pragma("vector aligned")
	_use_simd_pragma("simd")
	for (int n = 0; n < o_range; n++)
	f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];
	} else if (_nthreads == 2) {
	_use_simd_pragma("vector aligned")
	_use_simd_pragma("simd")
	for (int n = 0; n < o_range; n++)
	f_scalar[n] += f_scalar2[n];
	} else {
	acc_t *f_scalar3 = f_scalar2 + f_stride4;
	_use_simd_pragma("vector aligned")
	_use_simd_pragma("simd")
	for (int n = 0; n < o_range; n++)
	f_scalar[n] += f_scalar2[n] + f_scalar3[n];
	}
	} else {
	#if defined(_OPENMP)
	#pragma omp parallel
	#endif
	{
	int iifrom, iito, tid;
	IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads,
	sizeof(acc_t));

	acc_t *f_scalar2 = f_scalar + f_stride4;
	for (int t = 1; t < _nthreads; t++) {
	_use_simd_pragma("vector aligned")
	_use_simd_pragma("simd")
	for (int n = iifrom; n < iito; n++)
	f_scalar[n] += f_scalar2[n];
	f_scalar2 += f_stride4;
	}
	}
	}
	}

	/* ---------------------------------------------------------------------- */

	void FixIntel::sync_coprocessor()
	{
	#ifdef _LMP_INTEL_OFFLOAD
	if (_offload_balance != 0.0) {
	if (_off_force_array_m != 0) {
	add_off_results(_off_force_array_m, _off_ev_array_d);
	_off_force_array_m = 0;
	} else if (_off_force_array_d != 0) {
	add_off_results(_off_force_array_d, _off_ev_array_d);
	_off_force_array_d = 0;
	} else if (_off_force_array_s != 0) {
	add_off_results(_off_force_array_s, _off_ev_array_s);
	_off_force_array_s = 0;
	}
	}
	#endif
	}

	/* ---------------------------------------------------------------------- */

	template <class ft, class acc_t>
	void FixIntel::add_results(const ft * _noalias const f_in,
	const acc_t * _noalias const ev_global,
	const int eatom, const int vatom,
	const int offload) {
	start_watch(TIME_PACK);
	int f_length;
	#ifdef _LMP_INTEL_OFFLOAD
	if (_separate_buffers) {
	if (offload) {
	if (force->newton_pair) {
	add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
	const acc_t * _noalias const enull = 0;
	int offset = _offload_nlocal;
	if (atom->torque) offset *= 2;
	add_oresults(f_in + offset, enull, eatom, vatom,
	_offload_min_ghost, _offload_nghost);
	} else
	add_oresults(f_in, ev_global, eatom, vatom, 0, offload_end_pair());
	} else {
	if (force->newton_pair) {
	add_oresults(f_in, ev_global, eatom, vatom,
	_host_min_local, _host_used_local);
	const acc_t * _noalias const enull = 0;
	int offset = _host_used_local;
	if (atom->torque) offset *= 2;
	add_oresults(f_in + offset, enull, eatom,
	vatom, _host_min_ghost, _host_used_ghost);
	} else {
	int start = host_start_pair();
	add_oresults(f_in, ev_global, eatom, vatom, start, atom->nlocal-start);
	}
	}
	stop_watch(TIME_PACK);
	return;
	}
	int start;
	if (offload) {
	start = 0;
	if (force->newton_pair) {
	if (_offload_noghost == 0)
	f_length = atom->nlocal + atom->nghost;
	else
	f_length = atom->nlocal;
	} else
	f_length = offload_end_pair();
	} else {
	if (force->newton_pair) {
	start = 0;
	f_length = atom->nlocal + atom->nghost;
	} else {
	start = host_start_pair();
	f_length = atom->nlocal - start;
	}
	}
	add_oresults(f_in, ev_global, eatom, vatom, start, f_length);
	#else
	if (force->newton_pair)
	f_length = atom->nlocal + atom->nghost;
	else
	f_length = atom->nlocal;
	add_oresults(f_in, ev_global, eatom, vatom, 0, f_length);
	#endif
	stop_watch(TIME_PACK);
	}

	/* ---------------------------------------------------------------------- */

	template <class ft, class acc_t>
	void FixIntel::add_oresults(const ft * _noalias const f_in,
	const acc_t * _noalias const ev_global,
	const int eatom, const int vatom,
	const int out_offset, const int nall) {
	lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
	if (atom->torque) {
	if (f_in[1].w)
	if (f_in[1].w == 1)
	error->all(FLERR,"Bad matrix inversion in mldivide3");
	else
	error->all(FLERR,
	"Sphere particles not yet supported for gayberne/intel");
	}

	int packthreads;
	if (_nthreads > INTEL_HTHREADS) packthreads = _nthreads;
	else packthreads = 1;
	#if defined(_OPENMP)
	#pragma omp parallel if(packthreads > 1)
	#endif
	{
	#if defined(_OPENMP)
	const int tid = omp_get_thread_num();
	#else
	const int tid = 0;
	#endif
	int ifrom, ito;
	IP_PRE_omp_range_align(ifrom, ito, tid, nall, packthreads, sizeof(acc_t));
	if (atom->torque) {
	int ii = ifrom * 2;
	lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
	out_offset;
	if (eatom) {
	double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
	#if defined(LMP_SIMD_COMPILER)
	#pragma vector aligned
	#pragma ivdep
	#endif
	for (int i = ifrom; i < ito; i++) {
	f[i].x += f_in[ii].x;
	f[i].y += f_in[ii].y;
	f[i].z += f_in[ii].z;
	lmp_eatom[i] += f_in[ii].w;
	tor[i].x += f_in[ii+1].x;
	tor[i].y += f_in[ii+1].y;
	tor[i].z += f_in[ii+1].z;
	ii += 2;
	}
	} else {
	#if defined(LMP_SIMD_COMPILER)
	#pragma vector aligned
	#pragma ivdep
	#endif
	for (int i = ifrom; i < ito; i++) {
	f[i].x += f_in[ii].x;
	f[i].y += f_in[ii].y;
	f[i].z += f_in[ii].z;
	tor[i].x += f_in[ii+1].x;
	tor[i].y += f_in[ii+1].y;
	tor[i].z += f_in[ii+1].z;
	ii += 2;
	}
	}
	} else {
	if (eatom) {
	double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
	#if defined(LMP_SIMD_COMPILER)
	#pragma vector aligned
	#pragma ivdep
	#endif
	for (int i = ifrom; i < ito; i++) {
	f[i].x += f_in[i].x;
	f[i].y += f_in[i].y;
	f[i].z += f_in[i].z;
	lmp_eatom[i] += f_in[i].w;
	}
	} else {
	#if defined(LMP_SIMD_COMPILER)
	#pragma vector aligned
	#pragma ivdep
	#endif
	for (int i = ifrom; i < ito; i++) {
	f[i].x += f_in[i].x;
	f[i].y += f_in[i].y;
	f[i].z += f_in[i].z;
	}
	}
	}
	}

	if (ev_global != NULL) {
	force->pair->eng_vdwl += ev_global[0];
	force->pair->eng_coul += ev_global[1];
	force->pair->virial[0] += ev_global[2];
	force->pair->virial[1] += ev_global[3];
	force->pair->virial[2] += ev_global[4];
	force->pair->virial[3] += ev_global[5];
	force->pair->virial[4] += ev_global[6];
	force->pair->virial[5] += ev_global[7];
	}
	}

	/* ---------------------------------------------------------------------- */

	double FixIntel::memory_usage()
	{
	double bytes;
	if (_precision_mode == PREC_MODE_SINGLE)
	bytes = _single_buffers->memory_usage(_nthreads);
	else if (_precision_mode == PREC_MODE_MIXED)
	bytes = _mixed_buffers->memory_usage(_nthreads);
	else
	bytes = _double_buffers->memory_usage(_nthreads);

	return bytes;
	}

	/* ---------------------------------------------------------------------- */

	#ifdef _LMP_INTEL_OFFLOAD

	/* ---------------------------------------------------------------------- */

	void FixIntel::post_force(int vflag)
	{
	if (_sync_mode == 2) sync_coprocessor();
	}

	/* ---------------------------------------------------------------------- */

	template <class ft, class acc_t>
	void FixIntel::add_off_results(const ft * _noalias const f_in,
	const acc_t * _noalias const ev_global) {
	if (_offload_balance < 0.0)
	_balance_other_time = MPI_Wtime() - _balance_other_time;

	start_watch(TIME_OFFLOAD_WAIT);
	#ifdef _LMP_INTEL_OFFLOAD
	if (neighbor->ago == 0) {
	#pragma offload_wait target(mic:_cop) wait(atom->tag,f_in)
	} else {
	#pragma offload_wait target(mic:_cop) wait(f_in)
	}
	#endif
	double wait_time = stop_watch(TIME_OFFLOAD_WAIT);

	int nlocal = atom->nlocal;
	if (neighbor->ago == 0) {
	if (_off_overflow_flag[LMP_OVERFLOW])
	error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
	_offload_nlocal = _off_overflow_flag[LMP_LOCAL_MAX] + 1;
	_offload_min_ghost = _off_overflow_flag[LMP_GHOST_MIN];
	_offload_nghost = _off_overflow_flag[LMP_GHOST_MAX] + 1 -
	_offload_min_ghost;
	if (_offload_nghost < 0) _offload_nghost = 0;
	_offload_nall = _offload_nlocal + _offload_nghost;
	_offload_nlocal;
	}

	if (atom->torque)
	if (f_in[1].w < 0.0)
	error->all(FLERR, "Bad matrix inversion in mldivide3");
	add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1);

	// Load balance?
	if (_offload_balance < 0.0) {
	if (neighbor->ago == 0)
	_balance_pair = _balance_neighbor;
	double mic_time;
	mic_time = *_stopwatch_offload_pair;
	if (_balance_pair_time + _balance_other_time < mic_time) {
	double ft = _balance_pair_time + _balance_other_time + wait_time -
	mic_time;
	_balance_fixed = (1.0 - INTEL_LB_MEAN_WEIGHT) * _balance_fixed +
	INTEL_LB_MEAN_WEIGHT * ft;
	}

	double ctps = _balance_pair_time / (1.0-_balance_pair);
	double otps = mic_time / _balance_pair;
	double new_balance = (ctps + _balance_other_time - _balance_fixed) /
	(otps + ctps);
	_balance_neighbor = (1.0 - INTEL_LB_MEAN_WEIGHT) *_balance_neighbor +
	INTEL_LB_MEAN_WEIGHT * new_balance;
	}

	#ifdef TIME_BALANCE
	start_watch(TIME_IMBALANCE);
	MPI_Barrier(_real_space_comm);
	stop_watch(TIME_IMBALANCE);
	#endif
	acc_timers();
	}

	/* ---------------------------------------------------------------------- */

	void FixIntel::output_timing_data() {
	if (_im_real_space_task == 0 \|\| _offload_affinity_set == 0) return;

	double timer_total = 0.0;
	int size, rank;
	double timers[NUM_ITIMERS];
	MPI_Comm_size(_real_space_comm, &size);
	MPI_Comm_rank(_real_space_comm, &rank);
	MPI_Allreduce(&_timers, &timers, NUM_ITIMERS, MPI_DOUBLE, MPI_SUM,
	_real_space_comm);
	for (int i=0; i < NUM_ITIMERS; i++) {
	timers[i] /= size;
	timer_total += timers[i];
	}
	#ifdef TIME_BALANCE
	double timers_min[NUM_ITIMERS], timers_max[NUM_ITIMERS];
	MPI_Allreduce(&_timers, &timers_max, NUM_ITIMERS, MPI_DOUBLE, MPI_MAX,
	_real_space_comm);
	MPI_Allreduce(&_timers, &timers_min, NUM_ITIMERS, MPI_DOUBLE, MPI_MIN,
	_real_space_comm);
	#endif

	if (timer_total > 0.0) {
	double balance_out[2], balance_in[2];
	balance_out[0] = _balance_pair;
	balance_out[1] = _balance_neighbor;
	MPI_Reduce(balance_out, balance_in, 2, MPI_DOUBLE, MPI_SUM,
	0, _real_space_comm);
	balance_in[0] /= size;
	balance_in[1] /= size;

	if (rank == 0 && _tscreen) {
	fprintf(_tscreen, "\n------------------------------------------------\n");
	fprintf(_tscreen, " Offload Timing Data\n");
	fprintf(_tscreen, "------------------------------------------------\n");
	fprintf(_tscreen, " Data Pack/Cast Seconds %f\n",
	timers[TIME_PACK]);
	if (_offload_balance != 0.0) {
	fprintf(_tscreen, " Host Neighbor Seconds %f\n",
	timers[TIME_HOST_NEIGHBOR]);
	fprintf(_tscreen, " Host Pair Seconds %f\n",
	timers[TIME_HOST_PAIR]);
	fprintf(_tscreen, " Offload Neighbor Seconds %f\n",
	timers[TIME_OFFLOAD_NEIGHBOR]);
	fprintf(_tscreen, " Offload Pair Seconds %f\n",
	timers[TIME_OFFLOAD_PAIR]);
	fprintf(_tscreen, " Offload Wait Seconds %f\n",
	timers[TIME_OFFLOAD_WAIT]);
	fprintf(_tscreen, " Offload Latency Seconds %f\n",
	timers[TIME_OFFLOAD_LATENCY]);
	fprintf(_tscreen, " Offload Neighbor Balance %f\n",
	balance_in[1]);
	fprintf(_tscreen, " Offload Pair Balance %f\n",
	balance_in[0]);
	fprintf(_tscreen, " Offload Ghost Atoms ");
	if (_offload_noghost) fprintf(_tscreen,"No\n");
	else fprintf(_tscreen,"Yes\n");
	#ifdef TIME_BALANCE
	fprintf(_tscreen, " Offload Imbalance Seconds %f\n",
	timers[TIME_IMBALANCE]);
	fprintf(_tscreen, " Offload Min/Max Seconds ");
	for (int i = 0; i < NUM_ITIMERS; i++)
	fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]);
	fprintf(_tscreen, "\n");
	#endif
	double ht = timers[TIME_HOST_NEIGHBOR] + timers[TIME_HOST_PAIR] +
	timers[TIME_OFFLOAD_WAIT];
	double ct = timers[TIME_OFFLOAD_NEIGHBOR] +
	timers[TIME_OFFLOAD_PAIR];
	double tt = MAX(ht,ct);
	if (timers[TIME_OFFLOAD_LATENCY] / tt > 0.07 && _separate_coi == 0)
	error->warning(FLERR,
	"Leaving a core free can improve performance for offload");
	}
	fprintf(_tscreen, "------------------------------------------------\n");
	}
	zero_timers();
	_setup_time_cleared = false;
	}
	}

	/* ---------------------------------------------------------------------- */

	int FixIntel::get_ppn(int &node_rank) {
	int nprocs;
	int rank;
	MPI_Comm_size(_real_space_comm, &nprocs);
	MPI_Comm_rank(_real_space_comm, &rank);

	int name_length;
	char node_name[MPI_MAX_PROCESSOR_NAME];
	MPI_Get_processor_name(node_name,&name_length);
	node_name[name_length] = '\0';
	char node_names = new char[MPI_MAX_PROCESSOR_NAMEnprocs];
	MPI_Allgather(node_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, node_names,
	MPI_MAX_PROCESSOR_NAME, MPI_CHAR, _real_space_comm);
	int ppn = 0;
	node_rank = 0;
	for (int i = 0; i < nprocs; i++) {
	if (strcmp(node_name, node_names + i * MPI_MAX_PROCESSOR_NAME) == 0) {
	ppn++;
	if (i < rank)
	node_rank++;
	}
	}

	return ppn;
	}

	/* ---------------------------------------------------------------------- */

	void FixIntel::set_offload_affinity()
	{
	_separate_buffers = 0;
	if (_allow_separate_buffers)
	if (_offload_balance != 0.0 && _offload_balance < 1.0)
	_separate_buffers = 1;

	_im_real_space_task = 1;
	if (strncmp(update->integrate_style,"verlet/split",12) == 0) {
	_real_space_comm = world;
	if (universe->iworld != 0) {
	_im_real_space_task = 0;
	return;
	}
	} else
	_real_space_comm = universe->uworld;

	if (_offload_balance == 0.0) _cop = -1;
	if (_offload_balance == 0.0 \|\| _offload_affinity_set == 1)
	return;

	_offload_affinity_set = 1;
	int node_rank;
	int ppn = get_ppn(node_rank);

	if (ppn % _ncops != 0)
	error->all(FLERR, "MPI tasks per node must be multiple of offload_cards");
	ppn = ppn / _ncops;
	_cop = node_rank / ppn;
	node_rank = node_rank % ppn;

	int max_threads_per_task = _offload_cores / 4 * _offload_tpc / ppn;
	if (_offload_threads > max_threads_per_task)
	_offload_threads = max_threads_per_task;
	if (_offload_threads > _max_offload_threads)
	_offload_threads = _max_offload_threads;

	int offload_threads = _offload_threads;
	int offload_tpc = _offload_tpc;
	int offload_affinity_balanced = _offload_affinity_balanced;
	int offload_cores = _offload_cores;
	#pragma offload target(mic:_cop) mandatory \
	in(node_rank,offload_threads,offload_tpc,offload_affinity_balanced, \
	offload_cores)
	{
	omp_set_num_threads(offload_threads);
	#pragma omp parallel
	{
	int tnum = omp_get_thread_num();
	kmp_affinity_mask_t mask;
	kmp_create_affinity_mask(&mask);
	int proc = offload_threads * node_rank + tnum;
	#ifdef __AVX512F__
	proc = (proc / offload_tpc) + (proc % offload_tpc) *
	((offload_cores) / 4);
	proc += 68;
	#else
	if (offload_affinity_balanced)
	proc = proc * 4 - (proc / 60) * 240 + proc / 60 + 1;
	else
	proc += (proc / 4) * (4 - offload_tpc) + 1;
	#endif
	kmp_set_affinity_mask_proc(proc, &mask);
	if (kmp_set_affinity(&mask) != 0)
	printf("Could not set affinity on rank %d thread %d to %d\n",
	node_rank, tnum, proc);
	}
	}

	if (_precision_mode == PREC_MODE_SINGLE)
	_single_buffers->set_off_params(offload_threads, _cop, _separate_buffers);
	else if (_precision_mode == PREC_MODE_MIXED)
	_mixed_buffers->set_off_params(offload_threads, _cop, _separate_buffers);
	else
	_double_buffers->set_off_params(offload_threads, _cop, _separate_buffers);
	}

	/* ---------------------------------------------------------------------- */

	int FixIntel::set_host_affinity(const int nomp)
	{
	#ifndef INTEL_OFFLOAD_NOAFFINITY
	_separate_coi = 1;
	int rank = comm->me;
	int node_rank;
	int ppn = get_ppn(node_rank);
	int cop = node_rank / (ppn / _ncops);

	// Get a sorted list of logical cores
	int proc_list[INTEL_MAX_HOST_CORE_COUNT];
	int ncores;
	FILE *p;
	char cmd[512];
	char readbuf[INTEL_MAX_HOST_CORE_COUNT*5];
	sprintf(cmd, "lscpu -p \| grep -v '#' \|"
	"sort -t, -k 3,3n -k 2,2n \| awk -F, '{print $1}'");
	p = popen(cmd, "r");
	if (p == NULL) return -1;
	ncores = 0;
	while(fgets(readbuf, 512, p)) {
	proc_list[ncores] = atoi(readbuf);
	ncores++;
	}
	pclose(p);

	// Sanity checks for core list
	if (ncores < 2) return -1;
	int nzero = 0;
	for (int i = 0; i < ncores; i++) {
	if (proc_list[i] == 0) nzero++;
	if (proc_list[i] < 0 \|\| proc_list[i] >= ncores) return -1;
	}
	if (nzero > 1) return -1;

	// Determine the OpenMP/MPI configuration
	char *estring;
	int nthreads = nomp;
	if (nthreads == 0) {
	estring = getenv("OMP_NUM_THREADS");
	if (estring != NULL) {
	nthreads = atoi(estring);
	if (nthreads < 2) nthreads = 1;
	} else
	nthreads = 1;
	}

	// Determine how many logical cores for COI and MPI tasks
	int coi_cores = 0, mpi_cores;
	int subscription = nthreads * ppn;
	if (subscription > ncores) {
	if (rank == 0)
	error->warning(FLERR,
	"More MPI tasks/OpenMP threads than available cores");
	return 0;
	}
	if (subscription == ncores)
	_separate_coi = 0;

	if (subscription > ncores / 2) {
	coi_cores = ncores - subscription;
	if (coi_cores > INTEL_MAX_COI_CORES) coi_cores = INTEL_MAX_COI_CORES;
	}
	mpi_cores = (ncores - coi_cores) / ppn;

	// Get ids of all LWPs that COI spawned and affinitize
	int lwp = 0, plwp = 0, nlwp = 0, mlwp = 0, fail = 0;
	cpu_set_t cpuset;
	pid_t pid = getpid();
	if (coi_cores) {
	sprintf(cmd, "ps -Lp %d -o lwp \| awk ' (NR > 2) {print}'", pid);
	p = popen(cmd, "r");
	if (p == NULL) return -1;

	while(fgets(readbuf, 512, p)) {
	lwp = atoi(readbuf);
	int first = coi_cores + node_rank * mpi_cores;
	CPU_ZERO(&cpuset);
	for (int i = first; i < first + mpi_cores; i++)
	CPU_SET(proc_list[i], &cpuset);
	if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
	fail = 1;
	break;
	}
	plwp++;
	}
	pclose(p);

	// Do async offload to create COI threads
	int sig1, sig2;
	float *buf1;
	int pragma_size = 1024;
	buf1 = (float) malloc(sizeof(float)pragma_size);

	#pragma offload target (mic:0) mandatory \
	in(buf1:length(pragma_size) alloc_if(1) free_if(0)) \
	signal(&sig1)
	{ buf1[0] = 0.0; }
	#pragma offload_wait target(mic:0) wait(&sig1)

	#pragma offload target (mic:0) mandatory \
	out(buf1:length(pragma_size) alloc_if(0) free_if(1)) \
	signal(&sig2)
	{ buf1[0] = 1.0; }
	#pragma offload_wait target(mic:0) wait(&sig2)
	free(buf1);

	p = popen(cmd, "r");
	if (p == NULL) return -1;

	while(fgets(readbuf, 512, p)) {
	lwp = atoi(readbuf);
	nlwp++;
	if (nlwp <= plwp) continue;

	CPU_ZERO(&cpuset);
	for(int i=0; i<coi_cores; i++)
	CPU_SET(proc_list[i], &cpuset);

	if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
	fail = 1;
	break;
	}
	}
	pclose(p);
	nlwp -= plwp;

	// Get stats on the number of LWPs per process
	MPI_Reduce(&nlwp, &mlwp, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);
	}

	if (screen && rank == 0) {
	if (coi_cores)
	fprintf(screen,"Intel Package: Affinitizing %d Offload Threads to %d Cores\n",
	mlwp, coi_cores);
	fprintf(screen,"Intel Package: Affinitizing MPI Tasks to %d Cores Each\n",mpi_cores);
	}
	if (fail) return -1;

	// Affinitize MPI Ranks
	CPU_ZERO(&cpuset);
	int first = coi_cores + node_rank * mpi_cores;
	for (int i = first; i < first+mpi_cores; i++)
	CPU_SET(proc_list[i], &cpuset);
	if (sched_setaffinity(pid, sizeof(cpu_set_t), &cpuset))
	return -1;

	#endif
	return 0;
	}

	#endif

fix_intel.cppNo OneTemporaryActions

File Metadata

fix_intel.cppView Options

Event Timeline

fix_intel.cpp
No OneTemporary
Actions

fix_intel.cpp
View Options