File Metadata

Created: Mon, Sep 2, 09:31

lal_balance.h
View Options

	/***************************************************************************
	balance.h
	-------------------
	W. Michael Brown (ORNL)

	Class for host-device load balancing

	__________________________________________________________________________
	This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
	__________________________________________________________________________

	begin :
	email : brownw@ornl.gov
	***************************************************************************/

	#ifndef LAL_BALANCE_H
	#define LAL_BALANCE_H

	#include "lal_device.h"
	#include <math.h>

	#define _HD_BALANCE_EVERY 25
	#define _HD_BALANCE_WEIGHT 0.5
	#define _HD_BALANCE_GAP 1.10

	namespace LAMMPS_AL {

	/// Host/device load balancer
	template<class numtyp, class acctyp>
	class Balance {
	public:
	inline Balance() : _init_done(false), _measure_this_step(false) {}
	inline ~Balance() { clear(); }

	/// Clear any old data and setup for new LAMMPS run
	inline void init(Device<numtyp, acctyp> *gpu, const int gpu_nbor,
	const double split);

	/// Clear all host and device data
	inline void clear() {
	if (_init_done) {
	_device_time.clear();
	_measure_this_step=false;
	_init_done=false;
	}
	}

	/// Return the timestep since initialization
	inline int timestep() { return _timestep; }

	/// Get a count of the number of particles host will handle for initial alloc
	inline int first_host_count(const int nlocal, const double gpu_split,
	const int gpu_nbor) const {
	int host_nlocal=0;
	if (gpu_nbor>0 && gpu_split!=1.0) {
	if (gpu_split>0)
	host_nlocal=static_cast<int>(ceil((1.0-gpu_split)*nlocal));
	else
	host_nlocal=static_cast<int>(ceil(0.05*nlocal));
	}
	return host_nlocal;
	}

	/// Return the number of particles the device will handle this timestep
	inline int get_gpu_count(const int ago, const int inum_full);

	/// Return the average fraction of particles handled by device on all procs
	inline double all_avg_split() {
	if (_load_balance) {
	double _all_avg_split=0.0;
	MPI_Reduce(&_avg_split,&_all_avg_split,1,MPI_DOUBLE,MPI_SUM,0,
	_device->replica());
	_all_avg_split/=_device->replica_size();
	return _all_avg_split/_avg_count;
	} else
	return _actual_split;
	}

	/// If CPU neighboring, allow the device fraction to increase on 2nd timestep
	inline int ago_first(int ago) const
	{ if (_avg_count==1 && _actual_split<_desired_split) ago=0; return ago; }

	/// Start the timer for asynchronous device execution
	inline void start_timer() {
	if (_measure_this_step) {
	_device->gpu->sync();
	_device->gpu_barrier();
	_device->start_host_timer();
	_device_time.start();
	_device->gpu->sync();
	_device->gpu_barrier();
	}
	}

	/// Stop the timer for asynchronous device execution
	inline void stop_timer() { if (_measure_this_step) { _device_time.stop(); } }

	/// Calculate the new host/device split based on the cpu and device times
	/** \note Only does calculation every _HD_BALANCE_EVERY timesteps
	(and first 10) **/
	inline void balance(const double cpu_time);

	/// Calls balance() and then get_gpu_count()
	inline int balance(const int ago,const int inum_full,const double cpu_time) {
	balance(cpu_time);
	return get_gpu_count(ago,inum_full);
	}

	private:
	Device<numtyp,acctyp> *_device;
	UCL_Timer _device_time;
	bool _init_done;
	int _gpu_nbor;

	bool _load_balance;
	double _actual_split, _avg_split, _desired_split, _max_split;
	int _avg_count;

	bool _measure_this_step;
	int _inum, _inum_full, _timestep;
	};

	#define BalanceT Balance<numtyp,acctyp>

	template <class numtyp, class acctyp>
	void BalanceT::init(Device<numtyp, acctyp> *gpu,
	const int gpu_nbor, const double split) {
	clear();
	_gpu_nbor=gpu_nbor;
	_init_done=true;

	_device=gpu;
	_device_time.init(*gpu->gpu);

	if (split<0.0) {
	_load_balance=true;
	_desired_split=0.90;
	} else {
	_load_balance=false;
	_desired_split=split;
	}
	_actual_split=_desired_split;
	_avg_split=0.0;
	_avg_count=0;
	_timestep=0;
	}

	template <class numtyp, class acctyp>
	int BalanceT::get_gpu_count(const int ago, const int inum_full) {
	_measure_this_step=false;
	if (_load_balance) {
	if (_avg_count<11 \|\| _timestep%_HD_BALANCE_EVERY==0) {
	_measure_this_step=true;
	_inum_full=inum_full;
	}
	if (ago==0) {
	_actual_split=_desired_split;
	_max_split=_desired_split;
	}
	}
	_inum=static_cast<int>(floor(_actual_split*inum_full));
	if (_inum==0) _inum++;
	_timestep++;
	return _inum;
	}

	template <class numtyp, class acctyp>
	void BalanceT::balance(const double cpu_time) {
	if (_measure_this_step) {
	_measure_this_step=false;
	double gpu_time=_device_time.seconds();

	double max_gpu_time;
	MPI_Allreduce(&gpu_time,&max_gpu_time,1,MPI_DOUBLE,MPI_MAX,
	_device->gpu_comm());

	if (_inum_full==_inum) {
	_desired_split=1.0;
	return;
	}

	double cpu_time_per_atom=cpu_time/(_inum_full-_inum);
	double cpu_other_time=_device->host_time()-cpu_time;
	int host_inum=static_cast<int>((max_gpu_time-cpu_other_time)/
	cpu_time_per_atom);

	double split=static_cast<double>(_inum_full-host_inum)/_inum_full;
	_desired_split=split*_HD_BALANCE_GAP;
	if (_desired_split>1.0)
	_desired_split=1.0;
	if (_desired_split<0.0)
	_desired_split=0.0;

	if (_gpu_nbor==0) {
	if (_desired_split<_max_split)
	_actual_split=_desired_split;
	else
	_actual_split=_max_split;
	}
	}
	_avg_split+=_desired_split;
	_avg_count++;
	}

	}

	#endif

lal_balance.h
No OneTemporary
Actions

File Metadata

lal_balance.h
View Options

Event Timeline

lal_balance.hNo OneTemporaryActions

File Metadata

lal_balance.hView Options

Event Timeline

lal_balance.h
No OneTemporary
Actions

lal_balance.h
View Options