fix_rx_kokkos.cpp
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Wed, Jun 5, 19:05

fix_rx_kokkos.cpp
View Options

	/* ----------------------------------------------------------------------
	LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
	http://lammps.sandia.gov, Sandia National Laboratories
	Steve Plimpton, sjplimp@sandia.gov

	Copyright (2003) Sandia Corporation. Under the terms of Contract
	DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
	certain rights in this software. This software is distributed under
	the GNU General Public License.

	See the README file in the top-level LAMMPS directory.
	------------------------------------------------------------------------- */

	#include <stdio.h>
	#include <string.h>
	#include "fix_rx_kokkos.h"
	#include "atom_masks.h"
	#include "atom_kokkos.h"
	#include "force.h"
	#include "memory.h"
	#include "update.h"
	#include "respa.h"
	#include "modify.h"
	#include "neighbor.h"
	#include "neigh_list_kokkos.h"
	#include "neigh_request.h"
	#include "error.h"
	#include "math_special_kokkos.h"

	#include <float.h> // DBL_EPSILON

	using namespace LAMMPS_NS;
	using namespace FixConst;
	using namespace MathSpecialKokkos;

	#ifdef DBL_EPSILON
	#define MY_EPSILON (10.0*DBL_EPSILON)
	#else
	#define MY_EPSILON (10.0*2.220446049250313e-16)
	#endif

	#define SparseKinetics_enableIntegralReactions (true)
	#define SparseKinetics_invalidIndex (-1)

	// From fix_rx.cpp ... this should be lifted into fix_rx.h or fix_rx_kokkos.h?
	enum{NONE,HARMONIC};
	enum{LUCY};

	namespace /* anonymous */
	{

	typedef double TimerType;
	TimerType getTimeStamp(void) { return MPI_Wtime(); }
	double getElapsedTime( const TimerType &t0, const TimerType &t1) { return t1-t0; }

	} // end namespace

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	FixRxKokkos<DeviceType>::FixRxKokkos(LAMMPS lmp, int narg, char *arg) :
	FixRX(lmp, narg, arg),
	pairDPDEKK(NULL),
	update_kinetics_data(true)
	{
	kokkosable = 1;
	atomKK = (AtomKokkos *) atom;
	execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
	datamask_read = EMPTY_MASK;
	datamask_modify = EMPTY_MASK;

	k_error_flag = DAT::tdual_int_scalar("FixRxKokkos::k_error_flag");

	//printf("Inside FixRxKokkos::FixRxKokkos\n");
	}

	template <typename DeviceType>
	FixRxKokkos<DeviceType>::~FixRxKokkos()
	{
	//printf("Inside FixRxKokkos::~FixRxKokkos copymode= %d\n", copymode);
	if (copymode) return;

	if (localTempFlag)
	memory->destroy_kokkos(k_dpdThetaLocal, dpdThetaLocal);

	memory->destroy_kokkos(k_sumWeights, sumWeights);
	//memory->destroy_kokkos(k_sumWeights);

	//delete [] scratchSpace;
	memory->destroy_kokkos(d_scratchSpace);

	memory->destroy_kokkos(k_cutsq);
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	void FixRxKokkos<DeviceType>::post_constructor()
	{
	// Run the parents and then reset one value.
	FixRX::post_constructor();

	// Need a copy of this
	this->my_restartFlag = modify->fix[modify->nfix-1]->restart_reset;
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	void FixRxKokkos<DeviceType>::init()
	{
	//printf("Inside FixRxKokkos::init\n");

	// Call the parent's version.
	//FixRX::init();

	pairDPDE = (PairDPDfdtEnergy *) force->pair_match("dpd/fdt/energy",1);
	if (pairDPDE == NULL)
	pairDPDE = (PairDPDfdtEnergy *) force->pair_match("dpd/fdt/energy/kk",1);

	if (pairDPDE == NULL)
	error->all(FLERR,"Must use pair_style dpd/fdt/energy with fix rx");

	pairDPDEKK = dynamic_cast<decltype(pairDPDEKK)>(pairDPDE);
	if (pairDPDEKK == NULL)
	error->all(FLERR,"Must use pair_style dpd/fdt/energy/kk with fix rx/kk");

	bool eos_flag = false;
	for (int i = 0; i < modify->nfix; i++)
	if (strncmp(modify->fix[i]->style,"eos/table/rx",3) == 0) eos_flag = true;
	if(!eos_flag) error->all(FLERR,"fix rx requires fix eos/table/rx to be specified");

	if (update_kinetics_data)
	create_kinetics_data();

	// From FixRX::init()
	// need a half neighbor list
	// built whenever re-neighboring occurs

	int irequest = neighbor->request(this,instance_me);
	neighbor->requests[irequest]->pair = 0;
	neighbor->requests[irequest]->fix = 1;

	// Update the neighbor data for Kokkos.
	int neighflag = lmp->kokkos->neighflag;

	neighbor->requests[irequest]->
	kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value &&
	!Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
	neighbor->requests[irequest]->
	kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;

	if (neighflag == FULL) {
	neighbor->requests[irequest]->full = 1;
	neighbor->requests[irequest]->half = 0;
	} else { //if (neighflag == HALF \|\| neighflag == HALFTHREAD)
	neighbor->requests[irequest]->full = 0;
	neighbor->requests[irequest]->half = 1;
	}
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	void FixRxKokkos<DeviceType>::init_list(int, class NeighList* ptr)
	{
	//printf("Inside FixRxKokkos::init_list\n");
	this->list = ptr;
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	void FixRxKokkos<DeviceType>::rk4(const double t_stop, double y, double rwork, void* v_params) const
	{
	double *k1 = rwork;
	double *k2 = k1 + nspecies;
	double *k3 = k2 + nspecies;
	double *k4 = k3 + nspecies;
	double *yp = k4 + nspecies;

	const int numSteps = minSteps;

	const double h = t_stop / double(numSteps);

	// Run the requested steps with h.
	for (int step = 0; step < numSteps; step++)
	{
	// k1
	rhs(0.0,y,k1,v_params);

	// k2
	for (int ispecies = 0; ispecies < nspecies; ispecies++)
	yp[ispecies] = y[ispecies] + 0.5hk1[ispecies];

	rhs(0.0,yp,k2,v_params);

	// k3
	for (int ispecies = 0; ispecies < nspecies; ispecies++)
	yp[ispecies] = y[ispecies] + 0.5hk2[ispecies];

	rhs(0.0,yp,k3,v_params);

	// k4
	for (int ispecies = 0; ispecies < nspecies; ispecies++)
	yp[ispecies] = y[ispecies] + h*k3[ispecies];

	rhs(0.0,yp,k4,v_params);

	for (int ispecies = 0; ispecies < nspecies; ispecies++)
	y[ispecies] += h*(k1[ispecies]/6.0 + k2[ispecies]/3.0 + k3[ispecies]/3.0 + k4[ispecies]/6.0);

	} // end for (int step...

	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	template <typename VectorType, typename UserDataType>
	void FixRxKokkos<DeviceType>::k_rk4(const double t_stop, VectorType& y, VectorType& rwork, UserDataType& userData) const
	{
	VectorType k1( rwork );
	VectorType k2( &k1[nspecies] );
	VectorType k3( &k2[nspecies] );
	VectorType k4( &k3[nspecies] );
	VectorType yp( &k4[nspecies] );

	const int numSteps = minSteps;

	const double h = t_stop / double(numSteps);

	// Run the requested steps with h.
	for (int step = 0; step < numSteps; step++)
	{
	// k1
	k_rhs(0.0,y,k1, userData);

	// k2
	for (int ispecies = 0; ispecies < nspecies; ispecies++)
	yp[ispecies] = y[ispecies] + 0.5hk1[ispecies];

	k_rhs(0.0,yp,k2, userData);

	// k3
	for (int ispecies = 0; ispecies < nspecies; ispecies++)
	yp[ispecies] = y[ispecies] + 0.5hk2[ispecies];

	k_rhs(0.0,yp,k3, userData);

	// k4
	for (int ispecies = 0; ispecies < nspecies; ispecies++)
	yp[ispecies] = y[ispecies] + h*k3[ispecies];

	k_rhs(0.0,yp,k4, userData);

	for (int ispecies = 0; ispecies < nspecies; ispecies++)
	y[ispecies] += h*(k1[ispecies]/6.0 + k2[ispecies]/3.0 + k3[ispecies]/3.0 + k4[ispecies]/6.0);

	} // end for (int step...

	}

	/* ---------------------------------------------------------------------- */

	// f1 = dt*f(t,x)
	// f2 = dtf(t+ c20dt,x + c21*f1)
	// f3 = dtf(t+ c30dt,x + c31f1 + c32f2)
	// f4 = dtf(t+ c40dt,x + c41f1 + c42f2 + c43*f3)
	// f5 = dtf(t+dt,x + c51f1 + c52f2 + c53f3 + c54*f4)
	// f6 = dtf(t+ c60dt,x + c61f1 + c62f2 + c63f3 + c64f4 + c65*f5)
	//
	// fifth-order runge-kutta integration
	// x5 = x + b1f1 + b3f3 + b4f4 + b5f5 + b6*f6
	// fourth-order runge-kutta integration
	// x = x + a1f1 + a3f3 + a4f4 + a5f5

	template <typename DeviceType>
	template <typename VectorType, typename UserDataType>
	void FixRxKokkos<DeviceType>::k_rkf45_step (const int neq, const double h, VectorType& y, VectorType& y_out, VectorType& rwk, UserDataType& userData) const
	{
	const double c21=0.25;
	const double c31=0.09375;
	const double c32=0.28125;
	const double c41=0.87938097405553;
	const double c42=-3.2771961766045;
	const double c43=3.3208921256258;
	const double c51=2.0324074074074;
	const double c52=-8.0;
	const double c53=7.1734892787524;
	const double c54=-0.20589668615984;
	const double c61=-0.2962962962963;
	const double c62=2.0;
	const double c63=-1.3816764132554;
	const double c64=0.45297270955166;
	const double c65=-0.275;
	const double a1=0.11574074074074;
	const double a3=0.54892787524366;
	const double a4=0.5353313840156;
	const double a5=-0.2;
	const double b1=0.11851851851852;
	const double b3=0.51898635477583;
	const double b4=0.50613149034201;
	const double b5=-0.18;
	const double b6=0.036363636363636;

	// local dependent variables (5 total)
	VectorType& f1 = rwk;
	VectorType f2( &rwk[ neq] );
	VectorType f3( &rwk[2*neq] );
	VectorType f4( &rwk[3*neq] );
	VectorType f5( &rwk[4*neq] );
	VectorType f6( &rwk[5*neq] );

	// scratch for the intermediate solution.
	VectorType& ytmp = y_out;

	// 1)
	k_rhs (0.0, y, f1, userData);

	for (int k = 0; k < neq; k++){
	f1[k] *= h;
	ytmp[k] = y[k] + c21 * f1[k];
	}

	// 2)
	k_rhs(0.0, ytmp, f2, userData);

	for (int k = 0; k < neq; k++){
	f2[k] *= h;
	ytmp[k] = y[k] + c31 * f1[k] + c32 * f2[k];
	}

	// 3)
	k_rhs(0.0, ytmp, f3, userData);

	for (int k = 0; k < neq; k++) {
	f3[k] *= h;
	ytmp[k] = y[k] + c41 * f1[k] + c42 * f2[k] + c43 * f3[k];
	}

	// 4)
	k_rhs(0.0, ytmp, f4, userData);

	for (int k = 0; k < neq; k++) {
	f4[k] *= h;
	ytmp[k] = y[k] + c51 * f1[k] + c52 * f2[k] + c53 * f3[k] + c54 * f4[k];
	}

	// 5)
	k_rhs(0.0, ytmp, f5, userData);

	for (int k = 0; k < neq; k++) {
	f5[k] *= h;
	ytmp[k] = y[k] + c61f1[k] + c62f2[k] + c63f3[k] + c64f4[k] + c65*f5[k];
	}

	// 6)
	k_rhs(0.0, ytmp, f6, userData);

	for (int k = 0; k < neq; k++)
	{
	//const double f6 = h * ydot[k];
	f6[k] *= h;

	// 5th-order solution.
	const double r5 = b1f1[k] + b3f3[k] + b4f4[k] + b5f5[k] + b6*f6[k];

	// 4th-order solution.
	const double r4 = a1f1[k] + a3f3[k] + a4f4[k] + a5f5[k];

	// Truncation error: difference between 4th and 5th-order solutions.
	rwk[k] = fabs(r5 - r4);

	// Update solution.
	//y_out[k] = y[k] + r5; // Local extrapolation
	y_out[k] = y[k] + r4;
	}

	return;
	}

	template <typename DeviceType>
	template <typename VectorType, typename UserDataType>
	int FixRxKokkos<DeviceType>::k_rkf45_h0
	(const int neq, const double t, const double t_stop,
	const double hmin, const double hmax,
	double& h0, VectorType& y, VectorType& rwk, UserDataType& userData) const
	{
	// Set lower and upper bounds on h0, and take geometric mean as first trial value.
	// Exit with this value if the bounds cross each other.

	// Adjust upper bound based on ydot ...
	double hg = sqrt(hmin*hmax);

	//if (hmax < hmin)
	//{
	// h0 = hg;
	// return;
	//}

	// Start iteration to find solution to ... {WRMS norm of (h0^2 y'' / 2)} = 1

	VectorType& ydot = rwk;
	VectorType y1 ( &ydot[ neq] );
	VectorType ydot1 ( &ydot[2*neq] );

	const int max_iters = 10;
	bool hnew_is_ok = false;
	double hnew = hg;
	int iter = 0;

	// compute ydot at t=t0
	k_rhs (t, y, ydot, userData);

	while(1)
	{
	// Estimate y'' with finite-difference ...

	for (int k = 0; k < neq; k++)
	y1[k] = y[k] + hg * ydot[k];

	// compute y' at t1
	k_rhs (t + hg, y1, ydot1, userData);

	// Compute WRMS norm of y''
	double yddnrm = 0.0;
	for (int k = 0; k < neq; k++){
	double ydd = (ydot1[k] - ydot[k]) / hg;
	double wterr = ydd / (relTol * fabs( y[k] ) + absTol);
	yddnrm += wterr * wterr;
	}

	yddnrm = sqrt( yddnrm / double(neq) );

	//std::cout << "iter " << _iter << " hg " << hg << " y'' " << yddnrm << std::endl;
	//std::cout << "ydot " << ydot[neq-1] << std::endl;

	// should we accept this?
	if (hnew_is_ok \|\| iter == max_iters){
	hnew = hg;
	//if (iter == max_iters)
	// fprintf(stderr, "ERROR_HIN_MAX_ITERS\n");
	break;
	}

	// Get the new value of h ...
	hnew = (yddnrmhmaxhmax > 2.0) ? sqrt(2.0 / yddnrm) : sqrt(hg * hmax);

	// test the stopping conditions.
	double hrat = hnew / hg;

	// Accept this value ... the bias factor should bring it within range.
	if ( (hrat > 0.5) && (hrat < 2.0) )
	hnew_is_ok = true;

	// If y'' is still bad after a few iterations, just accept h and give up.
	if ( (iter > 1) && hrat > 2.0 ) {
	hnew = hg;
	hnew_is_ok = true;
	}

	//printf("iter=%d, yddnrw=%e, hnew=%e, hmin=%e, hmax=%e\n", iter, yddnrm, hnew, hmin, hmax);

	hg = hnew;
	iter ++;
	}

	// bound and bias estimate
	h0 = hnew * 0.5;
	h0 = fmax(h0, hmin);
	h0 = fmin(h0, hmax);
	//printf("h0=%e, hmin=%e, hmax=%e\n", h0, hmin, hmax);

	return (iter + 1);
	}

	template <typename DeviceType>
	template <typename VectorType, typename UserDataType>
	void FixRxKokkos<DeviceType>::k_rkf45(const int neq, const double t_stop, VectorType& y, VectorType& rwork, UserDataType& userData, CounterType& counter) const
	{
	// Rounding coefficient.
	const double uround = DBL_EPSILON;

	// Adaption limit (shrink or grow)
	const double adaption_limit = 4.0;

	// Safety factor on the adaption. very specific but not necessary .. 0.9 is common.
	const double hsafe = 0.840896415;

	// Time rounding factor.
	const double tround = t_stop * uround;

	// Counters for diagnostics.
	int nst = 0; // # of steps (accepted)
	int nit = 0; // # of iterations total
	int nfe = 0; // # of RHS evaluations

	// Min/Max step-size limits.
	const double h_min = 100.0 * tround;
	const double h_max = (minSteps > 0) ? t_stop / double(minSteps) : t_stop;

	// Set the initial step-size. 0 forces an internal estimate ... stable Euler step size.
	double h = (minSteps > 0) ? t_stop / double(minSteps) : 0.0;

	double t = 0.0;

	if (h < h_min){
	//fprintf(stderr,"hin not implemented yet\n");
	//exit(-1);
	nfe = k_rkf45_h0 (neq, t, t_stop, h_min, h_max, h, y, rwork, userData);
	}

	//printf("t= %e t_stop= %e h= %e\n", t, t_stop, h);

	// Integrate until we reach the end time.
	while (fabs(t - t_stop) > tround)
	{
	VectorType& yout = rwork;
	VectorType eout ( &yout[neq] );

	// Take a trial step.
	k_rkf45_step (neq, h, y, yout, eout, userData);

	// Estimate the solution error.
	// ... weighted 2-norm of the error.
	double err2 = 0.0;
	for (int k = 0; k < neq; k++){
	const double wterr = eout[k] / (relTol * fabs( y[k] ) + absTol);
	err2 += wterr * wterr;
	}

	double err = fmax( uround, sqrt( err2 / double(nspecies) ));

	// Accept the solution?
	if (err <= 1.0 \|\| h <= h_min){
	t += h;
	nst++;

	for (int k = 0; k < neq; k++)
	y[k] = yout[k];
	}

	// Adjust h for the next step.
	double hfac = hsafe * sqrt( sqrt( 1.0 / err ) );

	// Limit the adaption.
	hfac = fmax( hfac, 1.0 / adaption_limit );
	hfac = fmin( hfac, adaption_limit );

	// Apply the adaption factor...
	h *= hfac;

	// Limit h.
	h = fmin( h, h_max );
	h = fmax( h, h_min );

	// Stretch h if we're within 5% ... and we didn't just fail.
	if (err <= 1.0 && (t + 1.05*h) > t_stop)
	h = t_stop - t;

	// And don't overshoot the end.
	if (t + h > t_stop)
	h = t_stop - t;

	nit++;
	nfe += 6;

	if (maxIters && nit > maxIters){
	//fprintf(stderr,"atom[%d] took too many iterations in rkf45 %d %e %e\n", id, nit, t, t_stop);
	counter.nFails ++;
	break;
	// We should set an error here so that the solution is not used!
	}

	} // end while

	counter.nSteps += nst;
	counter.nIters += nit;
	counter.nFuncs += nfe;

	//printf("id= %d nst= %d nit= %d\n", id, nst, nit);
	}
	/* ---------------------------------------------------------------------- */

	// f1 = dt*f(t,x)
	// f2 = dtf(t+ c20dt,x + c21*f1)
	// f3 = dtf(t+ c30dt,x + c31f1 + c32f2)
	// f4 = dtf(t+ c40dt,x + c41f1 + c42f2 + c43*f3)
	// f5 = dtf(t+dt,x + c51f1 + c52f2 + c53f3 + c54*f4)
	// f6 = dtf(t+ c60dt,x + c61f1 + c62f2 + c63f3 + c64f4 + c65*f5)
	//
	// fifth-order runge-kutta integration
	// x5 = x + b1f1 + b3f3 + b4f4 + b5f5 + b6*f6
	// fourth-order runge-kutta integration
	// x = x + a1f1 + a3f3 + a4f4 + a5f5

	template <typename DeviceType>
	void FixRxKokkos<DeviceType>::rkf45_step (const int neq, const double h, double y[], double y_out[], double rwk[], void* v_param) const
	{
	const double c21=0.25;
	const double c31=0.09375;
	const double c32=0.28125;
	const double c41=0.87938097405553;
	const double c42=-3.2771961766045;
	const double c43=3.3208921256258;
	const double c51=2.0324074074074;
	const double c52=-8.0;
	const double c53=7.1734892787524;
	const double c54=-0.20589668615984;
	const double c61=-0.2962962962963;
	const double c62=2.0;
	const double c63=-1.3816764132554;
	const double c64=0.45297270955166;
	const double c65=-0.275;
	const double a1=0.11574074074074;
	const double a3=0.54892787524366;
	const double a4=0.5353313840156;
	const double a5=-0.2;
	const double b1=0.11851851851852;
	const double b3=0.51898635477583;
	const double b4=0.50613149034201;
	const double b5=-0.18;
	const double b6=0.036363636363636;

	// local dependent variables (5 total)
	double* f1 = &rwk[ 0];
	double* f2 = &rwk[ neq];
	double* f3 = &rwk[2*neq];
	double* f4 = &rwk[3*neq];
	double* f5 = &rwk[4*neq];
	double* f6 = &rwk[5*neq];

	// scratch for the intermediate solution.
	//double* ytmp = &rwk[6*neq];
	double* ytmp = y_out;

	// 1)
	rhs (0.0, y, f1, v_param);

	for (int k = 0; k < neq; k++){
	f1[k] *= h;
	ytmp[k] = y[k] + c21 * f1[k];
	}

	// 2)
	rhs(0.0, ytmp, f2, v_param);

	for (int k = 0; k < neq; k++){
	f2[k] *= h;
	ytmp[k] = y[k] + c31 * f1[k] + c32 * f2[k];
	}

	// 3)
	rhs(0.0, ytmp, f3, v_param);

	for (int k = 0; k < neq; k++) {
	f3[k] *= h;
	ytmp[k] = y[k] + c41 * f1[k] + c42 * f2[k] + c43 * f3[k];
	}

	// 4)
	rhs(0.0, ytmp, f4, v_param);

	for (int k = 0; k < neq; k++) {
	f4[k] *= h;
	ytmp[k] = y[k] + c51 * f1[k] + c52 * f2[k] + c53 * f3[k] + c54 * f4[k];
	}

	// 5)
	rhs(0.0, ytmp, f5, v_param);

	for (int k = 0; k < neq; k++) {
	f5[k] *= h;
	ytmp[k] = y[k] + c61f1[k] + c62f2[k] + c63f3[k] + c64f4[k] + c65*f5[k];
	}

	// 6)
	rhs(0.0, ytmp, f6, v_param);

	for (int k = 0; k < neq; k++)
	{
	//const double f6 = h * ydot[k];
	f6[k] *= h;

	// 5th-order solution.
	const double r5 = b1f1[k] + b3f3[k] + b4f4[k] + b5f5[k] + b6*f6[k];

	// 4th-order solution.
	const double r4 = a1f1[k] + a3f3[k] + a4f4[k] + a5f5[k];

	// Truncation error: difference between 4th and 5th-order solutions.
	rwk[k] = fabs(r5 - r4);

	// Update solution.
	//y_out[k] = y[k] + r5; // Local extrapolation
	y_out[k] = y[k] + r4;
	}

	return;
	}

	template <typename DeviceType>
	int FixRxKokkos<DeviceType>::rkf45_h0
	(const int neq, const double t, const double t_stop,
	const double hmin, const double hmax,
	double& h0, double y[], double rwk[], void* v_params) const
	{
	// Set lower and upper bounds on h0, and take geometric mean as first trial value.
	// Exit with this value if the bounds cross each other.

	// Adjust upper bound based on ydot ...
	double hg = sqrt(hmin*hmax);

	//if (hmax < hmin)
	//{
	// h0 = hg;
	// return;
	//}

	// Start iteration to find solution to ... {WRMS norm of (h0^2 y'' / 2)} = 1

	double *ydot = rwk;
	double *y1 = ydot + neq;
	double *ydot1 = y1 + neq;

	const int max_iters = 10;
	bool hnew_is_ok = false;
	double hnew = hg;
	int iter = 0;

	// compute ydot at t=t0
	rhs (t, y, ydot, v_params);

	while(1)
	{
	// Estimate y'' with finite-difference ...

	for (int k = 0; k < neq; k++)
	y1[k] = y[k] + hg * ydot[k];

	// compute y' at t1
	rhs (t + hg, y1, ydot1, v_params);

	// Compute WRMS norm of y''
	double yddnrm = 0.0;
	for (int k = 0; k < neq; k++){
	double ydd = (ydot1[k] - ydot[k]) / hg;
	double wterr = ydd / (relTol * fabs( y[k] ) + absTol);
	yddnrm += wterr * wterr;
	}

	yddnrm = sqrt( yddnrm / double(neq) );

	//std::cout << "iter " << _iter << " hg " << hg << " y'' " << yddnrm << std::endl;
	//std::cout << "ydot " << ydot[neq-1] << std::endl;

	// should we accept this?
	if (hnew_is_ok \|\| iter == max_iters){
	hnew = hg;
	if (iter == max_iters)
	fprintf(stderr, "ERROR_HIN_MAX_ITERS\n");
	break;
	}

	// Get the new value of h ...
	hnew = (yddnrmhmaxhmax > 2.0) ? sqrt(2.0 / yddnrm) : sqrt(hg * hmax);

	// test the stopping conditions.
	double hrat = hnew / hg;

	// Accept this value ... the bias factor should bring it within range.
	if ( (hrat > 0.5) && (hrat < 2.0) )
	hnew_is_ok = true;

	// If y'' is still bad after a few iterations, just accept h and give up.
	if ( (iter > 1) && hrat > 2.0 ) {
	hnew = hg;
	hnew_is_ok = true;
	}

	//printf("iter=%d, yddnrw=%e, hnew=%e, hmin=%e, hmax=%e\n", iter, yddnrm, hnew, hmin, hmax);

	hg = hnew;
	iter ++;
	}

	// bound and bias estimate
	h0 = hnew * 0.5;
	h0 = fmax(h0, hmin);
	h0 = fmin(h0, hmax);
	//printf("h0=%e, hmin=%e, hmax=%e\n", h0, hmin, hmax);

	return (iter + 1);
	}

	template <typename DeviceType>
	void FixRxKokkos<DeviceType>::rkf45(const int neq, const double t_stop, double y, double rwork, void *v_param, CounterType& counter) const
	{
	// Rounding coefficient.
	const double uround = DBL_EPSILON;

	// Adaption limit (shrink or grow)
	const double adaption_limit = 4.0;

	// Safety factor on the adaption. very specific but not necessary .. 0.9 is common.
	const double hsafe = 0.840896415;

	// Time rounding factor.
	const double tround = t_stop * uround;

	// Counters for diagnostics.
	int nst = 0; // # of steps (accepted)
	int nit = 0; // # of iterations total
	int nfe = 0; // # of RHS evaluations

	// Min/Max step-size limits.
	const double h_min = 100.0 * tround;
	const double h_max = (minSteps > 0) ? t_stop / double(minSteps) : t_stop;

	// Set the initial step-size. 0 forces an internal estimate ... stable Euler step size.
	double h = (minSteps > 0) ? t_stop / double(minSteps) : 0.0;

	double t = 0.0;

	if (h < h_min){
	//fprintf(stderr,"hin not implemented yet\n");
	//exit(-1);
	nfe = rkf45_h0 (neq, t, t_stop, h_min, h_max, h, y, rwork, v_param);
	}

	//printf("t= %e t_stop= %e h= %e\n", t, t_stop, h);

	// Integrate until we reach the end time.
	while (fabs(t - t_stop) > tround){
	double *yout = rwork;
	double *eout = yout + neq;

	// Take a trial step.
	rkf45_step (neq, h, y, yout, eout, v_param);

	// Estimate the solution error.
	// ... weighted 2-norm of the error.
	double err2 = 0.0;
	for (int k = 0; k < neq; k++){
	const double wterr = eout[k] / (relTol * fabs( y[k] ) + absTol);
	err2 += wterr * wterr;
	}

	double err = fmax( uround, sqrt( err2 / double(nspecies) ));

	// Accept the solution?
	if (err <= 1.0 \|\| h <= h_min){
	t += h;
	nst++;

	for (int k = 0; k < neq; k++)
	y[k] = yout[k];
	}

	// Adjust h for the next step.
	double hfac = hsafe * sqrt( sqrt( 1.0 / err ) );

	// Limit the adaption.
	hfac = fmax( hfac, 1.0 / adaption_limit );
	hfac = fmin( hfac, adaption_limit );

	// Apply the adaption factor...
	h *= hfac;

	// Limit h.
	h = fmin( h, h_max );
	h = fmax( h, h_min );

	// Stretch h if we're within 5% ... and we didn't just fail.
	if (err <= 1.0 && (t + 1.05*h) > t_stop)
	h = t_stop - t;

	// And don't overshoot the end.
	if (t + h > t_stop)
	h = t_stop - t;

	nit++;
	nfe += 6;

	if (maxIters && nit > maxIters){
	//fprintf(stderr,"atom[%d] took too many iterations in rkf45 %d %e %e\n", id, nit, t, t_stop);
	counter.nFails ++;
	break;
	// We should set an error here so that the solution is not used!
	}

	} // end while

	counter.nSteps += nst;
	counter.nIters += nit;
	counter.nFuncs += nfe;

	//printf("id= %d nst= %d nit= %d\n", id, nst, nit);
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	int FixRxKokkos<DeviceType>::rhs(double t, const double y, double dydt, void *params) const
	{
	// Use the sparse format instead.
	if (useSparseKinetics)
	return this->rhs_sparse( t, y, dydt, params);
	else
	return this->rhs_dense ( t, y, dydt, params);
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	int FixRxKokkos<DeviceType>::rhs_dense(double t, const double y, double dydt, void *params) const
	{
	UserRHSData userData = (UserRHSData ) params;

	double *rxnRateLaw = userData->rxnRateLaw;
	double *kFor = userData->kFor;

	//const double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
	//const int nspecies = atom->nspecies_dpd;

	for(int ispecies=0; ispecies<nspecies; ispecies++)
	dydt[ispecies] = 0.0;

	// Construct the reaction rate laws
	for(int jrxn=0; jrxn<nreactions; jrxn++){
	double rxnRateLawForward = kFor[jrxn];

	for(int ispecies=0; ispecies<nspecies; ispecies++){
	const double concentration = y[ispecies]/VDPD;
	rxnRateLawForward *= pow( concentration, d_kineticsData.stoichReactants(jrxn,ispecies) );
	//rxnRateLawForward *= pow(concentration,stoichReactants[jrxn][ispecies]);
	}
	rxnRateLaw[jrxn] = rxnRateLawForward;
	}

	// Construct the reaction rates for each species
	for(int ispecies=0; ispecies<nspecies; ispecies++)
	for(int jrxn=0; jrxn<nreactions; jrxn++)
	{
	dydt[ispecies] += d_kineticsData.stoich(jrxn,ispecies) VDPDrxnRateLaw[jrxn];
	//dydt[ispecies] += stoich[jrxn][ispecies]VDPDrxnRateLaw[jrxn];
	}

	return 0;
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	int FixRxKokkos<DeviceType>::rhs_sparse(double t, const double y, double dydt, void *v_params) const
	{
	UserRHSData userData = (UserRHSData ) v_params;

	//const double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;

	#define kFor (userData->kFor)
	#define kRev (NULL)
	#define rxnRateLaw (userData->rxnRateLaw)
	#define conc (dydt)
	#define maxReactants (this->sparseKinetics_maxReactants)
	#define maxSpecies (this->sparseKinetics_maxSpecies)
	#define nuk (this->d_kineticsData.nuk)
	#define nu (this->d_kineticsData.nu)
	#define inu (this->d_kineticsData.inu)
	#define isIntegral(idx) ( SparseKinetics_enableIntegralReactions \
	&& this->d_kineticsData.isIntegral(idx) )

	for (int k = 0; k < nspecies; ++k)
	conc[k] = y[k] / VDPD;

	// Construct the reaction rate laws
	for (int i = 0; i < nreactions; ++i)
	{
	double rxnRateLawForward;
	if (isIntegral(i)){
	rxnRateLawForward = kFor[i] * powint( conc[ nuk(i,0) ], inu(i,0) );
	for (int kk = 1; kk < maxReactants; ++kk){
	const int k = nuk(i,kk);
	if (k == SparseKinetics_invalidIndex) break;
	//if (k != SparseKinetics_invalidIndex)
	rxnRateLawForward *= powint( conc[k], inu(i,kk) );
	}
	} else {
	rxnRateLawForward = kFor[i] * pow( conc[ nuk(i,0) ], nu(i,0) );
	for (int kk = 1; kk < maxReactants; ++kk){
	const int k = nuk(i,kk);
	if (k == SparseKinetics_invalidIndex) break;
	//if (k != SparseKinetics_invalidIndex)
	rxnRateLawForward *= pow( conc[k], nu(i,kk) );
	}
	}

	rxnRateLaw[i] = rxnRateLawForward;
	}

	// Construct the reaction rates for each species from the
	// Stoichiometric matrix and ROP vector.
	for (int k = 0; k < nspecies; ++k)
	dydt[k] = 0.0;

	for (int i = 0; i < nreactions; ++i){
	// Reactants ...
	dydt[ nuk(i,0) ] -= nu(i,0) * rxnRateLaw[i];
	for (int kk = 1; kk < maxReactants; ++kk){
	const int k = nuk(i,kk);
	if (k == SparseKinetics_invalidIndex) break;
	//if (k != SparseKinetics_invalidIndex)
	dydt[k] -= nu(i,kk) * rxnRateLaw[i];
	}

	// Products ...
	dydt[ nuk(i,maxReactants) ] += nu(i,maxReactants) * rxnRateLaw[i];
	for (int kk = maxReactants+1; kk < maxSpecies; ++kk){
	const int k = nuk(i,kk);
	if (k == SparseKinetics_invalidIndex) break;
	//if (k != SparseKinetics_invalidIndex)
	dydt[k] += nu(i,kk) * rxnRateLaw[i];
	}
	}

	// Add in the volume factor to convert to the proper units.
	for (int k = 0; k < nspecies; ++k)
	dydt[k] *= VDPD;

	#undef kFor
	#undef kRev
	#undef rxnRateLaw
	#undef conc
	#undef maxReactants
	#undef maxSpecies
	#undef nuk
	#undef nu
	#undef inu
	#undef isIntegral
	//#undef invalidIndex

	return 0;
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	template <typename VectorType, typename UserDataType>
	int FixRxKokkos<DeviceType>::k_rhs(double t, const VectorType& y, VectorType& dydt, UserDataType& userData) const
	{
	// Use the sparse format instead.
	if (useSparseKinetics)
	return this->k_rhs_sparse( t, y, dydt, userData);
	else
	return this->k_rhs_dense ( t, y, dydt, userData);
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	template <typename VectorType, typename UserDataType>
	int FixRxKokkos<DeviceType>::k_rhs_dense(double t, const VectorType& y, VectorType& dydt, UserDataType& userData) const
	{
	#define rxnRateLaw (userData.rxnRateLaw)
	#define kFor (userData.kFor )

	//const double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
	//const int nspecies = atom->nspecies_dpd;

	for(int ispecies=0; ispecies<nspecies; ispecies++)
	dydt[ispecies] = 0.0;

	// Construct the reaction rate laws
	for(int jrxn=0; jrxn<nreactions; jrxn++){
	double rxnRateLawForward = kFor[jrxn];

	for(int ispecies=0; ispecies<nspecies; ispecies++){
	const double concentration = y[ispecies]/VDPD;
	rxnRateLawForward *= pow( concentration, d_kineticsData.stoichReactants(jrxn,ispecies) );
	//rxnRateLawForward *= pow(concentration,stoichReactants[jrxn][ispecies]);
	}
	rxnRateLaw[jrxn] = rxnRateLawForward;
	}

	// Construct the reaction rates for each species
	for(int ispecies=0; ispecies<nspecies; ispecies++)
	for(int jrxn=0; jrxn<nreactions; jrxn++)
	{
	dydt[ispecies] += d_kineticsData.stoich(jrxn,ispecies) VDPDrxnRateLaw[jrxn];
	//dydt[ispecies] += stoich[jrxn][ispecies]VDPDrxnRateLaw[jrxn];
	}

	#undef rxnRateLaw
	#undef kFor

	return 0;
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	template <typename VectorType, typename UserDataType>
	int FixRxKokkos<DeviceType>::k_rhs_sparse(double t, const VectorType& y, VectorType& dydt, UserDataType& userData) const
	{
	#define kFor (userData.kFor)
	#define kRev (NULL)
	#define rxnRateLaw (userData.rxnRateLaw)
	#define conc (dydt)
	#define maxReactants (this->sparseKinetics_maxReactants)
	#define maxSpecies (this->sparseKinetics_maxSpecies)
	#define nuk (this->d_kineticsData.nuk)
	#define nu (this->d_kineticsData.nu)
	#define inu (this->d_kineticsData.inu)
	#define isIntegral(idx) ( SparseKinetics_enableIntegralReactions \
	&& this->d_kineticsData.isIntegral(idx) )

	for (int k = 0; k < nspecies; ++k)
	conc[k] = y[k] / VDPD;

	// Construct the reaction rate laws
	for (int i = 0; i < nreactions; ++i)
	{
	double rxnRateLawForward;
	if (isIntegral(i)){
	rxnRateLawForward = kFor[i] * powint( conc[ nuk(i,0) ], inu(i,0) );
	for (int kk = 1; kk < maxReactants; ++kk){
	const int k = nuk(i,kk);
	if (k == SparseKinetics_invalidIndex) break;
	//if (k != SparseKinetics_invalidIndex)
	rxnRateLawForward *= powint( conc[k], inu(i,kk) );
	}
	} else {
	rxnRateLawForward = kFor[i] * pow( conc[ nuk(i,0) ], nu(i,0) );
	for (int kk = 1; kk < maxReactants; ++kk){
	const int k = nuk(i,kk);
	if (k == SparseKinetics_invalidIndex) break;
	//if (k != SparseKinetics_invalidIndex)
	rxnRateLawForward *= pow( conc[k], nu(i,kk) );
	}
	}

	rxnRateLaw[i] = rxnRateLawForward;
	}

	// Construct the reaction rates for each species from the
	// Stoichiometric matrix and ROP vector.
	for (int k = 0; k < nspecies; ++k)
	dydt[k] = 0.0;

	for (int i = 0; i < nreactions; ++i){
	// Reactants ...
	dydt[ nuk(i,0) ] -= nu(i,0) * rxnRateLaw[i];
	for (int kk = 1; kk < maxReactants; ++kk){
	const int k = nuk(i,kk);
	if (k == SparseKinetics_invalidIndex) break;
	//if (k != SparseKinetics_invalidIndex)
	dydt[k] -= nu(i,kk) * rxnRateLaw[i];
	}

	// Products ...
	dydt[ nuk(i,maxReactants) ] += nu(i,maxReactants) * rxnRateLaw[i];
	for (int kk = maxReactants+1; kk < maxSpecies; ++kk){
	const int k = nuk(i,kk);
	if (k == SparseKinetics_invalidIndex) break;
	//if (k != SparseKinetics_invalidIndex)
	dydt[k] += nu(i,kk) * rxnRateLaw[i];
	}
	}

	// Add in the volume factor to convert to the proper units.
	for (int k = 0; k < nspecies; ++k)
	dydt[k] *= VDPD;

	#undef kFor
	#undef kRev
	#undef rxnRateLaw
	#undef conc
	#undef maxReactants
	#undef maxSpecies
	#undef nuk
	#undef nu
	#undef inu
	#undef isIntegral
	//#undef invalidIndex

	return 0;
	}

	/* ---------------------------------------------------------------------- */

	/*template <typename DeviceType>
	template <typename SolverType>
	KOKKOS_INLINE_FUNCTION
	void FixRxKokkos<DeviceType>::operator()(SolverType, const int &i) const
	{
	if (atom->mask[i] & groupbit)
	{
	double rwork = new double[8nspecies];

	UserRHSData userData;
	userData.kFor = new double[nreactions];
	userData.rxnRateLaw = new double[nreactions];

	int ode_counter[4] = { 0 };

	const double theta = (localTempFlag) ? dpdThetaLocal[i] : atom->dpdTheta[i];

	//Compute the reaction rate constants
	for (int irxn = 0; irxn < nreactions; irxn++)
	{
	if (SolverType::setToZero)
	userData.kFor[irxn] = 0.0;
	else
	userData.kFor[irxn] = Arr[irxn]pow(theta,nArr[irxn])exp(-Ea[irxn]/force->boltz/theta);
	}

	if (odeIntegrationFlag == ODE_LAMMPS_RK4)
	rk4(i, rwork, &userData);
	else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
	rkf45(i, rwork, &userData, ode_counter);

	delete [] rwork;
	delete [] userData.kFor;
	delete [] userData.rxnRateLaw;
	}
	} */

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	void FixRxKokkos<DeviceType>::create_kinetics_data(void)
	{
	//printf("Inside FixRxKokkos::create_kinetics_data\n");

	memory->create_kokkos( d_kineticsData.Arr, h_kineticsData.Arr, nreactions, "KineticsType::Arr");
	memory->create_kokkos( d_kineticsData.nArr, h_kineticsData.nArr, nreactions, "KineticsType::nArr");
	memory->create_kokkos( d_kineticsData.Ea, h_kineticsData.Ea, nreactions, "KineticsType::Ea");

	for (int i = 0; i < nreactions; ++i)
	{
	h_kineticsData.Arr[i] = Arr[i];
	h_kineticsData.nArr[i] = nArr[i];
	h_kineticsData.Ea[i] = Ea[i];
	}

	Kokkos::deep_copy( d_kineticsData.Arr, h_kineticsData.Arr );
	Kokkos::deep_copy( d_kineticsData.nArr, h_kineticsData.nArr );
	Kokkos::deep_copy( d_kineticsData.Ea, h_kineticsData.Ea );

	if (useSparseKinetics)
	{

	memory->create_kokkos( d_kineticsData.nu , h_kineticsData.nu , nreactions, sparseKinetics_maxSpecies, "KineticsType::nu");
	memory->create_kokkos( d_kineticsData.nuk, h_kineticsData.nuk, nreactions, sparseKinetics_maxSpecies, "KineticsType::nuk");

	for (int i = 0; i < nreactions; ++i)
	for (int k = 0; k < sparseKinetics_maxSpecies; ++k)
	{
	h_kineticsData.nu (i,k) = sparseKinetics_nu [i][k];
	h_kineticsData.nuk(i,k) = sparseKinetics_nuk[i][k];
	}

	Kokkos::deep_copy( d_kineticsData.nu, h_kineticsData.nu );
	Kokkos::deep_copy( d_kineticsData.nuk, h_kineticsData.nuk );

	if (SparseKinetics_enableIntegralReactions)
	{
	memory->create_kokkos( d_kineticsData.inu, h_kineticsData.inu, nreactions, sparseKinetics_maxSpecies, "KineticsType::inu");
	memory->create_kokkos( d_kineticsData.isIntegral, h_kineticsData.isIntegral, nreactions, "KineticsType::isIntegral");

	for (int i = 0; i < nreactions; ++i)
	{
	h_kineticsData.isIntegral(i) = sparseKinetics_isIntegralReaction[i];

	for (int k = 0; k < sparseKinetics_maxSpecies; ++k)
	h_kineticsData.inu(i,k) = sparseKinetics_inu[i][k];
	}

	Kokkos::deep_copy( d_kineticsData.inu, h_kineticsData.inu );
	Kokkos::deep_copy( d_kineticsData.isIntegral, h_kineticsData.isIntegral );
	}
	}

	//else
	//{

	// Dense option
	memory->create_kokkos( d_kineticsData.stoich, h_kineticsData.stoich, nreactions, nspecies, "KineticsType::stoich");
	memory->create_kokkos( d_kineticsData.stoichReactants, h_kineticsData.stoichReactants, nreactions, nspecies, "KineticsType::stoichReactants");
	memory->create_kokkos( d_kineticsData.stoichProducts, h_kineticsData.stoichProducts, nreactions, nspecies, "KineticsType::stoichProducts");

	for (int i = 0; i < nreactions; ++i)
	for (int k = 0; k < nspecies; ++k)
	{
	h_kineticsData.stoich(i,k) = stoich[i][k];
	h_kineticsData.stoichReactants(i,k) = stoichReactants[i][k];
	h_kineticsData.stoichProducts(i,k) = stoichProducts[i][k];
	}

	Kokkos::deep_copy( d_kineticsData.stoich, h_kineticsData.stoich );
	Kokkos::deep_copy( d_kineticsData.stoichReactants, h_kineticsData.stoichReactants );
	Kokkos::deep_copy( d_kineticsData.stoichProducts, h_kineticsData.stoichProducts );

	//}

	update_kinetics_data = false;
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	void FixRxKokkos<DeviceType>::setup_pre_force(int vflag)
	{
	//printf("Inside FixRxKokkos<DeviceType>::setup_pre_force restartFlag= %d\n", my_restartFlag);

	if (my_restartFlag)
	my_restartFlag = 0;
	else
	this->solve_reactions( vflag, false );
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	void FixRxKokkos<DeviceType>::pre_force(int vflag)
	{
	//printf("Inside FixRxKokkos<DeviceType>::pre_force localTempFlag= %d\n", localTempFlag);

	this->solve_reactions( vflag, true );
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	KOKKOS_INLINE_FUNCTION
	void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_zeroCounterViews, const int& i) const
	{
	d_diagnosticCounterPerODEnSteps(i) = 0;
	d_diagnosticCounterPerODEnFuncs(i) = 0;
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	template <bool ZERO_RATES>
	KOKKOS_INLINE_FUNCTION
	void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_solveSystems<ZERO_RATES>, const int& i, CounterType& counter) const
	{
	if (d_mask(i) & groupbit)
	{
	StridedArrayType<double,1> y( d_scratchSpace.ptr_on_device() + scratchSpaceSize * i );
	StridedArrayType<double,1> rwork( &y[nspecies] );

	UserRHSDataKokkos<1> userData;
	userData.kFor.m_data = &( rwork[7*nspecies] );
	userData.rxnRateLaw.m_data = &( userData.kFor[ nreactions ] );

	CounterType counter_i;

	const double theta = (localTempFlag) ? d_dpdThetaLocal(i) : d_dpdTheta(i);

	//Compute the reaction rate constants
	for (int irxn = 0; irxn < nreactions; irxn++)
	{
	if (ZERO_RATES)
	userData.kFor[irxn] = 0.0;
	else
	{
	userData.kFor[irxn] = d_kineticsData.Arr(irxn) *
	pow(theta, d_kineticsData.nArr(irxn)) *
	exp(-d_kineticsData.Ea(irxn) / boltz / theta);
	}
	}

	// Update ConcOld and initialize the ODE solution vector y[].
	for (int ispecies = 0; ispecies < nspecies; ispecies++)
	{
	const double tmp = d_dvector(ispecies, i);
	d_dvector(ispecies+nspecies, i) = tmp;
	y[ispecies] = tmp;
	}

	// Solver the ODE system.
	if (odeIntegrationFlag == ODE_LAMMPS_RK4)
	{
	k_rk4(t_stop, y, rwork, userData);
	}
	else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
	{
	k_rkf45(nspecies, t_stop, y, rwork, userData, counter_i);

	if (diagnosticFrequency == 1)
	{
	d_diagnosticCounterPerODEnSteps(i) = counter_i.nSteps;
	d_diagnosticCounterPerODEnFuncs(i) = counter_i.nFuncs;
	}
	}

	// Store the solution back in dvector.
	for (int ispecies = 0; ispecies < nspecies; ispecies++)
	{
	if (y[ispecies] < -1.0e-10)
	{
	//error->one(FLERR,"Computed concentration in RK solver is < -1.0e-10");
	k_error_flag.d_view() = 2;
	// This should be an atomic update.
	}
	else if (y[ispecies] < MY_EPSILON)
	y[ispecies] = 0.0;

	d_dvector(ispecies,i) = y[ispecies];
	}

	// Update the iteration statistics counter. Is this unique for each iteration?
	counter += counter_i;

	} // if
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreForce)
	{
	//printf("Inside FixRxKokkos<DeviceType>::solve_reactions localTempFlag= %d isPreForce= %s\n", localTempFlag, isPreForce ? "True" : "false");

	copymode = 1;

	if (update_kinetics_data)
	create_kinetics_data();

	TimerType timer_start = getTimeStamp();

	//const int nlocal = atom->nlocal;
	this->nlocal = atom->nlocal;
	const int nghost = atom->nghost;
	const int newton_pair = force->newton_pair;

	// Set the forward rates to zero if acting as setup_pre_force.
	const bool setRatesToZero = (isPreForce == false);

	if (localTempFlag)
	{
	const int count = nlocal + (newton_pair ? nghost : 0);

	if (count > k_dpdThetaLocal.d_view.dimension_0()) {
	memory->destroy_kokkos (k_dpdThetaLocal, dpdThetaLocal);
	memory->create_kokkos (k_dpdThetaLocal, dpdThetaLocal, count, "FixRxKokkos::dpdThetaLocal");
	this->d_dpdThetaLocal = k_dpdThetaLocal.d_view;
	this->h_dpdThetaLocal = k_dpdThetaLocal.h_view;
	}

	const int neighflag = lmp->kokkos->neighflag;

	#define _template_switch(_wtflag, _localTempFlag) { \
	if (neighflag == HALF) \
	if (newton_pair) \
	computeLocalTemperature<_wtflag, _localTempFlag, true , HALF> (); \
	else \
	computeLocalTemperature<_wtflag, _localTempFlag, false, HALF> (); \
	else if (neighflag == HALFTHREAD) \
	if (newton_pair) \
	computeLocalTemperature<_wtflag, _localTempFlag, true , HALFTHREAD> (); \
	else \
	computeLocalTemperature<_wtflag, _localTempFlag, false, HALFTHREAD> (); \
	else if (neighflag == FULL) \
	if (newton_pair) \
	computeLocalTemperature<_wtflag, _localTempFlag, true , FULL> (); \
	else \
	computeLocalTemperature<_wtflag, _localTempFlag, false, FULL> (); \
	}

	// Are there is no other options than wtFlag = (0)LUCY and localTempFlag = NONE : HARMONIC?
	if (localTempFlag == HARMONIC) {
	_template_switch(LUCY, HARMONIC)
	}
	else {
	_template_switch(LUCY, NONE)
	}
	#undef _template_switch
	}

	TimerType timer_localTemperature = getTimeStamp();

	// Total counters from the ODE solvers.
	CounterType TotalCounters;

	// Set data needed in the operators.
	// ...

	// Local references to the atomKK objects.
	//typename ArrayTypes<DeviceType>::t_efloat_1d d_dpdTheta = atomKK->k_dpdTheta.view<DeviceType>();
	//typename ArrayTypes<DeviceType>::t_float_2d d_dvector = atomKK->k_dvector.view<DeviceType>();
	//typename ArrayTypes<DeviceType>::t_int_1d d_mask = atomKK->k_mask.view<DeviceType>();
	this->d_dpdTheta = atomKK->k_dpdTheta.view<DeviceType>();
	this->d_dvector = atomKK->k_dvector.view<DeviceType>();
	this->d_mask = atomKK->k_mask.view<DeviceType>();

	// Get up-to-date data.
	atomKK->sync( execution_space, MASK_MASK \| DVECTOR_MASK \| DPDTHETA_MASK );

	// Set some constants outside of the parallel_for
	//const double boltz = force->boltz;
	//const double t_stop = update->dt; // DPD time-step and integration length.
	this->boltz = force->boltz;
	this->t_stop = update->dt; // DPD time-step and integration length.

	// Average DPD volume. Used in the RHS function.
	this->VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;

	if (odeIntegrationFlag == ODE_LAMMPS_RKF45 && diagnosticFrequency == 1)
	{
	memory->create_kokkos (k_diagnosticCounterPerODEnSteps, diagnosticCounterPerODEnSteps, nlocal, "FixRxKokkos::diagnosticCounterPerODEnSteps");
	memory->create_kokkos (k_diagnosticCounterPerODEnFuncs, diagnosticCounterPerODEnFuncs, nlocal, "FixRxKokkos::diagnosticCounterPerODEnFuncs");

	d_diagnosticCounterPerODEnSteps = k_diagnosticCounterPerODEnSteps.d_view;
	d_diagnosticCounterPerODEnFuncs = k_diagnosticCounterPerODEnFuncs.d_view;

	Kokkos::parallel_for ( Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_zeroCounterViews>(0,nlocal), *this);
	//Kokkos::parallel_for ( nlocal,
	// LAMMPS_LAMBDA(const int i)
	// {
	// d_diagnosticCounterPerODEnSteps(i) = 0;
	// d_diagnosticCounterPerODEnFuncs(i) = 0;
	// }
	// );
	}

	// Error flag for any failures.
	//DAT::tdual_int_scalar k_error_flag("pair:error_flag");

	// Initialize and sync the device flag.
	k_error_flag.h_view() = 0;
	k_error_flag.template modify<LMPHostType>();
	k_error_flag.template sync<DeviceType>();

	// Create scratch array space.
	//const size_t scratchSpaceSize = (8nspecies + 2nreactions);
	this->scratchSpaceSize = (8nspecies + 2nreactions);
	//double scratchSpace = new double[ scratchSpaceSize nlocal ];

	//typename ArrayTypes<DeviceType>::t_double_1d d_scratchSpace("d_scratchSpace", scratchSpaceSize * nlocal);
	if (nlocal*scratchSpaceSize > d_scratchSpace.dimension_0()) {
	memory->destroy_kokkos (d_scratchSpace);
	memory->create_kokkos (d_scratchSpace, nlocal*scratchSpaceSize, "FixRxKokkos::d_scratchSpace");
	}

	#if 0
	Kokkos::parallel_reduce( nlocal, LAMMPS_LAMBDA(int i, CounterType &counter)
	{
	if (d_mask(i) & groupbit)
	{
	//double y = new double[8nspecies];
	//double *rwork = y + nspecies;

	//StridedArrayType<double,1> _y( y );
	//StridedArrayType<double,1> _rwork( rwork );

	StridedArrayType<double,1> y( d_scratchSpace.ptr_on_device() + scratchSpaceSize * i );
	StridedArrayType<double,1> rwork( &y[nspecies] );

	//UserRHSData userData;
	//userData.kFor = new double[nreactions];
	//userData.rxnRateLaw = new double[nreactions];

	//UserRHSDataKokkos<1> userDataKokkos;
	//userDataKokkos.kFor.m_data = userData.kFor;
	//userDataKokkos.rxnRateLaw.m_data = userData.rxnRateLaw;

	UserRHSDataKokkos<1> userData;
	userData.kFor.m_data = &( rwork[7*nspecies] );
	userData.rxnRateLaw.m_data = &( userData.kFor[ nreactions ] );

	CounterType counter_i;

	const double theta = (localTempFlag) ? d_dpdThetaLocal(i) : d_dpdTheta(i);

	//Compute the reaction rate constants
	for (int irxn = 0; irxn < nreactions; irxn++)
	{
	if (setRatesToZero)
	userData.kFor[irxn] = 0.0;
	else
	{
	userData.kFor[irxn] = d_kineticsData.Arr(irxn) *
	pow(theta, d_kineticsData.nArr(irxn)) *
	exp(-d_kineticsData.Ea(irxn) / boltz / theta);
	}
	}

	// Update ConcOld and initialize the ODE solution vector y[].
	for (int ispecies = 0; ispecies < nspecies; ispecies++)
	{
	const double tmp = d_dvector(ispecies, i);
	d_dvector(ispecies+nspecies, i) = tmp;
	y[ispecies] = tmp;
	}

	// Solver the ODE system.
	if (odeIntegrationFlag == ODE_LAMMPS_RK4)
	{
	k_rk4(t_stop, y, rwork, userData);
	}
	else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
	{
	k_rkf45(nspecies, t_stop, y, rwork, userData, counter_i);

	if (diagnosticFrequency == 1)
	{
	d_diagnosticCounterPerODEnSteps(i) = counter_i.nSteps;
	d_diagnosticCounterPerODEnFuncs(i) = counter_i.nFuncs;
	}
	}

	// Store the solution back in dvector.
	for (int ispecies = 0; ispecies < nspecies; ispecies++)
	{
	if (y[ispecies] < -1.0e-10)
	{
	//error->one(FLERR,"Computed concentration in RK solver is < -1.0e-10");
	k_error_flag.d_view() = 2;
	// This should be an atomic update.
	}
	else if (y[ispecies] < MY_EPSILON)
	y[ispecies] = 0.0;

	d_dvector(ispecies,i) = y[ispecies];
	}

	//delete [] y;
	//delete [] userData.kFor;
	//delete [] userData.rxnRateLaw;

	// Update the iteration statistics counter. Is this unique for each iteration?
	counter += counter_i;

	} // if
	} // parallel_for lambda-body

	, TotalCounters // reduction value for all iterations.
	);
	#else
	if (setRatesToZero)
	Kokkos::parallel_reduce( Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_solveSystems<true > >(0,nlocal), *this, TotalCounters);
	else
	Kokkos::parallel_reduce( Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_solveSystems<false> >(0,nlocal), *this, TotalCounters);
	#endif

	TimerType timer_ODE = getTimeStamp();

	// Check the error flag for any failures.
	k_error_flag.template modify<DeviceType>();
	k_error_flag.template sync<LMPHostType>();
	if (k_error_flag.h_view() == 2)
	error->one(FLERR,"Computed concentration in RK solver is < -1.0e-10");

	// Signal that dvector has been modified on this execution space.
	atomKK->modified( execution_space, DVECTOR_MASK );

	// Communicate the updated species data to all nodes
	atomKK->sync ( Host, DVECTOR_MASK );

	comm->forward_comm_fix(this);

	atomKK->modified ( Host, DVECTOR_MASK );

	TimerType timer_stop = getTimeStamp();

	double time_ODE = getElapsedTime(timer_localTemperature, timer_ODE);

	//printf("me= %d kokkos total= %g temp= %g ode= %g comm= %g nlocal= %d nfc= %d %d\n", comm->me,
	// getElapsedTime(timer_start, timer_stop),
	// getElapsedTime(timer_start, timer_localTemperature),
	// getElapsedTime(timer_localTemperature, timer_ODE),
	// getElapsedTime(timer_ODE, timer_stop), nlocal, TotalCounters.nFuncs, TotalCounters.nSteps);

	// Warn the user if a failure was detected in the ODE solver.
	if (TotalCounters.nFails > 0){
	char sbuf[128];
	sprintf(sbuf,"in FixRX::pre_force, ODE solver failed for %d atoms.", TotalCounters.nFails);
	error->warning(FLERR, sbuf);
	}

	// Compute and report ODE diagnostics, if requested.
	if (odeIntegrationFlag == ODE_LAMMPS_RKF45 && diagnosticFrequency != 0)
	{
	// Update the counters.
	diagnosticCounter[StepSum] += TotalCounters.nSteps;
	diagnosticCounter[FuncSum] += TotalCounters.nFuncs;
	diagnosticCounter[TimeSum] += time_ODE;
	diagnosticCounter[AtomSum] += nlocal;
	diagnosticCounter[numDiagnosticCounters-1] ++;

	if ( (diagnosticFrequency > 0 &&
	((update->ntimestep - update->firststep) % diagnosticFrequency) == 0) \|\|
	(diagnosticFrequency < 0 && update->ntimestep == update->laststep) )
	this->odeDiagnostics();
	}

	copymode = 0;
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	void FixRxKokkos<DeviceType>::odeDiagnostics(void)
	{
	TimerType timer_start = getTimeStamp();

	// Compute:
	// 1) Average # of ODE integrator steps and RHS evaluations per atom globally.
	// 2) RMS # of ...
	// 3) Average # of ODE steps and RHS evaluations per MPI task.
	// 4) RMS # of ODE steps and RHS evaluations per MPI task.
	// 5) MAX # of ODE steps and RHS evaluations per MPI task.
	//
	// ... 1,2 are for ODE control diagnostics.
	// ... 3-5 are for load balancing diagnostics.
	//
	// To do this, we'll need to
	// a) Allreduce (sum) the sum of nSteps / nFuncs. Dividing by atom->natoms
	// gives the avg # of steps/funcs per atom globally.
	// b) Reduce (sum) to root the sum of squares of the differences.
	// i) Sum_i (steps_i - avg_steps_global)^2
	// ii) Sum_i (funcs_i - avg_funcs_global)^2
	// iii) (avg_steps_local - avg_steps_global)^2
	// iv) (avg_funcs_local - avg_funcs_global)^2

	const int numCounters = numDiagnosticCounters-1;

	// # of time-steps for averaging.
	const int nTimes = this->diagnosticCounter[numDiagnosticCounters-1];

	// # of ODE's per time-step (on average).
	//const int nODEs = this->diagnosticCounter[AtomSum] / nTimes;

	// Sum up the sums from each task.
	double sums[numCounters];
	double my_vals[numCounters];
	double max_per_proc[numCounters];
	double min_per_proc[numCounters];

	// Compute counters per dpd time-step.
	for (int i = 0; i < numCounters; ++i){
	my_vals[i] = this->diagnosticCounter[i] / nTimes;
	//printf("my sum[%d] = %f %d\n", i, my_vals[i], comm->me);
	}

	MPI_Allreduce (my_vals, sums, numCounters, MPI_DOUBLE, MPI_SUM, world);

	MPI_Reduce (my_vals, max_per_proc, numCounters, MPI_DOUBLE, MPI_MAX, 0, world);
	MPI_Reduce (my_vals, min_per_proc, numCounters, MPI_DOUBLE, MPI_MIN, 0, world);

	const double nODEs = sums[numCounters-1];

	double avg_per_atom[numCounters], avg_per_proc[numCounters];

	// Averages per-ODE and per-proc per time-step.
	for (int i = 0; i < numCounters; ++i){
	avg_per_atom[i] = sums[i] / nODEs;
	avg_per_proc[i] = sums[i] / comm->nprocs;
	}

	// Sum up the differences from each task.
	double sum_sq[2*numCounters];
	double my_sum_sq[2*numCounters];
	for (int i = 0; i < numCounters; ++i){
	double diff_i = my_vals[i] - avg_per_proc[i];
	my_sum_sq[i] = diff_i * diff_i;
	}

	double max_per_ODE[numCounters], min_per_ODE[numCounters];

	// Process the per-ODE RMS of the # of steps/funcs
	if (diagnosticFrequency == 1)
	{
	h_diagnosticCounterPerODEnSteps = k_diagnosticCounterPerODEnSteps.h_view;
	h_diagnosticCounterPerODEnFuncs = k_diagnosticCounterPerODEnFuncs.h_view;

	Kokkos::deep_copy( h_diagnosticCounterPerODEnSteps, d_diagnosticCounterPerODEnSteps );
	Kokkos::deep_copy( h_diagnosticCounterPerODEnFuncs, d_diagnosticCounterPerODEnFuncs );

	double my_max[numCounters], my_min[numCounters];

	//const int nlocal = atom->nlocal;
	nlocal = atom->nlocal;
	HAT::t_int_1d h_mask = atomKK->k_mask.h_view;

	for (int i = 0; i < numCounters; ++i)
	{
	my_sum_sq[i+numCounters] = 0;
	my_max[i] = 0;
	my_min[i] = DBL_MAX;
	}

	for (int j = 0; j < nlocal; ++j)
	if (h_mask(j) & groupbit)
	{
	int nSteps = h_diagnosticCounterPerODEnSteps(j);
	double diff_nSteps = double( nSteps ) - avg_per_atom[StepSum];
	my_sum_sq[StepSum+numCounters] += diff_nSteps*diff_nSteps;
	my_max[StepSum] = std::max( my_max[StepSum], (double)nSteps );
	my_min[StepSum] = std::min( my_min[StepSum], (double)nSteps );

	int nFuncs = h_diagnosticCounterPerODEnFuncs(j);
	double diff_nFuncs = double( nFuncs ) - avg_per_atom[FuncSum];
	my_sum_sq[FuncSum+numCounters] += diff_nFuncs*diff_nFuncs;

	my_max[FuncSum] = std::max( my_max[FuncSum], (double)nFuncs );
	my_min[FuncSum] = std::min( my_min[FuncSum], (double)nFuncs );
	}

	memory->destroy_kokkos( k_diagnosticCounterPerODEnSteps, diagnosticCounterPerODEnSteps );
	memory->destroy_kokkos( k_diagnosticCounterPerODEnFuncs, diagnosticCounterPerODEnFuncs );

	MPI_Reduce (my_sum_sq, sum_sq, 2*numCounters, MPI_DOUBLE, MPI_SUM, 0, world);

	MPI_Reduce (my_max, max_per_ODE, numCounters, MPI_DOUBLE, MPI_MAX, 0, world);
	MPI_Reduce (my_min, min_per_ODE, numCounters, MPI_DOUBLE, MPI_MIN, 0, world);
	}
	else
	MPI_Reduce (my_sum_sq, sum_sq, numCounters, MPI_DOUBLE, MPI_SUM, 0, world);

	TimerType timer_stop = getTimeStamp();
	double time_local = getElapsedTime( timer_start, timer_stop );

	if (comm->me == 0){
	char smesg[128];

	#define print_mesg(smesg) {\
	if (screen) fprintf(screen,"%s\n", smesg); \
	if (logfile) fprintf(logfile,"%s\n", smesg); }

	sprintf(smesg, "FixRX::ODE Diagnostics: # of iters \|# of rhs evals\| run-time (sec) \| # atoms");
	print_mesg(smesg);

	sprintf(smesg, " AVG per ODE : %-12.5g \| %-12.5g \| %-12.5g", avg_per_atom[0], avg_per_atom[1], avg_per_atom[2]);
	print_mesg(smesg);

	// only valid for single time-step!
	if (diagnosticFrequency == 1){
	double rms_per_ODE[numCounters];
	for (int i = 0; i < numCounters; ++i)
	rms_per_ODE[i] = sqrt( sum_sq[i+numCounters] / nODEs );

	sprintf(smesg, " RMS per ODE : %-12.5g \| %-12.5g ", rms_per_ODE[0], rms_per_ODE[1]);
	print_mesg(smesg);

	sprintf(smesg, " MAX per ODE : %-12.5g \| %-12.5g ", max_per_ODE[0], max_per_ODE[1]);
	print_mesg(smesg);

	sprintf(smesg, " MIN per ODE : %-12.5g \| %-12.5g ", min_per_ODE[0], min_per_ODE[1]);
	print_mesg(smesg);
	}

	sprintf(smesg, " AVG per Proc : %-12.5g \| %-12.5g \| %-12.5g \| %-12.5g", avg_per_proc[StepSum], avg_per_proc[FuncSum], avg_per_proc[TimeSum], avg_per_proc[AtomSum]);
	print_mesg(smesg);

	if (comm->nprocs > 1){
	double rms_per_proc[numCounters];
	for (int i = 0; i < numCounters; ++i)
	rms_per_proc[i] = sqrt( sum_sq[i] / comm->nprocs );

	sprintf(smesg, " RMS per Proc : %-12.5g \| %-12.5g \| %-12.5g \| %-12.5g", rms_per_proc[0], rms_per_proc[1], rms_per_proc[2], rms_per_proc[AtomSum]);
	print_mesg(smesg);

	sprintf(smesg, " MAX per Proc : %-12.5g \| %-12.5g \| %-12.5g \| %-12.5g", max_per_proc[0], max_per_proc[1], max_per_proc[2], max_per_proc[AtomSum]);
	print_mesg(smesg);

	sprintf(smesg, " MIN per Proc : %-12.5g \| %-12.5g \| %-12.5g \| %-12.5g", min_per_proc[0], min_per_proc[1], min_per_proc[2], min_per_proc[AtomSum]);
	print_mesg(smesg);
	}

	sprintf(smesg, " AVG'd over %d time-steps", nTimes);
	print_mesg(smesg);
	sprintf(smesg, " AVG'ing took %g sec", time_local);
	print_mesg(smesg);

	#undef print_mesg

	}

	// Reset the counters.
	for (int i = 0; i < numDiagnosticCounters; ++i)
	diagnosticCounter[i] = 0;

	return;
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	KOKKOS_INLINE_FUNCTION
	void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_zeroTemperatureViews, const int& i) const
	{
	d_sumWeights(i) = 0.0;
	d_dpdThetaLocal(i) = 0.0;
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	template <int WT_FLAG, bool NEWTON_PAIR, int NEIGHFLAG>
	KOKKOS_INLINE_FUNCTION
	void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_firstPairOperator<WT_FLAG,NEWTON_PAIR,NEIGHFLAG>, const int& ii) const
	{
	// Create an atomic view of sumWeights and dpdThetaLocal. Only needed
	// for Half/thread scenarios.
	typedef Kokkos::View< E_FLOAT*, typename DAT::t_efloat_1d::array_layout, typename DAT::t_efloat_1d::device_type, Kokkos::MemoryTraits< AtomicF< NEIGHFLAG >::value> > AtomicViewType;

	AtomicViewType a_dpdThetaLocal = d_dpdThetaLocal;
	AtomicViewType a_sumWeights = d_sumWeights;

	// Local scalar accumulators.
	double i_dpdThetaLocal = 0.0;
	double i_sumWeights = 0.0;

	const int i = d_ilist(ii);

	const double xtmp = d_x(i,0);
	const double ytmp = d_x(i,1);
	const double ztmp = d_x(i,2);
	const int itype = d_type(i);

	const int jnum = d_numneigh(i);

	for (int jj = 0; jj < jnum; jj++)
	{
	const int j = (d_neighbors(i,jj) & NEIGHMASK);
	const int jtype = d_type(j);

	const double delx = xtmp - d_x(j,0);
	const double dely = ytmp - d_x(j,1);
	const double delz = ztmp - d_x(j,2);
	const double rsq = delxdelx + delydely + delz*delz;

	const double cutsq_ij = d_cutsq(itype,jtype);

	if (rsq < cutsq_ij)
	{
	const double rcut = sqrt( cutsq_ij );
	double rij = sqrt(rsq);
	double ratio = rij/rcut;

	double wij = 0.0;

	// Lucy's Weight Function
	if (WT_FLAG == LUCY)
	{
	wij = (1.0+3.0ratio) (1.0-ratio)(1.0-ratio)(1.0-ratio);
	i_dpdThetaLocal += wij / d_dpdTheta(j);
	if ((NEIGHFLAG==HALF \|\| NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR \|\| j < nlocal))
	a_dpdThetaLocal(j) += wij / d_dpdTheta(i);
	}

	i_sumWeights += wij;
	if ((NEIGHFLAG==HALF \|\| NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR \|\| j < nlocal))
	a_sumWeights(j) += wij;
	}
	}

	// Update, don't assign, the array value (because another iteration may have hit it).
	a_dpdThetaLocal(i) += i_dpdThetaLocal;
	a_sumWeights(i) += i_sumWeights;
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	template <int WT_FLAG, int LOCAL_TEMP_FLAG>
	KOKKOS_INLINE_FUNCTION
	void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_2ndPairOperator<WT_FLAG,LOCAL_TEMP_FLAG>, const int& i) const
	{
	double wij = 0.0;

	// Lucy Weight Function
	if (WT_FLAG == LUCY)
	{
	wij = 1.0;
	d_dpdThetaLocal(i) += wij / d_dpdTheta(i);
	}
	d_sumWeights(i) += wij;

	// Normalized local temperature
	d_dpdThetaLocal(i) = d_dpdThetaLocal(i) / d_sumWeights(i);

	if (LOCAL_TEMP_FLAG == HARMONIC)
	d_dpdThetaLocal(i) = 1.0 / d_dpdThetaLocal(i);
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	template <int WT_FLAG, int LOCAL_TEMP_FLAG, bool NEWTON_PAIR, int NEIGHFLAG>
	void FixRxKokkos<DeviceType>::computeLocalTemperature()
	{
	//typename ArrayTypes<DeviceType>::t_x_array_randomread d_x = atomKK->k_x.view<DeviceType>();
	//typename ArrayTypes<DeviceType>::t_int_1d_randomread d_type = atomKK->k_type.view<DeviceType>();
	//typename ArrayTypes<DeviceType>::t_efloat_1d d_dpdTheta = atomKK->k_dpdTheta.view<DeviceType>();
	d_x = atomKK->k_x.view<DeviceType>();
	d_type = atomKK->k_type.view<DeviceType>();
	d_dpdTheta = atomKK->k_dpdTheta.view<DeviceType>();

	atomKK->sync(execution_space, X_MASK \| TYPE_MASK \| DPDTHETA_MASK );

	//const int nlocal = atom->nlocal;
	nlocal = atom->nlocal;
	const int nghost = atom->nghost;

	//printf("Inside FixRxKokkos::computeLocalTemperature: %d %d %d %d %d %d %d\n", WT_FLAG, LOCAL_TEMP_FLAG, NEWTON_PAIR, (int)lmp->kokkos->neighflag, NEIGHFLAG, nlocal, nghost);

	// Pull from pairDPDE. The pairDPDEKK objects are protected so recreate here for now.
	//pairDPDEKK->k_cutsq.template sync<DeviceType>();
	//typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq = pairDPDEKK->k_cutsq.template view<DeviceType();

	//!< Copies pulled from pairDPDE for local use since pairDPDEKK's objects are protected.
	//typename ArrayTypes<DeviceType>::tdual_ffloat_2d k_cutsq;
	//typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;
	//double **h_cutsq;

	{
	const int ntypes = atom->ntypes;

	//memory->create_kokkos (k_cutsq, h_cutsq, ntypes+1, ntypes+1, "pair:cutsq");
	if (ntypes+1 > k_cutsq.dimension_0()) {
	memory->destroy_kokkos (k_cutsq);
	memory->create_kokkos (k_cutsq, ntypes+1, ntypes+1, "FixRxKokkos::k_cutsq");
	d_cutsq = k_cutsq.template view<DeviceType>();
	}

	for (int i = 1; i <= ntypes; ++i)
	for (int j = i; j <= ntypes; ++j)
	{
	k_cutsq.h_view(i,j) = pairDPDE->cutsq[i][j];
	k_cutsq.h_view(j,i) = k_cutsq.h_view(i,j);
	}

	k_cutsq.template modify<LMPHostType>();
	k_cutsq.template sync<DeviceType>();
	}

	// Initialize the local temperature weight array
	int sumWeightsCt = nlocal + (NEWTON_PAIR ? nghost : 0);

	//memory->create_kokkos (k_sumWeights, sumWeights, sumWeightsCt, "FixRxKokkos::sumWeights");
	if (sumWeightsCt > k_sumWeights.d_view.dimension_0()) {
	memory->destroy_kokkos(k_sumWeights, sumWeights);
	memory->create_kokkos (k_sumWeights, sumWeightsCt, "FixRxKokkos::sumWeights");
	d_sumWeights = k_sumWeights.d_view;
	h_sumWeights = k_sumWeights.h_view;
	}

	// Initialize the accumulator to zero ...
	//Kokkos::parallel_for (sumWeightsCt,
	// LAMMPS_LAMBDA(const int i)
	// {
	// d_sumWeights(i) = 0.0;
	// }
	// );

	Kokkos::parallel_for (Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_zeroTemperatureViews>(0, sumWeightsCt), *this);

	// Local list views. (This isn't working!)
	NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
	if (not(list->kokkos))
	error->one(FLERR,"list is not a Kokkos list\n");

	//typename ArrayTypes<DeviceType>::t_neighbors_2d d_neighbors = k_list->d_neighbors;
	//typename ArrayTypes<DeviceType>::t_int_1d d_ilist = k_list->d_ilist;
	//typename ArrayTypes<DeviceType>::t_int_1d d_numneigh = k_list->d_numneigh;
	d_neighbors = k_list->d_neighbors;
	d_ilist = k_list->d_ilist;
	d_numneigh = k_list->d_numneigh;

	const int inum = list->inum;

	// loop over neighbors of my atoms
	#if 0
	Kokkos::parallel_for ( inum,
	LAMMPS_LAMBDA(const int ii)
	{
	// Create an atomic view of sumWeights and dpdThetaLocal. Only needed
	// for Half/thread scenarios.
	//typedef Kokkos::View< E_FLOAT*, typename DAT::t_efloat_1d::array_layout, DeviceType, Kokkos::MemoryTraits< AtomicF< NEIGHFLAG >::value> > AtomicViewType;
	typedef Kokkos::View< E_FLOAT*, typename DAT::t_efloat_1d::array_layout, typename DAT::t_efloat_1d::device_type, Kokkos::MemoryTraits< AtomicF< NEIGHFLAG >::value> > AtomicViewType;

	AtomicViewType a_dpdThetaLocal = d_dpdThetaLocal;
	AtomicViewType a_sumWeights = d_sumWeights;

	// Local scalar accumulators.
	double i_dpdThetaLocal = 0.0;
	double i_sumWeights = 0.0;

	const int i = d_ilist(ii);

	const double xtmp = d_x(i,0);
	const double ytmp = d_x(i,1);
	const double ztmp = d_x(i,2);
	const int itype = d_type(i);

	const int jnum = d_numneigh(i);

	for (int jj = 0; jj < jnum; jj++)
	{
	const int j = (d_neighbors(i,jj) & NEIGHMASK);
	const int jtype = d_type(j);

	const double delx = xtmp - d_x(j,0);
	const double dely = ytmp - d_x(j,1);
	const double delz = ztmp - d_x(j,2);
	const double rsq = delxdelx + delydely + delz*delz;

	const double cutsq_ij = d_cutsq(itype,jtype);

	if (rsq < cutsq_ij)
	{
	const double rcut = sqrt( cutsq_ij );
	double rij = sqrt(rsq);
	double ratio = rij/rcut;

	double wij = 0.0;

	// Lucy's Weight Function
	if (WT_FLAG == LUCY)
	{
	wij = (1.0+3.0ratio) (1.0-ratio)(1.0-ratio)(1.0-ratio);
	i_dpdThetaLocal += wij / d_dpdTheta(j);
	if (NEWTON_PAIR \|\| j < nlocal)
	a_dpdThetaLocal(j) += wij / d_dpdTheta(i);
	}

	i_sumWeights += wij;
	if (NEWTON_PAIR \|\| j < nlocal)
	a_sumWeights(j) += wij;
	}
	}

	// Update, don't assign, the array value (because another iteration may have hit it).
	a_dpdThetaLocal(i) += i_dpdThetaLocal;
	a_sumWeights(i) += i_sumWeights;
	}
	);
	#else
	Kokkos::parallel_for (Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_firstPairOperator<WT_FLAG, NEWTON_PAIR, NEIGHFLAG> >(0, inum), *this);
	#endif

	// Signal that dpdThetaLocal and sumWeights have been modified.
	k_dpdThetaLocal.template modify<DeviceType>();
	k_sumWeights. template modify<DeviceType>();

	// Communicate the sum dpdTheta and the weights on the host.
	if (NEWTON_PAIR) comm->reverse_comm_fix(this);

	// Update the device view in case they got changed.
	k_dpdThetaLocal.template sync<DeviceType>();
	k_sumWeights. template sync<DeviceType>();

	// self-interaction for local temperature
	#if 0
	Kokkos::parallel_for ( nlocal,
	LAMMPS_LAMBDA(const int i)
	{
	double wij = 0.0;

	// Lucy Weight Function
	if (WT_FLAG == LUCY)
	{
	wij = 1.0;
	d_dpdThetaLocal(i) += wij / d_dpdTheta(i);
	}
	d_sumWeights(i) += wij;

	// Normalized local temperature
	d_dpdThetaLocal(i) = d_dpdThetaLocal(i) / d_sumWeights(i);

	if (LOCAL_TEMP_FLAG == HARMONIC)
	d_dpdThetaLocal(i) = 1.0 / d_dpdThetaLocal(i);
	}
	);
	#else
	Kokkos::parallel_for (Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_2ndPairOperator<WT_FLAG, LOCAL_TEMP_FLAG> >(0, nlocal), *this);
	#endif

	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	int FixRxKokkos<DeviceType>::pack_forward_comm(int n, int list, double buf, int pbc_flag, int *pbc)
	{
	//printf("inside FixRxKokkos::pack_forward_comm %d\n", comm->me);

	HAT::t_float_2d h_dvector = atomKK->k_dvector.h_view;

	int m = 0;
	for (int ii = 0; ii < n; ii++) {
	const int jj = list[ii];
	for(int ispecies = 0; ispecies < nspecies; ispecies++){
	buf[m++] = h_dvector(ispecies,jj);
	buf[m++] = h_dvector(ispecies+nspecies,jj);
	}
	}

	//printf("done with FixRxKokkos::pack_forward_comm %d\n", comm->me);

	return m;
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	void FixRxKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *buf)
	{
	//printf("inside FixRxKokkos::unpack_forward_comm %d\n", comm->me);

	HAT::t_float_2d h_dvector = atomKK->k_dvector.h_view;

	const int last = first + n ;
	int m = 0;
	for (int ii = first; ii < last; ii++){
	for (int ispecies = 0; ispecies < nspecies; ispecies++){
	h_dvector(ispecies,ii) = buf[m++];
	h_dvector(ispecies+nspecies,ii) = buf[m++];
	}
	}

	//printf("done with FixRxKokkos::unpack_forward_comm %d\n", comm->me);
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	int FixRxKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
	{
	//printf("inside FixRxKokkos::pack_reverse_comm %d %d %d\n", comm->me, first, n);
	// Sync the host view.
	k_dpdThetaLocal.template sync<LMPHostType>();
	k_sumWeights. template sync<LMPHostType>();

	const int last = first + n;
	int m = 0;
	for (int i = first; i < last; ++i)
	{
	buf[m++] = h_dpdThetaLocal(i);
	buf[m++] = h_sumWeights(i);
	}
	//printf("done with FixRxKokkos::pack_reverse_comm %d\n", comm->me);

	return m;
	}

	/* ---------------------------------------------------------------------- */

	template <typename DeviceType>
	void FixRxKokkos<DeviceType>::unpack_reverse_comm(int n, int list, double buf)
	{
	// printf("inside FixRxKokkos::unpack_reverse_comm %d\n", comm->me);
	int m = 0;
	for (int i = 0; i < n; i++) {
	const int j = list[i];

	h_dpdThetaLocal(j) += buf[m++];
	h_sumWeights(j) += buf[m++];
	}

	// Signal that the host view has been modified.
	k_dpdThetaLocal.template modify<LMPHostType>();
	k_sumWeights. template modify<LMPHostType>();

	// printf("done with FixRxKokkos::unpack_reverse_comm %d\n", comm->me);
	}

	namespace LAMMPS_NS {
	template class FixRxKokkos<LMPDeviceType>;
	#ifdef KOKKOS_HAVE_CUDA
	template class FixRxKokkos<LMPHostType>;
	#endif
	}

fix_rx_kokkos.cppNo OneTemporaryActions

File Metadata

fix_rx_kokkos.cppView Options

Event Timeline

fix_rx_kokkos.cpp
No OneTemporary
Actions

fix_rx_kokkos.cpp
View Options