diff --git a/src/KOKKOS/pair_reax_c_kokkos.h b/src/KOKKOS/pair_reax_c_kokkos.h
index a6192ed12..8c07ee2a0 100644
--- a/src/KOKKOS/pair_reax_c_kokkos.h
+++ b/src/KOKKOS/pair_reax_c_kokkos.h
@@ -1,430 +1,430 @@
 /* -*- c++ -*- ----------------------------------------------------------
 
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #ifdef PAIR_CLASS
 
 PairStyle(reax/c/kk,PairReaxCKokkos<LMPDeviceType>)
 PairStyle(reax/c/kk/device,PairReaxCKokkos<LMPDeviceType>)
 PairStyle(reax/c/kk/host,PairReaxCKokkos<LMPHostType>)
 
 #else
 
 #ifndef LMP_PAIR_REAXC_KOKKOS_H
 #define LMP_PAIR_REAXC_KOKKOS_H
 
-#include "stdio.h"
+#include <stdio.h>
 #include "pair_kokkos.h"
 #include "pair_reax_c.h"
 #include "neigh_list_kokkos.h"
 #include "reaxc_types.h"
 
 #define C_ele 332.06371
 #define SMALL 0.0001
 #define KCALpMOL_to_EV 23.02
 #define HB_THRESHOLD   1e-2  // 0.01
 #define MAX_BONDS      30
 
 #define SQR(x)        ((x)*(x))
 
 namespace LAMMPS_NS {
 
 typedef Kokkos::DualView<LR_data*,Kokkos::LayoutRight,LMPDeviceType> tdual_LR_data_1d;
 typedef typename tdual_LR_data_1d::t_dev t_LR_data_1d;
 
 typedef Kokkos::DualView<cubic_spline_coef*,Kokkos::LayoutRight,LMPDeviceType> tdual_cubic_spline_coef_1d;
 typedef typename tdual_cubic_spline_coef_1d::t_dev t_cubic_spline_coef_1d;
 
 struct LR_lookup_table_kk
 {
   double xmin, xmax;
   int n;
   double dx, inv_dx;
   double a;
   double m;
   double c;
 
   t_LR_data_1d d_y;
   t_cubic_spline_coef_1d d_H;
   t_cubic_spline_coef_1d d_vdW, d_CEvd;
   t_cubic_spline_coef_1d d_ele, d_CEclmb;
 };
 
 template<int NEIGHFLAG, int EVFLAG>
 struct PairReaxComputePolar{};
 
 template<int NEIGHFLAG, int EVFLAG>
 struct PairReaxComputeLJCoulomb{};
 
 template<int NEIGHFLAG, int EVFLAG>
 struct PairReaxComputeTabulatedLJCoulomb{};
 
 struct PairReaxBuildListsFull{};
 
 template<int NEIGHFLAG>
 struct PairReaxBuildListsHalf{};
 
 template<int NEIGHFLAG>
 struct PairReaxBuildListsHalf_LessAtomics{};
 
 struct PairReaxZero{};
 
 struct PairReaxZeroEAtom{};
 
 struct PairReaxZeroVAtom{};
 
 struct PairReaxBondOrder1{};
 
 struct PairReaxBondOrder1_LessAtomics{};
 
 struct PairReaxBondOrder2{};
 
 struct PairReaxBondOrder3{};
 
 template<int NEIGHFLAG>
 struct PairReaxUpdateBond{};
 
 template<int NEIGHFLAG, int EVFLAG>
 struct PairReaxComputeBond1{};
 
 template<int NEIGHFLAG, int EVFLAG>
 struct PairReaxComputeBond2{};
 
 template<int NEIGHFLAG, int EVFLAG>
 struct PairReaxComputeMulti1{};
 
 template<int NEIGHFLAG, int EVFLAG>
 struct PairReaxComputeMulti2{};
 
 template<int NEIGHFLAG, int EVFLAG>
 struct PairReaxComputeAngular{};
 
 template<int NEIGHFLAG, int EVFLAG>
 struct PairReaxComputeTorsion{};
 
 template<int NEIGHFLAG, int EVFLAG>
 struct PairReaxComputeHydrogen{};
 
 template<class DeviceType>
 class PairReaxCKokkos : public PairReaxC {
  public:
   enum {EnabledNeighFlags=FULL|HALF|HALFTHREAD};
   enum {COUL_FLAG=1};
   typedef DeviceType device_type;
   typedef ArrayTypes<DeviceType> AT;
   typedef EV_FLOAT_REAX value_type;
 
   PairReaxCKokkos(class LAMMPS *);
   virtual ~PairReaxCKokkos();
 
   void ev_setup(int, int);
   void compute(int, int);
   void *extract(const char *, int &);
   void init_style();
   double memory_usage();
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxComputePolar<NEIGHFLAG,EVFLAG>, const int&, EV_FLOAT_REAX&) const;
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxComputePolar<NEIGHFLAG,EVFLAG>, const int&) const;
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxComputeLJCoulomb<NEIGHFLAG,EVFLAG>, const int&, EV_FLOAT_REAX&) const;
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxComputeLJCoulomb<NEIGHFLAG,EVFLAG>, const int&) const;
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxComputeTabulatedLJCoulomb<NEIGHFLAG,EVFLAG>, const int&, EV_FLOAT_REAX&) const;
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxComputeTabulatedLJCoulomb<NEIGHFLAG,EVFLAG>, const int&) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxBuildListsFull, const int&) const;
 
   template<int NEIGHFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxBuildListsHalf<NEIGHFLAG>, const int&) const;
 
   template<int NEIGHFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxBuildListsHalf_LessAtomics<NEIGHFLAG>, const int&) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxZero, const int&) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxZeroEAtom, const int&) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxZeroVAtom, const int&) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxBondOrder1, const int&) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxBondOrder1_LessAtomics, const int&) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxBondOrder2, const int&) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxBondOrder3, const int&) const;
 
   template<int NEIGHFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxUpdateBond<NEIGHFLAG>, const int&) const;
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxComputeBond1<NEIGHFLAG,EVFLAG>, const int&, EV_FLOAT_REAX&) const;
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxComputeBond1<NEIGHFLAG,EVFLAG>, const int&) const;
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxComputeBond2<NEIGHFLAG,EVFLAG>, const int&, EV_FLOAT_REAX&) const;
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxComputeBond2<NEIGHFLAG,EVFLAG>, const int&) const;
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxComputeMulti1<NEIGHFLAG,EVFLAG>, const int&) const;
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxComputeMulti2<NEIGHFLAG,EVFLAG>, const int&, EV_FLOAT_REAX&) const;
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxComputeMulti2<NEIGHFLAG,EVFLAG>, const int&) const;
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxComputeAngular<NEIGHFLAG,EVFLAG>, const int&, EV_FLOAT_REAX&) const;
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxComputeAngular<NEIGHFLAG,EVFLAG>, const int&) const;
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxComputeTorsion<NEIGHFLAG,EVFLAG>, const int&, EV_FLOAT_REAX&) const;
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxComputeTorsion<NEIGHFLAG,EVFLAG>, const int&) const;
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxComputeHydrogen<NEIGHFLAG,EVFLAG>, const int&, EV_FLOAT_REAX&) const;
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxComputeHydrogen<NEIGHFLAG,EVFLAG>, const int&) const;
 
   struct params_sing{
     KOKKOS_INLINE_FUNCTION
     params_sing(){mass=0;chi=0;eta=0;r_s=0;r_pi=0;r_pi2=0;valency=0;valency_val=0;valency_e=0;valency_boc=0;nlp_opt=0;
       p_lp2=0;p_ovun2=0;p_ovun5=0;p_val3=0;p_val5=0;p_hbond=0;};
     KOKKOS_INLINE_FUNCTION
     params_sing(int i){mass=0;chi=0;eta=0;r_s=0;r_pi=0;r_pi2=0;valency=0;valency_val=0;valency_e=0;valency_boc=0;nlp_opt=0;
       p_lp2=0;p_ovun2=0;p_ovun5=0;p_val3=0;p_val5=0;p_hbond=0;};
     F_FLOAT mass,chi,eta,r_s,r_pi,r_pi2,valency,valency_val,valency_e,valency_boc,nlp_opt,
       p_lp2,p_ovun2,p_ovun5, p_val3, p_val5, p_hbond;
   };
 
   struct params_twbp{
     KOKKOS_INLINE_FUNCTION
     params_twbp(){gamma=0;gamma_w=0;alpha=0;r_vdw=0;epsilon=0;acore=0;ecore=0;rcore=0;lgre=0;lgcij=0;
       r_s=0;r_pi=0;r_pi2=0;p_bo1=0;p_bo2=0;p_bo3=0;p_bo4=0;p_bo5=0;p_bo6=0;ovc=0;v13cor=0;
       p_boc3=0;p_boc4=0;p_boc5=0;p_be1=0,p_be2=0,De_s=0,De_p=0;De_pp=0;
           p_ovun1=0;};
     KOKKOS_INLINE_FUNCTION
     params_twbp(int i){gamma=0;gamma_w=0;alpha=0;r_vdw=0;epsilon=0;acore=0;ecore=0;rcore=0;lgre=0;lgcij=0;
       r_s=0;r_pi=0;r_pi2=0;p_bo1=0;p_bo2=0;p_bo3=0;p_bo4=0;p_bo5=0;p_bo6=0;ovc=0;v13cor=0;
       p_boc3=0;p_boc4=0;p_boc5=0;p_be1=0,p_be2=0,De_s=0,De_p=0;De_pp=0;
           p_ovun1=0;};
     F_FLOAT gamma,gamma_w,alpha,r_vdw,epsilon,acore,ecore,rcore,lgre,lgcij,
       r_s,r_pi,r_pi2,p_bo1,p_bo2,p_bo3,p_bo4,p_bo5,p_bo6,ovc,v13cor,
       p_boc3,p_boc4,p_boc5,p_be1,p_be2,De_s,De_p,De_pp,
       p_ovun1;
   };
 
   struct params_thbp{
     KOKKOS_INLINE_FUNCTION
     params_thbp(){cnt=0;theta_00=0;p_val1=0;p_val2=0;p_val4=0;p_val7=0;p_pen1=0;p_coa1=0;};
     KOKKOS_INLINE_FUNCTION
     params_thbp(int i){cnt=0;theta_00=0;p_val1=0;p_val2=0;p_val4=0;p_val7=0;p_pen1=0;p_coa1=0;};
     F_FLOAT cnt, theta_00, p_val1, p_val2, p_val4, p_val7, p_pen1, p_coa1;
   };
 
   struct params_fbp{
     KOKKOS_INLINE_FUNCTION
     params_fbp(){p_tor1=0;p_cot1=0;V1=0;V2=0;V3=0;};
     KOKKOS_INLINE_FUNCTION
     params_fbp(int i){p_tor1=0;p_cot1=0;V1=0;V2=0;V3=0;};
     F_FLOAT p_tor1, p_cot1, V1, V2, V3;
   };
 
   struct params_hbp{
     KOKKOS_INLINE_FUNCTION
     params_hbp(){p_hb1=0;p_hb2=0;p_hb3=0;r0_hb=0;};
     KOKKOS_INLINE_FUNCTION
     params_hbp(int i){p_hb1=0;p_hb2=0;p_hb3=0;r0_hb=0;};
     F_FLOAT p_hb1, p_hb2, p_hb3, r0_hb;
   };
 
   template<int NEIGHFLAG>
   KOKKOS_INLINE_FUNCTION
   void ev_tally(EV_FLOAT_REAX &ev, const int &i, const int &j, const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
                   const F_FLOAT &dely, const F_FLOAT &delz) const;
 
   template<int NEIGHFLAG>
   KOKKOS_INLINE_FUNCTION
   void e_tally(EV_FLOAT_REAX &ev, const int &i, const int &j, const F_FLOAT &epair) const;
 
   template<int NEIGHFLAG>
   KOKKOS_INLINE_FUNCTION
   void e_tally_single(EV_FLOAT_REAX &ev, const int &i, const F_FLOAT &epair) const;
 
   template<int NEIGHFLAG>
   KOKKOS_INLINE_FUNCTION
   void v_tally(EV_FLOAT_REAX &ev, const int &i, F_FLOAT *fi, F_FLOAT *drij) const;
 
   template<int NEIGHFLAG>
   KOKKOS_INLINE_FUNCTION
   void v_tally3(EV_FLOAT_REAX &ev, const int &i, const int &j, const int &k,
     F_FLOAT *fj, F_FLOAT *fk, F_FLOAT *drij, F_FLOAT *drik) const;
 
   KOKKOS_INLINE_FUNCTION
   void v_tally3_atom(EV_FLOAT_REAX &ev, const int &i, const int &j, const int &k,
     F_FLOAT *fj, F_FLOAT *fk, F_FLOAT *drji, F_FLOAT *drjk) const;
 
   template<int NEIGHFLAG>
   KOKKOS_INLINE_FUNCTION
   void v_tally4(EV_FLOAT_REAX &ev, const int &i, const int &j, const int &k, const int &l,
     F_FLOAT *fi, F_FLOAT *fj, F_FLOAT *fk, F_FLOAT *dril, F_FLOAT *drjl, F_FLOAT *drkl) const;
 
  protected:
   void cleanup_copy();
   void allocate();
   void allocate_array();
   void setup();
   void init_md();
   int Init_Lookup_Tables();
   void Deallocate_Lookup_Tables();
   void LR_vdW_Coulomb( int i, int j, double r_ij, LR_data *lr );
 
   typedef Kokkos::DualView<int*,DeviceType> tdual_int_1d;
   Kokkos::DualView<params_sing*,typename DeviceType::array_layout,DeviceType> k_params_sing;
   typename Kokkos::DualView<params_sing*,typename DeviceType::array_layout,DeviceType>::t_dev_const paramssing;
 
   typedef Kokkos::DualView<int**,DeviceType> tdual_int_2d;
   Kokkos::DualView<params_twbp**,typename DeviceType::array_layout,DeviceType> k_params_twbp;
   typename Kokkos::DualView<params_twbp**,typename DeviceType::array_layout,DeviceType>::t_dev_const paramstwbp;
 
   typedef Kokkos::DualView<int***,DeviceType> tdual_int_3d;
   Kokkos::DualView<params_thbp***,typename DeviceType::array_layout,DeviceType> k_params_thbp;
   typename Kokkos::DualView<params_thbp***,typename DeviceType::array_layout,DeviceType>::t_dev_const paramsthbp;
   Kokkos::DualView<params_hbp***,typename DeviceType::array_layout,DeviceType> k_params_hbp;
   typename Kokkos::DualView<params_hbp***,typename DeviceType::array_layout,DeviceType>::t_dev_const paramshbp;
 
   typedef Kokkos::DualView<int****,DeviceType> tdual_int_4d;
   Kokkos::DualView<params_fbp****,typename DeviceType::array_layout,DeviceType> k_params_fbp;
   typename Kokkos::DualView<params_fbp****,typename DeviceType::array_layout,DeviceType>::t_dev_const paramsfbp;
 
   typename AT::t_x_array_randomread x;
   typename AT::t_f_array f;
   typename AT::t_int_1d_randomread type;
   typename AT::t_tagint_1d tag;
   typename AT::t_float_1d_randomread q;
 
   DAT::tdual_efloat_1d k_eatom;
   typename AT::t_efloat_1d v_eatom;
 
   DAT::tdual_virial_array k_vatom;
   DAT::t_virial_array d_vatom;
   typename AT::t_virial_array v_vatom;
   HAT::t_virial_array h_vatom;
 
   DAT::tdual_float_1d k_tap;
   DAT::t_float_1d d_tap;
   HAT::t_float_1d h_tap;
 
   typename AT::t_float_1d d_bo_rij, d_hb_rsq, d_Deltap, d_Deltap_boc, d_total_bo;
   typename AT::t_float_1d d_Delta, d_Delta_boc, d_Delta_lp, d_dDelta_lp, d_Delta_lp_temp, d_CdDelta;
   typename AT::t_ffloat_2d_dl d_BO, d_BO_s, d_BO_pi, d_BO_pi2, d_dBOp;
   typename AT::t_ffloat_2d_dl d_dln_BOp_pix, d_dln_BOp_piy, d_dln_BOp_piz;
   typename AT::t_ffloat_2d_dl d_dln_BOp_pi2x, d_dln_BOp_pi2y, d_dln_BOp_pi2z;
   typename AT::t_ffloat_2d_dl d_C1dbo, d_C2dbo, d_C3dbo;
   typename AT::t_ffloat_2d_dl d_C1dbopi, d_C2dbopi, d_C3dbopi, d_C4dbopi;
   typename AT::t_ffloat_2d_dl d_C1dbopi2, d_C2dbopi2, d_C3dbopi2, d_C4dbopi2;
   typename AT::t_ffloat_2d_dl d_Cdbo, d_Cdbopi, d_Cdbopi2, d_dDeltap_self;
 
   typedef Kokkos::DualView<F_FLOAT**[7],typename DeviceType::array_layout,DeviceType> tdual_ffloat_2d_n7;
   typedef typename tdual_ffloat_2d_n7::t_dev_const_randomread t_ffloat_2d_n7_randomread;
   typedef typename tdual_ffloat_2d_n7::t_host t_host_ffloat_2d_n7;
 
   typename ArrayTypes<DeviceType>::t_neighbors_2d d_neighbors;
   typename ArrayTypes<DeviceType>::t_int_1d_randomread d_ilist;
   typename ArrayTypes<DeviceType>::t_int_1d_randomread d_numneigh;
 
   typename AT::t_int_1d d_bo_first, d_bo_num, d_bo_list, d_hb_first, d_hb_num, d_hb_list;
 
   DAT::tdual_int_scalar k_resize_bo, k_resize_hb;
   typename AT::t_int_scalar d_resize_bo, d_resize_hb;
 
   typename AT::t_ffloat_2d_dl d_sum_ovun;
   typename AT::t_ffloat_2d_dl d_dBOpx, d_dBOpy, d_dBOpz;
 
   class AtomKokkos *atomKK;
 
   int neighflag,newton_pair, maxnumneigh, maxhb, maxbo;
   int nlocal,nall,eflag,vflag;
   F_FLOAT cut_nbsq, cut_hbsq, cut_bosq, bo_cut, thb_cut, thb_cutsq;
 
   int vdwflag, lgflag;
   F_FLOAT gp[39], p_boc1, p_boc2;
 
   friend void pair_virial_fdotr_compute<PairReaxCKokkos>(PairReaxCKokkos*);
 
   int bocnt,hbcnt;
 
   typedef Kokkos::DualView<LR_lookup_table_kk**,LMPDeviceType::array_layout,DeviceType> tdual_LR_lookup_table_kk_2d;
   typedef typename tdual_LR_lookup_table_kk_2d::t_dev t_LR_lookup_table_kk_2d;
 
   tdual_LR_lookup_table_kk_2d k_LR;
   t_LR_lookup_table_kk_2d d_LR;
 };
 
 }
 
 #endif
 #endif
 
 /* ERROR/WARNING messages:
 
 */
diff --git a/src/USER-DPD/compute_dpd_atom.cpp b/src/USER-DPD/compute_dpd_atom.cpp
index 10f5d8203..ffac09c48 100644
--- a/src/USER-DPD/compute_dpd_atom.cpp
+++ b/src/USER-DPD/compute_dpd_atom.cpp
@@ -1,109 +1,109 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: James Larentzos (U.S. Army Research Laboratory)
 ------------------------------------------------------------------------- */
 
-#include "math.h"
+#include <math.h>
 #include <string.h>
 #include <stdlib.h>
 #include "compute_dpd_atom.h"
 #include "atom.h"
 #include "update.h"
 #include "modify.h"
 #include "domain.h"
 #include "group.h"
 #include "memory.h"
 #include "error.h"
 #include "comm.h"
 
 #include <vector>
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 ComputeDpdAtom::ComputeDpdAtom(LAMMPS *lmp, int narg, char **arg) :
   Compute(lmp, narg, arg)
 {
   if (narg != 3) error->all(FLERR,"Illegal compute dpd/atom command");
 
   peratom_flag = 1;
   size_peratom_cols = 4;
 
   nmax = 0;
   dpdAtom = NULL;
 
   if (atom->dpd_flag != 1) error->all(FLERR,"compute dpd requires atom_style with internal temperature and energies (e.g. dpd)");
 }
 
 /* ---------------------------------------------------------------------- */
 
 ComputeDpdAtom::~ComputeDpdAtom()
 {
   memory->destroy(dpdAtom);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputeDpdAtom::init()
 {
   int count = 0;
   for (int i = 0; i < modify->ncompute; i++)
     if (strcmp(modify->compute[i]->style,"dpd/atom") == 0) count++;
   if (count > 1 && comm->me == 0)
     error->warning(FLERR,"More than one compute dpd/atom command");
 }
 
 /* ----------------------------------------------------------------------
    gather compute vector data from other nodes
 ------------------------------------------------------------------------- */
 
 void ComputeDpdAtom::compute_peratom()
 {
 
   invoked_peratom = update->ntimestep;
 
   double *uCond = atom->uCond;
   double *uMech = atom->uMech;
   double *uChem = atom->uChem;
   double *dpdTheta = atom->dpdTheta;
   int nlocal = atom->nlocal;
   int *mask = atom->mask;
   if (nlocal > nmax) {
     memory->destroy(dpdAtom);
     nmax = atom->nmax;
     memory->create(dpdAtom,nmax,size_peratom_cols,"dpd/atom:dpdAtom");
     array_atom = dpdAtom;
   }
 
   for (int i = 0; i < nlocal; i++){
     if (mask[i] & groupbit){
       dpdAtom[i][0] =  uCond[i];
       dpdAtom[i][1] =  uMech[i];
       dpdAtom[i][2] =  uChem[i];
       dpdAtom[i][3] =  dpdTheta[i];
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    memory usage of local atom-based array
 ------------------------------------------------------------------------- */
 
 double ComputeDpdAtom::memory_usage()
 {
   double bytes = size_peratom_cols * nmax * sizeof(double);
   return bytes;
 }
diff --git a/src/USER-DPD/pair_exp6_rx.cpp b/src/USER-DPD/pair_exp6_rx.cpp
index 697f94ec7..ae941a081 100644
--- a/src/USER-DPD/pair_exp6_rx.cpp
+++ b/src/USER-DPD/pair_exp6_rx.cpp
@@ -1,1137 +1,1137 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "pair_exp6_rx.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "neigh_list.h"
 #include "math_const.h"
 #include "math_special.h"
 #include "memory.h"
 #include "error.h"
 #include "modify.h"
 #include "fix.h"
-#include "float.h"
+#include <float.h>
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
 using namespace MathSpecial;
 
 #define MAXLINE 1024
 #define DELTA 4
 
 #define oneFluidApproxParameter (-1)
 #define isOneFluidApprox(_site) ( (_site) == oneFluidApproxParameter )
 
 #define exp6PotentialType (1)
 #define isExp6PotentialType(_type) ( (_type) == exp6PotentialType )
 
 // Create a structure to hold the parameter data for all
 // local and neighbor particles. Pack inside this struct
 // to avoid any name clashes.
 struct PairExp6ParamDataType
 {
    int n;
    double *epsilon1, *alpha1, *rm1, *fraction1,
           *epsilon2, *alpha2, *rm2, *fraction2,
           *epsilonOld1, *alphaOld1, *rmOld1, *fractionOld1,
           *epsilonOld2, *alphaOld2, *rmOld2, *fractionOld2;
 
    // Default constructor -- nullify everything.
    PairExp6ParamDataType(void)
       : n(0), epsilon1(NULL), alpha1(NULL), rm1(NULL), fraction1(NULL),
               epsilon2(NULL), alpha2(NULL), rm2(NULL), fraction2(NULL),
               epsilonOld1(NULL), alphaOld1(NULL), rmOld1(NULL), fractionOld1(NULL),
               epsilonOld2(NULL), alphaOld2(NULL), rmOld2(NULL), fractionOld2(NULL)
    {}
 };
 
 /* ---------------------------------------------------------------------- */
 
 PairExp6rx::PairExp6rx(LAMMPS *lmp) : Pair(lmp)
 {
   writedata = 1;
 
   nspecies = 0;
   nparams = maxparam = 0;
   params = NULL;
   mol2param = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 PairExp6rx::~PairExp6rx()
 {
   for (int i=0; i < nparams; ++i) {
     delete[] params[i].name;
     delete[] params[i].potential;
   }
   memory->destroy(params);
   memory->destroy(mol2param);
 
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(cutsq);
     memory->destroy(cut);
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairExp6rx::compute(int eflag, int vflag)
 {
   int i,j,ii,jj,inum,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,evdwlOld,fpair;
   double rsq,r2inv,r6inv,forceExp6,factor_lj;
   double rCut,rCutInv,rCut2inv,rCut6inv,rCutExp,urc,durc;
   double rm2ij,rm6ij;
   double r,rexp;
   int *ilist,*jlist,*numneigh,**firstneigh;
 
   evdwlOld = 0.0;
   evdwl = 0.0;
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
 
   double **x = atom->x;
   double **f = atom->f;
   int *type = atom->type;
   int nlocal = atom->nlocal;
   double *special_lj = force->special_lj;
   int newton_pair = force->newton_pair;
 
   double alphaOld12_ij, rmOld12_ij, epsilonOld12_ij;
   double alphaOld21_ij, rmOld21_ij, epsilonOld21_ij;
   double alpha12_ij, rm12_ij, epsilon12_ij;
   double alpha21_ij, rm21_ij, epsilon21_ij;
   double rminv, buck1, buck2;
   double epsilonOld1_i,alphaOld1_i,rmOld1_i;
   double epsilonOld1_j,alphaOld1_j,rmOld1_j;
   double epsilonOld2_i,alphaOld2_i,rmOld2_i;
   double epsilonOld2_j,alphaOld2_j,rmOld2_j;
   double epsilon1_i,alpha1_i,rm1_i;
   double epsilon1_j,alpha1_j,rm1_j;
   double epsilon2_i,alpha2_i,rm2_i;
   double epsilon2_j,alpha2_j,rm2_j;
   double evdwlOldEXP6_12, evdwlOldEXP6_21;
   double evdwlEXP6_12, evdwlEXP6_21, fpairEXP6_12, fpairEXP6_21;
   double fractionOld1_i, fractionOld1_j;
   double fractionOld2_i, fractionOld2_j;
   double fraction1_i, fraction1_j;
   double fraction2_i, fraction2_j;
   double *uCG = atom->uCG;
   double *uCGnew = atom->uCGnew;
 
   const int nRep = 12;
   const double shift = 1.05;
   double rin1, aRep, uin1, win1, uin1rep, rin1exp, rin6, rin6inv;
 
   // Initialize the Exp6 parameter data for both the local
   // and ghost atoms. Make the parameter data persistent
   // and exchange like any other atom property later.
 
   PairExp6ParamDataType PairExp6ParamData;
 
   {
      const int np_total = nlocal + atom->nghost;
 
      memory->create( PairExp6ParamData.epsilon1     , np_total, "PairExp6ParamData.epsilon1");
      memory->create( PairExp6ParamData.alpha1       , np_total, "PairExp6ParamData.alpha1");
      memory->create( PairExp6ParamData.rm1          , np_total, "PairExp6ParamData.rm1");
      memory->create( PairExp6ParamData.fraction1    , np_total, "PairExp6ParamData.fraction1");
      memory->create( PairExp6ParamData.epsilon2     , np_total, "PairExp6ParamData.epsilon2");
      memory->create( PairExp6ParamData.alpha2       , np_total, "PairExp6ParamData.alpha2");
      memory->create( PairExp6ParamData.rm2          , np_total, "PairExp6ParamData.rm2");
      memory->create( PairExp6ParamData.fraction2    , np_total, "PairExp6ParamData.fraction2");
      memory->create( PairExp6ParamData.epsilonOld1  , np_total, "PairExp6ParamData.epsilonOld1");
      memory->create( PairExp6ParamData.alphaOld1    , np_total, "PairExp6ParamData.alphaOld1");
      memory->create( PairExp6ParamData.rmOld1       , np_total, "PairExp6ParamData.rmOld1");
      memory->create( PairExp6ParamData.fractionOld1 , np_total, "PairExp6ParamData.fractionOld1");
      memory->create( PairExp6ParamData.epsilonOld2  , np_total, "PairExp6ParamData.epsilonOld2");
      memory->create( PairExp6ParamData.alphaOld2    , np_total, "PairExp6ParamData.alphaOld2");
      memory->create( PairExp6ParamData.rmOld2       , np_total, "PairExp6ParamData.rmOld2");
      memory->create( PairExp6ParamData.fractionOld2 , np_total, "PairExp6ParamData.fractionOld2");
 
      for (i = 0; i < np_total; ++i)
      {
         getParamsEXP6 (i, PairExp6ParamData.epsilon1[i],
                           PairExp6ParamData.alpha1[i],
                           PairExp6ParamData.rm1[i],
                           PairExp6ParamData.fraction1[i],
                           PairExp6ParamData.epsilon2[i],
                           PairExp6ParamData.alpha2[i],
                           PairExp6ParamData.rm2[i],
                           PairExp6ParamData.fraction2[i],
                           PairExp6ParamData.epsilonOld1[i],
                           PairExp6ParamData.alphaOld1[i],
                           PairExp6ParamData.rmOld1[i],
                           PairExp6ParamData.fractionOld1[i],
                           PairExp6ParamData.epsilonOld2[i],
                           PairExp6ParamData.alphaOld2[i],
                           PairExp6ParamData.rmOld2[i],
                           PairExp6ParamData.fractionOld2[i]);
      }
   }
 
   inum = list->inum;
   ilist = list->ilist;
   numneigh = list->numneigh;
   firstneigh = list->firstneigh;
 
   // loop over neighbors of my atoms
 
   for (ii = 0; ii < inum; ii++) {
     i = ilist[ii];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
     itype = type[i];
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     {
        epsilon1_i     = PairExp6ParamData.epsilon1[i];
        alpha1_i       = PairExp6ParamData.alpha1[i];
        rm1_i          = PairExp6ParamData.rm1[i];
        fraction1_i    = PairExp6ParamData.fraction1[i];
        epsilon2_i     = PairExp6ParamData.epsilon2[i];
        alpha2_i       = PairExp6ParamData.alpha2[i];
        rm2_i          = PairExp6ParamData.rm2[i];
        fraction2_i    = PairExp6ParamData.fraction2[i];
        epsilonOld1_i  = PairExp6ParamData.epsilonOld1[i];
        alphaOld1_i    = PairExp6ParamData.alphaOld1[i];
        rmOld1_i       = PairExp6ParamData.rmOld1[i];
        fractionOld1_i = PairExp6ParamData.fractionOld1[i];
        epsilonOld2_i  = PairExp6ParamData.epsilonOld2[i];
        alphaOld2_i    = PairExp6ParamData.alphaOld2[i];
        rmOld2_i       = PairExp6ParamData.rmOld2[i];
        fractionOld2_i = PairExp6ParamData.fractionOld2[i];
     }
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       factor_lj = special_lj[sbmask(j)];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j][0];
       dely = ytmp - x[j][1];
       delz = ztmp - x[j][2];
 
       rsq = delx*delx + dely*dely + delz*delz;
       jtype = type[j];
 
       if (rsq < cutsq[itype][jtype]) {
         r2inv = 1.0/rsq;
         r6inv = r2inv*r2inv*r2inv;
 
         r = sqrt(rsq);
         rCut2inv = 1.0/cutsq[itype][jtype];
         rCut6inv = rCut2inv*rCut2inv*rCut2inv;
         rCut = sqrt(cutsq[itype][jtype]);
         rCutInv = 1.0/rCut;
 
         //
         // A. Compute the exp-6 potential
         //
 
         // A1.  Get alpha, epsilon and rm for particle j
 
         {
            epsilon1_j     = PairExp6ParamData.epsilon1[j];
            alpha1_j       = PairExp6ParamData.alpha1[j];
            rm1_j          = PairExp6ParamData.rm1[j];
            fraction1_j    = PairExp6ParamData.fraction1[j];
            epsilon2_j     = PairExp6ParamData.epsilon2[j];
            alpha2_j       = PairExp6ParamData.alpha2[j];
            rm2_j          = PairExp6ParamData.rm2[j];
            fraction2_j    = PairExp6ParamData.fraction2[j];
            epsilonOld1_j  = PairExp6ParamData.epsilonOld1[j];
            alphaOld1_j    = PairExp6ParamData.alphaOld1[j];
            rmOld1_j       = PairExp6ParamData.rmOld1[j];
            fractionOld1_j = PairExp6ParamData.fractionOld1[j];
            epsilonOld2_j  = PairExp6ParamData.epsilonOld2[j];
            alphaOld2_j    = PairExp6ParamData.alphaOld2[j];
            rmOld2_j       = PairExp6ParamData.rmOld2[j];
            fractionOld2_j = PairExp6ParamData.fractionOld2[j];
         }
 
         // A2.  Apply Lorentz-Berthelot mixing rules for the i-j pair
         alphaOld12_ij = sqrt(alphaOld1_i*alphaOld2_j);
         rmOld12_ij = 0.5*(rmOld1_i + rmOld2_j);
         epsilonOld12_ij = sqrt(epsilonOld1_i*epsilonOld2_j);
         alphaOld21_ij = sqrt(alphaOld2_i*alphaOld1_j);
         rmOld21_ij = 0.5*(rmOld2_i + rmOld1_j);
         epsilonOld21_ij = sqrt(epsilonOld2_i*epsilonOld1_j);
 
         alpha12_ij = sqrt(alpha1_i*alpha2_j);
         rm12_ij = 0.5*(rm1_i + rm2_j);
         epsilon12_ij = sqrt(epsilon1_i*epsilon2_j);
         alpha21_ij = sqrt(alpha2_i*alpha1_j);
         rm21_ij = 0.5*(rm2_i + rm1_j);
         epsilon21_ij = sqrt(epsilon2_i*epsilon1_j);
 
         if(rmOld12_ij!=0.0 && rmOld21_ij!=0.0){
           if(alphaOld21_ij == 6.0 || alphaOld12_ij == 6.0)
             error->all(FLERR,"alpha_ij is 6.0 in pair exp6");
 
           // A3.  Compute some convenient quantities for evaluating the force
           rminv = 1.0/rmOld12_ij;
           buck1 = epsilonOld12_ij / (alphaOld12_ij - 6.0);
           rexp = expValue(alphaOld12_ij*(1.0-r*rminv));
           rm2ij = rmOld12_ij*rmOld12_ij;
           rm6ij = rm2ij*rm2ij*rm2ij;
 
           // Compute the shifted potential
           rCutExp = expValue(alphaOld12_ij*(1.0-rCut*rminv));
           buck2 = 6.0*alphaOld12_ij;
           urc = buck1*(6.0*rCutExp - alphaOld12_ij*rm6ij*rCut6inv);
           durc = -buck1*buck2*(rCutExp* rminv - rCutInv*rm6ij*rCut6inv);
           rin1 = shift*rmOld12_ij*func_rin(alphaOld12_ij);
           if(r < rin1){
             rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
             rin6inv = 1.0/rin6;
 
             rin1exp = expValue(alphaOld12_ij*(1.0-rin1*rminv));
 
             uin1 = buck1*(6.0*rin1exp - alphaOld12_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
 
             win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) - rin1*durc;
 
             aRep = -1.0*win1*powint(rin1,nRep)/nRep;
 
             uin1rep = aRep/powint(rin1,nRep);
 
             evdwlOldEXP6_12 = uin1 - uin1rep + aRep/powint(r,nRep);
 
           } else {
             evdwlOldEXP6_12 = buck1*(6.0*rexp - alphaOld12_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
           }
 
           // A3.  Compute some convenient quantities for evaluating the force
           rminv = 1.0/rmOld21_ij;
           buck1 = epsilonOld21_ij / (alphaOld21_ij - 6.0);
           buck2 = 6.0*alphaOld21_ij;
           rexp = expValue(alphaOld21_ij*(1.0-r*rminv));
           rm2ij = rmOld21_ij*rmOld21_ij;
           rm6ij = rm2ij*rm2ij*rm2ij;
 
           // Compute the shifted potential
           rCutExp = expValue(alphaOld21_ij*(1.0-rCut*rminv));
           buck2 = 6.0*alphaOld21_ij;
           urc = buck1*(6.0*rCutExp - alphaOld21_ij*rm6ij*rCut6inv);
           durc = -buck1*buck2*(rCutExp* rminv - rCutInv*rm6ij*rCut6inv);
           rin1 = shift*rmOld21_ij*func_rin(alphaOld21_ij);
 
           if(r < rin1){
             rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
             rin6inv = 1.0/rin6;
 
             rin1exp = expValue(alphaOld21_ij*(1.0-rin1*rminv));
 
             uin1 = buck1*(6.0*rin1exp - alphaOld21_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
 
             win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) - rin1*durc;
 
             aRep = -1.0*win1*powint(rin1,nRep)/nRep;
 
             uin1rep = aRep/powint(rin1,nRep);
 
             evdwlOldEXP6_21 = uin1 - uin1rep + aRep/powint(r,nRep);
 
           } else {
             evdwlOldEXP6_21 = buck1*(6.0*rexp - alphaOld21_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
           }
 
           if (isite1 == isite2)
             evdwlOld = sqrt(fractionOld1_i*fractionOld2_j)*evdwlOldEXP6_12;
           else
             evdwlOld = sqrt(fractionOld1_i*fractionOld2_j)*evdwlOldEXP6_12 + sqrt(fractionOld2_i*fractionOld1_j)*evdwlOldEXP6_21;
 
           evdwlOld *= factor_lj;
 
           uCG[i] += 0.5*evdwlOld;
           if (newton_pair || j < nlocal)
             uCG[j] += 0.5*evdwlOld;
         }
 
         if(rm12_ij!=0.0 && rm21_ij!=0.0){
           if(alpha21_ij == 6.0 || alpha12_ij == 6.0)
             error->all(FLERR,"alpha_ij is 6.0 in pair exp6");
 
           // A3.  Compute some convenient quantities for evaluating the force
           rminv = 1.0/rm12_ij;
           buck1 = epsilon12_ij / (alpha12_ij - 6.0);
           buck2 = 6.0*alpha12_ij;
           rexp = expValue(alpha12_ij*(1.0-r*rminv));
           rm2ij = rm12_ij*rm12_ij;
           rm6ij = rm2ij*rm2ij*rm2ij;
 
           // Compute the shifted potential
           rCutExp = expValue(alpha12_ij*(1.0-rCut*rminv));
           urc = buck1*(6.0*rCutExp - alpha12_ij*rm6ij*rCut6inv);
           durc = -buck1*buck2*(rCutExp*rminv - rCutInv*rm6ij*rCut6inv);
           rin1 = shift*rm12_ij*func_rin(alpha12_ij);
 
           if(r < rin1){
             rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
             rin6inv = 1.0/rin6;
 
             rin1exp = expValue(alpha12_ij*(1.0-rin1*rminv));
 
             uin1 = buck1*(6.0*rin1exp - alpha12_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
 
             win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) - rin1*durc;
 
             aRep = -1.0*win1*powint(rin1,nRep)/nRep;
 
             uin1rep = aRep/powint(rin1,nRep);
 
             evdwlEXP6_12 = uin1 - uin1rep + aRep/powint(r,nRep);
 
             forceExp6 = -double(nRep)*aRep/powint(r,nRep);
             fpairEXP6_12 = factor_lj*forceExp6*r2inv;
 
           } else {
 
             // A4.  Compute the exp-6 force and energy
             forceExp6 = buck1*buck2*(r*rexp*rminv - rm6ij*r6inv) + r*durc;
             fpairEXP6_12 = factor_lj*forceExp6*r2inv;
             evdwlEXP6_12 = buck1*(6.0*rexp - alpha12_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
           }
 
           rminv = 1.0/rm21_ij;
           buck1 = epsilon21_ij / (alpha21_ij - 6.0);
           buck2 = 6.0*alpha21_ij;
           rexp = expValue(alpha21_ij*(1.0-r*rminv));
           rm2ij = rm21_ij*rm21_ij;
           rm6ij = rm2ij*rm2ij*rm2ij;
 
           // Compute the shifted potential
           rCutExp = expValue(alpha21_ij*(1.0-rCut*rminv));
           urc = buck1*(6.0*rCutExp - alpha21_ij*rm6ij*rCut6inv);
           durc = -buck1*buck2*(rCutExp*rminv - rCutInv*rm6ij*rCut6inv);
           rin1 = shift*rm21_ij*func_rin(alpha21_ij);
 
           if(r < rin1){
             rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
             rin6inv = 1.0/rin6;
 
             rin1exp = expValue(alpha21_ij*(1.0-rin1*rminv));
 
             uin1 = buck1*(6.0*rin1exp - alpha21_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
 
             win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) - rin1*durc;
 
             aRep = -1.0*win1*powint(rin1,nRep)/nRep;
 
             uin1rep = aRep/powint(rin1,nRep);
 
             evdwlEXP6_21 = uin1 - uin1rep + aRep/powint(r,nRep);
 
             forceExp6 = -double(nRep)*aRep/powint(r,nRep);
             fpairEXP6_21 = factor_lj*forceExp6*r2inv;
 
           } else {
 
             // A4.  Compute the exp-6 force and energy
             forceExp6 = buck1*buck2*(r*rexp*rminv - rm6ij*r6inv) + r*durc;
             fpairEXP6_21 = factor_lj*forceExp6*r2inv;
             evdwlEXP6_21 = buck1*(6.0*rexp - alpha21_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
           }
 
           //
           // Apply Mixing Rule to get the overall force for the CG pair
           //
           if (isite1 == isite2) fpair = sqrt(fractionOld1_i*fractionOld2_j)*fpairEXP6_12; 
           else fpair = sqrt(fractionOld1_i*fractionOld2_j)*fpairEXP6_12 + sqrt(fractionOld2_i*fractionOld1_j)*fpairEXP6_21;
 
           f[i][0] += delx*fpair;
           f[i][1] += dely*fpair;
           f[i][2] += delz*fpair;
           if (newton_pair || j < nlocal) {
             f[j][0] -= delx*fpair;
             f[j][1] -= dely*fpair;
             f[j][2] -= delz*fpair;
           }
 
           if (isite1 == isite2) evdwl = sqrt(fraction1_i*fraction2_j)*evdwlEXP6_12;
           else evdwl = sqrt(fraction1_i*fraction2_j)*evdwlEXP6_12 + sqrt(fraction2_i*fraction1_j)*evdwlEXP6_21;
           evdwl *= factor_lj;
 
           uCGnew[i]   += 0.5*evdwl;
           if (newton_pair || j < nlocal)
             uCGnew[j] += 0.5*evdwl;
           evdwl = evdwlOld;
           if (evflag) ev_tally(i,j,nlocal,newton_pair,
                                evdwl,0.0,fpair,delx,dely,delz);
         }
       }
     }
   }
   if (vflag_fdotr) virial_fdotr_compute();
 
   // Release the local parameter data.
   {
      if (PairExp6ParamData.epsilon1    ) memory->destroy(PairExp6ParamData.epsilon1);
      if (PairExp6ParamData.alpha1      ) memory->destroy(PairExp6ParamData.alpha1);
      if (PairExp6ParamData.rm1         ) memory->destroy(PairExp6ParamData.rm1);
      if (PairExp6ParamData.fraction1   ) memory->destroy(PairExp6ParamData.fraction1);
      if (PairExp6ParamData.epsilon2    ) memory->destroy(PairExp6ParamData.epsilon2);
      if (PairExp6ParamData.alpha2      ) memory->destroy(PairExp6ParamData.alpha2);
      if (PairExp6ParamData.rm2         ) memory->destroy(PairExp6ParamData.rm2);
      if (PairExp6ParamData.fraction2   ) memory->destroy(PairExp6ParamData.fraction2);
      if (PairExp6ParamData.epsilonOld1 ) memory->destroy(PairExp6ParamData.epsilonOld1);
      if (PairExp6ParamData.alphaOld1   ) memory->destroy(PairExp6ParamData.alphaOld1);
      if (PairExp6ParamData.rmOld1      ) memory->destroy(PairExp6ParamData.rmOld1);
      if (PairExp6ParamData.fractionOld1) memory->destroy(PairExp6ParamData.fractionOld1);
      if (PairExp6ParamData.epsilonOld2 ) memory->destroy(PairExp6ParamData.epsilonOld2);
      if (PairExp6ParamData.alphaOld2   ) memory->destroy(PairExp6ParamData.alphaOld2);
      if (PairExp6ParamData.rmOld2      ) memory->destroy(PairExp6ParamData.rmOld2);
      if (PairExp6ParamData.fractionOld2) memory->destroy(PairExp6ParamData.fractionOld2);
   }
 
 }
 
 /* ----------------------------------------------------------------------
    allocate all arrays
 ------------------------------------------------------------------------- */
 
 void PairExp6rx::allocate()
 {
   allocated = 1;
   int n = atom->ntypes;
 
   memory->create(setflag,n+1,n+1,"pair:setflag");
   for (int i = 1; i <= n; i++)
     for (int j = i; j <= n; j++)
       setflag[i][j] = 0;
 
   memory->create(cutsq,n+1,n+1,"pair:cutsq");
 
   memory->create(cut,n+1,n+1,"pair:cut_lj");
 }
 
 /* ----------------------------------------------------------------------
    global settings
 ------------------------------------------------------------------------- */
 
 void PairExp6rx::settings(int narg, char **arg)
 {
   if (narg != 1) error->all(FLERR,"Illegal pair_style command");
 
   cut_global = force->numeric(FLERR,arg[0]);
 
   if (allocated) {
     int i,j;
     for (i = 1; i <= atom->ntypes; i++)
       for (j = i+1; j <= atom->ntypes; j++)
         if (setflag[i][j]) cut[i][j] = cut_global;
   }
 
   allocated = 0;
 
 }
 
 /* ----------------------------------------------------------------------
    set coeffs for one or more type pairs
 ------------------------------------------------------------------------- */
 
 void PairExp6rx::coeff(int narg, char **arg)
 {
   if (narg < 7 || narg > 8) error->all(FLERR,"Incorrect args for pair coefficients");
 
   bool rx_flag = false;
   for (int i = 0; i < modify->nfix; i++)
     if (strncmp(modify->fix[i]->style,"rx",2) == 0) rx_flag = true;
   if (!rx_flag) error->all(FLERR,"PairExp6rx requires a fix rx command.");
 
   if (!allocated) allocate();
 
   int ilo,ihi,jlo,jhi;
   int n;
   force->bounds(arg[0],atom->ntypes,ilo,ihi);
   force->bounds(arg[1],atom->ntypes,jlo,jhi);
 
   nspecies = atom->nspecies_dpd;
   if(nspecies==0) error->all(FLERR,"There are no rx species specified.");
   read_file(arg[2]);
 
   n = strlen(arg[3]) + 1;
   site1 = new char[n];
   strcpy(site1,arg[3]);
 
   int ispecies;
   for (ispecies = 0; ispecies < nspecies; ispecies++){
     if (strcmp(site1,&atom->dname[ispecies][0]) == 0) break;
   }
   if (ispecies == nspecies && strcmp(site1,"1fluid") != 0)
     error->all(FLERR,"Site1 name not recognized in pair coefficients");
 
   n = strlen(arg[4]) + 1;
   site2 = new char[n];
   strcpy(site2,arg[4]);
 
   for (ispecies = 0; ispecies < nspecies; ispecies++){
     if (strcmp(site2,&atom->dname[ispecies][0]) == 0) break;
   }
   if (ispecies == nspecies && strcmp(site2,"1fluid") != 0)
     error->all(FLERR,"Site2 name not recognized in pair coefficients");
 
   {
     // Set isite1 and isite2 parameters based on site1 and site2 strings.
     
     if (strcmp(site1,"1fluid") == 0)
       isite1 = oneFluidApproxParameter;
     else
       {
         int isp;
         for (isp = 0; isp < nspecies; isp++)
           if (strcmp(site1, &atom->dname[isp][0]) == 0) break;
 
         if (isp == nspecies)
           error->all(FLERR,"Site1 name not recognized in pair coefficients");
         else
           isite1 = isp;
       }
     
     if (strcmp(site2,"1fluid") == 0)
       isite2 = oneFluidApproxParameter;
     else
       {
         int isp;
         for (isp = 0; isp < nspecies; isp++)
         if (strcmp(site2, &atom->dname[isp][0]) == 0) break;
 
         if (isp == nspecies)
           error->all(FLERR,"Site2 name not recognized in pair coefficients");
         else
           isite2 = isp;
       }
     
     // Set the interaction potential type to the enumerated type.
     for (int iparam = 0; iparam < nparams; ++iparam)
       {
         if (strcmp( params[iparam].potential, "exp6") == 0)
           params[iparam].potentialType = exp6PotentialType;
         else
           error->all(FLERR,"params[].potential type unknown");
 
         //printf("params[%d].name= %s ispecies= %d potential= %s potentialType= %d\n", iparam, params[iparam].name, params[iparam].ispecies, params[iparam].potential, params[iparam].potentialType);
       }
   }
   delete[] site1;
   delete[] site2;
   site1 = site2 = NULL;
 
   fuchslinR = force->numeric(FLERR,arg[5]);
   fuchslinEpsilon = force->numeric(FLERR,arg[6]);
 
   setup();
 
   double cut_one = cut_global;
   if (narg == 8) cut_one = force->numeric(FLERR,arg[7]);
 
   int count = 0;
   for (int i = ilo; i <= ihi; i++) {
     for (int j = MAX(jlo,i); j <= jhi; j++) {
       cut[i][j] = cut_one;
       setflag[i][j] = 1;
       count++;
     }
   }
 
   if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
 }
 
 /* ----------------------------------------------------------------------
    init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
 
 double PairExp6rx::init_one(int i, int j)
 {
   if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
 
   return cut[i][j];
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairExp6rx::read_file(char *file)
 {
   int params_per_line = 5;
   char **words = new char*[params_per_line+1];
 
   memory->sfree(params);
   params = NULL;
   nparams = maxparam = 0;
 
   // open file on proc 0
 
   FILE *fp;
   fp = NULL;
   if (comm->me == 0) {
     fp = force->open_potential(file);
     if (fp == NULL) {
       char str[128];
       sprintf(str,"Cannot open exp6/rx potential file %s",file);
       error->one(FLERR,str);
     }
   }
 
   // read each set of params from potential file
   // one set of params can span multiple lines
 
   int n,nwords,ispecies;
   char line[MAXLINE],*ptr;
   int eof = 0;
 
   while (1) {
     if (comm->me == 0) {
       ptr = fgets(line,MAXLINE,fp);
       if (ptr == NULL) {
         eof = 1;
         fclose(fp);
       } else n = strlen(line) + 1;
     }
     MPI_Bcast(&eof,1,MPI_INT,0,world);
     if (eof) break;
     MPI_Bcast(&n,1,MPI_INT,0,world);
     MPI_Bcast(line,n,MPI_CHAR,0,world);
 
     // strip comment, skip line if blank
 
     if ((ptr = strchr(line,'#'))) *ptr = '\0';
     nwords = atom->count_words(line);
     if (nwords == 0) continue;
 
     // concatenate additional lines until have params_per_line words
 
     while (nwords < params_per_line) {
       n = strlen(line);
       if (comm->me == 0) {
         ptr = fgets(&line[n],MAXLINE-n,fp);
         if (ptr == NULL) {
           eof = 1;
           fclose(fp);
         } else n = strlen(line) + 1;
       }
       MPI_Bcast(&eof,1,MPI_INT,0,world);
       if (eof) break;
       MPI_Bcast(&n,1,MPI_INT,0,world);
       MPI_Bcast(line,n,MPI_CHAR,0,world);
       if ((ptr = strchr(line,'#'))) *ptr = '\0';
       nwords = atom->count_words(line);
     }
 
     if (nwords != params_per_line)
       error->all(FLERR,"Incorrect format in exp6/rx potential file");
 
     // words = ptrs to all words in line
 
     nwords = 0;
     words[nwords++] = strtok(line," \t\n\r\f");
     while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue;
 
     for (ispecies = 0; ispecies < nspecies; ispecies++)
       if (strcmp(words[0],&atom->dname[ispecies][0]) == 0) break;
     if (ispecies == nspecies) continue;
 
     // load up parameter settings and error check their values
 
     if (nparams == maxparam) {
       maxparam += DELTA;
       params = (Param *) memory->srealloc(params,maxparam*sizeof(Param),
                                           "pair:params");
     }
 
     params[nparams].ispecies = ispecies;
 
     n = strlen(&atom->dname[ispecies][0]) + 1;
     params[nparams].name = new char[n];
     strcpy(params[nparams].name,&atom->dname[ispecies][0]);
 
     n = strlen(words[1]) + 1;
     params[nparams].potential = new char[n];
     strcpy(params[nparams].potential,words[1]);
     if (strcmp(params[nparams].potential,"exp6") == 0){
       params[nparams].alpha = atof(words[2]);
       params[nparams].epsilon = atof(words[3]);
       params[nparams].rm = atof(words[4]);
       if (params[nparams].epsilon <= 0.0 || params[nparams].rm <= 0.0 ||
           params[nparams].alpha < 0.0)
         error->all(FLERR,"Illegal exp6/rx parameters.  Rm and Epsilon must be greater than zero.  Alpha cannot be negative.");
     } else {
       error->all(FLERR,"Illegal exp6/rx parameters.  Interaction potential does not exist.");
     }
     nparams++;
   }
 
   delete [] words;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairExp6rx::setup()
 {
   int i,j,n;
 
   // set mol2param for all combinations
   // must be a single exact match to lines read from file
 
   memory->destroy(mol2param);
   memory->create(mol2param,nspecies,"pair:mol2param");
 
   for (i = 0; i < nspecies; i++) {
     n = -1;
     for (j = 0; j < nparams; j++) {
       if (i == params[j].ispecies) {
         if (n >= 0) error->all(FLERR,"Potential file has duplicate entry");
         n = j;
       }
     }
     mol2param[i] = n;
   }
 }
 
 /* ----------------------------------------------------------------------
   proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
 void PairExp6rx::write_restart(FILE *fp)
 {
   write_restart_settings(fp);
 
   int i,j;
   for (i = 1; i <= atom->ntypes; i++)
     for (j = i; j <= atom->ntypes; j++) {
       fwrite(&setflag[i][j],sizeof(int),1,fp);
       if (setflag[i][j]) {
         fwrite(&cut[i][j],sizeof(double),1,fp);
       }
     }
 }
 
 /* ----------------------------------------------------------------------
   proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
 void PairExp6rx::read_restart(FILE *fp)
 {
   read_restart_settings(fp);
 
   allocate();
 
   int i,j;
   int me = comm->me;
   for (i = 1; i <= atom->ntypes; i++)
     for (j = i; j <= atom->ntypes; j++) {
       if (me == 0) fread(&setflag[i][j],sizeof(int),1,fp);
       MPI_Bcast(&setflag[i][j],1,MPI_INT,0,world);
       if (setflag[i][j]) {
         if (me == 0) {
           fread(&cut[i][j],sizeof(double),1,fp);
         }
         MPI_Bcast(&cut[i][j],1,MPI_DOUBLE,0,world);
       }
     }
 }
 
 /* ----------------------------------------------------------------------
   proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
 void PairExp6rx::write_restart_settings(FILE *fp)
 {
   fwrite(&cut_global,sizeof(double),1,fp);
   fwrite(&offset_flag,sizeof(int),1,fp);
   fwrite(&mix_flag,sizeof(int),1,fp);
   fwrite(&tail_flag,sizeof(int),1,fp);
 }
 
 /* ----------------------------------------------------------------------
   proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
 void PairExp6rx::read_restart_settings(FILE *fp)
 {
   if (comm->me == 0) {
     fread(&cut_global,sizeof(double),1,fp);
     fread(&offset_flag,sizeof(int),1,fp);
     fread(&mix_flag,sizeof(int),1,fp);
     fread(&tail_flag,sizeof(int),1,fp);
   }
   MPI_Bcast(&cut_global,1,MPI_DOUBLE,0,world);
   MPI_Bcast(&offset_flag,1,MPI_INT,0,world);
   MPI_Bcast(&mix_flag,1,MPI_INT,0,world);
   MPI_Bcast(&tail_flag,1,MPI_INT,0,world);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairExp6rx::getParamsEXP6(int id,double &epsilon1,double &alpha1,double &rm1, double &fraction1,double &epsilon2,double &alpha2,double &rm2,double &fraction2,double &epsilon1_old,double &alpha1_old,double &rm1_old, double &fraction1_old,double &epsilon2_old,double &alpha2_old,double &rm2_old,double &fraction2_old) const
 {
   int iparam, jparam;
   double rmi, rmj, rmij, rm3ij;
   double epsiloni, epsilonj, epsilonij;
   double alphai, alphaj, alphaij;
   double epsilon_old, rm3_old, alpha_old;
   double epsilon, rm3, alpha;
   double fractionOFA, fractionOFA_old;
   double nTotalOFA, nTotalOFA_old;
   double nTotal, nTotal_old;
   double xMolei, xMolej, xMolei_old, xMolej_old;
 
   rm3 = 0.0;
   epsilon = 0.0;
   alpha = 0.0;
   epsilon_old = 0.0;
   rm3_old = 0.0;
   alpha_old = 0.0;
   fractionOFA = 0.0;
   fractionOFA_old = 0.0;
   nTotalOFA = 0.0;
   nTotalOFA_old = 0.0;
   nTotal = 0.0;
   nTotal_old = 0.0;
 
   // Compute the total number of molecules in the old and new CG particle as well as the total number of molecules in the fluid portion of the old and new CG particle
   for (int ispecies = 0; ispecies < nspecies; ispecies++){
     nTotal += atom->dvector[ispecies][id];
     nTotal_old += atom->dvector[ispecies+nspecies][id];
 
     iparam = mol2param[ispecies];
 
     if (iparam < 0 || params[iparam].potentialType != exp6PotentialType ) continue;
     if (isOneFluidApprox(isite1) || isOneFluidApprox(isite2)) {
       if (isite1 == params[iparam].ispecies || isite2 == params[iparam].ispecies) continue;
       nTotalOFA_old += atom->dvector[ispecies+nspecies][id];
       nTotalOFA += atom->dvector[ispecies][id];
     }
   }
   if(nTotal < 1e-8 || nTotal_old < 1e-8)
     error->all(FLERR,"The number of molecules in CG particle is less than 1e-8.");
 
   // Compute the mole fraction of molecules within the fluid portion of the particle (One Fluid Approximation)
   fractionOFA_old = nTotalOFA_old / nTotal_old;
   fractionOFA = nTotalOFA / nTotal;
 
   for (int ispecies = 0; ispecies < nspecies; ispecies++) {
     iparam = mol2param[ispecies];
     if (iparam < 0 || params[iparam].potentialType != exp6PotentialType ) continue;
 
     // If Site1 matches a pure species, then grab the parameters
     if (isite1 == params[iparam].ispecies){
       rm1_old = params[iparam].rm;
       rm1 = params[iparam].rm;
       epsilon1_old = params[iparam].epsilon;
       epsilon1 = params[iparam].epsilon;
       alpha1_old = params[iparam].alpha;
       alpha1 = params[iparam].alpha;
 
       // Compute the mole fraction of Site1
       fraction1_old = atom->dvector[ispecies+nspecies][id]/nTotal_old;
       fraction1 = atom->dvector[ispecies][id]/nTotal;
     }
 
     // If Site2 matches a pure species, then grab the parameters
     if (isite2 == params[iparam].ispecies){
       rm2_old = params[iparam].rm;
       rm2 = params[iparam].rm;
       epsilon2_old = params[iparam].epsilon;
       epsilon2 = params[iparam].epsilon;
       alpha2_old = params[iparam].alpha;
       alpha2 = params[iparam].alpha;
 
       // Compute the mole fraction of Site2
       fraction2_old = atom->dvector[ispecies+nspecies][id]/nTotal_old;
       fraction2 = atom->dvector[ispecies][id]/nTotal;
     }
 
     // If Site1 or Site2 matches is a fluid, then compute the paramters
     if (isOneFluidApprox(isite1) || isOneFluidApprox(isite2)) {
       if (isite1 == params[iparam].ispecies || isite2 == params[iparam].ispecies) continue;
       rmi = params[iparam].rm;
       epsiloni = params[iparam].epsilon;
       alphai = params[iparam].alpha;
       xMolei = atom->dvector[ispecies][id]/nTotalOFA;
       xMolei_old = atom->dvector[ispecies+nspecies][id]/nTotalOFA_old;
 
       for (int jspecies = 0; jspecies < nspecies; jspecies++) {
         jparam = mol2param[jspecies];
         if (jparam < 0 || params[jparam].potentialType != exp6PotentialType ) continue;
         if (isite1 == params[jparam].ispecies || isite2 == params[jparam].ispecies) continue;
         rmj = params[jparam].rm;
         epsilonj = params[jparam].epsilon;
         alphaj = params[jparam].alpha;
         xMolej = atom->dvector[jspecies][id]/nTotalOFA;
         xMolej_old = atom->dvector[jspecies+nspecies][id]/nTotalOFA_old;
 
         rmij = (rmi+rmj)/2.0;
         rm3ij = rmij*rmij*rmij;
         epsilonij = sqrt(epsiloni*epsilonj);
         alphaij = sqrt(alphai*alphaj);
 
         if(fractionOFA_old > 0.0){
           rm3_old += xMolei_old*xMolej_old*rm3ij;
           epsilon_old += xMolei_old*xMolej_old*rm3ij*epsilonij;
           alpha_old += xMolei_old*xMolej_old*rm3ij*epsilonij*alphaij;
         }
         if(fractionOFA > 0.0){
           rm3 += xMolei*xMolej*rm3ij;
           epsilon += xMolei*xMolej*rm3ij*epsilonij;
           alpha += xMolei*xMolej*rm3ij*epsilonij*alphaij;
         }
       }
     }
   }
 
   if (isOneFluidApprox(isite1)){
     rm1 = cbrt(rm3);
     if(rm1 < 1e-16) {
       rm1 = 0.0;
       epsilon1 = 0.0;
       alpha1 = 0.0;
     } else {
       epsilon1 = epsilon / rm3;
       alpha1 = alpha / epsilon1 / rm3;
     }
 
     fraction1 = fractionOFA;
 
     rm1_old = cbrt(rm3_old);
     if(rm1_old < 1e-16) {
       rm1_old = 0.0;
       epsilon1_old = 0.0;
       alpha1_old = 0.0;
     } else {
       epsilon1_old = epsilon_old / rm3_old;
       alpha1_old = alpha_old / epsilon1_old / rm3_old;
     }
     fraction1_old = fractionOFA_old;
 
     // Fuchslin-Like Exp-6 Scaling
     double powfuch = 0.0;
     if(fuchslinEpsilon < 0.0){
       powfuch = pow(nTotalOFA,-fuchslinEpsilon);
       if(powfuch<1e-15) epsilon1 = 0.0;
       else epsilon1 *= 1.0/powfuch;
 
       powfuch = pow(nTotalOFA_old,-fuchslinEpsilon);
       if(powfuch<1e-15) epsilon1_old = 0.0;
       else epsilon1_old *= 1.0/powfuch;
 
     } else {
       epsilon1 *= pow(nTotalOFA,fuchslinEpsilon);
       epsilon1_old *= pow(nTotalOFA_old,fuchslinEpsilon);
     }
 
     if(fuchslinR < 0.0){
       powfuch = pow(nTotalOFA,-fuchslinR);
       if(powfuch<1e-15) rm1 = 0.0;
       else rm1 *= 1.0/powfuch;
 
       powfuch = pow(nTotalOFA_old,-fuchslinR);
       if(powfuch<1e-15) rm1_old = 0.0;
       else rm1_old *= 1.0/powfuch;
 
     } else {
       rm1 *= pow(nTotalOFA,fuchslinR);
       rm1_old *= pow(nTotalOFA_old,fuchslinR);
     }
   }
 
   if (isOneFluidApprox(isite2)){
     rm2 = cbrt(rm3);
     if(rm2 < 1e-16) {
       rm2 = 0.0;
       epsilon2 = 0.0;
       alpha2 = 0.0;
     } else {
       epsilon2 = epsilon / rm3;
       alpha2 = alpha / epsilon2 / rm3;
     }
     fraction2 = fractionOFA;
 
     rm2_old = cbrt(rm3_old);
     if(rm2_old < 1e-16) {
       rm2_old = 0.0;
       epsilon2_old = 0.0;
       alpha2_old = 0.0;
     } else {
       epsilon2_old = epsilon_old / rm3_old;
       alpha2_old = alpha_old / epsilon2_old / rm3_old;
     }
     fraction2_old = fractionOFA_old;
 
     // Fuchslin-Like Exp-6 Scaling
     double powfuch = 0.0;
     if(fuchslinEpsilon < 0.0){
       powfuch = pow(nTotalOFA,-fuchslinEpsilon);
       if(powfuch<1e-15) epsilon2 = 0.0;
       else epsilon2 *= 1.0/powfuch;
 
       powfuch = pow(nTotalOFA_old,-fuchslinEpsilon);
       if(powfuch<1e-15) epsilon2_old = 0.0;
       else epsilon2_old *= 1.0/powfuch;
 
     } else {
       epsilon2 *= pow(nTotalOFA,fuchslinEpsilon);
       epsilon2_old *= pow(nTotalOFA_old,fuchslinEpsilon);
     }
 
     if(fuchslinR < 0.0){
       powfuch = pow(nTotalOFA,-fuchslinR);
       if(powfuch<1e-15) rm2 = 0.0;
       else rm2 *= 1.0/powfuch;
 
       powfuch = pow(nTotalOFA_old,-fuchslinR);
       if(powfuch<1e-15) rm2_old = 0.0;
       else rm2_old *= 1.0/powfuch;
 
     } else {
       rm2 *= pow(nTotalOFA,fuchslinR);
       rm2_old *= pow(nTotalOFA_old,fuchslinR);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 inline double PairExp6rx::func_rin(const double &alpha) const
 {
   double function;
 
   const double a = 3.7682065;
   const double b = -1.4308614;
 
   function = a+b*sqrt(alpha);
   function = expValue(function);
 
   return function;
 }
 
 /* ---------------------------------------------------------------------- */
 
 inline double PairExp6rx::expValue(double value) const
 {
   double returnValue;
   if(value < DBL_MIN_EXP) returnValue = 0.0;
   else returnValue = exp(value);
 
   return returnValue;
 }
diff --git a/src/USER-DPD/pair_multi_lucy.cpp b/src/USER-DPD/pair_multi_lucy.cpp
index 501ef2d04..a063eff19 100644
--- a/src/USER-DPD/pair_multi_lucy.cpp
+++ b/src/USER-DPD/pair_multi_lucy.cpp
@@ -1,833 +1,833 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------------------------
    Contributing authors:
    James Larentzos and Joshua Moore (U.S. Army Research Laboratory)
 
    Please cite the related publications:
    J.D. Moore, B.C. Barnes, S. Izvekov, M. Lisal, M.S. Sellers, D.E. Taylor & J.K. Brennan
    "A coarse-grain force field for RDX: Density dependent and energy conserving"
    The Journal of Chemical Physics, 2016, 144, 104501.
 ------------------------------------------------------------------------------------------- */
 
-#include "mpi.h"
+#include <mpi.h>
 #include <math.h>
 #include "math_const.h"
 #include <stdlib.h>
 #include <string.h>
 #include "pair_multi_lucy.h"
 #include "atom.h"
 #include "force.h"
 #include "comm.h"
 #include "neigh_list.h"
 #include "memory.h"
 #include "error.h"
 #include "citeme.h"
 
 using namespace LAMMPS_NS;
 
 enum{NONE,RLINEAR,RSQ};
 
 #define MAXLINE 1024
 
 static const char cite_pair_multi_lucy[] =
   "pair_style multi/lucy command:\n\n"
   "@Article{Moore16,\n"
   " author = {J.D. Moore, B.C. Barnes, S. Izvekov, M. Lisal, M.S. Sellers, D.E. Taylor and J. K. Brennan},\n"
   " title = {A coarse-grain force field for RDX:  Density dependent and energy conserving},\n"
   " journal = {J. Chem. Phys.},\n"
   " year =    2016,\n"
   " volume =  144\n"
   " pages =   {104501}\n"
   "}\n\n";
 
 /* ---------------------------------------------------------------------- */
 
 PairMultiLucy::PairMultiLucy(LAMMPS *lmp) : Pair(lmp)
 {
   if (lmp->citeme) lmp->citeme->add(cite_pair_multi_lucy);
 
   if (atom->rho_flag != 1) error->all(FLERR,"Pair multi/lucy command requires atom_style with density (e.g. dpd, meso)");
 
   ntables = 0;
   tables = NULL;
 
   comm_forward = 1;
   comm_reverse = 1;
 
 }
 
 /* ---------------------------------------------------------------------- */
 
 PairMultiLucy::~PairMultiLucy()
 {
   for (int m = 0; m < ntables; m++) free_table(&tables[m]);
   memory->sfree(tables);
 
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(cutsq);
     memory->destroy(tabindex);
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairMultiLucy::compute(int eflag, int vflag)
 {
   int i,j,ii,jj,inum,jnum,itype,jtype,itable;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair,rsq;
   int *ilist,*jlist,*numneigh,**firstneigh;
   Table *tb;
 
   int tlm1 = tablength - 1;
 
   evdwl = 0.0;
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
 
   double **x = atom->x;
   double **f = atom->f;
   int *type = atom->type;
   int nlocal = atom->nlocal;
   int newton_pair = force->newton_pair;
 
   double pi = MathConst::MY_PI;
   double A_i;
   double A_j;
   double fraction_i,fraction_j;
   int jtable;
   double *rho = atom->rho;
 
   inum = list->inum;
   ilist = list->ilist;
   numneigh = list->numneigh;
   firstneigh = list->firstneigh;
 
   computeLocalDensity();
 
   // loop over neighbors of my atoms
 
   for (ii = 0; ii < inum; ii++) {
     i = ilist[ii];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
     itype = type[i];
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j][0];
       dely = ytmp - x[j][1];
       delz = ztmp - x[j][2];
       rsq = delx*delx + dely*dely + delz*delz;
       jtype = type[j];
 
       if (rsq < cutsq[itype][jtype]) {
         tb = &tables[tabindex[itype][jtype]];
         if (rho[i]*rho[i] < tb->innersq || rho[j]*rho[j] < tb->innersq)
           error->one(FLERR,"Density < table inner cutoff");
 
         if (tabstyle == LOOKUP) {
           itable = static_cast<int> (((rho[i]*rho[i]) - tb->innersq) * tb->invdelta);
           jtable = static_cast<int> (((rho[j]*rho[j]) - tb->innersq) * tb->invdelta);
           if (itable >= tlm1 || jtable >= tlm1)
             error->one(FLERR,"Density > table outer cutoff");
 
           A_i = tb->f[itable];
           A_j = tb->f[jtable];
           fpair = 0.5*(A_i + A_j)*(1.0+3.0*sqrt(rsq)/sqrt(cutsq[itype][jtype]))*(1.0 - sqrt(rsq)/sqrt(cutsq[itype][jtype]))*(1.0 - sqrt(rsq)/sqrt(cutsq[itype][jtype]))*(1.0 - sqrt(rsq)/sqrt(cutsq[itype][jtype]));
           fpair = fpair/sqrt(rsq);
 
         } else if (tabstyle == LINEAR) {
           itable = static_cast<int> ((rho[i]*rho[i] - tb->innersq) * tb->invdelta);
           jtable = static_cast<int> (((rho[j]*rho[j]) - tb->innersq) * tb->invdelta);
           if (itable >= tlm1 || jtable >= tlm1)
             error->one(FLERR,"Density > table outer cutoff");
 
           fraction_i = (((rho[i]*rho[i]) - tb->rsq[itable]) * tb->invdelta);
           fraction_j = (((rho[j]*rho[j]) - tb->rsq[jtable]) * tb->invdelta);
 
           A_i = tb->f[itable] + fraction_i*tb->df[itable];
           A_j = tb->f[jtable] + fraction_j*tb->df[jtable];
 
           fpair = 0.5*(A_i + A_j)*(1.0+3.0*sqrt(rsq)/sqrt(cutsq[itype][jtype]))*(1.0 - sqrt(rsq)/sqrt(cutsq[itype][jtype]))*(1.0 - sqrt(rsq)/sqrt(cutsq[itype][jtype]))*(1.0 - sqrt(rsq)/sqrt(cutsq[itype][jtype]));
 
           fpair = fpair / sqrt(rsq);
 
         } else error->one(FLERR,"Only LOOKUP and LINEAR table styles have been implemented for pair multi/lucy");
 
         f[i][0] += delx*fpair;
         f[i][1] += dely*fpair;
         f[i][2] += delz*fpair;
         if (newton_pair || j < nlocal) {
           f[j][0] -= delx*fpair;
           f[j][1] -= dely*fpair;
           f[j][2] -= delz*fpair;
         }
         if (evflag) ev_tally(i,j,nlocal,newton_pair,
         		     0.0,0.0,fpair,delx,dely,delz);
       }
     }
 
     tb = &tables[tabindex[itype][itype]];
     if (rho[i]*rho[i] < tb->innersq || rho[j]*rho[j] < tb->innersq)
       error->one(FLERR,"Density < table inner cutoff");
     itable = static_cast<int> (((rho[i]*rho[i]) - tb->innersq) * tb->invdelta);
     if (tabstyle == LOOKUP) evdwl = tb->e[itable];
     else if (tabstyle == LINEAR){
       if (itable >= tlm1) error->one(FLERR,"Density > table outer cutoff");
       if(itable==0) fraction_i=0.0;
       else fraction_i = (((rho[i]*rho[i]) - tb->rsq[itable]) * tb->invdelta);
       evdwl = tb->e[itable] + fraction_i*tb->de[itable];
     } else error->one(FLERR,"Only LOOKUP and LINEAR table styles have been implemented for pair multi/lucy");
 
     evdwl *=(pi*cutsq[itype][itype]*cutsq[itype][itype])/84.0;
 
     if (evflag) ev_tally(0,0,nlocal,newton_pair,
         		 evdwl,0.0,0.0,0.0,0.0,0.0);
   }
 
  if (vflag_fdotr) virial_fdotr_compute();
 }
 
 /* ----------------------------------------------------------------------
    allocate all arrays
 ------------------------------------------------------------------------- */
 
 void PairMultiLucy::allocate()
 {
   allocated = 1;
   const int nt = atom->ntypes + 1;
 
   memory->create(setflag,nt,nt,"pair:setflag");
   memory->create(cutsq,nt,nt,"pair:cutsq");
   memory->create(tabindex,nt,nt,"pair:tabindex");
 
   memset(&setflag[0][0],0,nt*nt*sizeof(int));
   memset(&cutsq[0][0],0,nt*nt*sizeof(double));
   memset(&tabindex[0][0],0,nt*nt*sizeof(int));
 }
 
 /* ----------------------------------------------------------------------
    global settings
 ------------------------------------------------------------------------- */
 
 void PairMultiLucy::settings(int narg, char **arg)
 {
   if (narg != 2) error->all(FLERR,"Illegal pair_style command");
 
   // new settings
 
   if (strcmp(arg[0],"lookup") == 0) tabstyle = LOOKUP;
   else if (strcmp(arg[0],"linear") == 0) tabstyle = LINEAR;
   else error->all(FLERR,"Unknown table style in pair_style command");
 
   tablength = force->inumeric(FLERR,arg[1]);
   if (tablength < 2) error->all(FLERR,"Illegal number of pair table entries");
 
   // delete old tables, since cannot just change settings
 
   for (int m = 0; m < ntables; m++) free_table(&tables[m]);
   memory->sfree(tables);
 
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(cutsq);
     memory->destroy(tabindex);
   }
   allocated = 0;
 
   ntables = 0;
   tables = NULL;
 }
 
 /* ----------------------------------------------------------------------
    set coeffs for one or more type pairs
 ------------------------------------------------------------------------- */
 
 void PairMultiLucy::coeff(int narg, char **arg)
 {
   if (narg != 4 && narg != 5) error->all(FLERR,"Illegal pair_coeff command");
   if (!allocated) allocate();
 
   int ilo,ihi,jlo,jhi;
   force->bounds(arg[0],atom->ntypes,ilo,ihi);
   force->bounds(arg[1],atom->ntypes,jlo,jhi);
 
   int me;
   MPI_Comm_rank(world,&me);
   tables = (Table *)
     memory->srealloc(tables,(ntables+1)*sizeof(Table),"pair:tables");
   Table *tb = &tables[ntables];
   null_table(tb);
   if (me == 0) read_table(tb,arg[2],arg[3]);
   bcast_table(tb);
 
   // set table cutoff
   if (narg == 5) tb->cut = force->numeric(FLERR,arg[4]);
   else if (tb->rflag) tb->cut = tb->rhi;
   else tb->cut = tb->rfile[tb->ninput-1];
 
   // error check on table parameters
   // insure cutoff is within table
 
   if (tb->ninput <= 1) error->one(FLERR,"Invalid pair table length");
   double rlo;
   if (tb->rflag == 0) {
     rlo = tb->rfile[0];
   } else {
     rlo = tb->rlo;
   }
   rho_0 = rlo;
 
   tb->match = 0;
   if (tabstyle == LINEAR && tb->ninput == tablength &&
       tb->rflag == RSQ) tb->match = 1;
 
   // spline read-in values and compute r,e,f vectors within table
 
   if (tb->match == 0) spline_table(tb);
   compute_table(tb);
 
   // store ptr to table in tabindex
 
   int count = 0;
   for (int i = ilo; i <= ihi; i++) {
     for (int j = MAX(jlo,i); j <= jhi; j++) {
       tabindex[i][j] = ntables;
       setflag[i][j] = 1;
       count++;
     }
   }
 
   if (count == 0) error->all(FLERR,"Illegal pair_coeff command");
   ntables++;
 }
 
 /* ----------------------------------------------------------------------
    init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
 
 double PairMultiLucy::init_one(int i, int j)
 {
   if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
 
   tabindex[j][i] = tabindex[i][j];
 
   return tables[tabindex[i][j]].cut;
 }
 
 /* ----------------------------------------------------------------------
    read a table section from a tabulated potential file
    only called by proc 0
    this function sets these values in Table:
      ninput,rfile,efile,ffile,rflag,rlo,rhi,fpflag,fplo,fphi
 ------------------------------------------------------------------------- */
 
 void PairMultiLucy::read_table(Table *tb, char *file, char *keyword)
 {
   char line[MAXLINE];
 
   // open file
 
   FILE *fp = fopen(file,"r");
   if (fp == NULL) {
     char str[128];
     sprintf(str,"Cannot open file %s",file);
     error->one(FLERR,str);
   }
 
   // loop until section found with matching keyword
 
   while (1) {
     if (fgets(line,MAXLINE,fp) == NULL)
       error->one(FLERR,"Did not find keyword in table file");
     if (strspn(line," \t\n\r") == strlen(line)) continue;  // blank line
     if (line[0] == '#') continue;                          // comment
     char *word = strtok(line," \t\n\r");
     if (strcmp(word,keyword) == 0) break;           // matching keyword
     fgets(line,MAXLINE,fp);                         // no match, skip section
     param_extract(tb,line);
     fgets(line,MAXLINE,fp);
     for (int i = 0; i < tb->ninput; i++) fgets(line,MAXLINE,fp);
   }
 
   // read args on 2nd line of section
   // allocate table arrays for file values
 
   fgets(line,MAXLINE,fp);
   param_extract(tb,line);
   memory->create(tb->rfile,tb->ninput,"pair:rfile");
   memory->create(tb->efile,tb->ninput,"pair:efile");
   memory->create(tb->ffile,tb->ninput,"pair:ffile");
 
   // read r,e,f table values from file
   // if rflag set, compute r
   // if rflag not set, use r from file
 
   int itmp;
   double rtmp;
 
   fgets(line,MAXLINE,fp);
   for (int i = 0; i < tb->ninput; i++) {
     fgets(line,MAXLINE,fp);
     sscanf(line,"%d %lg %lg %lg",&itmp,&rtmp,&tb->efile[i],&tb->ffile[i]);
 
     if (tb->rflag == RLINEAR)
       rtmp = tb->rlo + (tb->rhi - tb->rlo)*i/(tb->ninput-1);
     else if (tb->rflag == RSQ) {
       rtmp = tb->rlo*tb->rlo +
         (tb->rhi*tb->rhi - tb->rlo*tb->rlo)*i/(tb->ninput-1);
       rtmp = sqrt(rtmp);
     }
 
     tb->rfile[i] = rtmp;
   }
 
   // close file
 
   fclose(fp);
 }
 
 /* ----------------------------------------------------------------------
    broadcast read-in table info from proc 0 to other procs
    this function communicates these values in Table:
      ninput,rfile,efile,ffile,rflag,rlo,rhi,fpflag,fplo,fphi
 ------------------------------------------------------------------------- */
 
 void PairMultiLucy::bcast_table(Table *tb)
 {
   MPI_Bcast(&tb->ninput,1,MPI_INT,0,world);
 
   int me;
   MPI_Comm_rank(world,&me);
   if (me > 0) {
     memory->create(tb->rfile,tb->ninput,"pair:rfile");
     memory->create(tb->efile,tb->ninput,"pair:efile");
     memory->create(tb->ffile,tb->ninput,"pair:ffile");
   }
 
   MPI_Bcast(tb->rfile,tb->ninput,MPI_DOUBLE,0,world);
   MPI_Bcast(tb->efile,tb->ninput,MPI_DOUBLE,0,world);
   MPI_Bcast(tb->ffile,tb->ninput,MPI_DOUBLE,0,world);
 
   MPI_Bcast(&tb->rflag,1,MPI_INT,0,world);
   if (tb->rflag) {
     MPI_Bcast(&tb->rlo,1,MPI_DOUBLE,0,world);
     MPI_Bcast(&tb->rhi,1,MPI_DOUBLE,0,world);
   }
   MPI_Bcast(&tb->fpflag,1,MPI_INT,0,world);
   if (tb->fpflag) {
     MPI_Bcast(&tb->fplo,1,MPI_DOUBLE,0,world);
     MPI_Bcast(&tb->fphi,1,MPI_DOUBLE,0,world);
   }
 }
 
 /* ----------------------------------------------------------------------
    build spline representation of e,f over entire range of read-in table
    this function sets these values in Table: e2file,f2file
 ------------------------------------------------------------------------- */
 
 void PairMultiLucy::spline_table(Table *tb)
 {
   memory->create(tb->e2file,tb->ninput,"pair:e2file");
   memory->create(tb->f2file,tb->ninput,"pair:f2file");
 
   double ep0 = - tb->ffile[0];
   double epn = - tb->ffile[tb->ninput-1];
   spline(tb->rfile,tb->efile,tb->ninput,ep0,epn,tb->e2file);
 
   if (tb->fpflag == 0) {
     tb->fplo = (tb->ffile[1] - tb->ffile[0]) / (tb->rfile[1] - tb->rfile[0]);
     tb->fphi = (tb->ffile[tb->ninput-1] - tb->ffile[tb->ninput-2]) /
       (tb->rfile[tb->ninput-1] - tb->rfile[tb->ninput-2]);
   }
 
   double fp0 = tb->fplo;
   double fpn = tb->fphi;
   spline(tb->rfile,tb->ffile,tb->ninput,fp0,fpn,tb->f2file);
 }
 
 /* ----------------------------------------------------------------------
    extract attributes from parameter line in table section
    format of line: N value R/RSQ lo hi FP fplo fphi
    N is required, other params are optional
 ------------------------------------------------------------------------- */
 
 void PairMultiLucy::param_extract(Table *tb, char *line)
 {
   tb->ninput = 0;
   tb->rflag = NONE;
   tb->fpflag = 0;
 
   char *word = strtok(line," \t\n\r\f");
   while (word) {
     if (strcmp(word,"N") == 0) {
       word = strtok(NULL," \t\n\r\f");
       tb->ninput = atoi(word);
     } else if (strcmp(word,"R") == 0 || strcmp(word,"RSQ") == 0) {
       if (strcmp(word,"R") == 0) tb->rflag = RLINEAR;
       else if (strcmp(word,"RSQ") == 0) tb->rflag = RSQ;
       word = strtok(NULL," \t\n\r\f");
       tb->rlo = atof(word);
       word = strtok(NULL," \t\n\r\f");
       tb->rhi = atof(word);
     } else if (strcmp(word,"FP") == 0) {
       tb->fpflag = 1;
       word = strtok(NULL," \t\n\r\f");
       tb->fplo = atof(word);
       word = strtok(NULL," \t\n\r\f");
       tb->fphi = atof(word);
     } else {
       printf("WORD: %s\n",word);
       error->one(FLERR,"Invalid keyword in pair table parameters");
     }
     word = strtok(NULL," \t\n\r\f");
   }
 
   if (tb->ninput == 0) error->one(FLERR,"Pair table parameters did not set N");
 }
 
 /* ----------------------------------------------------------------------
    compute r,e,f vectors from splined values
 ------------------------------------------------------------------------- */
 
 void PairMultiLucy::compute_table(Table *tb)
 {
   int tlm1 = tablength-1;
 
   // inner = inner table bound
   // cut = outer table bound
   // delta = table spacing in rsq for N-1 bins
 
   double inner;
   if (tb->rflag) inner = tb->rlo;
   else inner = tb->rfile[0];
   tb->innersq = inner*inner;
   tb->delta = (tb->rhi*tb->rhi - tb->innersq) / tlm1;
   tb->invdelta = 1.0/tb->delta;
 
   // direct lookup tables
   // N-1 evenly spaced bins in rsq from inner to cut
   // e,f = value at midpt of bin
   // e,f are N-1 in length since store 1 value at bin midpt
   // f is converted to f/r when stored in f[i]
   // e,f are never a match to read-in values, always computed via spline interp
 
   if (tabstyle == LOOKUP) {
     memory->create(tb->e,tlm1,"pair:e");
     memory->create(tb->f,tlm1,"pair:f");
 
     double r,rsq;
     for (int i = 0; i < tlm1; i++) {
       rsq = tb->innersq + (i+0.5)*tb->delta;
       r = sqrt(rsq);
       tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
       tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r);
     }
   }
 
   // linear tables
   // N-1 evenly spaced bins in rsq from inner to cut
   // rsq,e,f = value at lower edge of bin
   // de,df values = delta from lower edge to upper edge of bin
   // rsq,e,f are N in length so de,df arrays can compute difference
   // f is converted to f/r when stored in f[i]
   // e,f can match read-in values, else compute via spline interp
 
   if (tabstyle == LINEAR) {
     memory->create(tb->rsq,tablength,"pair:rsq");
     memory->create(tb->e,tablength,"pair:e");
     memory->create(tb->f,tablength,"pair:f");
     memory->create(tb->de,tlm1,"pair:de");
     memory->create(tb->df,tlm1,"pair:df");
 
     double r,rsq;
     for (int i = 0; i < tablength; i++) {
       rsq = tb->innersq + i*tb->delta;
       r = sqrt(rsq);
       tb->rsq[i] = rsq;
       if (tb->match) {
         tb->e[i] = tb->efile[i];
         tb->f[i] = tb->ffile[i];
       } else {
         tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
         tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r);
       }
     }
 
     for (int i = 0; i < tlm1; i++) {
       tb->de[i] = tb->e[i+1] - tb->e[i];
       tb->df[i] = tb->f[i+1] - tb->f[i];
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    set all ptrs in a table to NULL, so can be freed safely
 ------------------------------------------------------------------------- */
 
 void PairMultiLucy::null_table(Table *tb)
 {
   tb->rfile = tb->efile = tb->ffile = NULL;
   tb->e2file = tb->f2file = NULL;
   tb->rsq = tb->drsq = tb->e = tb->de = NULL;
   tb->f = tb->df = tb->e2 = tb->f2 = NULL;
 }
 
 /* ----------------------------------------------------------------------
    free all arrays in a table
 ------------------------------------------------------------------------- */
 
 void PairMultiLucy::free_table(Table *tb)
 {
   memory->destroy(tb->rfile);
   memory->destroy(tb->efile);
   memory->destroy(tb->ffile);
   memory->destroy(tb->e2file);
   memory->destroy(tb->f2file);
 
   memory->destroy(tb->rsq);
   memory->destroy(tb->drsq);
   memory->destroy(tb->e);
   memory->destroy(tb->de);
   memory->destroy(tb->f);
   memory->destroy(tb->df);
   memory->destroy(tb->e2);
   memory->destroy(tb->f2);
 }
 
 /* ----------------------------------------------------------------------
    spline and splint routines modified from Numerical Recipes
 ------------------------------------------------------------------------- */
 
 void PairMultiLucy::spline(double *x, double *y, int n,
                        double yp1, double ypn, double *y2)
 {
   int i,k;
   double p,qn,sig,un;
   double *u = new double[n];
 
   if (yp1 > 0.99e30) y2[0] = u[0] = 0.0;
   else {
     y2[0] = -0.5;
     u[0] = (3.0/(x[1]-x[0])) * ((y[1]-y[0]) / (x[1]-x[0]) - yp1);
   }
   for (i = 1; i < n-1; i++) {
     sig = (x[i]-x[i-1]) / (x[i+1]-x[i-1]);
     p = sig*y2[i-1] + 2.0;
     y2[i] = (sig-1.0) / p;
     u[i] = (y[i+1]-y[i]) / (x[i+1]-x[i]) - (y[i]-y[i-1]) / (x[i]-x[i-1]);
     u[i] = (6.0*u[i] / (x[i+1]-x[i-1]) - sig*u[i-1]) / p;
   }
   if (ypn > 0.99e30) qn = un = 0.0;
   else {
     qn = 0.5;
     un = (3.0/(x[n-1]-x[n-2])) * (ypn - (y[n-1]-y[n-2]) / (x[n-1]-x[n-2]));
   }
   y2[n-1] = (un-qn*u[n-2]) / (qn*y2[n-2] + 1.0);
   for (k = n-2; k >= 0; k--) y2[k] = y2[k]*y2[k+1] + u[k];
 
   delete [] u;
 }
 
 /* ---------------------------------------------------------------------- */
 
 double PairMultiLucy::splint(double *xa, double *ya, double *y2a, int n, double x)
 {
   int klo,khi,k;
   double h,b,a,y;
 
   klo = 0;
   khi = n-1;
   while (khi-klo > 1) {
     k = (khi+klo) >> 1;
     if (xa[k] > x) khi = k;
     else klo = k;
   }
   h = xa[khi]-xa[klo];
   a = (xa[khi]-x) / h;
   b = (x-xa[klo]) / h;
   y = a*ya[klo] + b*ya[khi] +
     ((a*a*a-a)*y2a[klo] + (b*b*b-b)*y2a[khi]) * (h*h)/6.0;
   return y;
 }
 
 /* ----------------------------------------------------------------------
    proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
 void PairMultiLucy::write_restart(FILE *fp)
 {
   write_restart_settings(fp);
 }
 
 /* ----------------------------------------------------------------------
    proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
 void PairMultiLucy::read_restart(FILE *fp)
 {
   read_restart_settings(fp);
   allocate();
 }
 
 /* ----------------------------------------------------------------------
    proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
 void PairMultiLucy::write_restart_settings(FILE *fp)
 {
   fwrite(&tabstyle,sizeof(int),1,fp);
   fwrite(&tablength,sizeof(int),1,fp);
 }
 
 /* ----------------------------------------------------------------------
    proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
 void PairMultiLucy::read_restart_settings(FILE *fp)
 {
   if (comm->me == 0) {
     fread(&tabstyle,sizeof(int),1,fp);
     fread(&tablength,sizeof(int),1,fp);
   }
   MPI_Bcast(&tabstyle,1,MPI_INT,0,world);
   MPI_Bcast(&tablength,1,MPI_INT,0,world);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairMultiLucy::computeLocalDensity()
 {
   int i,j,m,ii,jj,inum,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz;
   double rsq;
   int *ilist,*jlist,*numneigh,**firstneigh;
 
   double **x = atom->x;
   int *type = atom->type;
   int nlocal = atom->nlocal;
   int newton_pair = force->newton_pair;
   double factor;
 
   inum = list->inum;
   ilist = list->ilist;
   numneigh = list->numneigh;
   firstneigh = list->firstneigh;
 
   double pi = MathConst::MY_PI;
   double *rho = atom->rho;
 
  // zero out density
 
   if (newton_pair) {
     m = nlocal + atom->nghost;
     for (i = 0; i < m; i++) rho[i] = 0.0;
   } else for (i = 0; i < nlocal; i++) rho[i] = 0.0;
 
 // rho = density at each atom
 // loop over neighbors of my atoms
   for (ii = 0; ii < inum; ii++) {
     i = ilist[ii];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
     itype = type[i];
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       j &= NEIGHMASK;
       jtype = type[j];
 
       delx = xtmp - x[j][0];
       dely = ytmp - x[j][1];
       delz = ztmp - x[j][2];
       rsq = delx*delx + dely*dely + delz*delz;
 
       if (rsq < cutsq[itype][jtype]) {
         double rcut = sqrt(cutsq[itype][jtype]);
         factor= (84.0/(5.0*pi*rcut*rcut*rcut))*(1.0+3.0*sqrt(rsq)/(2.0*rcut))*(1.0-sqrt(rsq)/rcut)*(1.0-sqrt(rsq)/rcut)*(1.0-sqrt(rsq)/rcut)*(1.0-sqrt(rsq)/rcut);
         rho[i] += factor;
         if (newton_pair || j < nlocal) {
           rho[j] += factor;
         }
       }
     }
   }
   if (newton_pair) comm->reverse_comm_pair(this);
 
   comm->forward_comm_pair(this);
 
 }
 /* ---------------------------------------------------------------------- */
 
 int PairMultiLucy::pack_forward_comm(int n, int *list, double *buf, int pbc_flag, int *pbc)
 {
   int i,j,m;
   double *rho = atom->rho;
 
   m = 0;
   for (i = 0; i < n; i++) {
     j = list[i];
     buf[m++] = rho[j];
   }
   return m;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairMultiLucy::unpack_forward_comm(int n, int first, double *buf)
 {
   int i,m,last;
   double *rho = atom->rho;
 
   m = 0;
   last = first + n;
   for (i = first; i < last; i++) rho[i] = buf[m++];
 }
 
 /* ---------------------------------------------------------------------- */
 
 int PairMultiLucy::pack_reverse_comm(int n, int first, double *buf)
 {
   int i,m,last;
   double *rho = atom->rho;
 
   m = 0;
   last = first + n;
   for (i = first; i < last; i++) buf[m++] = rho[i];
   return m;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairMultiLucy::unpack_reverse_comm(int n, int *list, double *buf)
 {
   int i,j,m;
   double *rho = atom->rho;
 
   m = 0;
   for (i = 0; i < n; i++) {
     j = list[i];
     rho[j] += buf[m++];
   }
 }
diff --git a/src/USER-DPD/pair_multi_lucy_rx.cpp b/src/USER-DPD/pair_multi_lucy_rx.cpp
index 0a1bb4b10..de8673372 100644
--- a/src/USER-DPD/pair_multi_lucy_rx.cpp
+++ b/src/USER-DPD/pair_multi_lucy_rx.cpp
@@ -1,1023 +1,1023 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------------------------
    Contributing authors:
    James Larentzos and Joshua Moore (U.S. Army Research Laboratory)
 
    Please cite the related publications:
    J.D. Moore, B.C. Barnes, S. Izvekov, M. Lisal, M.S. Sellers, D.E. Taylor & J.K. Brennan
    "A coarse-grain force field for RDX: Density dependent and energy conserving"
    The Journal of Chemical Physics, 2016, 144, 104501.
 ------------------------------------------------------------------------------------------- */
 
-#include "mpi.h"
+#include <mpi.h>
 #include <math.h>
 #include "math_const.h"
 #include <stdlib.h>
 #include <string.h>
 #include "pair_multi_lucy_rx.h"
 #include "atom.h"
 #include "force.h"
 #include "comm.h"
 #include "neigh_list.h"
 #include "memory.h"
 #include "error.h"
 #include "citeme.h"
 #include "modify.h"
 #include "fix.h"
 
 using namespace LAMMPS_NS;
 
 enum{NONE,RLINEAR,RSQ};
 
 #define MAXLINE 1024
 
 #define oneFluidParameter (-1)
 #define isOneFluid(_site) ( (_site) == oneFluidParameter )
 
 static const char cite_pair_multi_lucy_rx[] =
   "pair_style multi/lucy/rx command:\n\n"
   "@Article{Moore16,\n"
   " author = {J.D. Moore, B.C. Barnes, S. Izvekov, M. Lisal, M.S. Sellers, D.E. Taylor and J. K. Brennan},\n"
   " title = {A coarse-grain force field for RDX:  Density dependent and energy conserving},\n"
   " journal = {J. Chem. Phys.},\n"
   " year =    2016,\n"
   " volume =  144\n"
   " pages =   {104501}\n"
   "}\n\n";
 
 /* ---------------------------------------------------------------------- */
 
 PairMultiLucyRX::PairMultiLucyRX(LAMMPS *lmp) : Pair(lmp)
 {
   if (lmp->citeme) lmp->citeme->add(cite_pair_multi_lucy_rx);
 
   if (atom->rho_flag != 1) error->all(FLERR,"Pair multi/lucy/rx command requires atom_style with density (e.g. dpd, meso)");
 
   ntables = 0;
   tables = NULL;
 
   comm_forward = 1;
   comm_reverse = 1;
 
 }
 
 /* ---------------------------------------------------------------------- */
 
 PairMultiLucyRX::~PairMultiLucyRX()
 {
   for (int m = 0; m < ntables; m++) free_table(&tables[m]);
   memory->sfree(tables);
 
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(cutsq);
     memory->destroy(tabindex);
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairMultiLucyRX::compute(int eflag, int vflag)
 {
   int i,j,ii,jj,inum,jnum,itype,jtype,itable;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,evdwlOld,fpair;
   double rsq;
   int *ilist,*jlist,*numneigh,**firstneigh;
   Table *tb;
 
   int tlm1 = tablength - 1;
 
   evdwlOld = 0.0;
   evdwl = 0.0;
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
 
   double **x = atom->x;
   double **f = atom->f;
   int *type = atom->type;
   int nlocal = atom->nlocal;
   int nghost = atom->nghost;
   int newton_pair = force->newton_pair;
 
   double fractionOld1_i,fractionOld1_j;
   double fractionOld2_i,fractionOld2_j;
   double fraction1_i;
   double *uCG = atom->uCG;
   double *uCGnew = atom->uCGnew;
 
   double pi = MathConst::MY_PI;
   double A_i, A_j;
   double fraction_i,fraction_j;
   int jtable;
   double *rho = atom->rho;
 
   double *fractionOld1 = NULL;
   double *fractionOld2 = NULL;
   double *fraction1 = NULL;
   double *fraction2 = NULL;
 
   {
     const int ntotal = nlocal + nghost;
     memory->create(fractionOld1, ntotal, "PairMultiLucyRX::fractionOld1");
     memory->create(fractionOld2, ntotal, "PairMultiLucyRX::fractionOld2");
     memory->create(fraction1, ntotal, "PairMultiLucyRX::fraction1");
     memory->create(fraction2, ntotal, "PairMultiLucyRX::fraction2");
 
     for (int i = 0; i < ntotal; ++i)
        getParams(i, fractionOld1[i], fractionOld2[i], fraction1[i], fraction2[i]);
   }
 
   inum = list->inum;
   ilist = list->ilist;
   numneigh = list->numneigh;
   firstneigh = list->firstneigh;
 
   computeLocalDensity();
 
   // loop over neighbors of my atoms
 
   for (ii = 0; ii < inum; ii++) {
     i = ilist[ii];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
     itype = type[i];
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     double fx_i = 0.0;
     double fy_i = 0.0;
     double fz_i = 0.0;
 
     fractionOld1_i = fractionOld1[i];
     fractionOld2_i = fractionOld2[i];
     fraction1_i = fraction1[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j][0];
       dely = ytmp - x[j][1];
       delz = ztmp - x[j][2];
       rsq = delx*delx + dely*dely + delz*delz;
       jtype = type[j];
 
       if (rsq < cutsq[itype][jtype]) {
         fpair = 0.0;
 
         fractionOld1_j = fractionOld1[j];
         fractionOld2_j = fractionOld2[j];
 
         tb = &tables[tabindex[itype][jtype]];
         if (rho[i]*rho[i] < tb->innersq || rho[j]*rho[j] < tb->innersq){
           printf("Table inner cutoff = %lf\n",sqrt(tb->innersq));
           printf("rho[%d]=%lf\n",i,rho[i]);
           printf("rho[%d]=%lf\n",j,rho[j]);
           error->one(FLERR,"Density < table inner cutoff");
         }
         if (tabstyle == LOOKUP) {
           itable = static_cast<int> (((rho[i]*rho[i]) - tb->innersq) * tb->invdelta);
           jtable = static_cast<int> (((rho[j]*rho[j]) - tb->innersq) * tb->invdelta);
           if (itable >= tlm1 || jtable >= tlm1){
             printf("Table outer index = %d\n",tlm1);
             printf("itableIndex=%d rho[%d]=%lf\n",itable,i,rho[i]);
             printf("jtableIndex=%d rho[%d]=%lf\n",jtable,j,rho[j]);
             error->one(FLERR,"Density > table outer cutoff");
           }
           A_i = tb->f[itable];
           A_j = tb->f[jtable];
 
           const double rfactor = 1.0-sqrt(rsq/cutsq[itype][jtype]);
           fpair = 0.5*(A_i + A_j)*(4.0-3.0*rfactor)*rfactor*rfactor*rfactor;
           fpair /= sqrt(rsq);
 
         } else if (tabstyle == LINEAR) {
           itable = static_cast<int> ((rho[i]*rho[i] - tb->innersq) * tb->invdelta);
           jtable = static_cast<int> (((rho[j]*rho[j]) - tb->innersq) * tb->invdelta);
           if (itable >= tlm1 || jtable >= tlm1){
             printf("Table outer index = %d\n",tlm1);
             printf("itableIndex=%d rho[%d]=%lf\n",itable,i,rho[i]);
             printf("jtableIndex=%d rho[%d]=%lf\n",jtable,j,rho[j]);
             error->one(FLERR,"Density > table outer cutoff");
           }
           if(itable<0) itable=0;
           if(itable>=tlm1) itable=tlm1;
           if(jtable<0) jtable=0;
           if(jtable>=tlm1)jtable=tlm1;
 
           fraction_i = (((rho[i]*rho[i]) - tb->rsq[itable]) * tb->invdelta);
           fraction_j = (((rho[j]*rho[j]) - tb->rsq[jtable]) * tb->invdelta);
           if(itable==0) fraction_i=0.0;
           if(itable==tlm1) fraction_i=0.0;
           if(jtable==0) fraction_j=0.0;
           if(jtable==tlm1) fraction_j=0.0;
 
           A_i = tb->f[itable] + fraction_i*tb->df[itable];
           A_j = tb->f[jtable] + fraction_j*tb->df[jtable];
 
           const double rfactor = 1.0-sqrt(rsq/cutsq[itype][jtype]);
           fpair = 0.5*(A_i + A_j)*(4.0-3.0*rfactor)*rfactor*rfactor*rfactor;
           fpair /= sqrt(rsq);
 
         } else error->one(FLERR,"Only LOOKUP and LINEAR table styles have been implemented for pair multi/lucy/rx");
 
         if (isite1 == isite2) fpair = sqrt(fractionOld1_i*fractionOld2_j)*fpair; 
         else fpair = (sqrt(fractionOld1_i*fractionOld2_j) + sqrt(fractionOld2_i*fractionOld1_j))*fpair;
 
         fx_i += delx*fpair;
         fy_i += dely*fpair;
         fz_i += delz*fpair;
         if (newton_pair || j < nlocal) {
           f[j][0] -= delx*fpair;
           f[j][1] -= dely*fpair;
           f[j][2] -= delz*fpair;
         }
         if (evflag) ev_tally(i,j,nlocal,newton_pair,0.0,0.0,fpair,delx,dely,delz);
       }
     }
 
     f[i][0] += fx_i;
     f[i][1] += fy_i;
     f[i][2] += fz_i;
 
     tb = &tables[tabindex[itype][itype]];
     itable = static_cast<int> (((rho[i]*rho[i]) - tb->innersq) * tb->invdelta);
     if (tabstyle == LOOKUP) evdwl = tb->e[itable];
     else if (tabstyle == LINEAR){
       if (itable >= tlm1){
         printf("itableIndex=%d rho[%d]=%lf\n",itable,i,rho[i]);
         error->one(FLERR,"Density > table outer cutoff");
       }
       if(itable==0) fraction_i=0.0;
       else fraction_i = (((rho[i]*rho[i]) - tb->rsq[itable]) * tb->invdelta);
       evdwl = tb->e[itable] + fraction_i*tb->de[itable];
     } else error->one(FLERR,"Only LOOKUP and LINEAR table styles have been implemented for pair multi/lucy/rx");
 
     evdwl *=(pi*cutsq[itype][itype]*cutsq[itype][itype])/84.0;
     evdwlOld = fractionOld1_i*evdwl;
     evdwl = fraction1_i*evdwl;
 
     uCG[i] += evdwlOld;
     uCGnew[i] += evdwl;
 
     evdwl = evdwlOld;
 
     if (evflag) ev_tally(0,0,nlocal,newton_pair,evdwl,0.0,0.0,0.0,0.0,0.0);
   }
 
   if (vflag_fdotr) virial_fdotr_compute();
 
   memory->destroy(fractionOld1);
   memory->destroy(fractionOld2);
   memory->destroy(fraction1);
   memory->destroy(fraction2);
 }
 
 /* ----------------------------------------------------------------------
    allocate all arrays
 ------------------------------------------------------------------------- */
 
 void PairMultiLucyRX::allocate()
 {
   allocated = 1;
   const int nt = atom->ntypes + 1;
 
   memory->create(setflag,nt,nt,"pair:setflag");
   memory->create(cutsq,nt,nt,"pair:cutsq");
   memory->create(tabindex,nt,nt,"pair:tabindex");
 
   memset(&setflag[0][0],0,nt*nt*sizeof(int));
   memset(&cutsq[0][0],0,nt*nt*sizeof(double));
   memset(&tabindex[0][0],0,nt*nt*sizeof(int));
 }
 
 /* ----------------------------------------------------------------------
    global settings
 ------------------------------------------------------------------------- */
 
 void PairMultiLucyRX::settings(int narg, char **arg)
 {
   if (narg != 2) error->all(FLERR,"Illegal pair_style command");
 
   // new settings
 
   if (strcmp(arg[0],"lookup") == 0) tabstyle = LOOKUP;
   else if (strcmp(arg[0],"linear") == 0) tabstyle = LINEAR;
   else error->all(FLERR,"Unknown table style in pair_style command");
 
   tablength = force->inumeric(FLERR,arg[1]);
   if (tablength < 2) error->all(FLERR,"Illegal number of pair table entries");
 
   // delete old tables, since cannot just change settings
 
   for (int m = 0; m < ntables; m++) free_table(&tables[m]);
   memory->sfree(tables);
 
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(cutsq);
     memory->destroy(tabindex);
   }
   allocated = 0;
 
   ntables = 0;
   tables = NULL;
 }
 
 /* ----------------------------------------------------------------------
    set coeffs for one or more type pairs
 ------------------------------------------------------------------------- */
 
 void PairMultiLucyRX::coeff(int narg, char **arg)
 {
   if (narg != 6 && narg != 7) error->all(FLERR,"Illegal pair_coeff command");
 
   bool rx_flag = false;
   for (int i = 0; i < modify->nfix; i++)
     if (strncmp(modify->fix[i]->style,"rx",2) == 0) rx_flag = true;
   if (!rx_flag) error->all(FLERR,"PairMultiLucyRX requires a fix rx command.");
 
   if (!allocated) allocate();
 
   int ilo,ihi,jlo,jhi;
   force->bounds(arg[0],atom->ntypes,ilo,ihi);
   force->bounds(arg[1],atom->ntypes,jlo,jhi);
 
   int me;
   MPI_Comm_rank(world,&me);
   tables = (Table *)
     memory->srealloc(tables,(ntables+1)*sizeof(Table),"pair:tables");
   Table *tb = &tables[ntables];
   null_table(tb);
   if (me == 0) read_table(tb,arg[2],arg[3]);
   bcast_table(tb);
 
   nspecies = atom->nspecies_dpd;
   int n;
   n = strlen(arg[3]) + 1;
   site1 = new char[n];
   strcpy(site1,arg[4]);
 
   n = strlen(arg[4]) + 1;
   site2 = new char[n];
   strcpy(site2,arg[5]);
 
   // set table cutoff
 
   if (narg == 7) tb->cut = force->numeric(FLERR,arg[6]);
   else if (tb->rflag) tb->cut = tb->rhi;
   else tb->cut = tb->rfile[tb->ninput-1];
 
   // error check on table parameters
   // insure cutoff is within table
 
   if (tb->ninput <= 1) error->one(FLERR,"Invalid pair table length");
   if (tb->rflag == 0) {
     rho_0 = tb->rfile[0];
   } else {
     rho_0 = tb->rlo;
   }
 
   tb->match = 0;
   if (tabstyle == LINEAR && tb->ninput == tablength &&
       tb->rflag == RSQ) tb->match = 1;
 
   // spline read-in values and compute r,e,f vectors within table
 
   if (tb->match == 0) spline_table(tb);
   compute_table(tb);
 
   // store ptr to table in tabindex
 
   int count = 0;
   for (int i = ilo; i <= ihi; i++) {
     for (int j = MAX(jlo,i); j <= jhi; j++) {
       tabindex[i][j] = ntables;
       setflag[i][j] = 1;
       count++;
     }
   }
 
   if (count == 0) error->all(FLERR,"Illegal pair_coeff command");
   ntables++;
 
   // Match site* to isite values.
 
   if (strcmp(site1, "1fluid") == 0)
      isite1 = oneFluidParameter;
   else {
      isite1 = nspecies;
      for (int ispecies = 0; ispecies < nspecies; ++ispecies)
         if (strcmp(site1, atom->dname[ispecies]) == 0){
            isite1 = ispecies;
            break;
         }
 
      if (isite1 == nspecies)
         error->all(FLERR,"Pair_multi_lucy_rx site1 is invalid.");
   }
 
   if (strcmp(site2, "1fluid") == 0)
      isite2 = oneFluidParameter;
   else {
      isite2 = nspecies;
      for (int ispecies = 0; ispecies < nspecies; ++ispecies)
         if (strcmp(site2, atom->dname[ispecies]) == 0){
            isite2 = ispecies;
            break;
         }
 
      if (isite2 == nspecies)
         error->all(FLERR,"Pair_multi_lucy_rx site2 is invalid.");
   }
 
 }
 
 /* ----------------------------------------------------------------------
    init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
 
 double PairMultiLucyRX::init_one(int i, int j)
 {
   if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
 
   tabindex[j][i] = tabindex[i][j];
 
   return tables[tabindex[i][j]].cut;
 }
 
 /* ----------------------------------------------------------------------
    read a table section from a tabulated potential file
    only called by proc 0
    this function sets these values in Table:
      ninput,rfile,efile,ffile,rflag,rlo,rhi,fpflag,fplo,fphi
 ------------------------------------------------------------------------- */
 
 void PairMultiLucyRX::read_table(Table *tb, char *file, char *keyword)
 {
   char line[MAXLINE];
 
   // open file
 
   FILE *fp = force->open_potential(file);
   if (fp == NULL) {
     char str[128];
     sprintf(str,"Cannot open file %s",file);
     error->one(FLERR,str);
   }
 
   // loop until section found with matching keyword
 
   while (1) {
     if (fgets(line,MAXLINE,fp) == NULL)
       error->one(FLERR,"Did not find keyword in table file");
     if (strspn(line," \t\n\r") == strlen(line)) continue;  // blank line
     if (line[0] == '#') continue;                          // comment
     char *word = strtok(line," \t\n\r");
     if (strcmp(word,keyword) == 0) break;           // matching keyword
     fgets(line,MAXLINE,fp);                         // no match, skip section
     param_extract(tb,line);
     fgets(line,MAXLINE,fp);
     for (int i = 0; i < tb->ninput; i++) fgets(line,MAXLINE,fp);
   }
 
   // read args on 2nd line of section
   // allocate table arrays for file values
 
   fgets(line,MAXLINE,fp);
   param_extract(tb,line);
   memory->create(tb->rfile,tb->ninput,"pair:rfile");
   memory->create(tb->efile,tb->ninput,"pair:efile");
   memory->create(tb->ffile,tb->ninput,"pair:ffile");
 
   // read r,e,f table values from file
   // if rflag set, compute r
   // if rflag not set, use r from file
 
   int itmp;
   double rtmp;
 
   fgets(line,MAXLINE,fp);
   for (int i = 0; i < tb->ninput; i++) {
     fgets(line,MAXLINE,fp);
     sscanf(line,"%d %lg %lg %lg",&itmp,&rtmp,&tb->efile[i],&tb->ffile[i]);
 
     if (tb->rflag == RLINEAR)
       rtmp = tb->rlo + (tb->rhi - tb->rlo)*i/(tb->ninput-1);
     else if (tb->rflag == RSQ) {
       rtmp = tb->rlo*tb->rlo +
         (tb->rhi*tb->rhi - tb->rlo*tb->rlo)*i/(tb->ninput-1);
       rtmp = sqrt(rtmp);
     }
 
     tb->rfile[i] = rtmp;
   }
 
   // close file
 
   fclose(fp);
 }
 
 /* ----------------------------------------------------------------------
    broadcast read-in table info from proc 0 to other procs
    this function communicates these values in Table:
      ninput,rfile,efile,ffile,rflag,rlo,rhi,fpflag,fplo,fphi
 ------------------------------------------------------------------------- */
 
 void PairMultiLucyRX::bcast_table(Table *tb)
 {
   MPI_Bcast(&tb->ninput,1,MPI_INT,0,world);
 
   int me;
   MPI_Comm_rank(world,&me);
   if (me > 0) {
     memory->create(tb->rfile,tb->ninput,"pair:rfile");
     memory->create(tb->efile,tb->ninput,"pair:efile");
     memory->create(tb->ffile,tb->ninput,"pair:ffile");
   }
 
   MPI_Bcast(tb->rfile,tb->ninput,MPI_DOUBLE,0,world);
   MPI_Bcast(tb->efile,tb->ninput,MPI_DOUBLE,0,world);
   MPI_Bcast(tb->ffile,tb->ninput,MPI_DOUBLE,0,world);
 
   MPI_Bcast(&tb->rflag,1,MPI_INT,0,world);
   if (tb->rflag) {
     MPI_Bcast(&tb->rlo,1,MPI_DOUBLE,0,world);
     MPI_Bcast(&tb->rhi,1,MPI_DOUBLE,0,world);
   }
   MPI_Bcast(&tb->fpflag,1,MPI_INT,0,world);
   if (tb->fpflag) {
     MPI_Bcast(&tb->fplo,1,MPI_DOUBLE,0,world);
     MPI_Bcast(&tb->fphi,1,MPI_DOUBLE,0,world);
   }
 }
 
 /* ----------------------------------------------------------------------
    build spline representation of e,f over entire range of read-in table
    this function sets these values in Table: e2file,f2file
 ------------------------------------------------------------------------- */
 
 void PairMultiLucyRX::spline_table(Table *tb)
 {
   memory->create(tb->e2file,tb->ninput,"pair:e2file");
   memory->create(tb->f2file,tb->ninput,"pair:f2file");
 
   double ep0 = - tb->ffile[0];
   double epn = - tb->ffile[tb->ninput-1];
   spline(tb->rfile,tb->efile,tb->ninput,ep0,epn,tb->e2file);
 
   if (tb->fpflag == 0) {
     tb->fplo = (tb->ffile[1] - tb->ffile[0]) / (tb->rfile[1] - tb->rfile[0]);
     tb->fphi = (tb->ffile[tb->ninput-1] - tb->ffile[tb->ninput-2]) /
       (tb->rfile[tb->ninput-1] - tb->rfile[tb->ninput-2]);
   }
 
   double fp0 = tb->fplo;
   double fpn = tb->fphi;
   spline(tb->rfile,tb->ffile,tb->ninput,fp0,fpn,tb->f2file);
 }
 
 /* ----------------------------------------------------------------------
    extract attributes from parameter line in table section
    format of line: N value R/RSQ lo hi FP fplo fphi
    N is required, other params are optional
 ------------------------------------------------------------------------- */
 
 void PairMultiLucyRX::param_extract(Table *tb, char *line)
 {
   tb->ninput = 0;
   tb->rflag = NONE;
   tb->fpflag = 0;
 
   char *word = strtok(line," \t\n\r\f");
   while (word) {
     if (strcmp(word,"N") == 0) {
       word = strtok(NULL," \t\n\r\f");
       tb->ninput = atoi(word);
     } else if (strcmp(word,"R") == 0 || strcmp(word,"RSQ") == 0) {
       if (strcmp(word,"R") == 0) tb->rflag = RLINEAR;
       else if (strcmp(word,"RSQ") == 0) tb->rflag = RSQ;
       word = strtok(NULL," \t\n\r\f");
       tb->rlo = atof(word);
       word = strtok(NULL," \t\n\r\f");
       tb->rhi = atof(word);
     } else if (strcmp(word,"FP") == 0) {
       tb->fpflag = 1;
       word = strtok(NULL," \t\n\r\f");
       tb->fplo = atof(word);
       word = strtok(NULL," \t\n\r\f");
       tb->fphi = atof(word);
     } else {
       printf("WORD: %s\n",word);
       error->one(FLERR,"Invalid keyword in pair table parameters");
     }
     word = strtok(NULL," \t\n\r\f");
   }
 
   if (tb->ninput == 0) error->one(FLERR,"Pair table parameters did not set N");
 }
 
 /* ----------------------------------------------------------------------
    compute r,e,f vectors from splined values
 ------------------------------------------------------------------------- */
 
 void PairMultiLucyRX::compute_table(Table *tb)
 {
   int tlm1 = tablength-1;
 
   // inner = inner table bound
   // cut = outer table bound
   // delta = table spacing in rsq for N-1 bins
 
   double inner;
   if (tb->rflag) inner = tb->rlo;
   else inner = tb->rfile[0];
   tb->innersq = inner*inner;
   tb->delta = (tb->rhi*tb->rhi - tb->innersq) / tlm1;
   tb->invdelta = 1.0/tb->delta;
 
   // direct lookup tables
   // N-1 evenly spaced bins in rsq from inner to cut
   // e,f = value at midpt of bin
   // e,f are N-1 in length since store 1 value at bin midpt
   // f is converted to f/r when stored in f[i]
   // e,f are never a match to read-in values, always computed via spline interp
 
   if (tabstyle == LOOKUP) {
     memory->create(tb->e,tlm1,"pair:e");
     memory->create(tb->f,tlm1,"pair:f");
 
     double r,rsq;
     for (int i = 0; i < tlm1; i++) {
       rsq = tb->innersq + (i+0.5)*tb->delta;
       r = sqrt(rsq);
       tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
       tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r);
     }
   }
 
   // linear tables
   // N-1 evenly spaced bins in rsq from inner to cut
   // rsq,e,f = value at lower edge of bin
   // de,df values = delta from lower edge to upper edge of bin
   // rsq,e,f are N in length so de,df arrays can compute difference
   // f is converted to f/r when stored in f[i]
   // e,f can match read-in values, else compute via spline interp
 
   if (tabstyle == LINEAR) {
     memory->create(tb->rsq,tablength,"pair:rsq");
     memory->create(tb->e,tablength,"pair:e");
     memory->create(tb->f,tablength,"pair:f");
     memory->create(tb->de,tlm1,"pair:de");
     memory->create(tb->df,tlm1,"pair:df");
 
     double r,rsq;
     for (int i = 0; i < tablength; i++) {
       rsq = tb->innersq + i*tb->delta;
       r = sqrt(rsq);
       tb->rsq[i] = rsq;
       if (tb->match) {
         tb->e[i] = tb->efile[i];
         tb->f[i] = tb->ffile[i];
       } else {
         tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
         tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r);
       }
     }
 
     for (int i = 0; i < tlm1; i++) {
       tb->de[i] = tb->e[i+1] - tb->e[i];
       tb->df[i] = tb->f[i+1] - tb->f[i];
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    set all ptrs in a table to NULL, so can be freed safely
 ------------------------------------------------------------------------- */
 
 void PairMultiLucyRX::null_table(Table *tb)
 {
   tb->rfile = tb->efile = tb->ffile = NULL;
   tb->e2file = tb->f2file = NULL;
   tb->rsq = tb->drsq = tb->e = tb->de = NULL;
   tb->f = tb->df = tb->e2 = tb->f2 = NULL;
 }
 
 /* ----------------------------------------------------------------------
    free all arrays in a table
 ------------------------------------------------------------------------- */
 
 void PairMultiLucyRX::free_table(Table *tb)
 {
   memory->destroy(tb->rfile);
   memory->destroy(tb->efile);
   memory->destroy(tb->ffile);
   memory->destroy(tb->e2file);
   memory->destroy(tb->f2file);
 
   memory->destroy(tb->rsq);
   memory->destroy(tb->drsq);
   memory->destroy(tb->e);
   memory->destroy(tb->de);
   memory->destroy(tb->f);
   memory->destroy(tb->df);
   memory->destroy(tb->e2);
   memory->destroy(tb->f2);
 }
 
 /* ----------------------------------------------------------------------
    spline and splint routines modified from Numerical Recipes
 ------------------------------------------------------------------------- */
 
 void PairMultiLucyRX::spline(double *x, double *y, int n,
                        double yp1, double ypn, double *y2)
 {
   int i,k;
   double p,qn,sig,un;
   double *u = new double[n];
 
   if (yp1 > 0.99e30) y2[0] = u[0] = 0.0;
   else {
     y2[0] = -0.5;
     u[0] = (3.0/(x[1]-x[0])) * ((y[1]-y[0]) / (x[1]-x[0]) - yp1);
   }
   for (i = 1; i < n-1; i++) {
     sig = (x[i]-x[i-1]) / (x[i+1]-x[i-1]);
     p = sig*y2[i-1] + 2.0;
     y2[i] = (sig-1.0) / p;
     u[i] = (y[i+1]-y[i]) / (x[i+1]-x[i]) - (y[i]-y[i-1]) / (x[i]-x[i-1]);
     u[i] = (6.0*u[i] / (x[i+1]-x[i-1]) - sig*u[i-1]) / p;
   }
   if (ypn > 0.99e30) qn = un = 0.0;
   else {
     qn = 0.5;
     un = (3.0/(x[n-1]-x[n-2])) * (ypn - (y[n-1]-y[n-2]) / (x[n-1]-x[n-2]));
   }
   y2[n-1] = (un-qn*u[n-2]) / (qn*y2[n-2] + 1.0);
   for (k = n-2; k >= 0; k--) y2[k] = y2[k]*y2[k+1] + u[k];
 
   delete [] u;
 }
 
 /* ---------------------------------------------------------------------- */
 
 double PairMultiLucyRX::splint(double *xa, double *ya, double *y2a, int n, double x)
 {
   int klo,khi,k;
   double h,b,a,y;
 
   klo = 0;
   khi = n-1;
   while (khi-klo > 1) {
     k = (khi+klo) >> 1;
     if (xa[k] > x) khi = k;
     else klo = k;
   }
   h = xa[khi]-xa[klo];
   a = (xa[khi]-x) / h;
   b = (x-xa[klo]) / h;
   y = a*ya[klo] + b*ya[khi] +
     ((a*a*a-a)*y2a[klo] + (b*b*b-b)*y2a[khi]) * (h*h)/6.0;
   return y;
 }
 
 /* ----------------------------------------------------------------------
    proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
 void PairMultiLucyRX::write_restart(FILE *fp)
 {
   write_restart_settings(fp);
 }
 
 /* ----------------------------------------------------------------------
    proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
 void PairMultiLucyRX::read_restart(FILE *fp)
 {
   read_restart_settings(fp);
   allocate();
 }
 
 /* ----------------------------------------------------------------------
    proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
 void PairMultiLucyRX::write_restart_settings(FILE *fp)
 {
   fwrite(&tabstyle,sizeof(int),1,fp);
   fwrite(&tablength,sizeof(int),1,fp);
 }
 
 /* ----------------------------------------------------------------------
    proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
 void PairMultiLucyRX::read_restart_settings(FILE *fp)
 {
   if (comm->me == 0) {
     fread(&tabstyle,sizeof(int),1,fp);
     fread(&tablength,sizeof(int),1,fp);
   }
   MPI_Bcast(&tabstyle,1,MPI_INT,0,world);
   MPI_Bcast(&tablength,1,MPI_INT,0,world);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairMultiLucyRX::computeLocalDensity()
 {
   double **x = atom->x;
   const int *type = atom->type;
   const int nlocal = atom->nlocal;
 
   const int inum = list->inum;
   const int *ilist = list->ilist;
   const int *numneigh = list->numneigh;
         int **firstneigh = list->firstneigh;
 
   const double pi = MathConst::MY_PI;
 
   const bool newton_pair = force->newton_pair;
   const bool one_type = (atom->ntypes == 1);
 
   // Special cut-off values for when there's only one type.
   const double cutsq_type11 = cutsq[1][1];
   const double rcut_type11 = sqrt(cutsq_type11);
   const double factor_type11 = 84.0/(5.0*pi*rcut_type11*rcut_type11*rcut_type11);
 
   double *rho = atom->rho;
 
   // zero out density
   if (newton_pair) {
     const int m = nlocal + atom->nghost;
     for (int i = 0; i < m; i++) rho[i] = 0.0;
   }
   else
     for (int i = 0; i < nlocal; i++) rho[i] = 0.0;
 
 // rho = density at each atom
 // loop over neighbors of my atoms
   for (int ii = 0; ii < inum; ii++){
     const int i = ilist[ii];
 
     const double xtmp = x[i][0];
     const double ytmp = x[i][1];
     const double ztmp = x[i][2];
 
     double rho_i = rho[i];
 
     const int itype = type[i];
     const int *jlist = firstneigh[i];
     const int jnum = numneigh[i];
 
     for (int jj = 0; jj < jnum; jj++){
       const int j = (jlist[jj] & NEIGHMASK);
       const int jtype = type[j];
 
       const double delx = xtmp - x[j][0];
       const double dely = ytmp - x[j][1];
       const double delz = ztmp - x[j][2];
       const double rsq = delx*delx + dely*dely + delz*delz;
 
       if (one_type) {
         if (rsq < cutsq_type11) {
           const double rcut = rcut_type11;
           const double r_over_rcut = sqrt(rsq) / rcut;
           const double tmpFactor = 1.0 - r_over_rcut;
           const double tmpFactor4 = tmpFactor*tmpFactor*tmpFactor*tmpFactor;
           const double factor = factor_type11*(1.0 + 1.5*r_over_rcut)*tmpFactor4;
           rho_i += factor;
           if (newton_pair || j < nlocal)
             rho[j] += factor;
         } else if (rsq < cutsq[itype][jtype]) {
           const double rcut = sqrt(cutsq[itype][jtype]);
           const double tmpFactor = 1.0-sqrt(rsq)/rcut;
           const double tmpFactor4 = tmpFactor*tmpFactor*tmpFactor*tmpFactor;
           const double factor = (84.0/(5.0*pi*rcut*rcut*rcut))*(1.0+3.0*sqrt(rsq)/(2.0*rcut))*tmpFactor4;
           rho_i += factor;
           if (newton_pair || j < nlocal)
             rho[j] += factor;
         }
       }
     }
 
     rho[i] = rho_i;
   }
   if (newton_pair) comm->reverse_comm_pair(this);
 
   comm->forward_comm_pair(this);
 
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairMultiLucyRX::getParams(int id, double &fractionOld1, double &fractionOld2, double &fraction1, double &fraction2)
 {
   double fractionOld, fraction;
   double nTotal, nTotalOld;
 
   nTotal = 0.0;
   nTotalOld = 0.0;
   for (int ispecies = 0; ispecies < nspecies; ispecies++){
     nTotal += atom->dvector[ispecies][id]; 
     nTotalOld += atom->dvector[ispecies+nspecies][id];
   }
 
   if (isOneFluid(isite1) == false){
     fractionOld1 = atom->dvector[isite1+nspecies][id]/nTotalOld;
     fraction1 = atom->dvector[isite1][id]/nTotal;
   }
   if (isOneFluid(isite2) == false){
     fractionOld2 = atom->dvector[isite2+nspecies][id]/nTotalOld;
     fraction2 = atom->dvector[isite2][id]/nTotal;
   }
 
   if (isOneFluid(isite1) || isOneFluid(isite2)){
     fractionOld  = 0.0;
     fraction  = 0.0;
 
     for (int ispecies = 0; ispecies < nspecies; ispecies++){
       if (isite1 == ispecies || isite2 == ispecies) continue;
       fractionOld += atom->dvector[ispecies+nspecies][id] / nTotalOld;
       fraction += atom->dvector[ispecies][id] / nTotal;
     }
     if (isOneFluid(isite1)){
       fractionOld1 = fractionOld;
       fraction1 = fraction;
     }
     if (isOneFluid(isite2)){
       fractionOld2 = fractionOld;
       fraction2 = fraction;
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 int PairMultiLucyRX::pack_forward_comm(int n, int *list, double *buf, int pbc_flag, int *pbc)
 {
   int i,j,m;
   double *rho = atom->rho;
 
   m = 0;
   for (i = 0; i < n; i++) {
     j = list[i];
     buf[m++] = rho[j];
   }
   return m;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairMultiLucyRX::unpack_forward_comm(int n, int first, double *buf)
 {
   int i,m,last;
   double *rho = atom->rho;
 
   m = 0;
   last = first + n;
   for (i = first; i < last; i++) rho[i] = buf[m++];
 }
 
 /* ---------------------------------------------------------------------- */
 
 int PairMultiLucyRX::pack_reverse_comm(int n, int first, double *buf)
 {
   int i,m,last;
   double *rho = atom->rho;
 
   m = 0;
   last = first + n;
   for (i = first; i < last; i++) buf[m++] = rho[i];
   return m;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairMultiLucyRX::unpack_reverse_comm(int n, int *list, double *buf)
 {
   int i,j,m;
   double *rho = atom->rho;
 
   m = 0;
   for (i = 0; i < n; i++) {
     j = list[i];
     rho[j] += buf[m++];
   }
 }
diff --git a/src/USER-DPD/pair_table_rx.cpp b/src/USER-DPD/pair_table_rx.cpp
index d278feaf8..44a9d7602 100644
--- a/src/USER-DPD/pair_table_rx.cpp
+++ b/src/USER-DPD/pair_table_rx.cpp
@@ -1,1186 +1,1186 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Paul Crozier (SNL)
 ------------------------------------------------------------------------- */
 
-#include "mpi.h"
+#include <mpi.h>
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
 #include "pair_table_rx.h"
 #include "atom.h"
 #include "force.h"
 #include "comm.h"
 #include "neigh_list.h"
 #include "memory.h"
 #include "error.h"
 #include "modify.h"
 #include "fix.h"
 
 using namespace LAMMPS_NS;
 
 enum{NONE,RLINEAR,RSQ,BMP};
 
 #define MAXLINE 1024
 
 #define OneFluidValue (-1)
 #define isOneFluid(_site_) ( (_site_) == OneFluidValue )
 
 /* ---------------------------------------------------------------------- */
 
 PairTableRX::PairTableRX(LAMMPS *lmp) : Pair(lmp)
 {
   ntables = 0;
   tables = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 PairTableRX::~PairTableRX()
 {
   for (int m = 0; m < ntables; m++) free_table(&tables[m]);
   memory->sfree(tables);
 
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(cutsq);
     memory->destroy(tabindex);
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairTableRX::compute(int eflag, int vflag)
 {
   int i,j,ii,jj,inum,jnum,itype,jtype,itable;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,evdwlOld,fpair;
   double rsq,factor_lj,fraction,value,a,b;
   int *ilist,*jlist,*numneigh,**firstneigh;
   Table *tb;
 
   union_int_float_t rsq_lookup;
   int tlm1 = tablength - 1;
 
   fraction = 0.0;
   a = 0.0;
   b = 0.0;
 
   evdwlOld = 0.0;
   evdwl = 0.0;
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
 
   double *fractionOld1, *fractionOld2;
   double *fraction1, *fraction2;
 
   {
     const int ntotal = atom->nlocal + atom->nghost;
 
     memory->create(fractionOld1, ntotal, "PairTableRx::compute::fractionOld1");
     memory->create(fractionOld2, ntotal, "PairTableRx::compute::fractionOld2");
     memory->create(fraction1, ntotal, "PairTableRx::compute::fraction1");
     memory->create(fraction2, ntotal, "PairTableRx::compute::fraction2");
 
     for (int i = 0; i < ntotal; ++i)
       getParams(i, fractionOld1[i], fractionOld2[i], fraction1[i], fraction2[i]);
   }
 
   double **x = atom->x;
   double **f = atom->f;
   int *type = atom->type;
   int nlocal = atom->nlocal;
   double *special_lj = force->special_lj;
   int newton_pair = force->newton_pair;
 
   double fractionOld1_i, fractionOld1_j;
   double fractionOld2_i, fractionOld2_j;
   double fraction1_i, fraction1_j;
   double fraction2_i, fraction2_j;
   double *uCG = atom->uCG;
   double *uCGnew = atom->uCGnew;
 
   inum = list->inum;
   ilist = list->ilist;
   numneigh = list->numneigh;
   firstneigh = list->firstneigh;
 
   // loop over neighbors of my atoms
   for (ii = 0; ii < inum; ii++) {
     i = ilist[ii];
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
     itype = type[i];
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     double uCG_i = 0.0;
     double uCGnew_i = 0.0;
     double fx_i = 0.0, fy_i = 0.0, fz_i = 0.0;
 
     fractionOld1_i = fractionOld1[i];
     fractionOld2_i = fractionOld2[i];
     fraction1_i = fraction1[i];
     fraction2_i = fraction2[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       factor_lj = special_lj[sbmask(j)];
       j &= NEIGHMASK;
 
       delx = xtmp - x[j][0];
       dely = ytmp - x[j][1];
       delz = ztmp - x[j][2];
       rsq = delx*delx + dely*dely + delz*delz;
       jtype = type[j];
 
       if (rsq < cutsq[itype][jtype]) {
         fractionOld1_j = fractionOld1[j];
         fractionOld2_j = fractionOld2[j];
         fraction1_j = fraction1[j];
         fraction2_j = fraction2[j];
 
         tb = &tables[tabindex[itype][jtype]];
         if (rsq < tb->innersq)
           error->one(FLERR,"Pair distance < table inner cutoff");
 
         if (tabstyle == LOOKUP) {
           itable = static_cast<int> ((rsq - tb->innersq) * tb->invdelta);
           if (itable >= tlm1)
             error->one(FLERR,"Pair distance > table outer cutoff");
           fpair = factor_lj * tb->f[itable];
         } else if (tabstyle == LINEAR) {
           itable = static_cast<int> ((rsq - tb->innersq) * tb->invdelta);
           if (itable >= tlm1)
             error->one(FLERR,"Pair distance > table outer cutoff");
           fraction = (rsq - tb->rsq[itable]) * tb->invdelta;
           value = tb->f[itable] + fraction*tb->df[itable];
           fpair = factor_lj * value;
         } else if (tabstyle == SPLINE) {
           itable = static_cast<int> ((rsq - tb->innersq) * tb->invdelta);
           if (itable >= tlm1)
             error->one(FLERR,"Pair distance > table outer cutoff");
           b = (rsq - tb->rsq[itable]) * tb->invdelta;
           a = 1.0 - b;
           value = a * tb->f[itable] + b * tb->f[itable+1] +
             ((a*a*a-a)*tb->f2[itable] + (b*b*b-b)*tb->f2[itable+1]) *
             tb->deltasq6;
           fpair = factor_lj * value;
         } else {
           rsq_lookup.f = rsq;
           itable = rsq_lookup.i & tb->nmask;
           itable >>= tb->nshiftbits;
           fraction = (rsq_lookup.f - tb->rsq[itable]) * tb->drsq[itable];
           value = tb->f[itable] + fraction*tb->df[itable];
           fpair = factor_lj * value;
         }
         if (isite1 == isite2) fpair = sqrt(fractionOld1_i*fractionOld2_j)*fpair; 
         else fpair = (sqrt(fractionOld1_i*fractionOld2_j) + sqrt(fractionOld2_i*fractionOld1_j))*fpair;
 
         fx_i += delx*fpair;
         fy_i += dely*fpair;
         fz_i += delz*fpair;
         if (newton_pair || j < nlocal) {
           f[j][0] -= delx*fpair;
           f[j][1] -= dely*fpair;
           f[j][2] -= delz*fpair;
         }
 
         if (tabstyle == LOOKUP)
           evdwl = tb->e[itable];
         else if (tabstyle == LINEAR || tabstyle == BITMAP){
           evdwl = tb->e[itable] + fraction*tb->de[itable];
         }
         else
           evdwl = a * tb->e[itable] + b * tb->e[itable+1] +
             ((a*a*a-a)*tb->e2[itable] + (b*b*b-b)*tb->e2[itable+1]) *
             tb->deltasq6;
         if (isite1 == isite2){
           evdwlOld = sqrt(fractionOld1_i*fractionOld2_j)*evdwl;
           evdwl = sqrt(fraction1_i*fraction2_j)*evdwl;
         } else {
           evdwlOld = (sqrt(fractionOld1_i*fractionOld2_j) + sqrt(fractionOld2_i*fractionOld1_j))*evdwl;
           evdwl = (sqrt(fraction1_i*fraction2_j) + sqrt(fraction2_i*fraction1_j))*evdwl;
         }
         evdwlOld *= factor_lj;
         evdwl *= factor_lj;
 
         uCG_i += 0.5*evdwlOld;
         uCG[j] += 0.5*evdwlOld;
 
         uCGnew_i += 0.5*evdwl;
         uCGnew[j] += 0.5*evdwl;
         evdwl = evdwlOld;
 
         if (evflag) ev_tally(i,j,nlocal,newton_pair,
                              evdwl,0.0,fpair,delx,dely,delz);
       }
     }
 
     uCG[i] += uCG_i;
     uCGnew[i] += uCGnew_i;
 
     f[i][0] += fx_i;
     f[i][1] += fy_i;
     f[i][2] += fz_i;
   }
   if (vflag_fdotr) virial_fdotr_compute();
 
   memory->destroy(fractionOld1);
   memory->destroy(fractionOld2);
   memory->destroy(fraction1);
   memory->destroy(fraction2);
 }
 
 /* ----------------------------------------------------------------------
    allocate all arrays
 ------------------------------------------------------------------------- */
 
 void PairTableRX::allocate()
 {
   allocated = 1;
   const int nt = atom->ntypes + 1;
 
   memory->create(setflag,nt,nt,"pair:setflag");
   memory->create(cutsq,nt,nt,"pair:cutsq");
   memory->create(tabindex,nt,nt,"pair:tabindex");
 
   memset(&setflag[0][0],0,nt*nt*sizeof(int));
   memset(&cutsq[0][0],0,nt*nt*sizeof(double));
   memset(&tabindex[0][0],0,nt*nt*sizeof(int));
 }
 
 /* ----------------------------------------------------------------------
    global settings
 ------------------------------------------------------------------------- */
 
 void PairTableRX::settings(int narg, char **arg)
 {
   if (narg < 2) error->all(FLERR,"Illegal pair_style command");
 
   // new settings
 
   if (strcmp(arg[0],"lookup") == 0) tabstyle = LOOKUP;
   else if (strcmp(arg[0],"linear") == 0) tabstyle = LINEAR;
   else if (strcmp(arg[0],"spline") == 0) tabstyle = SPLINE;
   else if (strcmp(arg[0],"bitmap") == 0) tabstyle = BITMAP;
   else error->all(FLERR,"Unknown table style in pair_style command");
 
   tablength = force->inumeric(FLERR,arg[1]);
   if (tablength < 2) error->all(FLERR,"Illegal number of pair table entries");
 
   // optional keywords
   // assert the tabulation is compatible with a specific long-range solver
 
   int iarg = 2;
   while (iarg < narg) {
     if (strcmp(arg[iarg],"ewald") == 0) ewaldflag = 1;
     else if (strcmp(arg[iarg],"pppm") == 0) pppmflag = 1;
     else if (strcmp(arg[iarg],"msm") == 0) msmflag = 1;
     else if (strcmp(arg[iarg],"dispersion") == 0) dispersionflag = 1;
     else if (strcmp(arg[iarg],"tip4p") == 0) tip4pflag = 1;
     else error->all(FLERR,"Illegal pair_style command");
     iarg++;
   }
 
   // delete old tables, since cannot just change settings
 
   for (int m = 0; m < ntables; m++) free_table(&tables[m]);
   memory->sfree(tables);
 
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(cutsq);
     memory->destroy(tabindex);
   }
   allocated = 0;
 
   ntables = 0;
   tables = NULL;
 }
 
 /* ----------------------------------------------------------------------
    set coeffs for one or more type pairs
 ------------------------------------------------------------------------- */
 
 void PairTableRX::coeff(int narg, char **arg)
 {
   if (narg != 6 && narg != 7) error->all(FLERR,"Illegal pair_coeff command");
   if (!allocated) allocate();
 
   bool rx_flag = false;
   for (int i = 0; i < modify->nfix; i++)
     if (strncmp(modify->fix[i]->style,"rx",2) == 0) rx_flag = true;
   if (!rx_flag) error->all(FLERR,"PairTableRX requires a fix rx command.");
 
   int ilo,ihi,jlo,jhi;
   force->bounds(arg[0],atom->ntypes,ilo,ihi);
   force->bounds(arg[1],atom->ntypes,jlo,jhi);
 
   int me;
   MPI_Comm_rank(world,&me);
   tables = (Table *)
     memory->srealloc(tables,(ntables+1)*sizeof(Table),"pair:tables");
   Table *tb = &tables[ntables];
   null_table(tb);
   if (me == 0) read_table(tb,arg[2],arg[3]);
   bcast_table(tb);
 
   nspecies = atom->nspecies_dpd;
   if(nspecies==0) error->all(FLERR,"There are no rx species specified.");
   int n;
   n = strlen(arg[3]) + 1;
   site1 = new char[n];
   strcpy(site1,arg[4]);
 
   int ispecies;
   for (ispecies = 0; ispecies < nspecies; ispecies++){
     if (strcmp(site1,&atom->dname[ispecies][0]) == 0) break;
   }
   if (ispecies == nspecies && strcmp(site1,"1fluid") != 0)
     error->all(FLERR,"Site1 name not recognized in pair coefficients");
 
   n = strlen(arg[4]) + 1;
   site2 = new char[n];
   strcpy(site2,arg[5]);
 
   for (ispecies = 0; ispecies < nspecies; ispecies++){
     if (strcmp(site2,&atom->dname[ispecies][0]) == 0) break;
   }
   if (ispecies == nspecies && strcmp(site2,"1fluid") != 0)
     error->all(FLERR,"Site2 name not recognized in pair coefficients");
 
   // set table cutoff
 
   if (narg == 7) tb->cut = force->numeric(FLERR,arg[6]);
   else if (tb->rflag) tb->cut = tb->rhi;
   else tb->cut = tb->rfile[tb->ninput-1];
 
   // error check on table parameters
   // insure cutoff is within table
   // for BITMAP tables, file values can be in non-ascending order
 
   if (tb->ninput <= 1) error->one(FLERR,"Invalid pair table length");
   double rlo,rhi;
   if (tb->rflag == 0) {
     rlo = tb->rfile[0];
     rhi = tb->rfile[tb->ninput-1];
   } else {
     rlo = tb->rlo;
     rhi = tb->rhi;
   }
   if (tb->cut <= rlo || tb->cut > rhi)
     error->all(FLERR,"Invalid pair table cutoff");
   if (rlo <= 0.0) error->all(FLERR,"Invalid pair table cutoff");
 
   // match = 1 if don't need to spline read-in tables
   // this is only the case if r values needed by final tables
   //   exactly match r values read from file
   // for tabstyle SPLINE, always need to build spline tables
 
   tb->match = 0;
   if (tabstyle == LINEAR && tb->ninput == tablength &&
       tb->rflag == RSQ && tb->rhi == tb->cut) tb->match = 1;
   if (tabstyle == BITMAP && tb->ninput == 1 << tablength &&
       tb->rflag == BMP && tb->rhi == tb->cut) tb->match = 1;
   if (tb->rflag == BMP && tb->match == 0)
     error->all(FLERR,"Bitmapped table in file does not match requested table");
 
   // spline read-in values and compute r,e,f vectors within table
 
   if (tb->match == 0) spline_table(tb);
   compute_table(tb);
 
   // store ptr to table in tabindex
 
   int count = 0;
   for (int i = ilo; i <= ihi; i++) {
     for (int j = MAX(jlo,i); j <= jhi; j++) {
       tabindex[i][j] = ntables;
       setflag[i][j] = 1;
       count++;
     }
   }
 
   if (count == 0) error->all(FLERR,"Illegal pair_coeff command");
   ntables++;
 
   {
      if ( strcmp(site1,"1fluid") == 0 )
        isite1 = OneFluidValue;
      else {
        isite1 = nspecies;
 
        for (int k = 0; k < nspecies; k++){
          if (strcmp(site1, atom->dname[k]) == 0){
            isite1 = k;
            break;
          }
        }
 
        if (isite1 == nspecies) error->all(FLERR,"isite1 == nspecies");
      }
 
      if ( strcmp(site2,"1fluid") == 0 )
        isite2 = OneFluidValue;
      else {
        isite2 = nspecies;
 
        for (int k = 0; k < nspecies; k++){
          if (strcmp(site2, atom->dname[k]) == 0){
            isite2 = ispecies;
            break;
          }
        }
 
        if (isite2 == nspecies)
          error->all(FLERR,"isite2 == nspecies");
      }
   }
 
 }
 
 /* ----------------------------------------------------------------------
    init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
 
 double PairTableRX::init_one(int i, int j)
 {
   if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
 
   tabindex[j][i] = tabindex[i][j];
 
   return tables[tabindex[i][j]].cut;
 }
 
 /* ----------------------------------------------------------------------
    read a table section from a tabulated potential file
    only called by proc 0
    this function sets these values in Table:
      ninput,rfile,efile,ffile,rflag,rlo,rhi,fpflag,fplo,fphi,ntablebits
 ------------------------------------------------------------------------- */
 
 void PairTableRX::read_table(Table *tb, char *file, char *keyword)
 {
   char line[MAXLINE];
 
   // open file
 
   FILE *fp = force->open_potential(file);
   if (fp == NULL) {
     char str[128];
     sprintf(str,"Cannot open file %s",file);
     error->one(FLERR,str);
   }
 
   // loop until section found with matching keyword
 
   while (1) {
     if (fgets(line,MAXLINE,fp) == NULL)
       error->one(FLERR,"Did not find keyword in table file");
     if (strspn(line," \t\n\r") == strlen(line)) continue;  // blank line
     if (line[0] == '#') continue;                          // comment
     char *word = strtok(line," \t\n\r");
     if (strcmp(word,keyword) == 0) break;           // matching keyword
     fgets(line,MAXLINE,fp);                         // no match, skip section
     param_extract(tb,line);
     fgets(line,MAXLINE,fp);
     for (int i = 0; i < tb->ninput; i++) fgets(line,MAXLINE,fp);
   }
 
   // read args on 2nd line of section
   // allocate table arrays for file values
 
   fgets(line,MAXLINE,fp);
   param_extract(tb,line);
   memory->create(tb->rfile,tb->ninput,"pair:rfile");
   memory->create(tb->efile,tb->ninput,"pair:efile");
   memory->create(tb->ffile,tb->ninput,"pair:ffile");
 
   // setup bitmap parameters for table to read in
 
   tb->ntablebits = 0;
   int masklo,maskhi,nmask,nshiftbits;
   if (tb->rflag == BMP) {
     while (1 << tb->ntablebits < tb->ninput) tb->ntablebits++;
     if (1 << tb->ntablebits != tb->ninput)
       error->one(FLERR,"Bitmapped table is incorrect length in table file");
     init_bitmap(tb->rlo,tb->rhi,tb->ntablebits,masklo,maskhi,nmask,nshiftbits);
   }
 
   // read r,e,f table values from file
   // if rflag set, compute r
   // if rflag not set, use r from file
 
   int itmp;
   double rtmp;
   union_int_float_t rsq_lookup;
 
   fgets(line,MAXLINE,fp);
   for (int i = 0; i < tb->ninput; i++) {
     fgets(line,MAXLINE,fp);
     sscanf(line,"%d %lg %lg %lg",&itmp,&rtmp,&tb->efile[i],&tb->ffile[i]);
 
     if (tb->rflag == RLINEAR)
       rtmp = tb->rlo + (tb->rhi - tb->rlo)*i/(tb->ninput-1);
     else if (tb->rflag == RSQ) {
       rtmp = tb->rlo*tb->rlo +
         (tb->rhi*tb->rhi - tb->rlo*tb->rlo)*i/(tb->ninput-1);
       rtmp = sqrt(rtmp);
     } else if (tb->rflag == BMP) {
       rsq_lookup.i = i << nshiftbits;
       rsq_lookup.i |= masklo;
       if (rsq_lookup.f < tb->rlo*tb->rlo) {
         rsq_lookup.i = i << nshiftbits;
         rsq_lookup.i |= maskhi;
       }
       rtmp = sqrtf(rsq_lookup.f);
     }
 
     tb->rfile[i] = rtmp;
   }
 
   // close file
 
   fclose(fp);
 }
 
 /* ----------------------------------------------------------------------
    broadcast read-in table info from proc 0 to other procs
    this function communicates these values in Table:
      ninput,rfile,efile,ffile,rflag,rlo,rhi,fpflag,fplo,fphi
 ------------------------------------------------------------------------- */
 
 void PairTableRX::bcast_table(Table *tb)
 {
   MPI_Bcast(&tb->ninput,1,MPI_INT,0,world);
 
   int me;
   MPI_Comm_rank(world,&me);
   if (me > 0) {
     memory->create(tb->rfile,tb->ninput,"pair:rfile");
     memory->create(tb->efile,tb->ninput,"pair:efile");
     memory->create(tb->ffile,tb->ninput,"pair:ffile");
   }
 
   MPI_Bcast(tb->rfile,tb->ninput,MPI_DOUBLE,0,world);
   MPI_Bcast(tb->efile,tb->ninput,MPI_DOUBLE,0,world);
   MPI_Bcast(tb->ffile,tb->ninput,MPI_DOUBLE,0,world);
 
   MPI_Bcast(&tb->rflag,1,MPI_INT,0,world);
   if (tb->rflag) {
     MPI_Bcast(&tb->rlo,1,MPI_DOUBLE,0,world);
     MPI_Bcast(&tb->rhi,1,MPI_DOUBLE,0,world);
   }
   MPI_Bcast(&tb->fpflag,1,MPI_INT,0,world);
   if (tb->fpflag) {
     MPI_Bcast(&tb->fplo,1,MPI_DOUBLE,0,world);
     MPI_Bcast(&tb->fphi,1,MPI_DOUBLE,0,world);
   }
 }
 
 /* ----------------------------------------------------------------------
    build spline representation of e,f over entire range of read-in table
    this function sets these values in Table: e2file,f2file
 ------------------------------------------------------------------------- */
 
 void PairTableRX::spline_table(Table *tb)
 {
   memory->create(tb->e2file,tb->ninput,"pair:e2file");
   memory->create(tb->f2file,tb->ninput,"pair:f2file");
 
   double ep0 = - tb->ffile[0];
   double epn = - tb->ffile[tb->ninput-1];
   spline(tb->rfile,tb->efile,tb->ninput,ep0,epn,tb->e2file);
 
   if (tb->fpflag == 0) {
     tb->fplo = (tb->ffile[1] - tb->ffile[0]) / (tb->rfile[1] - tb->rfile[0]);
     tb->fphi = (tb->ffile[tb->ninput-1] - tb->ffile[tb->ninput-2]) /
       (tb->rfile[tb->ninput-1] - tb->rfile[tb->ninput-2]);
   }
 
   double fp0 = tb->fplo;
   double fpn = tb->fphi;
   spline(tb->rfile,tb->ffile,tb->ninput,fp0,fpn,tb->f2file);
 }
 
 /* ----------------------------------------------------------------------
    extract attributes from parameter line in table section
    format of line: N value R/RSQ/BITMAP lo hi FP fplo fphi
    N is required, other params are optional
 ------------------------------------------------------------------------- */
 
 void PairTableRX::param_extract(Table *tb, char *line)
 {
   tb->ninput = 0;
   tb->rflag = NONE;
   tb->fpflag = 0;
 
   char *word = strtok(line," \t\n\r\f");
   while (word) {
     if (strcmp(word,"N") == 0) {
       word = strtok(NULL," \t\n\r\f");
       tb->ninput = atoi(word);
     } else if (strcmp(word,"R") == 0 || strcmp(word,"RSQ") == 0 ||
                strcmp(word,"BITMAP") == 0) {
       if (strcmp(word,"R") == 0) tb->rflag = RLINEAR;
       else if (strcmp(word,"RSQ") == 0) tb->rflag = RSQ;
       else if (strcmp(word,"BITMAP") == 0) tb->rflag = BMP;
       word = strtok(NULL," \t\n\r\f");
       tb->rlo = atof(word);
       word = strtok(NULL," \t\n\r\f");
       tb->rhi = atof(word);
     } else if (strcmp(word,"FP") == 0) {
       tb->fpflag = 1;
       word = strtok(NULL," \t\n\r\f");
       tb->fplo = atof(word);
       word = strtok(NULL," \t\n\r\f");
       tb->fphi = atof(word);
     } else {
       printf("WORD: %s\n",word);
       error->one(FLERR,"Invalid keyword in pair table parameters");
     }
     word = strtok(NULL," \t\n\r\f");
   }
 
   if (tb->ninput == 0) error->one(FLERR,"Pair table parameters did not set N");
 }
 
 /* ----------------------------------------------------------------------
    compute r,e,f vectors from splined values
 ------------------------------------------------------------------------- */
 
 void PairTableRX::compute_table(Table *tb)
 {
   int tlm1 = tablength-1;
 
   // inner = inner table bound
   // cut = outer table bound
   // delta = table spacing in rsq for N-1 bins
 
   double inner;
   if (tb->rflag) inner = tb->rlo;
   else inner = tb->rfile[0];
   tb->innersq = double(inner)*double(inner);
   tb->delta = double(tb->cut*tb->cut - double(tb->innersq)) / double(tlm1);
   tb->invdelta = 1.0/double(tb->delta);
 
   // direct lookup tables
   // N-1 evenly spaced bins in rsq from inner to cut
   // e,f = value at midpt of bin
   // e,f are N-1 in length since store 1 value at bin midpt
   // f is converted to f/r when stored in f[i]
   // e,f are never a match to read-in values, always computed via spline interp
 
   if (tabstyle == LOOKUP) {
     memory->create(tb->e,tlm1,"pair:e");
     memory->create(tb->f,tlm1,"pair:f");
 
     double r,rsq;
     for (int i = 0; i < tlm1; i++) {
       rsq = tb->innersq + (i+0.5)*tb->delta;
       r = sqrt(rsq);
       tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
       tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
     }
   }
 
   // linear tables
   // N-1 evenly spaced bins in rsq from inner to cut
   // rsq,e,f = value at lower edge of bin
   // de,df values = delta from lower edge to upper edge of bin
   // rsq,e,f are N in length so de,df arrays can compute difference
   // f is converted to f/r when stored in f[i]
   // e,f can match read-in values, else compute via spline interp
 
   if (tabstyle == LINEAR) {
     memory->create(tb->rsq,tablength,"pair:rsq");
     memory->create(tb->e,tablength,"pair:e");
     memory->create(tb->f,tablength,"pair:f");
     memory->create(tb->de,tlm1,"pair:de");
     memory->create(tb->df,tlm1,"pair:df");
 
     double r,rsq;
     for (int i = 0; i < tablength; i++) {
       rsq = tb->innersq + i*tb->delta;
       r = sqrt(rsq);
       tb->rsq[i] = rsq;
       if (tb->match) {
         tb->e[i] = tb->efile[i];
         tb->f[i] = tb->ffile[i]/r;
       } else {
         tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
         tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
       }
     }
 
     for (int i = 0; i < tlm1; i++) {
       tb->de[i] = tb->e[i+1] - tb->e[i];
       tb->df[i] = tb->f[i+1] - tb->f[i];
     }
   }
 
   // cubic spline tables
   // N-1 evenly spaced bins in rsq from inner to cut
   // rsq,e,f = value at lower edge of bin
   // e2,f2 = spline coefficient for each bin
   // rsq,e,f,e2,f2 are N in length so have N-1 spline bins
   // f is converted to f/r after e is splined
   // e,f can match read-in values, else compute via spline interp
 
   if (tabstyle == SPLINE) {
     memory->create(tb->rsq,tablength,"pair:rsq");
     memory->create(tb->e,tablength,"pair:e");
     memory->create(tb->f,tablength,"pair:f");
     memory->create(tb->e2,tablength,"pair:e2");
     memory->create(tb->f2,tablength,"pair:f2");
 
     tb->deltasq6 = tb->delta*tb->delta / 6.0;
 
     double r,rsq;
     for (int i = 0; i < tablength; i++) {
       rsq = tb->innersq + i*tb->delta;
       r = sqrt(rsq);
       tb->rsq[i] = rsq;
       if (tb->match) {
         tb->e[i] = tb->efile[i];
         tb->f[i] = tb->ffile[i]/r;
       } else {
         tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
         tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r);
       }
     }
 
     // ep0,epn = dh/dg at inner and at cut
     // h(r) = e(r) and g(r) = r^2
     // dh/dg = (de/dr) / 2r = -f/2r
 
     double ep0 = - tb->f[0] / (2.0 * sqrt(tb->innersq));
     double epn = - tb->f[tlm1] / (2.0 * tb->cut);
     spline(tb->rsq,tb->e,tablength,ep0,epn,tb->e2);
 
     // fp0,fpn = dh/dg at inner and at cut
     // h(r) = f(r)/r and g(r) = r^2
     // dh/dg = (1/r df/dr - f/r^2) / 2r
     // dh/dg in secant approx = (f(r2)/r2 - f(r1)/r1) / (g(r2) - g(r1))
 
     double fp0,fpn;
     double secant_factor = 0.1;
     if (tb->fpflag) fp0 = (tb->fplo/sqrt(tb->innersq) - tb->f[0]/tb->innersq) /
       (2.0 * sqrt(tb->innersq));
     else {
       double rsq1 = tb->innersq;
       double rsq2 = rsq1 + secant_factor*tb->delta;
       fp0 = (splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,sqrt(rsq2)) /
              sqrt(rsq2) - tb->f[0] / sqrt(rsq1)) / (secant_factor*tb->delta);
     }
 
     if (tb->fpflag && tb->cut == tb->rfile[tb->ninput-1]) fpn =
       (tb->fphi/tb->cut - tb->f[tlm1]/(tb->cut*tb->cut)) / (2.0 * tb->cut);
     else {
       double rsq2 = tb->cut * tb->cut;
       double rsq1 = rsq2 - secant_factor*tb->delta;
       fpn = (tb->f[tlm1] / sqrt(rsq2) -
              splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,sqrt(rsq1)) /
              sqrt(rsq1)) / (secant_factor*tb->delta);
     }
 
     for (int i = 0; i < tablength; i++) tb->f[i] /= sqrt(tb->rsq[i]);
     spline(tb->rsq,tb->f,tablength,fp0,fpn,tb->f2);
   }
 
   // bitmapped linear tables
   // 2^N bins from inner to cut, spaced in bitmapped manner
   // f is converted to f/r when stored in f[i]
   // e,f can match read-in values, else compute via spline interp
 
   if (tabstyle == BITMAP) {
     double r;
     union_int_float_t rsq_lookup;
     int masklo,maskhi;
 
     // linear lookup tables of length ntable = 2^n
     // stored value = value at lower edge of bin
 
     init_bitmap(inner,tb->cut,tablength,masklo,maskhi,tb->nmask,tb->nshiftbits);
     int ntable = 1 << tablength;
     int ntablem1 = ntable - 1;
 
     memory->create(tb->rsq,ntable,"pair:rsq");
     memory->create(tb->e,ntable,"pair:e");
     memory->create(tb->f,ntable,"pair:f");
     memory->create(tb->de,ntable,"pair:de");
     memory->create(tb->df,ntable,"pair:df");
     memory->create(tb->drsq,ntable,"pair:drsq");
 
     union_int_float_t minrsq_lookup;
     minrsq_lookup.i = 0 << tb->nshiftbits;
     minrsq_lookup.i |= maskhi;
 
     for (int i = 0; i < ntable; i++) {
       rsq_lookup.i = i << tb->nshiftbits;
       rsq_lookup.i |= masklo;
       if (rsq_lookup.f < tb->innersq) {
         rsq_lookup.i = i << tb->nshiftbits;
         rsq_lookup.i |= maskhi;
       }
       r = sqrtf(rsq_lookup.f);
       tb->rsq[i] = rsq_lookup.f;
       if (tb->match) {
         tb->e[i] = tb->efile[i];
         tb->f[i] = tb->ffile[i]/r;
       } else {
         tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
         tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
       }
       minrsq_lookup.f = MIN(minrsq_lookup.f,rsq_lookup.f);
     }
 
     tb->innersq = minrsq_lookup.f;
 
     for (int i = 0; i < ntablem1; i++) {
       tb->de[i] = tb->e[i+1] - tb->e[i];
       tb->df[i] = tb->f[i+1] - tb->f[i];
       tb->drsq[i] = 1.0/(tb->rsq[i+1] - tb->rsq[i]);
     }
 
     // get the delta values for the last table entries
     // tables are connected periodically between 0 and ntablem1
 
     tb->de[ntablem1] = tb->e[0] - tb->e[ntablem1];
     tb->df[ntablem1] = tb->f[0] - tb->f[ntablem1];
     tb->drsq[ntablem1] = 1.0/(tb->rsq[0] - tb->rsq[ntablem1]);
 
     // get the correct delta values at itablemax
     // smallest r is in bin itablemin
     // largest r is in bin itablemax, which is itablemin-1,
     //   or ntablem1 if itablemin=0
 
     // deltas at itablemax only needed if corresponding rsq < cut*cut
     // if so, compute deltas between rsq and cut*cut
     //   if tb->match, data at cut*cut is unavailable, so we'll take
     //   deltas at itablemax-1 as a good approximation
 
     double e_tmp,f_tmp;
     int itablemin = minrsq_lookup.i & tb->nmask;
     itablemin >>= tb->nshiftbits;
     int itablemax = itablemin - 1;
     if (itablemin == 0) itablemax = ntablem1;
     int itablemaxm1 = itablemax - 1;
     if (itablemax == 0) itablemaxm1 = ntablem1;
     rsq_lookup.i = itablemax << tb->nshiftbits;
     rsq_lookup.i |= maskhi;
     if (rsq_lookup.f < tb->cut*tb->cut) {
       if (tb->match) {
         tb->de[itablemax] = tb->de[itablemaxm1];
         tb->df[itablemax] = tb->df[itablemaxm1];
         tb->drsq[itablemax] = tb->drsq[itablemaxm1];
       } else {
             rsq_lookup.f = tb->cut*tb->cut;
         r = sqrtf(rsq_lookup.f);
         e_tmp = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
         f_tmp = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
         tb->de[itablemax] = e_tmp - tb->e[itablemax];
         tb->df[itablemax] = f_tmp - tb->f[itablemax];
         tb->drsq[itablemax] = 1.0/(rsq_lookup.f - tb->rsq[itablemax]);
       }
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    set all ptrs in a table to NULL, so can be freed safely
 ------------------------------------------------------------------------- */
 
 void PairTableRX::null_table(Table *tb)
 {
   tb->rfile = tb->efile = tb->ffile = NULL;
   tb->e2file = tb->f2file = NULL;
   tb->rsq = tb->drsq = tb->e = tb->de = NULL;
   tb->f = tb->df = tb->e2 = tb->f2 = NULL;
 }
 
 /* ----------------------------------------------------------------------
    free all arrays in a table
 ------------------------------------------------------------------------- */
 
 void PairTableRX::free_table(Table *tb)
 {
   memory->destroy(tb->rfile);
   memory->destroy(tb->efile);
   memory->destroy(tb->ffile);
   memory->destroy(tb->e2file);
   memory->destroy(tb->f2file);
 
   memory->destroy(tb->rsq);
   memory->destroy(tb->drsq);
   memory->destroy(tb->e);
   memory->destroy(tb->de);
   memory->destroy(tb->f);
   memory->destroy(tb->df);
   memory->destroy(tb->e2);
   memory->destroy(tb->f2);
 }
 
 /* ----------------------------------------------------------------------
    spline and splint routines modified from Numerical Recipes
 ------------------------------------------------------------------------- */
 
 void PairTableRX::spline(double *x, double *y, int n,
                        double yp1, double ypn, double *y2)
 {
   int i,k;
   double p,qn,sig,un;
   double *u = new double[n];
 
   if (yp1 > 0.99e30) y2[0] = u[0] = 0.0;
   else {
     y2[0] = -0.5;
     u[0] = (3.0/(x[1]-x[0])) * ((y[1]-y[0]) / (x[1]-x[0]) - yp1);
   }
   for (i = 1; i < n-1; i++) {
     sig = (x[i]-x[i-1]) / (x[i+1]-x[i-1]);
     p = sig*y2[i-1] + 2.0;
     y2[i] = (sig-1.0) / p;
     u[i] = (y[i+1]-y[i]) / (x[i+1]-x[i]) - (y[i]-y[i-1]) / (x[i]-x[i-1]);
     u[i] = (6.0*u[i] / (x[i+1]-x[i-1]) - sig*u[i-1]) / p;
   }
   if (ypn > 0.99e30) qn = un = 0.0;
   else {
     qn = 0.5;
     un = (3.0/(x[n-1]-x[n-2])) * (ypn - (y[n-1]-y[n-2]) / (x[n-1]-x[n-2]));
   }
   y2[n-1] = (un-qn*u[n-2]) / (qn*y2[n-2] + 1.0);
   for (k = n-2; k >= 0; k--) y2[k] = y2[k]*y2[k+1] + u[k];
 
   delete [] u;
 }
 
 /* ---------------------------------------------------------------------- */
 
 double PairTableRX::splint(double *xa, double *ya, double *y2a, int n, double x)
 {
   int klo,khi,k;
   double h,b,a,y;
 
   klo = 0;
   khi = n-1;
   while (khi-klo > 1) {
     k = (khi+klo) >> 1;
     if (xa[k] > x) khi = k;
     else klo = k;
   }
   h = xa[khi]-xa[klo];
   a = (xa[khi]-x) / h;
   b = (x-xa[klo]) / h;
   y = a*ya[klo] + b*ya[khi] +
     ((a*a*a-a)*y2a[klo] + (b*b*b-b)*y2a[khi]) * (h*h)/6.0;
   return y;
 }
 
 /* ----------------------------------------------------------------------
    proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
 void PairTableRX::write_restart(FILE *fp)
 {
   write_restart_settings(fp);
 }
 
 /* ----------------------------------------------------------------------
    proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
 void PairTableRX::read_restart(FILE *fp)
 {
   read_restart_settings(fp);
   allocate();
 }
 
 /* ----------------------------------------------------------------------
    proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
 void PairTableRX::write_restart_settings(FILE *fp)
 {
   fwrite(&tabstyle,sizeof(int),1,fp);
   fwrite(&tablength,sizeof(int),1,fp);
   fwrite(&ewaldflag,sizeof(int),1,fp);
   fwrite(&pppmflag,sizeof(int),1,fp);
   fwrite(&msmflag,sizeof(int),1,fp);
   fwrite(&dispersionflag,sizeof(int),1,fp);
   fwrite(&tip4pflag,sizeof(int),1,fp);
 }
 
 /* ----------------------------------------------------------------------
    proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
 void PairTableRX::read_restart_settings(FILE *fp)
 {
   if (comm->me == 0) {
     fread(&tabstyle,sizeof(int),1,fp);
     fread(&tablength,sizeof(int),1,fp);
     fread(&ewaldflag,sizeof(int),1,fp);
     fread(&pppmflag,sizeof(int),1,fp);
     fread(&msmflag,sizeof(int),1,fp);
     fread(&dispersionflag,sizeof(int),1,fp);
     fread(&tip4pflag,sizeof(int),1,fp);
   }
   MPI_Bcast(&tabstyle,1,MPI_INT,0,world);
   MPI_Bcast(&tablength,1,MPI_INT,0,world);
   MPI_Bcast(&ewaldflag,1,MPI_INT,0,world);
   MPI_Bcast(&pppmflag,1,MPI_INT,0,world);
   MPI_Bcast(&msmflag,1,MPI_INT,0,world);
   MPI_Bcast(&dispersionflag,1,MPI_INT,0,world);
   MPI_Bcast(&tip4pflag,1,MPI_INT,0,world);
 }
 
 /* ---------------------------------------------------------------------- */
 
 double PairTableRX::single(int i, int j, int itype, int jtype, double rsq,
                          double factor_coul, double factor_lj,
                          double &fforce)
 {
   int itable;
   double fraction,value,a,b,phi;
   int tlm1 = tablength - 1;
 
   Table *tb = &tables[tabindex[itype][jtype]];
   double fraction1_i, fraction1_j;
   double fraction2_i, fraction2_j;
   double fractionOld1_i, fractionOld1_j;
   double fractionOld2_i, fractionOld2_j;
 
   fraction = 0.0;
   a = 0.0;
   b = 0.0;
 
   getParams(i,fractionOld1_i,fractionOld2_i,fraction1_i,fraction2_i);
   getParams(j,fractionOld1_j,fractionOld2_j,fraction1_j,fraction2_j);
 
   if (rsq < tb->innersq) error->one(FLERR,"Pair distance < table inner cutoff");
 
   if (tabstyle == LOOKUP) {
     itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta);
     if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff");
     fforce = factor_lj * tb->f[itable];
   } else if (tabstyle == LINEAR) {
     itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta);
     if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff");
     fraction = (rsq - tb->rsq[itable]) * tb->invdelta;
     value = tb->f[itable] + fraction*tb->df[itable];
     fforce = factor_lj * value;
   } else if (tabstyle == SPLINE) {
     itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta);
     if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff");
     b = (rsq - tb->rsq[itable]) * tb->invdelta;
     a = 1.0 - b;
     value = a * tb->f[itable] + b * tb->f[itable+1] +
       ((a*a*a-a)*tb->f2[itable] + (b*b*b-b)*tb->f2[itable+1]) *
       tb->deltasq6;
     fforce = factor_lj * value;
   } else {
     union_int_float_t rsq_lookup;
     rsq_lookup.f = rsq;
     itable = rsq_lookup.i & tb->nmask;
     itable >>= tb->nshiftbits;
     fraction = (rsq_lookup.f - tb->rsq[itable]) * tb->drsq[itable];
     value = tb->f[itable] + fraction*tb->df[itable];
     fforce = factor_lj * value;
   }
 
   if (isite1 == isite2) fforce = sqrt(fraction1_i*fraction2_j)*fforce; 
   else fforce = (sqrt(fraction1_i*fraction2_j) + sqrt(fraction2_i*fraction1_j))*fforce;
 
   if (tabstyle == LOOKUP)
     phi = tb->e[itable];
   else if (tabstyle == LINEAR || tabstyle == BITMAP)
     phi = tb->e[itable] + fraction*tb->de[itable];
   else
     phi = a * tb->e[itable] + b * tb->e[itable+1] +
       ((a*a*a-a)*tb->e2[itable] + (b*b*b-b)*tb->e2[itable+1]) * tb->deltasq6;
 
   if (isite1 == isite2) phi = sqrt(fraction1_i*fraction2_j)*phi;
   else phi = (sqrt(fraction1_i*fraction2_j) + sqrt(fraction2_i*fraction1_j))*phi;
 
   return factor_lj*phi;
 }
 
 /* ----------------------------------------------------------------------
    return the Coulomb cutoff for tabled potentials
    called by KSpace solvers which require that all pairwise cutoffs be the same
    loop over all tables not just those indexed by tabindex[i][j] since
      no way to know which tables are active since pair::init() not yet called
 ------------------------------------------------------------------------- */
 
 void *PairTableRX::extract(const char *str, int &dim)
 {
   if (strcmp(str,"cut_coul") != 0) return NULL;
   if (ntables == 0) error->all(FLERR,"All pair coeffs are not set");
 
   double cut_coul = tables[0].cut;
   for (int m = 1; m < ntables; m++)
     if (tables[m].cut != cut_coul)
       error->all(FLERR,"Pair table cutoffs must all be equal to use with KSpace");
   dim = 0;
   return &tables[0].cut;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairTableRX::getParams(int id, double &fractionOld1, double &fractionOld2, double &fraction1, double &fraction2)
 {
   double nTotal = 0.0;
   double nTotalOld = 0.0;
   for (int ispecies = 0; ispecies < nspecies; ++ispecies){
     nTotal += atom->dvector[ispecies][id]; 
     nTotalOld += atom->dvector[ispecies+nspecies][id];
   }
   if(nTotal < 1e-8 || nTotalOld < 1e-8)
     error->all(FLERR,"The number of molecules in CG particle is less than 1e-8.");
 
   if (isOneFluid(isite1) == false){ 
     fractionOld1 = atom->dvector[isite1+nspecies][id]/nTotalOld;
     fraction1 = atom->dvector[isite1][id]/nTotal;
   }
   if (isOneFluid(isite2) == false){ 
     fractionOld2 = atom->dvector[isite2+nspecies][id]/nTotalOld;
     fraction2 = atom->dvector[isite2][id]/nTotal;
   }
 
   if (isOneFluid(isite1) || isOneFluid(isite2)){
     double fractionOld  = 0.0;
     double fraction  = 0.0;
 
     for (int ispecies = 0; ispecies < nspecies; ispecies++){
       if (isite1 == ispecies || isite2 == ispecies) continue;
 
       fractionOld += atom->dvector[ispecies+nspecies][id]/nTotalOld;
       fraction += atom->dvector[ispecies][id]/nTotal;
     }
 
     if(isOneFluid(isite1)){
       fractionOld1 = fractionOld;
       fraction1 = fraction;
     }
 
     if(isOneFluid(isite2)){
       fractionOld2 = fractionOld;
       fraction2 = fraction;
     }
   }
 }
 
diff --git a/src/USER-INTEL/dihedral_charmm_intel.cpp b/src/USER-INTEL/dihedral_charmm_intel.cpp
index 7e93e319d..c07c22661 100644
--- a/src/USER-INTEL/dihedral_charmm_intel.cpp
+++ b/src/USER-INTEL/dihedral_charmm_intel.cpp
@@ -1,1001 +1,1001 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
-#include "mpi.h"
-#include "math.h"
+#include <mpi.h>
+#include <math.h>
 #include "dihedral_charmm_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "memory.h"
 #include "neighbor.h"
 #include "domain.h"
 #include "force.h"
 #include "pair.h"
 #include "update.h"
 #include "error.h"
 
 #ifdef LMP_USE_AVXCD
 #if (__INTEL_COMPILER_BUILD_DATE > 20160414)
 #define LMP_USE_AVXCD_DHC
 #endif
 #endif
 
 #ifdef LMP_USE_AVXCD_DHC
 #include "intel_simd.h"
 using namespace ip_simd;
 #endif
 
 #include "suffix.h"
 using namespace LAMMPS_NS;
 
 #define PTOLERANCE (flt_t)1.05
 #define MTOLERANCE (flt_t)-1.05
 typedef struct { int a,b,c,d,t;  } int5_t;
 
 /* ---------------------------------------------------------------------- */
 
 DihedralCharmmIntel::DihedralCharmmIntel(class LAMMPS *lmp)
   : DihedralCharmm(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void DihedralCharmmIntel::compute(int eflag, int vflag)
 {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_use_base) {
     DihedralCharmm::compute(eflag, vflag);
     return;
   }
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void DihedralCharmmIntel::compute(int eflag, int vflag,
 				  IntelBuffers<flt_t,acc_t> *buffers,
 				  const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = 0;
 
   // insure pair->ev_tally() will use 1-4 virial contribution
 
   if (weightflag && vflag_global == 2)
     force->pair->vflag_either = force->pair->vflag_global = 1;
 
   if (evflag) {
     if (eflag) {
       if (force->newton_bond)
 	eval<1,1,1>(vflag, buffers, fc);
       else
 	eval<1,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
 	eval<1,0,1>(vflag, buffers, fc);
       else
 	eval<1,0,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
       eval<0,0,1>(vflag, buffers, fc);
     else
       eval<0,0,0>(vflag, buffers, fc);
   }
 }
 
 #ifndef LMP_USE_AVXCD_DHC
 
 template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void DihedralCharmmIntel::eval(const int vflag, 
 			       IntelBuffers<flt_t,acc_t> *buffers,
 			       const ForceConst<flt_t> &fc)
 
 {
   const int inum = neighbor->ndihedrallist;
   if (inum == 0) return;
 
   ATOM_T * _noalias const x = buffers->get_x(0);
   flt_t * _noalias const q = buffers->get_q(0);
   const int nlocal = atom->nlocal;
   const int nall = nlocal + atom->nghost;
 
   int f_stride;
   if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
   else f_stride = buffers->get_stride(nlocal);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
   const int nthreads = tc;
 
   acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
   acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5;
   if (EVFLAG) {
     if (EFLAG)
       oevdwl = oecoul = oedihedral = (acc_t)0.0;
     if (vflag) {
       ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
       opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
     }
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
     shared(f_start,f_stride,fc)		  \
     reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \
 	      opv0,opv1,opv2,opv3,opv4,opv5)
   #endif
   {
     int nfrom, nto, tid;
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
     const int5_t * _noalias const dihedrallist = 
       (int5_t *) neighbor->dihedrallist[0];
     const flt_t qqrd2e = force->qqrd2e;
 
     acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
     acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5;
     if (EVFLAG) {
       if (EFLAG)
 	sevdwl = secoul = sedihedral = (acc_t)0.0;
       if (vflag) {
 	sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
 	spv0 = spv1 = spv2 = spv3 = spv4 = spv5 = (acc_t)0.0;
       }
     }
 
     #if defined(LMP_SIMD_COMPILER_TEST)
     #pragma vector aligned
     #pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
                            sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5) 
     #endif
     for (int n = nfrom; n < nto; n++) {
       const int i1 = dihedrallist[n].a;
       const int i2 = dihedrallist[n].b;
       const int i3 = dihedrallist[n].c;
       const int i4 = dihedrallist[n].d;
       const int type = dihedrallist[n].t;
 
       // 1st bond
 
       const flt_t vb1x = x[i1].x - x[i2].x;
       const flt_t vb1y = x[i1].y - x[i2].y;
       const flt_t vb1z = x[i1].z - x[i2].z;
       const int itype = x[i1].w;
 
       // 2nd bond
 
       const flt_t vb2xm = x[i2].x - x[i3].x;
       const flt_t vb2ym = x[i2].y - x[i3].y;
       const flt_t vb2zm = x[i2].z - x[i3].z;
 
       // 3rd bond
       
       const flt_t vb3x = x[i4].x - x[i3].x;
       const flt_t vb3y = x[i4].y - x[i3].y;
       const flt_t vb3z = x[i4].z - x[i3].z;
       const int jtype = x[i4].w;
 
       // 1-4
 
       const flt_t delx = x[i1].x - x[i4].x;
       const flt_t dely = x[i1].y - x[i4].y;
       const flt_t delz = x[i1].z - x[i4].z;
 
 
       // c,s calculation
 
       const flt_t ax = vb1y*vb2zm - vb1z*vb2ym;
       const flt_t ay = vb1z*vb2xm - vb1x*vb2zm;
       const flt_t az = vb1x*vb2ym - vb1y*vb2xm;
       const flt_t bx = vb3y*vb2zm - vb3z*vb2ym;
       const flt_t by = vb3z*vb2xm - vb3x*vb2zm;
       const flt_t bz = vb3x*vb2ym - vb3y*vb2xm;
 
       const flt_t rasq = ax*ax + ay*ay + az*az;
       const flt_t rbsq = bx*bx + by*by + bz*bz;
       const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
       const flt_t rg = sqrt(rgsq);
 
       flt_t rginv, ra2inv, rb2inv;
       rginv = ra2inv = rb2inv = (flt_t)0.0;
       if (rg > 0) rginv = (flt_t)1.0/rg;
       if (rasq > 0) ra2inv = (flt_t)1.0/rasq;
       if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq;
       const flt_t rabinv = sqrt(ra2inv*rb2inv);
 
       flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
       const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
 
       // error check
       #ifndef LMP_SIMD_COMPILER_TEST
       if (c > PTOLERANCE || c < MTOLERANCE) {
 	int me = comm->me;
 
 	if (screen) {
 	  char str[128];
 	  sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
 		  TAGINT_FORMAT " " TAGINT_FORMAT " "
 		  TAGINT_FORMAT " " TAGINT_FORMAT,
 		  me,tid,update->ntimestep,
 		  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
 	  error->warning(FLERR,str,0);
 	  fprintf(screen,"  1st atom: %d %g %g %g\n",
 		  me,x[i1].x,x[i1].y,x[i1].z);
 	  fprintf(screen,"  2nd atom: %d %g %g %g\n",
 		  me,x[i2].x,x[i2].y,x[i2].z);
 	  fprintf(screen,"  3rd atom: %d %g %g %g\n",
 		  me,x[i3].x,x[i3].y,x[i3].z);
 	  fprintf(screen,"  4th atom: %d %g %g %g\n",
 		  me,x[i4].x,x[i4].y,x[i4].z);
 	}
       }
       #endif
 
       if (c > (flt_t)1.0) c = (flt_t)1.0;
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
 
       const flt_t tcos_shift = fc.bp[type].cos_shift;
       const flt_t tsin_shift = fc.bp[type].sin_shift;
       const flt_t tk = fc.bp[type].k;
       const int m = fc.bp[type].multiplicity;
 
       flt_t p = (flt_t)1.0;
       flt_t ddf1, df1;
       ddf1 = df1 = (flt_t)0.0;
 
       for (int i = 0; i < m; i++) {
 	ddf1 = p*c - df1*s;
 	df1 = p*s + df1*c;
 	p = ddf1;
       }
 
       p = p*tcos_shift + df1*tsin_shift;
       df1 = df1*tcos_shift - ddf1*tsin_shift;
       df1 *= -m;
       p += (flt_t)1.0;
       
       if (m == 0) {
 	p = (flt_t)1.0 + tcos_shift;
 	df1 = (flt_t)0.0;
       }
 
       const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
       const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
       const flt_t fga = fg*ra2inv*rginv;
       const flt_t hgb = hg*rb2inv*rginv;
       const flt_t gaa = -ra2inv*rg;
       const flt_t gbb = rb2inv*rg;
 
       const flt_t dtfx = gaa*ax;
       const flt_t dtfy = gaa*ay;
       const flt_t dtfz = gaa*az;
       const flt_t dtgx = fga*ax - hgb*bx;
       const flt_t dtgy = fga*ay - hgb*by;
       const flt_t dtgz = fga*az - hgb*bz;
       const flt_t dthx = gbb*bx;
       const flt_t dthy = gbb*by;
       const flt_t dthz = gbb*bz;
 
       const flt_t df = -tk * df1;
 
       const flt_t sx2 = df*dtgx;
       const flt_t sy2 = df*dtgy;
       const flt_t sz2 = df*dtgz;
 
       flt_t f1x = df*dtfx;
       flt_t f1y = df*dtfy;
       flt_t f1z = df*dtfz;
 
       const flt_t f2x = sx2 - f1x;
       const flt_t f2y = sy2 - f1y;
       const flt_t f2z = sz2 - f1z;
 
       flt_t f4x = df*dthx;
       flt_t f4y = df*dthy;
       flt_t f4z = df*dthz;
 
       const flt_t f3x = -sx2 - f4x;
       const flt_t f3y = -sy2 - f4y;
       const flt_t f3z = -sz2 - f4z;
 
       if (EVFLAG) {
 	flt_t deng;
 	if (EFLAG) deng = tk * p;
 	IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, deng, i1, i2, i3, i4, f1x, 
 			      f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x, 
 			      vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y, 
 			      vb3z, sedihedral, f, NEWTON_BOND, nlocal,
 			      sv0, sv1, sv2, sv3, sv4, sv5);
       }
 
 
       #if defined(LMP_SIMD_COMPILER_TEST)
       #pragma simdoff
       #endif
       {
         if (NEWTON_BOND || i2 < nlocal) {
 	  f[i2].x += f2x;
 	  f[i2].y += f2y;
 	  f[i2].z += f2z;
         }
 
         if (NEWTON_BOND || i3 < nlocal) {
 	  f[i3].x += f3x;
 	  f[i3].y += f3y;
 	  f[i3].z += f3z;
         }
       }
 
       // 1-4 LJ and Coulomb interactions
       // tally energy/virial in pair, using newton_bond as newton flag
 
       const flt_t tweight = fc.weight[type];
       const flt_t rsq = delx*delx + dely*dely + delz*delz;
       const flt_t r2inv = (flt_t)1.0/rsq;
       const flt_t r6inv = r2inv*r2inv*r2inv;
 
       flt_t forcecoul;
       if (implicit) forcecoul = qqrd2e * q[i1]*q[i4]*r2inv;
       else forcecoul = qqrd2e * q[i1]*q[i4]*sqrt(r2inv);
       const flt_t forcelj = r6inv * (fc.ljp[itype][jtype].lj1*r6inv - 
 				     fc.ljp[itype][jtype].lj2);
       const flt_t fpair = tweight * (forcelj+forcecoul)*r2inv;
 
       if (NEWTON_BOND || i1 < nlocal) {
 	f1x += delx*fpair;
 	f1y += dely*fpair;
 	f1z += delz*fpair;
       }
       if (NEWTON_BOND || i4 < nlocal) {
 	f4x -= delx*fpair;
 	f4y -= dely*fpair;
 	f4z -= delz*fpair;
       }
 
       if (EVFLAG) {
 	flt_t ev_pre = (flt_t)0;
 	if (NEWTON_BOND || i1 < nlocal)
 	  ev_pre += (flt_t)0.5;
 	if (NEWTON_BOND || i4 < nlocal)
 	  ev_pre += (flt_t)0.5;
 
 	if (EFLAG) {
 	  flt_t ecoul, evdwl;
 	  ecoul = tweight * forcecoul;
 	  evdwl = tweight * r6inv * (fc.ljp[itype][jtype].lj3*r6inv - 
 				     fc.ljp[itype][jtype].lj4);
 	  secoul += ev_pre * ecoul;
 	  sevdwl += ev_pre * evdwl;
 	  if (eatom) {
 	    evdwl *= (flt_t)0.5;
 	    evdwl += (flt_t)0.5 * ecoul;
 	    if (NEWTON_BOND || i1 < nlocal)
 	      f[i1].w += evdwl;
 	    if (NEWTON_BOND || i4 < nlocal)
 	      f[i4].w += evdwl;
 	  }
 	}
 	//	      IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
 	//				   delx, dely, delz);
 	if (vflag) {                                                    
 	  spv0 += ev_pre * delx * delx * fpair;                               
 	  spv1 += ev_pre * dely * dely * fpair;                               
 	  spv2 += ev_pre * delz * delz * fpair;                               
 	  spv3 += ev_pre * delx * dely * fpair;                               
 	  spv4 += ev_pre * delx * delz * fpair;                               
 	  spv5 += ev_pre * dely * delz * fpair;                               
 	}                                                                    
       }
 
       // apply force to each of 4 atoms
       #if defined(LMP_SIMD_COMPILER_TEST)
       #pragma simdoff
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
 	  f[i1].x += f1x;
 	  f[i1].y += f1y;
 	  f[i1].z += f1z;
         }
 
         if (NEWTON_BOND || i4 < nlocal) {
 	  f[i4].x += f4x;
 	  f[i4].y += f4y;
 	  f[i4].z += f4z;
         }
       }
     } // for n
     if (EVFLAG) {
       if (EFLAG) {
 	oedihedral += sedihedral;
 	oecoul += secoul;
 	oevdwl += sevdwl;
       }
       if (vflag) {
 	ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
 	opv0 += spv0; opv1 += spv1; opv2 += spv2; 
 	opv3 += spv3; opv4 += spv4; opv5 += spv5;
       }
     }
   } // omp parallel
 
   if (EVFLAG) {
     if (EFLAG) {
       energy += oedihedral;
       force->pair->eng_vdwl += oevdwl;
       force->pair->eng_coul += oecoul;
     }
     if (vflag) {
       virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
       virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
       force->pair->virial[0] += opv0;
       force->pair->virial[1] += opv1;
       force->pair->virial[2] += opv2;
       force->pair->virial[3] += opv3;
       force->pair->virial[4] += opv4;
       force->pair->virial[5] += opv5;
     }
   }
 
   fix->set_reduce_flag();
 }
 
 #else
 
 /* ----------------------------------------------------------------------
 
 Vector intrinsics are temporarily being used for the Stillinger-Weber
 potential to allow for advanced features in the AVX512 instruction set to
 be exploited on early hardware. We hope to see compiler improvements for
 AVX512 that will eliminate this requirement, so it is not recommended to
 develop code based on the intrinsics implementation. Please e-mail the
 authors for more details.
 
 ------------------------------------------------------------------------- */
 
 template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void DihedralCharmmIntel::eval(const int vflag, 
 			       IntelBuffers<flt_t,acc_t> *buffers,
 			       const ForceConst<flt_t> &fc)
 
 {
   typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t;
   typedef typename SIMD_type<acc_t>::SIMD_vec SIMD_acc_t;
   const int swidth = SIMD_type<flt_t>::width();
 
   const int inum = neighbor->ndihedrallist;
   if (inum == 0) return;
 
   ATOM_T * _noalias const x = buffers->get_x(0);
   flt_t * _noalias const q = buffers->get_q(0);
   const int nlocal = atom->nlocal;
   const int nall = nlocal + atom->nghost;
 
   int f_stride;
   if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
   else f_stride = buffers->get_stride(nlocal);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
   const int nthreads = tc;
 
   acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
   acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5;
   if (EVFLAG) {
     if (EFLAG)
       oevdwl = oecoul = oedihedral = (acc_t)0.0;
     if (vflag) {
       ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
       opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
     }
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
     shared(f_start,f_stride,fc)		  \
     reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \
 	      opv0,opv1,opv2,opv3,opv4,opv5)
   #endif
   {
     int nfrom, nto, tid;
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
     const int * _noalias const dihedrallist = 
       (int *) neighbor->dihedrallist[0];
     const flt_t * _noalias const weight = &(fc.weight[0]);
     const flt_t * _noalias const x_f = &(x[0].x);
     const flt_t * _noalias const cos_shift = &(fc.bp[0].cos_shift);
     const flt_t * _noalias const sin_shift = &(fc.bp[0].sin_shift);
     const flt_t * _noalias const k = &(fc.bp[0].k);
     const int * _noalias const multiplicity = &(fc.bp[0].multiplicity);
     const flt_t * _noalias const plj1 = &(fc.ljp[0][0].lj1);
     const flt_t * _noalias const plj2 = &(fc.ljp[0][0].lj2);
     const flt_t * _noalias const plj3 = &(fc.ljp[0][0].lj3);
     const flt_t * _noalias const plj4 = &(fc.ljp[0][0].lj4);
     acc_t * _noalias const pforce= &(f[0].x);
     acc_t * _noalias const featom = &(f[0].w);
     const flt_t qqrd2e = force->qqrd2e;
 
     SIMD_acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
     SIMD_acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5;
     if (EVFLAG) {
       if (EFLAG) {
 	sevdwl = SIMD_set((acc_t)0.0);
 	secoul = SIMD_set((acc_t)0.0);
 	sedihedral = SIMD_set((acc_t)0.0);
       }
       if (vflag) {
 	sv0 = SIMD_set((acc_t)0.0);
 	sv1 = SIMD_set((acc_t)0.0);
 	sv2 = SIMD_set((acc_t)0.0);
 	sv3 = SIMD_set((acc_t)0.0);
 	sv4 = SIMD_set((acc_t)0.0);
 	sv5 = SIMD_set((acc_t)0.0);
 	spv0 = SIMD_set((acc_t)0.0);
 	spv1 = SIMD_set((acc_t)0.0);
 	spv2 = SIMD_set((acc_t)0.0);
 	spv3 = SIMD_set((acc_t)0.0);
 	spv4 = SIMD_set((acc_t)0.0);
 	spv5 = SIMD_set((acc_t)0.0);
       }
     }
 
     SIMD_int n_offset = SIMD_set(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50,
 				 55, 60, 65, 70, 75) + (nfrom * 5);
     const int nto5 = nto * 5;
     const int nlocals4 = nlocal << 4;
     const SIMD_int simd_nlocals4 = SIMD_set(nlocals4);
     const int ntypes = atom->ntypes + 1;
 
     for (int n = nfrom; n < nto; n += swidth) {
       SIMD_mask nmask = n_offset < nto5;
       SIMD_int i1 = SIMD_gather(nmask, dihedrallist, n_offset);
       const SIMD_flt_t q1 = SIMD_gather(nmask, q, i1);
       i1 = i1 << 4;
       const SIMD_int i2 = SIMD_gather(nmask, dihedrallist+1, n_offset) << 4;
       const SIMD_int i3 = SIMD_gather(nmask, dihedrallist+2, n_offset) << 4;
       SIMD_int i4 = SIMD_gather(nmask, dihedrallist+3, n_offset);
       const SIMD_flt_t q4 = SIMD_gather(nmask, q, i4);
       i4 = i4 << 4;
       SIMD_int type = SIMD_gather(nmask, dihedrallist+4, n_offset);
       const SIMD_flt_t tweight = SIMD_gather(nmask, weight, type);
       type = type << 2;
       n_offset = n_offset + swidth * 5;
 
       // 1st bond
 
       SIMD_flt_t x1, x2, y1, y2, z1, z2;
       SIMD_int itype;
 
       SIMD_atom_gather(nmask, x_f, i1, x1, y1, z1, itype);
       SIMD_atom_gather(nmask, x_f, i2, x2, y2, z2);
 
       const SIMD_flt_t vb1x = x1 - x2;
       const SIMD_flt_t vb1y = y1 - y2;
       const SIMD_flt_t vb1z = z1 - z2;
 
       // 2nd bond
 
       SIMD_flt_t x3, y3, z3;
 
       SIMD_atom_gather(nmask, x_f, i3, x3, y3, z3);
 
       const SIMD_flt_t vb2xm = x2 - x3;
       const SIMD_flt_t vb2ym = y2 - y3;
       const SIMD_flt_t vb2zm = z2 - z3;
 
       // 3rd bond
       
       SIMD_flt_t x4, y4, z4;
       SIMD_int jtype;
 
       SIMD_atom_gather(nmask, x_f, i4, x4, y4, z4, jtype);
 
       const SIMD_flt_t vb3x = x4 - x3;
       const SIMD_flt_t vb3y = y4 - y3;
       const SIMD_flt_t vb3z = z4 - z3;
 
       // 1-4
 
       const SIMD_flt_t delx = x1 - x4;
       const SIMD_flt_t dely = y1 - y4;
       const SIMD_flt_t delz = z1 - z4;
 
       // c,s calculation
 
       const SIMD_flt_t ax = vb1y*vb2zm - vb1z*vb2ym;
       const SIMD_flt_t ay = vb1z*vb2xm - vb1x*vb2zm;
       const SIMD_flt_t az = vb1x*vb2ym - vb1y*vb2xm;
       const SIMD_flt_t bx = vb3y*vb2zm - vb3z*vb2ym;
       const SIMD_flt_t by = vb3z*vb2xm - vb3x*vb2zm;
       const SIMD_flt_t bz = vb3x*vb2ym - vb3y*vb2xm;
 
       const SIMD_flt_t rasq = ax*ax + ay*ay + az*az;
       const SIMD_flt_t rbsq = bx*bx + by*by + bz*bz;
       const SIMD_flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
       const SIMD_flt_t rg = SIMD_sqrt(rgsq);
 
       const SIMD_flt_t szero = SIMD_set((flt_t)0.0);
       const SIMD_flt_t rginv = SIMD_rcpz(rg > szero, rg);
       const SIMD_flt_t ra2inv = SIMD_rcpz(rasq > szero, rasq);
       const SIMD_flt_t rb2inv = SIMD_rcpz(rbsq > szero, rbsq);
       const SIMD_flt_t rabinv = SIMD_sqrt(ra2inv*rb2inv);
 
       SIMD_flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
       const SIMD_flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
 
       // error check
       const SIMD_flt_t one = SIMD_set((flt_t)1.0);
       const SIMD_flt_t mone = SIMD_set((flt_t)-1.0);
 
       const SIMD_flt_t ptol = SIMD_set(PTOLERANCE);
       const SIMD_flt_t ntol = SIMD_set(MTOLERANCE);
       if (c > ptol || c < ntol)
 	if (screen)
           error->warning(FLERR,"Dihedral problem.");
 
       c = SIMD_set(c, c > one, one);
       c = SIMD_set(c, c < mone, mone);
 
       const SIMD_flt_t tcos_shift = SIMD_gather(nmask, cos_shift, type);
       const SIMD_flt_t tsin_shift = SIMD_gather(nmask, sin_shift, type);
       const SIMD_flt_t tk = SIMD_gather(nmask, k, type);
       const SIMD_int m = SIMD_gatherz_offset<flt_t>(nmask, multiplicity, type);
 
       SIMD_flt_t p(one);
       SIMD_flt_t ddf1(szero);
       SIMD_flt_t df1(szero);
       
       const int m_max = SIMD_max(m);
 
       for (int i = 0; i < m_max; i++) {
 	const SIMD_mask my_m = i < m;
 	ddf1 = SIMD_set(ddf1, my_m, p*c - df1*s);
 	df1 = SIMD_set(df1, my_m, p*s + df1*c);
 	p = SIMD_set(p, my_m, ddf1);
       }
 
       SIMD_flt_t multf;
       SIMD_cast(-m,multf);
       p = p*tcos_shift + df1*tsin_shift;
       df1 = df1*tcos_shift - ddf1*tsin_shift;
       df1 = df1 * multf;
       p = p + one;
       
       SIMD_mask mzero = (m == SIMD_set((int)0));
       p = SIMD_set(p, mzero, one + tcos_shift);
       df1 = SIMD_set(df1, mzero, szero);
 
       const SIMD_flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
       const SIMD_flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
       const SIMD_flt_t fga = fg*ra2inv*rginv;
       const SIMD_flt_t hgb = hg*rb2inv*rginv;
       const SIMD_flt_t gaa = -ra2inv*rg;
       const SIMD_flt_t gbb = rb2inv*rg;
 
       const SIMD_flt_t dtfx = gaa*ax;
       const SIMD_flt_t dtfy = gaa*ay;
       const SIMD_flt_t dtfz = gaa*az;
       const SIMD_flt_t dtgx = fga*ax - hgb*bx;
       const SIMD_flt_t dtgy = fga*ay - hgb*by;
       const SIMD_flt_t dtgz = fga*az - hgb*bz;
       const SIMD_flt_t dthx = gbb*bx;
       const SIMD_flt_t dthy = gbb*by;
       const SIMD_flt_t dthz = gbb*bz;
 
       const SIMD_flt_t df = -tk * df1;
 
       const SIMD_flt_t sx2 = df*dtgx;
       const SIMD_flt_t sy2 = df*dtgy;
       const SIMD_flt_t sz2 = df*dtgz;
 
       SIMD_flt_t f1x = df*dtfx;
       SIMD_flt_t f1y = df*dtfy;
       SIMD_flt_t f1z = df*dtfz;
 
       SIMD_flt_t f2x = sx2 - f1x;
       SIMD_flt_t f2y = sy2 - f1y;
       SIMD_flt_t f2z = sz2 - f1z;
 
       SIMD_flt_t f4x = df*dthx;
       SIMD_flt_t f4y = df*dthy;
       SIMD_flt_t f4z = df*dthz;
 
       SIMD_flt_t f3x = -sx2 - f4x;
       SIMD_flt_t f3y = -sy2 - f4y;
       SIMD_flt_t f3z = -sz2 - f4z;
 
       SIMD_flt_t qdeng;
       if (EVFLAG) {
 	SIMD_flt_t ev_pre;
 	if (NEWTON_BOND) ev_pre = one;
 	else {
 	  ev_pre = szero;
 	  const SIMD_flt_t quarter = SIMD_set((flt_t)0.25);
 	  ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4, ev_pre, quarter);
 	  ev_pre = SIMD_add(ev_pre, i2 < simd_nlocals4, ev_pre, quarter);
 	  ev_pre = SIMD_add(ev_pre, i3 < simd_nlocals4, ev_pre, quarter);
 	  ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4, ev_pre, quarter);
 	}
 	SIMD_zero_masked(nmask, ev_pre);
 	if (EFLAG) {
 	  const SIMD_flt_t deng = tk * p;
 	  sedihedral = SIMD_ev_add(sedihedral, ev_pre * deng);
 	  if (eatom) {
 	    qdeng = deng * SIMD_set((flt_t)0.25);
 	    SIMD_mask newton_mask;
 	    if (NEWTON_BOND) newton_mask = nmask;
 	    if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i2, simd_nlocals4);
 	    SIMD_flt_t ieng = qdeng;
 	    SIMD_jeng_update(newton_mask, featom, i2, ieng);
 	    ieng = qdeng;
 	    if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i3, simd_nlocals4);
 	    SIMD_jeng_update(newton_mask, featom, i3, ieng);
 	  }
 	}
 	if (vflag) {
           sv0 = SIMD_ev_add(sv0, ev_pre*(vb1x*f1x-vb2xm*f3x+(vb3x-vb2xm)*f4x));
 	  sv1 = SIMD_ev_add(sv1, ev_pre*(vb1y*f1y-vb2ym*f3y+(vb3y-vb2ym)*f4y));
 	  sv2 = SIMD_ev_add(sv2, ev_pre*(vb1z*f1z-vb2zm*f3z+(vb3z-vb2zm)*f4z));
 	  sv3 = SIMD_ev_add(sv3, ev_pre*(vb1x*f1y-vb2xm*f3y+(vb3x-vb2xm)*f4y));
 	  sv4 = SIMD_ev_add(sv4, ev_pre*(vb1x*f1z-vb2xm*f3z+(vb3x-vb2xm)*f4z));
 	  sv5 = SIMD_ev_add(sv5, ev_pre*(vb1y*f1z-vb2ym*f3z+(vb3y-vb2ym)*f4z));
 	}
       }
 
       SIMD_mask newton_mask;
       if (NEWTON_BOND) newton_mask = nmask;
       if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i2, simd_nlocals4);
       SIMD_safe_jforce(newton_mask, pforce, i2, f2x, f2y, f2z);
       if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i3, simd_nlocals4);
       SIMD_safe_jforce(newton_mask, pforce, i3, f3x, f3y, f3z);
 
       // 1-4 LJ and Coulomb interactions
       // tally energy/virial in pair, using newton_bond as newton flag
 
       const SIMD_flt_t rsq = delx*delx + dely*dely + delz*delz;
       const SIMD_flt_t r2inv = SIMD_rcpz(nmask, rsq);
       const SIMD_flt_t r6inv = r2inv*r2inv*r2inv;
 
       const SIMD_flt_t simd_qqrd2e = SIMD_set(qqrd2e);
       SIMD_flt_t forcecoul;
       if (implicit) forcecoul = simd_qqrd2e * q1 * q4 * r2inv;
       else forcecoul = simd_qqrd2e * q1 * q4 * SIMD_sqrt(r2inv);
 
       const SIMD_int ijtype = (itype * ntypes + jtype) << 2;
       const SIMD_flt_t lj1 = SIMD_gather(nmask, plj1, ijtype);
       const SIMD_flt_t lj2 = SIMD_gather(nmask, plj2, ijtype);
       const SIMD_flt_t forcelj = r6inv * (lj1 * r6inv - lj2);
       const SIMD_flt_t fpair = tweight * (forcelj + forcecoul) * r2inv;
 
       f1x = f1x + delx * fpair;
       f1y = f1y + dely * fpair;
       f1z = f1z + delz * fpair;
       f4x = f4x - delx * fpair;
       f4y = f4y - dely * fpair;
       f4z = f4z - delz * fpair;
 
       if (EVFLAG) {
 	SIMD_flt_t ev_pre;
 	if (NEWTON_BOND) ev_pre = one;
 	else {
 	  ev_pre = szero;
           const SIMD_flt_t half = SIMD_set((flt_t)0.5);
           ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4,ev_pre,half);
           ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4,ev_pre,half);
 	}
 	SIMD_zero_masked(nmask, ev_pre);
 
 	if (EFLAG) {
 	  const SIMD_flt_t ecoul = tweight * forcecoul;
 	  const SIMD_flt_t lj3 = SIMD_gather(nmask, plj3, ijtype);
 	  const SIMD_flt_t lj4 = SIMD_gather(nmask, plj4, ijtype);
 	  SIMD_flt_t evdwl = tweight * r6inv * (lj3 * r6inv - lj4);
 	  secoul = SIMD_ev_add(secoul, ev_pre * ecoul);
 	  sevdwl = SIMD_ev_add(sevdwl, ev_pre * evdwl);
 	  if (eatom) {
  	    const SIMD_flt_t half = SIMD_set((flt_t)0.5);
 	    evdwl = evdwl * half;
 	    evdwl = evdwl + half * ecoul + qdeng;
 
             if (NEWTON_BOND) newton_mask = nmask;
             if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i1, simd_nlocals4);
             SIMD_flt_t ieng = evdwl;
             SIMD_jeng_update(newton_mask, featom, i1, ieng);
             ieng = evdwl;
             if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i4, simd_nlocals4);
             SIMD_jeng_update(newton_mask, featom, i4, ieng);
 	  }
 	}
 	if (vflag) {                                                    
           spv0 = SIMD_ev_add(spv0, ev_pre * delx * delx * fpair);
 	  spv1 = SIMD_ev_add(spv1, ev_pre * dely * dely * fpair);
 	  spv2 = SIMD_ev_add(spv2, ev_pre * delz * delz * fpair);
 	  spv3 = SIMD_ev_add(spv3, ev_pre * delx * dely * fpair);
 	  spv4 = SIMD_ev_add(spv4, ev_pre * delx * delz * fpair);
 	  spv5 = SIMD_ev_add(spv5, ev_pre * dely * delz * fpair);
 	}                                                                    
       }
 
       if (NEWTON_BOND) newton_mask = nmask;
       if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i1, simd_nlocals4);
       SIMD_safe_jforce(newton_mask, pforce, i1, f1x, f1y, f1z);
       if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i4, simd_nlocals4);
       SIMD_safe_jforce(newton_mask, pforce, i4, f4x, f4y, f4z);
     } // for n
 
     if (EVFLAG) {
       if (EFLAG) {
 	oedihedral += SIMD_sum(sedihedral);
 	oecoul += SIMD_sum(secoul);
 	oevdwl += SIMD_sum(sevdwl);
       }
       if (vflag) {
 	ov0 += SIMD_sum(sv0); 
 	ov1 += SIMD_sum(sv1); 
 	ov2 += SIMD_sum(sv2); 
 	ov3 += SIMD_sum(sv3); 
 	ov4 += SIMD_sum(sv4); 
 	ov5 += SIMD_sum(sv5);
 	opv0 += SIMD_sum(spv0); 
 	opv1 += SIMD_sum(spv1); 
 	opv2 += SIMD_sum(spv2); 
 	opv3 += SIMD_sum(spv3); 
 	opv4 += SIMD_sum(spv4); 
 	opv5 += SIMD_sum(spv5);
       }
     }
   } // omp parallel
 
   if (EVFLAG) {
     if (EFLAG) {
       energy += oedihedral;
       force->pair->eng_vdwl += oevdwl;
       force->pair->eng_coul += oecoul;
     }
     if (vflag) {
       virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
       virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
       force->pair->virial[0] += opv0;
       force->pair->virial[1] += opv1;
       force->pair->virial[2] += opv2;
       force->pair->virial[3] += opv3;
       force->pair->virial[4] += opv4;
       force->pair->virial[5] += opv5;
     }
   }
 
   fix->set_reduce_flag();
 }
 
 #endif
 
 /* ---------------------------------------------------------------------- */
 
 void DihedralCharmmIntel::init_style()
 {
   DihedralCharmm::init_style();
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   #ifdef _LMP_INTEL_OFFLOAD
   _use_base = 0;
   if (fix->offload_balance() != 0.0) {
     _use_base = 1;
     return;
   }
   #endif
 
   fix->bond_init_check();
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
 	                                   IntelBuffers<flt_t,acc_t> *buffers)
 {
 
   const int tp1 = atom->ntypes + 1;
   const int bp1 = atom->ndihedraltypes + 1;
   fc.set_ntypes(tp1,bp1,memory);
   buffers->set_ntypes(tp1);
 
   for (int i = 0; i < tp1; i++) {
     for (int j = 0; j < tp1; j++) {
       fc.ljp[i][j].lj1 = lj14_1[i][j];
       fc.ljp[i][j].lj2 = lj14_2[i][j];
       fc.ljp[i][j].lj3 = lj14_3[i][j];
       fc.ljp[i][j].lj4 = lj14_4[i][j];
     }
   }
 
   for (int i = 0; i < bp1; i++) {
     fc.bp[i].multiplicity = multiplicity[i];
     fc.bp[i].cos_shift = cos_shift[i];
     fc.bp[i].sin_shift = sin_shift[i];
     fc.bp[i].k = k[i];
     fc.weight[i] = weight[i];
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void DihedralCharmmIntel::ForceConst<flt_t>::set_ntypes(const int npairtypes,
             	                                        const int nbondtypes,
 	                                                Memory *memory) {
   if (npairtypes != _npairtypes) {
     if (_npairtypes > 0)
       _memory->destroy(ljp);
     if (npairtypes > 0)
       memory->create(ljp,npairtypes,npairtypes,"fc.ljp");
   }
 
   if (nbondtypes != _nbondtypes) {
     if (_nbondtypes > 0) {
       _memory->destroy(bp);
       _memory->destroy(weight);
     }
     
     if (nbondtypes > 0) {
       _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
       _memory->create(weight,nbondtypes,"dihedralcharmmintel.weight");
     }
   }
   _npairtypes = npairtypes;
   _nbondtypes = nbondtypes;
   _memory = memory;
 }
diff --git a/src/USER-INTEL/dihedral_harmonic_intel.cpp b/src/USER-INTEL/dihedral_harmonic_intel.cpp
index aa6ba9d1d..03ab152f4 100644
--- a/src/USER-INTEL/dihedral_harmonic_intel.cpp
+++ b/src/USER-INTEL/dihedral_harmonic_intel.cpp
@@ -1,411 +1,411 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
-#include "mpi.h"
-#include "math.h"
+#include <mpi.h>
+#include <math.h>
 #include "dihedral_harmonic_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "memory.h"
 #include "neighbor.h"
 #include "domain.h"
 #include "force.h"
 #include "pair.h"
 #include "update.h"
 #include "error.h"
 
 #include "suffix.h"
 using namespace LAMMPS_NS;
 
 #define PTOLERANCE (flt_t)1.05
 #define MTOLERANCE (flt_t)-1.05
 typedef struct { int a,b,c,d,t;  } int5_t;
 
 /* ---------------------------------------------------------------------- */
 
 DihedralHarmonicIntel::DihedralHarmonicIntel(class LAMMPS *lmp)
   : DihedralHarmonic(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void DihedralHarmonicIntel::compute(int eflag, int vflag)
 {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_use_base) {
     DihedralHarmonic::compute(eflag, vflag);
     return;
   }
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void DihedralHarmonicIntel::compute(int eflag, int vflag,
 				  IntelBuffers<flt_t,acc_t> *buffers,
 				  const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = 0;
 
   if (evflag) {
     if (eflag) {
       if (force->newton_bond)
 	eval<1,1,1>(vflag, buffers, fc);
       else
 	eval<1,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
 	eval<1,0,1>(vflag, buffers, fc);
       else
 	eval<1,0,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
       eval<0,0,1>(vflag, buffers, fc);
     else
       eval<0,0,0>(vflag, buffers, fc);
   }
 }
 
 template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void DihedralHarmonicIntel::eval(const int vflag, 
 			       IntelBuffers<flt_t,acc_t> *buffers,
 			       const ForceConst<flt_t> &fc)
 
 {
   const int inum = neighbor->ndihedrallist;
   if (inum == 0) return;
 
   ATOM_T * _noalias const x = buffers->get_x(0);
   const int nlocal = atom->nlocal;
   const int nall = nlocal + atom->nghost;
 
   int f_stride;
   if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
   else f_stride = buffers->get_stride(nlocal);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
   const int nthreads = tc;
 
   acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
   if (EVFLAG) {
     if (EFLAG)
       oedihedral = (acc_t)0.0;
     if (vflag) {
       ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
     }
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
     shared(f_start,f_stride,fc)		  \
     reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
     int nfrom, nto, tid;
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
     const int5_t * _noalias const dihedrallist = 
       (int5_t *) neighbor->dihedrallist[0];
 
     acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
     if (EVFLAG) {
       if (EFLAG)
 	sedihedral = (acc_t)0.0;
       if (vflag) {
 	sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
       }
     }
 
     for (int n = nfrom; n < nto; n++) {
       const int i1 = dihedrallist[n].a;
       const int i2 = dihedrallist[n].b;
       const int i3 = dihedrallist[n].c;
       const int i4 = dihedrallist[n].d;
       const int type = dihedrallist[n].t;
 
       // 1st bond
 
       const flt_t vb1x = x[i1].x - x[i2].x;
       const flt_t vb1y = x[i1].y - x[i2].y;
       const flt_t vb1z = x[i1].z - x[i2].z;
 
       // 2nd bond
 
       const flt_t vb2xm = x[i2].x - x[i3].x;
       const flt_t vb2ym = x[i2].y - x[i3].y;
       const flt_t vb2zm = x[i2].z - x[i3].z;
 
       // 3rd bond
       
       const flt_t vb3x = x[i4].x - x[i3].x;
       const flt_t vb3y = x[i4].y - x[i3].y;
       const flt_t vb3z = x[i4].z - x[i3].z;
 
       // c,s calculation
 
       const flt_t ax = vb1y*vb2zm - vb1z*vb2ym;
       const flt_t ay = vb1z*vb2xm - vb1x*vb2zm;
       const flt_t az = vb1x*vb2ym - vb1y*vb2xm;
       const flt_t bx = vb3y*vb2zm - vb3z*vb2ym;
       const flt_t by = vb3z*vb2xm - vb3x*vb2zm;
       const flt_t bz = vb3x*vb2ym - vb3y*vb2xm;
 
       const flt_t rasq = ax*ax + ay*ay + az*az;
       const flt_t rbsq = bx*bx + by*by + bz*bz;
       const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
       const flt_t rg = sqrt(rgsq);
 
       flt_t rginv, ra2inv, rb2inv;
       rginv = ra2inv = rb2inv = (flt_t)0.0;
       if (rg > 0) rginv = (flt_t)1.0/rg;
       if (rasq > 0) ra2inv = (flt_t)1.0/rasq;
       if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq;
       const flt_t rabinv = sqrt(ra2inv*rb2inv);
 
       flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
       const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
 
       // error check
       if (c > PTOLERANCE || c < MTOLERANCE) {
 	int me = comm->me;
 
 	if (screen) {
 	  char str[128];
 	  sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
 		  TAGINT_FORMAT " " TAGINT_FORMAT " "
 		  TAGINT_FORMAT " " TAGINT_FORMAT,
 		  me,tid,update->ntimestep,
 		  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
 	  error->warning(FLERR,str,0);
 	  fprintf(screen,"  1st atom: %d %g %g %g\n",
 		  me,x[i1].x,x[i1].y,x[i1].z);
 	  fprintf(screen,"  2nd atom: %d %g %g %g\n",
 		  me,x[i2].x,x[i2].y,x[i2].z);
 	  fprintf(screen,"  3rd atom: %d %g %g %g\n",
 		  me,x[i3].x,x[i3].y,x[i3].z);
 	  fprintf(screen,"  4th atom: %d %g %g %g\n",
 		  me,x[i4].x,x[i4].y,x[i4].z);
 	}
       }
 
       if (c > (flt_t)1.0) c = (flt_t)1.0;
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
 
       const flt_t tcos_shift = fc.bp[type].cos_shift;
       const flt_t tsin_shift = fc.bp[type].sin_shift;
       const flt_t tk = fc.bp[type].k;
       const int m = fc.bp[type].multiplicity;
 
       flt_t p = (flt_t)1.0;
       flt_t ddf1, df1;
       ddf1 = df1 = (flt_t)0.0;
 
       for (int i = 0; i < m; i++) {
 	ddf1 = p*c - df1*s;
 	df1 = p*s + df1*c;
 	p = ddf1;
       }
 
       p = p*tcos_shift + df1*tsin_shift;
       df1 = df1*tcos_shift - ddf1*tsin_shift;
       df1 *= -m;
       p += (flt_t)1.0;
       
       if (m == 0) {
 	p = (flt_t)1.0 + tcos_shift;
 	df1 = (flt_t)0.0;
       }
 
       const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
       const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
       const flt_t fga = fg*ra2inv*rginv;
       const flt_t hgb = hg*rb2inv*rginv;
       const flt_t gaa = -ra2inv*rg;
       const flt_t gbb = rb2inv*rg;
 
       const flt_t dtfx = gaa*ax;
       const flt_t dtfy = gaa*ay;
       const flt_t dtfz = gaa*az;
       const flt_t dtgx = fga*ax - hgb*bx;
       const flt_t dtgy = fga*ay - hgb*by;
       const flt_t dtgz = fga*az - hgb*bz;
       const flt_t dthx = gbb*bx;
       const flt_t dthy = gbb*by;
       const flt_t dthz = gbb*bz;
 
       const flt_t df = -tk * df1;
 
       const flt_t sx2 = df*dtgx;
       const flt_t sy2 = df*dtgy;
       const flt_t sz2 = df*dtgz;
 
       flt_t f1x = df*dtfx;
       flt_t f1y = df*dtfy;
       flt_t f1z = df*dtfz;
 
       const flt_t f2x = sx2 - f1x;
       const flt_t f2y = sy2 - f1y;
       const flt_t f2z = sz2 - f1z;
 
       flt_t f4x = df*dthx;
       flt_t f4y = df*dthy;
       flt_t f4z = df*dthz;
 
       const flt_t f3x = -sx2 - f4x;
       const flt_t f3y = -sy2 - f4y;
       const flt_t f3z = -sz2 - f4z;
 
       if (EVFLAG) {
 	flt_t deng;
 	if (EFLAG) deng = tk * p;
 	IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, deng, i1, i2, i3, i4, f1x, 
 			      f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x, 
 			      vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y, 
 			      vb3z, sedihedral, f, NEWTON_BOND, nlocal,
 			      sv0, sv1, sv2, sv3, sv4, sv5);
       }
 
       {
         if (NEWTON_BOND || i1 < nlocal) {
 	  f[i1].x += f1x;
 	  f[i1].y += f1y;
 	  f[i1].z += f1z;
         }
 
         if (NEWTON_BOND || i2 < nlocal) {
 	  f[i2].x += f2x;
 	  f[i2].y += f2y;
 	  f[i2].z += f2z;
         }
 
         if (NEWTON_BOND || i3 < nlocal) {
 	  f[i3].x += f3x;
 	  f[i3].y += f3y;
 	  f[i3].z += f3z;
         }
 
         if (NEWTON_BOND || i4 < nlocal) {
 	  f[i4].x += f4x;
 	  f[i4].y += f4y;
 	  f[i4].z += f4z;
         }
       }
     } // for n
     if (EVFLAG) {
       if (EFLAG) oedihedral += sedihedral;
       if (vflag) {
 	ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
       }
     }
   } // omp parallel
 
   if (EVFLAG) {
     if (EFLAG) energy += oedihedral;
     if (vflag) {
       virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
       virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
     }
   }
 
   fix->set_reduce_flag();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void DihedralHarmonicIntel::init_style()
 {
   DihedralHarmonic::init_style();
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   #ifdef _LMP_INTEL_OFFLOAD
   _use_base = 0;
   if (fix->offload_balance() != 0.0) {
     _use_base = 1;
     return;
   }
   #endif
 
   fix->bond_init_check();
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void DihedralHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
 	                                     IntelBuffers<flt_t,acc_t> *buffers)
 {
   const int bp1 = atom->ndihedraltypes + 1;
   fc.set_ntypes(bp1,memory);
 
   for (int i = 0; i < bp1; i++) {
     fc.bp[i].multiplicity = multiplicity[i];
     fc.bp[i].cos_shift = cos_shift[i];
     fc.bp[i].sin_shift = sin_shift[i];
     fc.bp[i].k = k[i];
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void DihedralHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
 	                                                  Memory *memory) {
   if (nbondtypes != _nbondtypes) {
     if (_nbondtypes > 0)
       _memory->destroy(bp);
     
     if (nbondtypes > 0)
       _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
   }
   _nbondtypes = nbondtypes;
   _memory = memory;
 }
diff --git a/src/USER-INTEL/dihedral_opls_intel.cpp b/src/USER-INTEL/dihedral_opls_intel.cpp
index 1edb52f05..bfd5a5395 100644
--- a/src/USER-INTEL/dihedral_opls_intel.cpp
+++ b/src/USER-INTEL/dihedral_opls_intel.cpp
@@ -1,438 +1,438 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
-#include "mpi.h"
-#include "math.h"
+#include <mpi.h>
+#include <math.h>
 #include "dihedral_opls_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "memory.h"
 #include "neighbor.h"
 #include "domain.h"
 #include "force.h"
 #include "pair.h"
 #include "update.h"
 #include "error.h"
 
 #include "suffix.h"
 using namespace LAMMPS_NS;
 
 #define PTOLERANCE (flt_t)1.05
 #define MTOLERANCE (flt_t)-1.05
 #define SMALL2     (flt_t)0.000001
 #define INVSMALL   (flt_t)1000.0
 #define SMALLER2   (flt_t)0.0000000001
 #define INVSMALLER (flt_t)100000.0
 typedef struct { int a,b,c,d,t;  } int5_t;
 
 /* ---------------------------------------------------------------------- */
 
 DihedralOPLSIntel::DihedralOPLSIntel(class LAMMPS *lmp)
   : DihedralOPLS(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void DihedralOPLSIntel::compute(int eflag, int vflag)
 {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_use_base) {
     DihedralOPLS::compute(eflag, vflag);
     return;
   }
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void DihedralOPLSIntel::compute(int eflag, int vflag,
 				  IntelBuffers<flt_t,acc_t> *buffers,
 				  const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = 0;
 
   if (evflag) {
     if (eflag) {
       if (force->newton_bond)
 	eval<1,1,1>(vflag, buffers, fc);
       else
 	eval<1,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
 	eval<1,0,1>(vflag, buffers, fc);
       else
 	eval<1,0,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
       eval<0,0,1>(vflag, buffers, fc);
     else
       eval<0,0,0>(vflag, buffers, fc);
   }
 }
 
 template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void DihedralOPLSIntel::eval(const int vflag, 
 			       IntelBuffers<flt_t,acc_t> *buffers,
 			       const ForceConst<flt_t> &fc)
 
 {
   const int inum = neighbor->ndihedrallist;
   if (inum == 0) return;
 
   ATOM_T * _noalias const x = buffers->get_x(0);
   const int nlocal = atom->nlocal;
   const int nall = nlocal + atom->nghost;
 
   int f_stride;
   if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
   else f_stride = buffers->get_stride(nlocal);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
   const int nthreads = tc;
 
   acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
   if (EVFLAG) {
     if (EFLAG)
       oedihedral = (acc_t)0.0;
     if (vflag) {
       ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
     }
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
     shared(f_start,f_stride,fc)		  \
     reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
     int nfrom, nto, tid;
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
     const int5_t * _noalias const dihedrallist = 
       (int5_t *) neighbor->dihedrallist[0];
 
     acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
     if (EVFLAG) {
       if (EFLAG)
 	sedihedral = (acc_t)0.0;
       if (vflag) {
 	sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
       }
     }
 
     for (int n = nfrom; n < nto; n++) {
       const int i1 = dihedrallist[n].a;
       const int i2 = dihedrallist[n].b;
       const int i3 = dihedrallist[n].c;
       const int i4 = dihedrallist[n].d;
       const int type = dihedrallist[n].t;
 
       // 1st bond
 
       const flt_t vb1x = x[i1].x - x[i2].x;
       const flt_t vb1y = x[i1].y - x[i2].y;
       const flt_t vb1z = x[i1].z - x[i2].z;
 
       // 2nd bond
 
       const flt_t vb2xm = x[i2].x - x[i3].x;
       const flt_t vb2ym = x[i2].y - x[i3].y;
       const flt_t vb2zm = x[i2].z - x[i3].z;
 
       // 3rd bond
       
       const flt_t vb3x = x[i4].x - x[i3].x;
       const flt_t vb3y = x[i4].y - x[i3].y;
       const flt_t vb3z = x[i4].z - x[i3].z;
 
       // 1-4
 
       const flt_t delx = x[i1].x - x[i4].x;
       const flt_t dely = x[i1].y - x[i4].y;
       const flt_t delz = x[i1].z - x[i4].z;
 
 
       // c0 calculation
       // 1st and 2nd angle
 
       const flt_t b1mag2 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z;
       const flt_t rb1 = (flt_t)1.0 / sqrt(b1mag2);
       const flt_t sb1 = (flt_t)1.0 / b1mag2;
 
       const flt_t b2mag2 = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
       const flt_t rb2 = (flt_t)1.0 / sqrt(b2mag2);
       const flt_t sb2 = (flt_t)1.0 / b2mag2;
 
       const flt_t b3mag2 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z;
       const flt_t rb3 = (flt_t)1.0 / sqrt(b3mag2);
       const flt_t sb3 = (flt_t)1.0 / b3mag2;
 
       const flt_t c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3;
 
       flt_t ctmp = -vb1x*vb2xm - vb1y*vb2ym - vb1z*vb2zm;
       const flt_t r12c1 =  rb1 * rb2; 
       const flt_t c1mag = ctmp * r12c1;
 
       ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z;
       const flt_t r12c2 =  rb2 * rb3;
       const flt_t c2mag = ctmp * r12c2;
 
       // cos and sin of 2 angles and final c
 
       flt_t sin2 = MAX((flt_t)1.0 - c1mag*c1mag,(flt_t)0.0);
       flt_t sc1 = (flt_t)1.0/sqrt(sin2);
       if (sin2 < SMALL2) sc1 = INVSMALL;
 
       sin2 = MAX((flt_t)1.0 - c2mag*c2mag,(flt_t)0.0);
       flt_t sc2 = (flt_t)1.0/sqrt(sin2);
       if (sin2 < SMALL2) sc2 = INVSMALL;
 
       const flt_t s1 = sc1 * sc1;
       const flt_t s2 = sc2 * sc2;
       flt_t s12 = sc1 * sc2;
       flt_t c = (c0 + c1mag*c2mag) * s12;
 
       const flt_t cx = vb1z*vb2ym - vb1y*vb2zm;
       const flt_t cy = vb1x*vb2zm - vb1z*vb2xm;
       const flt_t cz = vb1y*vb2xm - vb1x*vb2ym;
       const flt_t cmag = (flt_t)1.0/sqrt(cx*cx + cy*cy + cz*cz);
       const flt_t dx = (cx*vb3x + cy*vb3y + cz*vb3z)*cmag*rb3;
 
       // error check
       if (c > PTOLERANCE || c < MTOLERANCE) {
 	int me = comm->me;
 
 	if (screen) {
 	  char str[128];
 	  sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
 		  TAGINT_FORMAT " " TAGINT_FORMAT " "
 		  TAGINT_FORMAT " " TAGINT_FORMAT,
 		  me,tid,update->ntimestep,
 		  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
 	  error->warning(FLERR,str,0);
 	  fprintf(screen,"  1st atom: %d %g %g %g\n",
 		  me,x[i1].x,x[i1].y,x[i1].z);
 	  fprintf(screen,"  2nd atom: %d %g %g %g\n",
 		  me,x[i2].x,x[i2].y,x[i2].z);
 	  fprintf(screen,"  3rd atom: %d %g %g %g\n",
 		  me,x[i3].x,x[i3].y,x[i3].z);
 	  fprintf(screen,"  4th atom: %d %g %g %g\n",
 		  me,x[i4].x,x[i4].y,x[i4].z);
 	}
       }
 
       if (c > (flt_t)1.0) c = (flt_t)1.0;
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
 
       // force & energy
       // p = sum (i=1,4) k_i * (1 + (-1)**(i+1)*cos(i*phi) )
       // pd = dp/dc
 
       const flt_t cossq = c * c;
       const flt_t sinsq = (flt_t)1.0 - cossq;
       flt_t siinv = (flt_t)1.0/sqrt(sinsq);
       if (sinsq < SMALLER2 ) siinv = INVSMALLER;
       if (dx < (flt_t)0.0) siinv = -siinv;
 
       const flt_t cos_2phi = cossq - sinsq;
       const flt_t sin_2phim = (flt_t)2.0 * c;
       const flt_t cos_3phi = (flt_t)2.0 * c * cos_2phi - c;
       const flt_t sin_3phim = (flt_t)2.0 * cos_2phi + (flt_t)1.0;
       const flt_t cos_4phi = (flt_t)2.0 * cos_2phi * cos_2phi - (flt_t)1.0;
       const flt_t sin_4phim = (flt_t)2.0 * cos_2phi * sin_2phim;
 
       flt_t p, pd;
       p = fc.bp[type].k1*((flt_t)1.0 + c) + 
 	  fc.bp[type].k2*((flt_t)1.0 - cos_2phi) + 
 	  fc.bp[type].k3*((flt_t)1.0 + cos_3phi) +
 	  fc.bp[type].k4*((flt_t)1.0 - cos_4phi) ;
       pd = fc.bp[type].k1 - 
 	   (flt_t)2.0 * fc.bp[type].k2 * sin_2phim +
 	   (flt_t)3.0 * fc.bp[type].k3 * sin_3phim - 
 	   (flt_t)4.0 * fc.bp[type].k4 * sin_4phim;
 
       flt_t edihed;
       if (EFLAG) edihed = p;
 
       const flt_t a = pd;
       c = c * a;
       s12 = s12 * a;
       const flt_t a11 = c*sb1*s1;
       const flt_t a22 = -sb2 * ((flt_t)2.0*c0*s12 - c*(s1+s2));
       const flt_t a33 = c*sb3*s2;
       const flt_t a12 = -r12c1 * (c1mag*c*s1 + c2mag*s12);
       const flt_t a13 = -rb1*rb3*s12;
       const flt_t a23 = r12c2 * (c2mag*c*s2 + c1mag*s12);
 
       const flt_t sx2  = a12*vb1x - a22*vb2xm + a23*vb3x;
       const flt_t sy2  = a12*vb1y - a22*vb2ym + a23*vb3y;
       const flt_t sz2  = a12*vb1z - a22*vb2zm + a23*vb3z;
 
       const flt_t f1x = a11*vb1x - a12*vb2xm + a13*vb3x;
       const flt_t f1y = a11*vb1y - a12*vb2ym + a13*vb3y;
       const flt_t f1z = a11*vb1z - a12*vb2zm + a13*vb3z;
 
       const flt_t f2x = -sx2 - f1x;
       const flt_t f2y = -sy2 - f1y;
       const flt_t f2z = -sz2 - f1z;
 
       const flt_t f4x = a13*vb1x - a23*vb2xm + a33*vb3x;
       const flt_t f4y = a13*vb1y - a23*vb2ym + a33*vb3y;
       const flt_t f4z = a13*vb1z - a23*vb2zm + a33*vb3z;
 
       const flt_t f3x = sx2 - f4x;
       const flt_t f3y = sy2 - f4y;
       const flt_t f3z = sz2 - f4z;
 
       if (EVFLAG) {
 	IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, edihed, i1, i2, i3, i4, f1x, 
 			      f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x, 
 			      vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y, 
 			      vb3z, sedihedral, f, NEWTON_BOND, nlocal,
 			      sv0, sv1, sv2, sv3, sv4, sv5);
       }
 
       {
         if (NEWTON_BOND || i1 < nlocal) {
 	  f[i1].x += f1x;
 	  f[i1].y += f1y;
 	  f[i1].z += f1z;
         }
 
         if (NEWTON_BOND || i2 < nlocal) {
 	  f[i2].x += f2x;
 	  f[i2].y += f2y;
 	  f[i2].z += f2z;
         }
 
         if (NEWTON_BOND || i3 < nlocal) {
 	  f[i3].x += f3x;
 	  f[i3].y += f3y;
 	  f[i3].z += f3z;
         }
 
         if (NEWTON_BOND || i4 < nlocal) {
 	  f[i4].x += f4x;
 	  f[i4].y += f4y;
 	  f[i4].z += f4z;
         }
       }
     } // for n
     if (EVFLAG) {
       if (EFLAG) oedihedral += sedihedral;
       if (vflag) {
 	ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
       }
     }
   } // omp parallel
 
   if (EVFLAG) {
     if (EFLAG) energy += oedihedral;
     if (vflag) {
       virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
       virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
     }
   }
 
   fix->set_reduce_flag();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void DihedralOPLSIntel::init_style()
 {
   DihedralOPLS::init_style();
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   #ifdef _LMP_INTEL_OFFLOAD
   _use_base = 0;
   if (fix->offload_balance() != 0.0) {
     _use_base = 1;
     return;
   }
   #endif
 
   fix->bond_init_check();
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void DihedralOPLSIntel::pack_force_const(ForceConst<flt_t> &fc,
 	                                     IntelBuffers<flt_t,acc_t> *buffers)
 {
   const int bp1 = atom->ndihedraltypes + 1;
   fc.set_ntypes(bp1,memory);
 
   for (int i = 0; i < bp1; i++) {
     fc.bp[i].k1 = k1[i];
     fc.bp[i].k2 = k2[i];
     fc.bp[i].k3 = k3[i];
     fc.bp[i].k4 = k4[i];
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void DihedralOPLSIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
 	                                                  Memory *memory) {
   if (nbondtypes != _nbondtypes) {
     if (_nbondtypes > 0)
       _memory->destroy(bp);
     
     if (nbondtypes > 0)
       _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
   }
   _nbondtypes = nbondtypes;
   _memory = memory;
 }
diff --git a/src/USER-INTEL/fix_npt_intel.cpp b/src/USER-INTEL/fix_npt_intel.cpp
index 2bfbd5e73..56df3bba4 100644
--- a/src/USER-INTEL/fix_npt_intel.cpp
+++ b/src/USER-INTEL/fix_npt_intel.cpp
@@ -1,68 +1,68 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
-#include "string.h"
+#include <string.h>
 #include "fix_npt_intel.h"
 #include "modify.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
 /* ---------------------------------------------------------------------- */
 
 FixNPTIntel::FixNPTIntel(LAMMPS *lmp, int narg, char **arg) :
   FixNHIntel(lmp, narg, arg)
 {
   if (!tstat_flag)
     error->all(FLERR,"Temperature control must be used with fix npt/omp");
   if (!pstat_flag)
     error->all(FLERR,"Pressure control must be used with fix npt/omp");
 
   // create a new compute temp style
   // id = fix-ID + temp
   // compute group = all since pressure is always global (group all)
   // and thus its KE/temperature contribution should use group all
 
   int n = strlen(id) + 6;
   id_temp = new char[n];
   strcpy(id_temp,id);
   strcat(id_temp,"_temp");
 
   char **newarg = new char*[3];
   newarg[0] = id_temp;
   newarg[1] = (char *) "all";
   newarg[2] = (char *) "temp";
 
   modify->add_compute(3,newarg);
   delete [] newarg;
   tcomputeflag = 1;
 
   // create a new compute pressure style
   // id = fix-ID + press, compute group = all
   // pass id_temp as 4th arg to pressure constructor
 
   n = strlen(id) + 7;
   id_press = new char[n];
   strcpy(id_press,id);
   strcat(id_press,"_press");
 
   newarg = new char*[4];
   newarg[0] = id_press;
   newarg[1] = (char *) "all";
   newarg[2] = (char *) "pressure";
   newarg[3] = id_temp;
   modify->add_compute(4,newarg);
   delete [] newarg;
   pcomputeflag = 1;
 }
diff --git a/src/USER-INTEL/fix_nve_asphere_intel.cpp b/src/USER-INTEL/fix_nve_asphere_intel.cpp
index f43519663..656316545 100644
--- a/src/USER-INTEL/fix_nve_asphere_intel.cpp
+++ b/src/USER-INTEL/fix_nve_asphere_intel.cpp
@@ -1,238 +1,238 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
-#include "math.h"
-#include "stdio.h"
-#include "string.h"
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
 #include "fix_nve_asphere_intel.h"
 #include "math_extra_intel.h"
 #include "atom.h"
 #include "atom_vec_ellipsoid.h"
 #include "force.h"
 #include "neighbor.h"
 #include "update.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
 #define INERTIA 0.2          // moment of inertia prefactor for ellipsoid
 
 /* ---------------------------------------------------------------------- */
 
 FixNVEAsphereIntel::FixNVEAsphereIntel(LAMMPS *lmp, int narg, char **arg) :
   FixNVE(lmp, narg, arg) 
 {
   _dtfm = 0;
   _nlocal3 = 0;
   _nlocal_max = 0;
   _inertia0 = 0;
   _inertia1 = 0;
   _inertia2 = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixNVEAsphereIntel::init()
 {
   avec = (AtomVecEllipsoid *) atom->style_match("ellipsoid");
   if (!avec)
     error->all(FLERR,"Compute nve/asphere requires atom style ellipsoid");
 
   // check that all particles are finite-size ellipsoids
   // no point particles allowed, spherical is OK
 
   int *ellipsoid = atom->ellipsoid;
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
 
   for (int i = 0; i < nlocal; i++)
     if (mask[i] & groupbit)
       if (ellipsoid[i] < 0)
         error->one(FLERR,"Fix nve/asphere requires extended particles");
 
   FixNVE::init();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixNVEAsphereIntel::setup(int vflag)
 {
   FixNVE::setup(vflag);
   reset_dt();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixNVEAsphereIntel::initial_integrate(int vflag)
 {
   double dtfm;
   double inertia[3],omega[3];
   double *shape,*quat;
 
   AtomVecEllipsoid::Bonus *bonus = avec->bonus;
   int *ellipsoid = atom->ellipsoid;
   double * _noalias const x = atom->x[0];
   double * _noalias const v = atom->v[0];
   const double * _noalias const f = atom->f[0];
   int *mask = atom->mask;
 
   double **angmom = atom->angmom;
   double **torque = atom->torque;
   double *rmass = atom->rmass;
   int nlocal = atom->nlocal;
   if (igroup == atom->firstgroup) nlocal = atom->nfirst;
 
   // set timestep here since dt may have changed or come via rRESPA
 
   dtq = 0.5 * dtv;
 
   #if defined(LMP_SIMD_COMPILER)
   #pragma vector aligned
   #pragma simd
   #endif
   for (int i = 0; i < _nlocal3; i++) {
     v[i] += _dtfm[i] * f[i];
     x[i] += dtv * v[i];
   }
 
   // update angular momentum by 1/2 step
   if (igroup == 0) {
     #if defined(LMP_SIMD_COMPILER)
     #pragma vector aligned
     #pragma simd
     #endif
     for (int i = 0; i < nlocal; i++) {
       double *quat = bonus[ellipsoid[i]].quat;
       ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i], _inertia0[i],
                           _inertia1[i], _inertia2[i]);
     }
   } else {
     #if defined(LMP_SIMD_COMPILER)
     #pragma vector aligned
     #pragma simd
     #endif
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & groupbit) {
 	double *quat = bonus[ellipsoid[i]].quat;
 	ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i], _inertia0[i],
 			    _inertia1[i], _inertia2[i]);
       }
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixNVEAsphereIntel::final_integrate()
 {
   if (neighbor->ago == 0) reset_dt();
 
   double dtfm;
 
   double * _noalias const v = atom->v[0];
   const double * _noalias const f = atom->f[0];
   double * _noalias const angmom = atom->angmom[0];
   const double * _noalias const torque = atom->torque[0];
 
   #if defined(LMP_SIMD_COMPILER)
   #pragma vector aligned
   #pragma simd
   #endif
   for (int i = 0; i < _nlocal3; i++) {
     v[i] += _dtfm[i] * f[i];
     angmom[i] += dtf * torque[i];
   }
 }
 
 void FixNVEAsphereIntel::reset_dt() {
   AtomVecEllipsoid::Bonus *bonus = avec->bonus;
   int *ellipsoid = atom->ellipsoid;
 
   dtv = update->dt;
   dtf = 0.5 * update->dt * force->ftm2v;
 
   const int * const mask = atom->mask;
   const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : 
     atom->nlocal;
 
   if (nlocal > _nlocal_max) {
     if (_nlocal_max) {
       memory->destroy(_dtfm);
       memory->destroy(_inertia0);
       memory->destroy(_inertia1);
       memory->destroy(_inertia2);
     }
     _nlocal_max = static_cast<int>(1.20 * nlocal);
     memory->create(_dtfm, _nlocal_max * 3, "fix_nve_intel:dtfm");
     memory->create(_inertia0, _nlocal_max * 3, "fix_nve_intel:inertia0");
     memory->create(_inertia1, _nlocal_max * 3, "fix_nve_intel:inertia1");
     memory->create(_inertia2, _nlocal_max * 3, "fix_nve_intel:inertia2");
   }
 
   _nlocal3 = nlocal * 3;
 
   if (igroup == 0) {
     const double * const rmass = atom->rmass;
     int n = 0;
     for (int i = 0; i < nlocal; i++) {
       _dtfm[n++] = dtf / rmass[i];
       _dtfm[n++] = dtf / rmass[i];
       _dtfm[n++] = dtf / rmass[i];
       double *shape = bonus[ellipsoid[i]].shape;
       double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]);
       if (idot != 0.0) idot = 1.0 / idot;
       _inertia0[i] = idot;
       idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]);
       if (idot != 0.0) idot = 1.0 / idot;
       _inertia1[i] = idot;
       idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]);
       if (idot != 0.0) idot = 1.0 / idot;
       _inertia2[i] = idot;
     }
   } else {
     const double * const rmass = atom->rmass;
     int n = 0;
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & groupbit) {
         _dtfm[n++] = dtf / rmass[i];
 	_dtfm[n++] = dtf / rmass[i];
 	_dtfm[n++] = dtf / rmass[i];
 	double *shape = bonus[ellipsoid[i]].shape;
 	double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]);
 	if (idot != 0.0) idot = 1.0 / idot;
 	_inertia0[i] = idot;
 	idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]);
 	if (idot != 0.0) idot = 1.0 / idot;
 	_inertia1[i] = idot;
 	idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]);
 	if (idot != 0.0) idot = 1.0 / idot;
 	_inertia2[i] = idot;
       } else {
         _dtfm[n++] = 0.0;
 	_dtfm[n++] = 0.0;
 	_dtfm[n++] = 0.0;
       }
     }
   }
 }
 double FixNVEAsphereIntel::memory_usage() 
 {
   return FixNVE::memory_usage() + _nlocal_max * 12 * sizeof(double);
 }
 
diff --git a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
index b9d60d8ea..4f34a484c 100644
--- a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
+++ b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
@@ -1,556 +1,556 @@
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Rodrigo Canales (RWTH Aachen University)
 ------------------------------------------------------------------------- */
 
-#include "math.h"
-#include "stdio.h"
-#include "stdlib.h"
-#include "string.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 #include "pair_buck_coul_cut_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "group.h"
 #include "kspace.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "math_const.h"
 #include "memory.h"
 #include "error.h"
 #include "suffix.h"
 #include "force.h"
 #include "modify.h"
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
 #define C_FORCE_T typename ForceConst<flt_t>::c_force_t
 #define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t
 #define C_CUT_T typename ForceConst<flt_t>::c_cut_t
 
 PairBuckCoulCutIntel::PairBuckCoulCutIntel(LAMMPS *lmp) :
   PairBuckCoulCut(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 PairBuckCoulCutIntel::~PairBuckCoulCutIntel()
 {
 }
 
 void PairBuckCoulCutIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), 
                           force_const_single);
   else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 
   fix->balance_stamp();
   vflag_fdotr = 0;
 }
 
 template <class flt_t, class acc_t>
 void PairBuckCoulCutIntel::compute(int eflag, int vflag,
 				   IntelBuffers<flt_t,acc_t> *buffers,
 				   const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = vflag_fdotr = 0;
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
   const int host_start = fix->host_start_pair();
   const int offload_end = fix->offload_end_pair();
   const int ago = neighbor->ago;
 
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
     #if defined(_OPENMP)
     #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, 
 				nthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
   
   if (evflag || vflag_fdotr) {
     int ovflag = 0;
     if (vflag_fdotr) ovflag = 2;
     else if (vflag) ovflag = 1;
     if (eflag) {
       if (force->newton_pair) {
 	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
 	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
 	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
 	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (force->newton_pair) {
 	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
 	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
 	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
 	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   } else {
     if (force->newton_pair) {
       eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
       eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
     } else {
       eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
       eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
 				IntelBuffers<flt_t,acc_t> *buffers,
 				const ForceConst<flt_t> &fc,
 				const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
   int nlocal, nall, minlocal;
   fix->get_buffern(offload, nlocal, nall, minlocal);
 
   const int ago = neighbor->ago;
   IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
   flt_t * _noalias const q = buffers->get_q(offload);
 
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
 
   const flt_t * _noalias const special_coul = fc.special_coul;
   const flt_t * _noalias const special_lj = fc.special_lj;
   const flt_t qqrd2e = force->qqrd2e;
 
   const C_FORCE_T * _noalias const c_force = fc.c_force[0];
   const C_ENERGY_T * _noalias const c_energy = fc.c_energy[0];
   const C_CUT_T * _noalias const c_cut = fc.c_cut[0];
 
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
 		       buffers, offload, fix, separate_flag,
 		       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
 
   const int nthreads = tc;
   #ifdef _LMP_INTEL_OFFLOAD
   int *overflow = fix->get_off_overflow_flag();
   double *timer_compute = fix->off_watch_pair();
   // Redeclare as local variables for offload
   const int ncoulmask = this->ncoulmask;
   const int ncoulshiftbits = this->ncoulshiftbits;
 
   if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
   #pragma offload target(mic:_cop) if(offload)                 \
     in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
     in(c_force, c_energy, c_cut:length(0) alloc_if(0) free_if(0))      \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(astart,nthreads,qqrd2e,inum,nall,ntypes,vflag,eatom) \
     in(f_stride,nlocal,minlocal,separate_flag,offload) \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
     signal(f_start)
   #endif
   {
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
     IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, 
 			      f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EVFLAG) {
       oevdwl = oecoul = (acc_t)0;
       if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
     }
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
     #pragma omp parallel default(none)        \
       shared(f_start,f_stride,nlocal,nall,minlocal)	\
       reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iito, tid;
       IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
       FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
       memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       for (int i = iifrom; i < iito; ++i) {
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
         const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
         const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
         const C_CUT_T * _noalias const c_cuti = c_cut + ptr_off;
         const int   * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
         acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
   
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         const flt_t qtmp = q[i];
         fxtmp = fytmp = fztmp = (acc_t)0;
         if (EVFLAG) {
           if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
         }
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
 	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
 	                       sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           flt_t forcecoul, forcebuck, evdwl, ecoul;
           forcecoul = forcebuck = evdwl = ecoul = (flt_t)0.0;
 
           const int sbindex = jlist[jj] >> SBBITS & 3;
           const int j = jlist[jj] & NEIGHMASK;
 
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
           const int jtype = x[j].w;
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
           const flt_t r = sqrt(rsq);
           const flt_t r2inv = (flt_t)1.0 / rsq;
 	  
           #ifdef INTEL_VMASK 
           if (rsq < c_cuti[jtype].cut_coulsq) {
           #endif
             forcecoul = qqrd2e * qtmp*q[j]/r;
             if (EFLAG) 
               ecoul = forcecoul;
             if (sbindex){
               const flt_t factor_coul = special_coul[sbindex];
               forcecoul *= factor_coul;
               if(EFLAG)
                 ecoul *= factor_coul;
               
             }
           #ifdef INTEL_VMASK
           }
           #else
           if (rsq >= c_cuti[jtype].cut_coulsq)
             { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
           #endif
           
           #ifdef INTEL_VMASK
           if (rsq < c_cuti[jtype].cut_ljsq) {
           #endif
             flt_t r6inv = r2inv * r2inv * r2inv;
             flt_t rexp = exp(-r * c_forcei[jtype].rhoinv);
             forcebuck = r * rexp * c_forcei[jtype].buck1 -
               r6inv * c_forcei[jtype].buck2;
             if (EFLAG) 
               evdwl = rexp * c_energyi[jtype].a -
                 r6inv * c_energyi[jtype].c -
                 c_energyi[jtype].offset;
             if (sbindex) {
               const flt_t factor_lj = special_lj[sbindex];
               forcebuck *= factor_lj;
               if (EFLAG) 
                 evdwl *= factor_lj;
             }
           #ifdef INTEL_VMASK
           }
           #else
           if (rsq >= c_cuti[jtype].cut_ljsq)
             { forcebuck = (flt_t)0.0; evdwl = (flt_t)0.0; }
           #endif
 
           #ifdef INTEL_VMASK
           if (rsq < c_cuti[jtype].cutsq) {
           #endif
             const flt_t fpair = (forcecoul + forcebuck) * r2inv;
             fxtmp += delx * fpair;
             fytmp += dely * fpair;
             fztmp += delz * fpair;
             if (NEWTON_PAIR || j < nlocal) {
               f[j].x -= delx * fpair;
               f[j].y -= dely * fpair;
               f[j].z -= delz * fpair;
             }
             
             if (EVFLAG) {
               flt_t ev_pre = (flt_t)0;
               if (NEWTON_PAIR || i < nlocal)
                 ev_pre += (flt_t)0.5;
               if (NEWTON_PAIR || j < nlocal)
                 ev_pre += (flt_t)0.5;
               
               if (EFLAG) {
                 sevdwl += ev_pre * evdwl;
                 secoul += ev_pre * ecoul;
                 if (eatom) {
                   if (NEWTON_PAIR || i < nlocal)
                     fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
                   if (NEWTON_PAIR || j < nlocal) 
                     f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
                 }
               }
               IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
             }
           #ifdef INTEL_VMASK
           }
           #endif
         } // for jj
 
         f[i].x += fxtmp;
         f[i].y += fytmp;
         f[i].z += fztmp;
 
         IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
       } // for ii
 
       #ifndef _LMP_INTEL_OFFLOAD
       if (vflag == 2)
       #endif
       {
         #if defined(_OPENMP)
         #pragma omp barrier
         #endif
         IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
 			       nlocal, minlocal, nthreads, f_start, f_stride, 
 			       x, offload);
       }
     } // end of omp parallel region
     if (EVFLAG) {
       if (EFLAG) {
         ev_global[0] = oevdwl;
         ev_global[1] = oecoul;
       }
       if (vflag) {
         ev_global[2] = ov0;
         ev_global[3] = ov1;
         ev_global[4] = ov2;
         ev_global[5] = ov3;
         ev_global[6] = ov4;
         ev_global[7] = ov5;
       }
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end of offload region
 
   if (offload)
     fix->stop_watch(TIME_OFFLOAD_LATENCY);
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
   if (EVFLAG)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairBuckCoulCutIntel::init_style()
 {
   PairBuckCoulCut::init_style();
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
   
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 
 }
 
 template <class flt_t, class acc_t>
 void PairBuckCoulCutIntel::pack_force_const(ForceConst<flt_t> &fc,
                                           IntelBuffers<flt_t,acc_t> *buffers)
 {
   int tp1 = atom->ntypes + 1;
   int ntable = 1;
   if (ncoultablebits)
     for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
 
   fc.set_ntypes(tp1, ntable, memory, _cop);
   buffers->set_ntypes(tp1);
   flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
   double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i, j);
         cutneigh = cut + neighbor->skin;
         cutsq[i][j] = cutsq[j][i] = cut*cut;
         cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
       }
     }
   }
 
   for (int i = 0; i < 4; i++) {
     fc.special_lj[i] = force->special_lj[i];
     fc.special_coul[i] = force->special_coul[i];
     fc.special_coul[0] = 1.0;
     fc.special_lj[0] = 1.0;
   }
 
   for (int i = 0; i < tp1; i++) {
     for (int j = 0; j < tp1; j++) {
       fc.c_cut[i][j].cutsq = cutsq[i][j];
       fc.c_cut[i][j].cut_ljsq = cut_ljsq[i][j];
       fc.c_cut[i][j].cut_coulsq = cut_coulsq[i][j];
       fc.c_force[i][j].buck1 = buck1[i][j];
       fc.c_force[i][j].buck2 = buck2[i][j];
       fc.c_force[i][j].rhoinv = rhoinv[i][j];
       fc.c_energy[i][j].a = a[i][j];
       fc.c_energy[i][j].c = c[i][j];
       fc.c_energy[i][j].offset = offset[i][j];
     }
   }
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop < 0) return;
   flt_t * special_lj = fc.special_lj;
   flt_t * special_coul = fc.special_coul;
   C_FORCE_T * c_force = fc.c_force[0];
   C_ENERGY_T * c_energy = fc.c_energy[0];
   C_CUT_T * c_cut = fc.c_cut[0];
   flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   #pragma offload_transfer target(mic:_cop) \
     in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
     in(c_force, c_energy, c_cut: length(tp1sq) alloc_if(0) free_if(0))   \
     in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void PairBuckCoulCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
 							   const int ntable,
 							   Memory *memory,
 							   const int cop) {
   if ( (ntypes != _ntypes || ntable != _ntable) ) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       flt_t * ospecial_coul = special_coul;
       c_force_t * oc_force = c_force[0];
       c_energy_t * oc_energy = c_energy[0];
       c_cut_t * oc_cut = c_cut[0];
 
       if (ospecial_lj != NULL && oc_force != NULL && oc_cut != NULL &&
           oc_energy != NULL && ospecial_coul != NULL && 
           _cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
           nocopy(oc_force, oc_energy: alloc_if(0) free_if(1))        \
           nocopy(oc_cut: alloc_if(0) free_if(1)) 
       }
       #endif
 
       _memory->destroy(c_force);
       _memory->destroy(c_energy);
       _memory->destroy(c_cut);
 
     }
     if (ntypes > 0) {
       _cop = cop;
       memory->create(c_force,ntypes,ntypes,"fc.c_force");
       memory->create(c_energy,ntypes,ntypes,"fc.c_energy");
       memory->create(c_cut,ntypes,ntypes,"fc.c_cut");
 
 
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       flt_t * ospecial_coul = special_coul;
       c_force_t * oc_force = c_force[0];
       c_energy_t * oc_energy = c_energy[0];
       c_cut_t * oc_cut = c_cut[0];
       int tp1sq = ntypes*ntypes;
       if (ospecial_lj != NULL && oc_force != NULL && oc_cut != NULL &&
           oc_energy != NULL && ospecial_coul != NULL &&  
           cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
           nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
           nocopy(oc_force: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(oc_energy: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(oc_cut: length(tp1sq) alloc_if(1) free_if(0))
 
       }
       #endif
     }
   }
   _ntypes=ntypes;
   _ntable=ntable;
   _memory=memory;
 }
diff --git a/src/USER-INTEL/pair_buck_coul_long_intel.cpp b/src/USER-INTEL/pair_buck_coul_long_intel.cpp
index 5450d6ff3..9319f531e 100644
--- a/src/USER-INTEL/pair_buck_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_buck_coul_long_intel.cpp
@@ -1,657 +1,657 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Rodrigo Canales (RWTH Aachen University)
 ------------------------------------------------------------------------- */
 
-#include "math.h"
-#include "stdio.h"
-#include "stdlib.h"
-#include "string.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 #include "pair_buck_coul_long_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "group.h"
 #include "kspace.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "math_const.h"
 #include "memory.h"
 #include "error.h"
 #include "suffix.h"
 #include "force.h"
 #include "modify.h"
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
 #define C_FORCE_T typename ForceConst<flt_t>::c_force_t
 #define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t
 #define TABLE_T typename ForceConst<flt_t>::table_t
 
 PairBuckCoulLongIntel::PairBuckCoulLongIntel(LAMMPS *lmp) :
   PairBuckCoulLong(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 PairBuckCoulLongIntel::~PairBuckCoulLongIntel()
 {
 }
 
 void PairBuckCoulLongIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), 
                           force_const_single);
   else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 
   fix->balance_stamp();
   vflag_fdotr = 0;
 }
 
 template <class flt_t, class acc_t>
 void PairBuckCoulLongIntel::compute(int eflag, int vflag,
 				    IntelBuffers<flt_t,acc_t> *buffers,
 				    const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = vflag_fdotr = 0;
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
   const int host_start = fix->host_start_pair();
   const int offload_end = fix->offload_end_pair();
   const int ago = neighbor->ago;
 
   if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
     #if defined(_OPENMP)
     #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, 
 				nthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
   
   if (evflag || vflag_fdotr) {
     int ovflag = 0;
     if (vflag_fdotr) ovflag = 2;
     else if (vflag) ovflag = 1;
     if (eflag) {
       if (force->newton_pair) {
 	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
 	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
 	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
 	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (force->newton_pair) {
 	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
 	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
 	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
 	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   } else {
     if (force->newton_pair) {
       eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
       eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
     } else {
       eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
       eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
 				 IntelBuffers<flt_t,acc_t> *buffers,
 				 const ForceConst<flt_t> &fc,
 				 const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
   int nlocal, nall, minlocal;
   fix->get_buffern(offload, nlocal, nall, minlocal);
 
   const int ago = neighbor->ago;
   IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
   flt_t * _noalias const q = buffers->get_q(offload);
 
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
 
   const flt_t * _noalias const special_coul = fc.special_coul;
   const flt_t * _noalias const special_lj = fc.special_lj;
   const flt_t qqrd2e = force->qqrd2e;
 
   const C_FORCE_T * _noalias const c_force = fc.c_force[0];
   const C_ENERGY_T * _noalias const c_energy = fc.c_energy[0];
   const flt_t * _noalias const rho_inv = fc.rho_inv[0];
   const TABLE_T * _noalias const table = fc.table;
   const flt_t * _noalias const etable = fc.etable;
   const flt_t * _noalias const detable = fc.detable;
   const flt_t * _noalias const ctable = fc.ctable;
   const flt_t * _noalias const dctable = fc.dctable;
   const flt_t g_ewald = fc.g_ewald;
   const flt_t tabinnersq = fc.tabinnersq;
 
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
 		       buffers, offload, fix, separate_flag,
 		       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
 
   const int nthreads = tc;
   #ifdef _LMP_INTEL_OFFLOAD
   int *overflow = fix->get_off_overflow_flag();
   double *timer_compute = fix->off_watch_pair();
   // Redeclare as local variables for offload
   const int ncoultablebits = this->ncoultablebits;
   const int ncoulmask = this->ncoulmask;
   const int ncoulshiftbits = this->ncoulshiftbits;
   #ifdef INTEL_ALLOW_TABLE
   #define ITABLE_IN in(table,etable,detable:length(0) alloc_if(0) free_if(0)) \
                     in(ctable,dctable:length(0) alloc_if(0) free_if(0)) \
                     in(ncoultablebits,tabinnersq,ncoulmask,ncoulshiftbits)
   #else
   #define ITABLE_IN
   #endif
 
   if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
   #pragma offload target(mic:_cop) if(offload)                 \
     in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
     in(c_force, c_energy:length(0) alloc_if(0) free_if(0)) \
     in(rho_inv:length(0) alloc_if(0) free_if(0)) \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
     in(f_stride,nlocal,minlocal,separate_flag,offload) \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
     ITABLE_IN signal(f_start)
   #endif
   {
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
     IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, 
 			      f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EVFLAG) {
       oevdwl = oecoul = (acc_t)0;
       if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
     }
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
     #pragma omp parallel default(none)        \
       shared(f_start,f_stride,nlocal,nall,minlocal)	\
       reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iito, tid;
       IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
       FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
       memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       for (int i = iifrom; i < iito; ++i) {
         const int itype = x[i].w;
         const int ptr_off = itype * ntypes;
         const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
         const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
         const flt_t * _noalias const rho_invi = rho_inv + ptr_off; 
 
         const int   * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
 	acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         const flt_t qtmp = q[i];
         fxtmp = fytmp = fztmp = (acc_t)0;
         if (EVFLAG) {
 	  if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
 	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 	}
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
 	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
 	                       sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           flt_t forcecoul, forcebuck, evdwl, ecoul;
           forcecoul = forcebuck = evdwl = ecoul = (flt_t)0.0;
 
           const int sbindex = jlist[jj] >> SBBITS & 3;
           const int j = jlist[jj] & NEIGHMASK;
 
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
           const int jtype = x[j].w;
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
           const flt_t r2inv = (flt_t)1.0 / rsq;
           const flt_t r = (flt_t)1.0 / sqrt(r2inv);
 
           #ifdef INTEL_VMASK
           if (rsq < c_forcei[jtype].cutsq) {
           #endif
             #ifdef INTEL_ALLOW_TABLE
             if (!ncoultablebits || rsq <= tabinnersq) {
             #endif
               const flt_t A1 =  0.254829592;
               const flt_t A2 = -0.284496736;
               const flt_t A3 =  1.421413741;
               const flt_t A4 = -1.453152027;
               const flt_t A5 =  1.061405429;
               const flt_t EWALD_F = 1.12837917;
               const flt_t INV_EWALD_P = 1.0 / 0.3275911;
 
               const flt_t grij = g_ewald * r;
               const flt_t expm2 = exp(-grij * grij);
               const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
               const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
               const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
               forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
               if (EFLAG) ecoul = prefactor * erfc;
 
 	      const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
 		prefactor;
 	      forcecoul -= adjust;
 	      if (EFLAG) ecoul -= adjust;
 
             #ifdef INTEL_ALLOW_TABLE
             } else {
               float rsq_lookup = rsq;
               const int itable = (__intel_castf32_u32(rsq_lookup) &
                                   ncoulmask) >> ncoulshiftbits;
               const flt_t fraction = (rsq_lookup - table[itable].r) *
                 table[itable].dr;
 
               const flt_t tablet = table[itable].f +
                 fraction * table[itable].df;
               forcecoul = qtmp * q[j] * tablet;
               if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
                                                 fraction * detable[itable]);
               if (sbindex) {
                 const flt_t table2 = ctable[itable] +
                   fraction * dctable[itable];
                 const flt_t prefactor = qtmp * q[j] * table2;
                 const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
                   prefactor;
                 forcecoul -= adjust;
                 if (EFLAG) ecoul -= adjust;
               }
             }
             #endif
             #ifdef INTEL_VMASK
           }
 	  #endif
 
 	  #ifdef INTEL_VMASK
           if (rsq < c_forcei[jtype].cut_ljsq) {
           #endif
             flt_t r6inv = r2inv * r2inv * r2inv;
             flt_t rexp = exp(-r * rho_invi[jtype]);
             forcebuck = r * rexp * c_forcei[jtype].buck1 -
               r6inv * c_forcei[jtype].buck2;
             if (EFLAG) evdwl = rexp * c_energyi[jtype].a -
                          r6inv * c_energyi[jtype].c -
                          c_energyi[jtype].offset;
 
             if (sbindex) {
               const flt_t factor_lj = special_lj[sbindex];
               forcebuck *= factor_lj;
               if (EFLAG) evdwl *= factor_lj;
             }
           #ifdef INTEL_VMASK
           }
           #else
           if (rsq > c_forcei[jtype].cutsq)
             { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
           if (rsq > c_forcei[jtype].cut_ljsq)
             { forcebuck = (flt_t)0.0; evdwl = (flt_t)0.0; }
           #endif
 
           #ifdef INTEL_VMASK
           if (rsq < c_forcei[jtype].cutsq) {
           #endif
             const flt_t fpair = (forcecoul + forcebuck) * r2inv;
             fxtmp += delx * fpair;
             fytmp += dely * fpair;
             fztmp += delz * fpair;
             if (NEWTON_PAIR || j < nlocal) {
               f[j].x -= delx * fpair;
               f[j].y -= dely * fpair;
               f[j].z -= delz * fpair;
             }
 
             if (EVFLAG) {
               flt_t ev_pre = (flt_t)0;
               if (NEWTON_PAIR || i < nlocal)
                 ev_pre += (flt_t)0.5;
               if (NEWTON_PAIR || j < nlocal)
                 ev_pre += (flt_t)0.5;
 
               if (EFLAG) {
                 sevdwl += ev_pre * evdwl;
                 secoul += ev_pre * ecoul;
                 if (eatom) {
                   if (NEWTON_PAIR || i < nlocal)
                     fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
                   if (NEWTON_PAIR || j < nlocal) 
                     f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
                 }
               }
               IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
             }
           #ifdef INTEL_VMASK
           }
           #endif
         } // for jj
 
         f[i].x += fxtmp;
         f[i].y += fytmp;
         f[i].z += fztmp;
 	IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
       } // for ii
 
       #ifndef _LMP_INTEL_OFFLOAD
       if (vflag == 2)
       #endif
       {
         #if defined(_OPENMP)
         #pragma omp barrier
         #endif
         IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
 			       nlocal, minlocal, nthreads, f_start, f_stride, 
 			       x, offload);
       }
     } // end of omp parallel region
     if (EVFLAG) {
       if (EFLAG) {
         ev_global[0] = oevdwl;
         ev_global[1] = oecoul;
       }
       if (vflag) {
         ev_global[2] = ov0;
         ev_global[3] = ov1;
         ev_global[4] = ov2;
         ev_global[5] = ov3;
         ev_global[6] = ov4;
         ev_global[7] = ov5;
       }
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end of offload region
 
   if (offload)
     fix->stop_watch(TIME_OFFLOAD_LATENCY);
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
   if (EVFLAG)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairBuckCoulLongIntel::init_style()
 {
   PairBuckCoulLong::init_style();
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
   
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 
   _lrt = fix->lrt();
 }
 
 template <class flt_t, class acc_t>
 void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
                                           IntelBuffers<flt_t,acc_t> *buffers)
 {
   int tp1 = atom->ntypes + 1;
   int ntable = 1;
   if (ncoultablebits)
     for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
 
   fc.set_ntypes(tp1, ntable, memory, _cop);
   buffers->set_ntypes(tp1);
   flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
   double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i, j);
         cutneigh = cut + neighbor->skin;
         cutsq[i][j] = cutsq[j][i] = cut*cut;
         cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
       }
     }
   }
 
   fc.g_ewald = force->kspace->g_ewald;
   fc.tabinnersq = tabinnersq;
 
   for (int i = 0; i < 4; i++) {
     fc.special_lj[i] = force->special_lj[i];
     fc.special_coul[i] = force->special_coul[i];
     fc.special_coul[0] = 1.0;
     fc.special_lj[0] = 1.0;
   }
 
   for (int i = 0; i < tp1; i++) {
     for (int j = 0; j < tp1; j++) {
       fc.c_force[i][j].cutsq = cutsq[i][j];
       fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
       fc.c_force[i][j].buck1 = buck1[i][j];
       fc.c_force[i][j].buck2 = buck2[i][j];
       fc.rho_inv[i][j] = rhoinv[i][j];
       fc.c_energy[i][j].a = a[i][j];
       fc.c_energy[i][j].c = c[i][j];
       fc.c_energy[i][j].offset = offset[i][j];
       fc.c_energy[i][j].pad = rhoinv[i][j];
     }
   }
 
   if (ncoultablebits) {
     for (int i = 0; i < ntable; i++) {
       fc.table[i].r = rtable[i];
       fc.table[i].dr = drtable[i];
       fc.table[i].f = ftable[i];
       fc.table[i].df = dftable[i];
       fc.etable[i] = etable[i];
       fc.detable[i] = detable[i];
       fc.ctable[i] = ctable[i];
       fc.dctable[i] = dctable[i];
     }
   }
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop < 0) return;
   flt_t * special_lj = fc.special_lj;
   flt_t * special_coul = fc.special_coul;
   C_FORCE_T * c_force = fc.c_force[0];
   C_ENERGY_T * c_energy = fc.c_energy[0];
   TABLE_T * table = fc.table;
   flt_t * rho_inv = fc.rho_inv[0];
   flt_t * etable = fc.etable;
   flt_t * detable = fc.detable;
   flt_t * ctable = fc.ctable;
   flt_t * dctable = fc.dctable;
   flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   #pragma offload_transfer target(mic:_cop) \
     in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
     in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0)) \
     in(rho_inv: length(tp1sq) alloc_if(0) free_if(0)) \
     in(table: length(ntable) alloc_if(0) free_if(0)) \
     in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \
     in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void PairBuckCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
 							   const int ntable,
 							   Memory *memory,
 							   const int cop) {
   if ( (ntypes != _ntypes || ntable != _ntable) ) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       flt_t * ospecial_coul = special_coul;
       c_force_t * oc_force = c_force[0];
       c_energy_t * oc_energy = c_energy[0];
       table_t * otable = table;
       flt_t * orho_inv = rho_inv[0];
       flt_t * oetable = etable;
       flt_t * odetable = detable;
       flt_t * octable = ctable;
       flt_t * odctable = dctable;
       if (ospecial_lj != NULL && oc_force != NULL && orho_inv != NULL &&
           oc_energy != NULL && otable != NULL && oetable != NULL &&
           odetable != NULL && octable != NULL && odctable != NULL &&
           ospecial_coul != NULL && _cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
 	  nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
 	  nocopy(orho_inv: alloc_if(0) free_if(1)) \
 	  nocopy(otable: alloc_if(0) free_if(1)) \
 	  nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
       }
       #endif
 
       _memory->destroy(c_force);
       _memory->destroy(c_energy);
       _memory->destroy(table);
       _memory->destroy(rho_inv);
       _memory->destroy(etable);
       _memory->destroy(detable);
       _memory->destroy(ctable);
       _memory->destroy(dctable);
     }
     if (ntypes > 0) {
       _cop = cop;
       memory->create(c_force,ntypes,ntypes,"fc.c_force");
       memory->create(c_energy,ntypes,ntypes,"fc.c_energy");
       memory->create(rho_inv,ntypes,ntypes,"fc.rho_inv");
       memory->create(table,ntable,"pair:fc.table");
       memory->create(etable,ntable,"pair:fc.etable");
       memory->create(detable,ntable,"pair:fc.detable");
       memory->create(ctable,ntable,"pair:fc.ctable");
       memory->create(dctable,ntable,"pair:fc.dctable");
 
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       flt_t * ospecial_coul = special_coul;
       c_force_t * oc_force = c_force[0];
       c_energy_t * oc_energy = c_energy[0];
       table_t * otable = table;
       flt_t * orho_inv = rho_inv[0];
       flt_t * oetable = etable;
       flt_t * odetable = detable;
       flt_t * octable = ctable;
       flt_t * odctable = dctable;
       int tp1sq = ntypes*ntypes;
       if (ospecial_lj != NULL && oc_force != NULL && orho_inv != NULL &&
           oc_energy != NULL && otable !=NULL && oetable != NULL &&
           odetable != NULL && octable != NULL && odctable != NULL &&
           ospecial_coul != NULL && cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
           nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
           nocopy(oc_force: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(oc_energy: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(orho_inv: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(otable: length(ntable) alloc_if(1) free_if(0)) \
           nocopy(oetable,odetable: length(ntable) alloc_if(1) free_if(0)) \
           nocopy(octable,odctable: length(ntable) alloc_if(1) free_if(0))
       }
       #endif
     }
   }
   _ntypes=ntypes;
   _ntable=ntable;
   _memory=memory;
 }
 
 
diff --git a/src/USER-INTEL/pair_buck_intel.cpp b/src/USER-INTEL/pair_buck_intel.cpp
index 5411661c5..4815d1e02 100644
--- a/src/USER-INTEL/pair_buck_intel.cpp
+++ b/src/USER-INTEL/pair_buck_intel.cpp
@@ -1,498 +1,498 @@
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Rodrigo Canales (RWTH Aachen University)
 ------------------------------------------------------------------------- */
 
-#include "math.h"
+#include <math.h>
 #include "pair_buck_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "group.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "math_const.h"
 #include "memory.h"
 #include "suffix.h"
 #include "force.h"
 #include "modify.h"
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
 #define C_FORCE_T typename ForceConst<flt_t>::c_force_t
 #define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t
 
 PairBuckIntel::PairBuckIntel(LAMMPS *lmp) : PairBuck(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 PairBuckIntel::~PairBuckIntel()
 {
 }
 
 void PairBuckIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), 
                           force_const_single);
   else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 
   fix->balance_stamp();
   vflag_fdotr = 0;
 }
 
 template <class flt_t, class acc_t>
 void PairBuckIntel::compute(int eflag, int vflag,
 			    IntelBuffers<flt_t,acc_t> *buffers,
 			    const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = vflag_fdotr = 0;
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
   const int host_start = fix->host_start_pair();
   const int offload_end = fix->offload_end_pair();
   const int ago = neighbor->ago;
 
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
     #if defined(_OPENMP)
     #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, 
 				nthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
   
   if (evflag || vflag_fdotr) {
     int ovflag = 0;
     if (vflag_fdotr) ovflag = 2;
     else if (vflag) ovflag = 1;
     if (eflag) {
       if (force->newton_pair) {
 	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
 	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
 	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
 	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (force->newton_pair) {
 	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
 	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
 	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
 	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   } else {
     if (force->newton_pair) {
       eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
       eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
     } else {
       eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
       eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairBuckIntel::eval(const int offload, const int vflag,
 				     IntelBuffers<flt_t,acc_t> *buffers,
 				     const ForceConst<flt_t> &fc,
 				     const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
   int nlocal, nall, minlocal;
   fix->get_buffern(offload, nlocal, nall, minlocal);
   const int ago = neighbor->ago;
   IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
 
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
 
   const flt_t * _noalias const special_lj = fc.special_lj;
   const C_FORCE_T * _noalias const c_force = fc.c_force[0];
   const C_ENERGY_T * _noalias const c_energy = fc.c_energy[0];
 
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
 		       buffers, offload, fix, separate_flag,
 		       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
 
   const int nthreads = tc;
   #ifdef _LMP_INTEL_OFFLOAD
   int *overflow = fix->get_off_overflow_flag();
   double *timer_compute = fix->off_watch_pair();
   // Redeclare as local variables for offload
  
   if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
   #pragma offload target(mic:_cop) if(offload)                 \
     in(special_lj:length(0) alloc_if(0) free_if(0)) \
     in(c_force, c_energy:length(0) alloc_if(0) free_if(0))      \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(astart,nthreads,inum,nall,ntypes,vflag,eatom) \
     in(f_stride,nlocal,minlocal,separate_flag,offload) \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
     signal(f_start)
   #endif
   {
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
     IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, 
 			      f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EVFLAG) {
       oevdwl =  (acc_t)0;
       if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
     }
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
     #pragma omp parallel default(none)        \
       shared(f_start,f_stride,nlocal,nall,minlocal)	\
       reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iito, tid;
       IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
       FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
       memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       for (int i = iifrom; i < iito; ++i) {
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
         const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
         const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
         const int   * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
 	acc_t sevdwl,  sv0, sv1, sv2, sv3, sv4, sv5;
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         fxtmp = fytmp = fztmp = (acc_t)0;
         if (EVFLAG) {
           if (EFLAG) fwtmp = sevdwl =  (acc_t)0;
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
         }
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
 	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
 	                       sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           
           flt_t  forcebuck, evdwl;
           forcebuck = evdwl =  (flt_t)0.0;
 
           const int sbindex = jlist[jj] >> SBBITS & 3;
           const int j = jlist[jj] & NEIGHMASK;
 
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
           const int jtype = x[j].w;
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
           const flt_t r = sqrt(rsq);
           const flt_t r2inv = (flt_t)1.0 / rsq;
 	  
           #ifdef INTEL_VMASK
           if (rsq < c_forcei[jtype].cutsq) {
           #endif
             const flt_t r6inv = r2inv * r2inv * r2inv;
             const flt_t rexp = exp(-r * c_forcei[jtype].rhoinv);
             forcebuck = r * rexp * c_forcei[jtype].buck1 -
               r6inv * c_forcei[jtype].buck2;
 
             #ifndef INTEL_VMASK
             if (rsq > c_forcei[jtype].cutsq)
               forcebuck =(flt_t)0.0;
             #endif 
             if (EFLAG) {
               evdwl = rexp * c_energyi[jtype].a -
                 r6inv * c_energyi[jtype].c -
                 c_energyi[jtype].offset;
 
               #ifndef INTEL_VMASK
               if (rsq > c_forcei[jtype].cutsq)
                 evdwl =(flt_t)0.0;
               #endif
             }
 
             if (sbindex) {
               const flt_t factor_lj = special_lj[sbindex];
               forcebuck *= factor_lj;
               if (EFLAG) 
                 evdwl *= factor_lj;
             }
             const flt_t fpair =  forcebuck * r2inv;
             fxtmp += delx * fpair;
             fytmp += dely * fpair;
             fztmp += delz * fpair;
             if (NEWTON_PAIR || j < nlocal) {
               f[j].x -= delx * fpair;
               f[j].y -= dely * fpair;
               f[j].z -= delz * fpair;
             }
             
             if (EVFLAG) {
               flt_t ev_pre = (flt_t)0;
               if (NEWTON_PAIR || i < nlocal)
                 ev_pre += (flt_t)0.5;
               if (NEWTON_PAIR || j < nlocal)
                 ev_pre += (flt_t)0.5;
               
               if (EFLAG) {
                 sevdwl += ev_pre * evdwl;
                 if (eatom) {
                   if (NEWTON_PAIR || i < nlocal)
                     fwtmp += (flt_t)0.5 * evdwl;
                   if (NEWTON_PAIR || j < nlocal) 
                     f[j].w += (flt_t)0.5 * evdwl;
                 }
               }
               IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
             }
           #ifdef INTEL_VMASK
           }
           #endif
         } // for jj
 
         f[i].x += fxtmp;
         f[i].y += fytmp;
         f[i].z += fztmp;
         IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
       } // for ii
 
       #ifndef _LMP_INTEL_OFFLOAD
       if (vflag == 2)
       #endif
       {
         #if defined(_OPENMP)
         #pragma omp barrier
         #endif
         IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
 			       nlocal, minlocal, nthreads, f_start, f_stride, 
 			       x, offload);
       }
     } // end of omp parallel region
     if (EVFLAG) {
       if (EFLAG) {
         ev_global[0] = oevdwl;
         ev_global[1] = (acc_t)0;
       }
       if (vflag) {
         ev_global[2] = ov0;
         ev_global[3] = ov1;
         ev_global[4] = ov2;
         ev_global[5] = ov3;
         ev_global[6] = ov4;
         ev_global[7] = ov5;
       }
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end of offload region
 
   if (offload)
     fix->stop_watch(TIME_OFFLOAD_LATENCY);
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
   if (EVFLAG)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
 }
 
 void PairBuckIntel::init_style()
 {
   PairBuck::init_style();
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
   
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 }
 
 template <class flt_t, class acc_t>
 void PairBuckIntel::pack_force_const(ForceConst<flt_t> &fc,
                                           IntelBuffers<flt_t,acc_t> *buffers)
 {
   int tp1 = atom->ntypes + 1;
 
   fc.set_ntypes(tp1, memory, _cop);
   buffers->set_ntypes(tp1);
   flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
   double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i, j);
         cutneigh = cut + neighbor->skin;
         cutsq[i][j] = cutsq[j][i] = cut*cut;
         cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
       }
     }
   }
 
   for (int i = 0; i < 4; i++) {
     fc.special_lj[i] = force->special_lj[i];
     fc.special_lj[0] = 1.0;
   }
 
   for (int i = 0; i < tp1; i++) {
     for (int j = 0; j < tp1; j++) {
       fc.c_force[i][j].buck1 = buck1[i][j];
       fc.c_force[i][j].buck2 = buck2[i][j];
       fc.c_force[i][j].rhoinv = rhoinv[i][j];
       fc.c_force[i][j].cutsq = cutsq[i][j];
       fc.c_energy[i][j].a = a[i][j];
       fc.c_energy[i][j].c = c[i][j];
       fc.c_energy[i][j].offset = offset[i][j];
     }
   }
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop < 0) return;
   flt_t * special_lj = fc.special_lj;
   C_FORCE_T * c_force = fc.c_force[0];
   C_ENERGY_T * c_energy = fc.c_energy[0];
   flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   #pragma offload_transfer target(mic:_cop) \
     in(special_lj: length(4) alloc_if(0) free_if(0)) \
     in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0))   \
     in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, 
                                                   Memory *memory,
                                                   const int cop) {
   if ( (ntypes != _ntypes ) ) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       c_force_t * oc_force = c_force[0];
       c_energy_t * oc_energy = c_energy[0];
 
       if (ospecial_lj != NULL && oc_force != NULL && 
           oc_energy != NULL  && 
           _cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: alloc_if(0) free_if(1)) \
           nocopy(oc_force, oc_energy: alloc_if(0) free_if(1))
       }
       #endif
 
       _memory->destroy(c_force);
       _memory->destroy(c_energy);
 
     }
     if (ntypes > 0) {
       _cop = cop;
       memory->create(c_force,ntypes,ntypes,"fc.c_force");
       memory->create(c_energy,ntypes,ntypes,"fc.c_energy");
 
 
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       c_force_t * oc_force = c_force[0];
       c_energy_t * oc_energy = c_energy[0];
       int tp1sq = ntypes*ntypes;
       if (ospecial_lj != NULL && oc_force != NULL && 
           oc_energy != NULL &&  
           cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
           nocopy(oc_force: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(oc_energy: length(tp1sq) alloc_if(1) free_if(0))
 
       }
       #endif
     }
   }
   _ntypes=ntypes;
   _memory=memory;
 }
 
 
diff --git a/src/USER-INTEL/pair_tersoff_intel.cpp b/src/USER-INTEL/pair_tersoff_intel.cpp
index 0c07be463..88354ec4d 100644
--- a/src/USER-INTEL/pair_tersoff_intel.cpp
+++ b/src/USER-INTEL/pair_tersoff_intel.cpp
@@ -1,1504 +1,1504 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Markus H�hnerbach (RWTH)
 ------------------------------------------------------------------------- */
 
-#include "math.h"
-#include "stdio.h"
-#include "stdlib.h"
-#include "string.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 #include "pair_tersoff_intel.h"
 #include "atom.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "force.h"
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
 
 // Currently Intel compiler is required for this pair style.
 // For convenience, base class routines are called if not using Intel compiler.
 #ifndef __INTEL_COMPILER
 using namespace LAMMPS_NS;
 
 PairTersoffIntel::PairTersoffIntel(LAMMPS *lmp) : PairTersoff(lmp)
 {
 }
 
 void PairTersoffIntel::compute(int eflag, int vflag)
 {
   PairTersoff::compute(eflag, vflag);
 }
 
 void PairTersoffIntel::init_style()
 {
   if (comm->me == 0) {
     error->warning(FLERR, "Tersoff/intel currently requires intel compiler. "
 		   "Using MANYBODY version.");
   }
   PairTersoff::init_style();
 }
 
 #else
 
 #ifdef _LMP_INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))
 #endif
 
 #include "intel_intrinsics.h"
 #include "math_const.h"
 
 #ifdef _LMP_INTEL_OFFLOAD
 #pragma offload_attribute(pop)
 #endif
 
 #include "group.h"
 #include "kspace.h"
 #include "modify.h"
 #include "suffix.h"
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
 /* ---------------------------------------------------------------------- */
 
 PairTersoffIntel::PairTersoffIntel(LAMMPS *lmp) : PairTersoff(lmp)
 {
   suffix_flag |= Suffix::INTEL;
   respa_enable = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 // Dispatch the requested precision
 void PairTersoffIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED) {
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), 
                           force_const_single);
   } else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) {
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   } else {
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
   }
   fix->balance_stamp();
   vflag_fdotr = 0;
 }
 
 // Dispatch the extent of computation:
 //  do we need to calculate energy/virial
 template <class flt_t, class acc_t>
 void PairTersoffIntel::compute(int eflag, int vflag,
 				     IntelBuffers<flt_t,acc_t> *buffers,
 				     const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = vflag_fdotr = 0;
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
   const int host_start = fix->host_start_pair();
   const int offload_end = fix->offload_end_pair();
   const int ago = neighbor->ago;
 
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
     #if defined(_OPENMP)
     #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, 
 				nthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
   
   if (evflag || vflag_fdotr) {
     int ovflag = 0;
     if (vflag_fdotr) ovflag = 2;
     else if (vflag) ovflag = 1;
     if (eflag) {
 	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
 	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
 	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
 	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
       eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
       eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
   }
 }
 
 #ifdef _LMP_INTEL_OFFLOAD
 #pragma offload_attribute(push, target(mic))
 #endif
 
 // The complete Tersoff computation kernel is encapsulated here
 //  everything is static, the class just serves as a unit of organization
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
 struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic> {
   // instantiate the vector library and import the types
   typedef typename lmp_intel::vector_routines<flt_t, acc_t, mic> v;
   typedef typename v::fvec fvec;
   typedef typename v::ivec ivec;
   typedef typename v::bvec bvec;
   typedef typename v::avec avec;
   typedef typename v::iarr iarr;
   typedef typename v::farr farr;
   typedef typename v::aarr aarr;
   typedef typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t c_inner_t;
   typedef typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t c_outer_t;
 
   // for descriptions of these methods, please have a look at the original code
   // what's done in here is that they are inlined and vectorized
   // attractive() also provides an option to compute zeta as well
   static fvec zeta_vector(
       const c_inner_t * param, 
       ivec xjw, bvec mask, 
       fvec vrij, fvec rsq2, 
       fvec vdijx, fvec vdijy, fvec vdijz, 
       fvec dikx, fvec diky, fvec dikz
   );
   static void force_zeta_vector(
       const c_outer_t * param, 
       ivec xjw,
       bvec mask,
       fvec vrijsq, fvec vzeta_ij,
       fvec *vfpair, fvec *vprefactor, int EVDWL, fvec *vevdwl,
       bvec vmask_repulsive
   );
   template<bool ZETA>
   static void attractive_vector(
       const c_inner_t * param,
       ivec xjw,
       bvec mask,
       fvec vprefactor,
       fvec vrijsq, fvec rsq2,
       fvec vdijx, fvec vdijy, fvec vdijz,
       fvec dikx, fvec diky, fvec dikz,
       fvec *fix, fvec *fiy, fvec *fiz,
       fvec *fjx, fvec *fjy, fvec *fjz,
       fvec *fkx, fvec *fky, fvec *fkz,
       fvec *zeta
   );
 
   // perform the actual computation
   template<bool EVFLAG, bool EFLAG>
   static void kernel(
       int iito, int iifrom, int eatom, int vflag, 
       const int * _noalias const numneigh,
       const int * _noalias const numneighhalf,
       const int * _noalias const cnumneigh, 
       const int * _noalias const firstneigh, int ntypes, 
       typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
       const c_inner_t * _noalias const c_inner, 
       const c_outer_t * _noalias const c_outer, 
       typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
       acc_t *evdwl, acc_t *ov0, acc_t * ov1, acc_t *ov2, acc_t* ov3, acc_t *ov4, acc_t *ov5
   );
 
   // perform one step of calculation, pass in i-j pairs of atoms (is, js)
   template<int EVFLAG, int EFLAG>
   static void kernel_step(
       int eatom, int vflag, 
       const int * _noalias const numneigh,
       const int * _noalias const cnumneigh, 
       const int * _noalias const firstneigh, 
       int ntypes,
       typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
       const c_inner_t * _noalias const c_inner, 
       const c_outer_t * _noalias const c_outer, 
       typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
       avec *vsevdwl, avec *vsv0, avec * vsv1, avec *vsv2, avec* vsv3, avec *vsv4, avec *vsv5,
       int compress_idx, iarr is, iarr js, bvec vmask_repulsive
   );
 
   // perform one step of calculation, as opposed to the previous method now
   //  with fixed i and a number of js
   template<int EVFLAG, int EFLAG>
   static void kernel_step_const_i(
     int eatom, int vflag, 
     const int * _noalias const numneigh, const int * _noalias const cnumneigh, 
     const int * _noalias const firstneigh, int ntypes, 
     typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
     const c_inner_t * _noalias const c_inner, 
     const c_outer_t * _noalias const c_outer, 
     typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
     avec *vsevdwl, avec *vsv0, avec *vsv1, avec *vsv2, avec *vsv3, avec *vsv4, avec *vsv5,
     int compress_idx, int i, iarr js, bvec vmask_repulsive
   );
 };
 
 #ifdef _LMP_INTEL_OFFLOAD
 #pragma offload_attribute(pop)
 #endif
 
 /* ---------------------------------------------------------------------- */
 
 // Dispatch to correct kernel instatiation and perform all the work neccesary
 //  for offloading. In this routine we enter the Phi.
 // This method is nearly identical to what happens in the other /intel styles
 template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairTersoffIntel::eval(const int offload, const int vflag,
 				     IntelBuffers<flt_t,acc_t> *buffers,
 				     const ForceConst<flt_t> &fc,
 				     const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
   int nlocal, nall, minlocal;
   fix->get_buffern(offload, nlocal, nall, minlocal);
 
   const int ago = neighbor->ago;
   IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
   tagint * _noalias tag = this->atom->tag;
   flt_t * _noalias const q = buffers->get_q(offload);
 
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const numneighhalf = buffers->get_atombin();
   const int * _noalias const firstneigh = buffers->firstneigh(list);
 
   typedef typename ForceConst<flt_t>::c_inner_t c_inner_t;
   typedef typename ForceConst<flt_t>::c_outer_t c_outer_t;
   typedef typename ForceConst<flt_t>::c_cutoff_t c_cutoff_t;
   const c_outer_t * _noalias const c_outer = fc.c_outer[0];
   const c_inner_t * _noalias const c_inner = fc.c_inner[0][0];
   const c_cutoff_t * _noalias const c_inner_cutoff = fc.c_cutoff_inner[0][0];
 
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
 		       buffers, offload, fix, separate_flag,
 		       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
 
   const int nthreads = tc;
 
   #ifdef _LMP_INTEL_OFFLOAD
   int *overflow = fix->get_off_overflow_flag();
   double *timer_compute = fix->off_watch_pair();
   if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
   #pragma offload target(mic:_cop) if(offload) \
     in(c_inner, c_outer :length(0) alloc_if(0) free_if(0)) \
     in(c_inner_cutoff :length(0) alloc_if(0) free_if(0)) \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(astart,nthreads,inum,nall,ntypes,vflag,eatom) \
     in(f_stride,nlocal,minlocal,separate_flag,offload) \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
     signal(f_start)
   #endif
   {
     #ifdef _LMP_INTEL_OFFLOAD
     #ifdef __MIC__
     *timer_compute = MIC_Wtime();
     #endif
     #endif
 
     IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, 
 			       f_stride, x, 0);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EVFLAG) {
       oevdwl = oecoul = (acc_t)0;
       if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
     }
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
     #pragma omp parallel default(none) \
       shared(f_start,f_stride,nlocal,nall,minlocal)	\
       reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iito, tid;
       IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
       FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
       memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       {
         acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
         sevdwl = sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = 0.;
         #define ARGS iito, iifrom, eatom, vflag, numneigh, numneighhalf, cnumneigh, \
           firstneigh, ntypes, x, c_inner, c_outer, f, &sevdwl, &sv0, &sv1, &sv2, &sv3, &sv4, &sv5
         // Pick the variable i algorithm under specific conditions
         // do use scalar algorithm with very short vectors
         int VL = lmp_intel::vector_routines<flt_t,acc_t,lmp_intel::mode>::VL;
         bool pack_i = VL >= 8 && 
           lmp_intel::vector_traits<lmp_intel::mode>::support_integer_and_gather_ops;
         bool use_scalar = VL < 4;
         if (use_scalar) {
           IntelKernelTersoff<flt_t,acc_t,lmp_intel::NONE,false>::kernel<EVFLAG,EFLAG>(ARGS);
         } else if (pack_i) {
           IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,true >::kernel<EVFLAG,EFLAG>(ARGS);
         } else {
           IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EVFLAG,EFLAG>(ARGS);
         }
 	if (EVFLAG) {
           if (EFLAG) oevdwl += sevdwl;
           if (vflag == 1) {
             ov0 += sv0;
             ov1 += sv1;
             ov2 += sv2;
             ov3 += sv3;
             ov4 += sv4;
             ov5 += sv5;
           }
         }
       }
 
       #ifndef _LMP_INTEL_OFFLOAD
       if (vflag == 2)
       #endif
       {
         #if defined(_OPENMP)
         #pragma omp barrier
         #endif
         IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
 	  		       nlocal, minlocal, nthreads, f_start, f_stride, 
                                x, offload);
       }
     } // end of omp parallel region
     if (EVFLAG) {
       if (EFLAG) {
         ev_global[0] = oevdwl;
         ev_global[1] = 0.0;
       }
       if (vflag) {
         ev_global[2] = ov0;
         ev_global[3] = ov1;
         ev_global[4] = ov2;
         ev_global[5] = ov3;
         ev_global[6] = ov4;
         ev_global[7] = ov5;
       }
     }
 
     #ifdef _LMP_INTEL_OFFLOAD
     #ifdef __MIC__
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
     #endif
   } // end of offload region
 
   if (offload)
     fix->stop_watch(TIME_OFFLOAD_LATENCY);
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
   if (EVFLAG)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
 
 }
 
 /* ----------------------------------------------------------------------
    init specific to this pair style
 ------------------------------------------------------------------------- */
 
 // As in any other /intel pair style
 void PairTersoffIntel::init_style()
 {
   if (atom->tag_enable == 0)
     error->all(FLERR,"Pair style Tersoff requires atom IDs");
   if (force->newton_pair == 0)
     error->all(FLERR,"Pair style Tersoff requires newton pair on");
 
   // need a full neighbor list
 
   int irequest = neighbor->request(this);
   neighbor->requests[irequest]->half = 0;
   neighbor->requests[irequest]->full = 1;
   neighbor->requests[irequest]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
   
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
   #endif
   if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
     pack_force_const(force_const_single, fix->get_mixed_buffers());
     fix->get_mixed_buffers()->need_tag(1);
   } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
     fix->get_double_buffers()->need_tag(1);
     pack_force_const(force_const_double, fix->get_double_buffers());
   } else {
     pack_force_const(force_const_single, fix->get_single_buffers());
     fix->get_single_buffers()->need_tag(1);
   }
 }
 
 // As in any other /intel pair style
 template <class flt_t, class acc_t>
 void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
                                           IntelBuffers<flt_t,acc_t> *buffers)
 {
   int tp1 = atom->ntypes + 1;
 
   fc.set_ntypes(tp1, memory, _cop);
   buffers->set_ntypes(tp1);
   flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
   double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i, j);
         cutneigh = cut + neighbor->skin;
         cutsq[i][j] = cutsq[j][i] = cut*cut;
         cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
       }
     }
   }
 
   for (int i = 1; i < tp1; i++) {
     for (int j = 1; j < tp1; j++) {
       fc.c_inner_loop[i][j][0].d2 = 1.0;
       fc.c_inner_loop[i][0][j].d2 = 1.0;
       fc.c_inner_loop[0][i][j].d2 = 1.0;
       for (int k = 1; k < tp1; k++) {
         Param * param = &params[elem2param[map[i]][map[j]][map[k]]];
         fc.c_cutoff_inner[i][k][j].cutsq = static_cast<flt_t>(param->cutsq);
 	fc.c_inner_loop[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
         fc.c_inner_loop[i][j][k].bigr = static_cast<flt_t>(param->bigr);
         fc.c_inner_loop[i][j][k].bigd = static_cast<flt_t>(param->bigd);
         fc.c_inner_loop[i][j][k].c2 = static_cast<flt_t>(param->c * param->c);
         fc.c_inner_loop[i][j][k].d2 = static_cast<flt_t>(param->d * param->d);
         fc.c_inner_loop[i][j][k].h = static_cast<flt_t>(param->h);
         fc.c_inner_loop[i][j][k].gamma = static_cast<flt_t>(param->gamma);
         fc.c_inner_loop[i][j][k].powermint = static_cast<flt_t>(param->powermint);  
 
         fc.c_inner[i][j][k].cutsq = static_cast<flt_t>(param->cutsq);
 	fc.c_inner[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
         fc.c_inner[i][j][k].bigr = static_cast<flt_t>(param->bigr);
         fc.c_inner[i][j][k].bigd = static_cast<flt_t>(param->bigd);
         fc.c_inner[i][j][k].c2 = static_cast<flt_t>(param->c * param->c);
         fc.c_inner[i][j][k].d2 = static_cast<flt_t>(param->d * param->d);
         fc.c_inner[i][j][k].h = static_cast<flt_t>(param->h);
         fc.c_inner[i][j][k].gamma = static_cast<flt_t>(param->gamma);
         fc.c_inner[i][j][k].powermint = static_cast<flt_t>(param->powermint);  
  
       }
       Param * param = &params[elem2param[map[i]][map[j]][map[j]]];
       fc.c_cutoff_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq);
       fc.c_first_loop[i][j].bigr = static_cast<flt_t>(param->bigr);
       fc.c_first_loop[i][j].bigd = static_cast<flt_t>(param->bigd);
       fc.c_first_loop[i][j].lam1 = static_cast<flt_t>(param->lam1);
       fc.c_first_loop[i][j].biga = static_cast<flt_t>(param->biga);
       fc.c_second_loop[i][j].lam2 = static_cast<flt_t>(param->lam2);
       fc.c_second_loop[i][j].beta = static_cast<flt_t>(param->beta);
       fc.c_second_loop[i][j].bigb = static_cast<flt_t>(param->bigb);
       fc.c_second_loop[i][j].powern = static_cast<flt_t>(param->powern);
       fc.c_second_loop[i][j].c1 = static_cast<flt_t>(param->c1);
       fc.c_second_loop[i][j].c2 = static_cast<flt_t>(param->c2);
       fc.c_second_loop[i][j].c3 = static_cast<flt_t>(param->c3);
       fc.c_second_loop[i][j].c4 = static_cast<flt_t>(param->c4);
      
       fc.c_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq);
       fc.c_outer[i][j].bigr = static_cast<flt_t>(param->bigr);
       fc.c_outer[i][j].bigd = static_cast<flt_t>(param->bigd);
       fc.c_outer[i][j].lam1 = static_cast<flt_t>(param->lam1);
       fc.c_outer[i][j].biga = static_cast<flt_t>(param->biga);
       fc.c_outer[i][j].lam2 = static_cast<flt_t>(param->lam2);
       fc.c_outer[i][j].beta = static_cast<flt_t>(param->beta);
       fc.c_outer[i][j].bigb = static_cast<flt_t>(param->bigb);
       fc.c_outer[i][j].powern = static_cast<flt_t>(param->powern);
       fc.c_outer[i][j].c1 = static_cast<flt_t>(param->c1);
       fc.c_outer[i][j].c2 = static_cast<flt_t>(param->c2);
       fc.c_outer[i][j].c3 = static_cast<flt_t>(param->c3);
       fc.c_outer[i][j].c4 = static_cast<flt_t>(param->c4);
 
     }
     fc.c_outer[i][0].cutsq = 0.;
   }
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop < 0) return;
   typename ForceConst<flt_t>::c_first_loop_t * c_first_loop = fc.c_first_loop[0];
   typename ForceConst<flt_t>::c_cutoff_t * c_cutoff_outer = fc.c_cutoff_outer[0];
   typename ForceConst<flt_t>::c_outer_t * c_outer = fc.c_outer[0];
   typename ForceConst<flt_t>::c_second_loop_t * c_second_loop = fc.c_second_loop[0];
   typename ForceConst<flt_t>::c_inner_loop_t * c_inner_loop = fc.c_inner_loop[0][0];
   typename ForceConst<flt_t>::c_cutoff_t * c_cutoff_inner = fc.c_cutoff_inner[0][0];
   typename ForceConst<flt_t>::c_inner_t * c_inner = fc.c_inner[0][0];
   flt_t * ocutneighsq = cutneighsq[0];
   size_t VL = 512 / 8 / sizeof(flt_t);
   int ntypes = tp1;
   int ntypes_pad = ntypes + VL - ntypes % VL;
   int tp1sq = tp1 * tp1;
   int tp1cb = tp1 * tp1 * tp1;
   int tp1cb_pad = tp1 * tp1 * ntypes_pad;
   #pragma offload_transfer target(mic:_cop) \
     in(c_first_loop, c_second_loop, c_cutoff_outer, c_outer : length(tp1sq) alloc_if(0) free_if(0)) \
     in(c_inner : length(tp1cb) alloc_if(0) free_if(0)) \
     in(c_cutoff_inner : length(tp1cb_pad) alloc_if(0) free_if(0)) \
     in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 // As in any other /intel pair style
 template <class flt_t>
 void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
 							   Memory *memory,
 							   const int cop) {
   if ( (ntypes != _ntypes) ) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
       c_first_loop_t * oc_first_loop = c_first_loop[0];
       c_second_loop_t * oc_second_loop = c_second_loop[0];
       c_inner_loop_t * oc_inner_loop = c_inner_loop[0][0];
       c_cutoff_t * oc_cutoff_inner = c_cutoff_inner[0][0];
       c_cutoff_t * oc_cutoff_outer = c_cutoff_outer[0];
       c_inner_t * oc_inner = c_inner[0][0];
       c_outer_t * oc_outer = c_outer[0];
       if (c_first_loop != NULL && c_second_loop != NULL && 
           c_inner_loop != NULL &&  _cop >= 0) {
 
         #pragma offload_transfer target(mic:cop) \
 	  nocopy(oc_first_loop, oc_second_loop, oc_inner_loop: alloc_if(0) free_if(1)) \
 	  nocopy(oc_cutoff_outer, oc_cutoff_inner: alloc_if(0) free_if(1)) \
           nocopy(oc_inner, oc_outer: alloc_if(0) free_if(0))
       }
       #endif
       _memory->destroy(c_first_loop);
       _memory->destroy(c_second_loop);
       _memory->destroy(c_inner_loop);
       _memory->destroy(c_cutoff_outer);
       _memory->destroy(c_cutoff_inner);
       _memory->destroy(c_inner);
       _memory->destroy(c_outer);
     }
     if (ntypes > 0) {
       _cop = cop;
       size_t VL = 512 / 8 / sizeof(flt_t);
       int ntypes_pad = ntypes + VL - ntypes % VL;
       memory->create(c_first_loop,ntypes,ntypes,"fc.c_first_loop");
       memory->create(c_second_loop,ntypes,ntypes,"fc.c_second_loop");
       memory->create(c_cutoff_outer,ntypes,ntypes,"fc.c_cutoff_outer");
       memory->create(c_inner_loop,ntypes,ntypes,ntypes,"fc.c_inner_loop");
       memory->create(c_cutoff_inner,ntypes,ntypes,ntypes_pad,"fc.c_cutoff_inner");
       memory->create(c_inner,ntypes,ntypes,ntypes,"fc.c_inner");
       memory->create(c_outer,ntypes,ntypes,"fc.c_outer");
       #ifdef _LMP_INTEL_OFFLOAD
       c_first_loop_t * oc_first_loop = c_first_loop[0];
       c_second_loop_t * oc_second_loop = c_second_loop[0];
       c_cutoff_t * oc_cutoff_outer = c_cutoff_outer[0];
       c_inner_loop_t * oc_inner_loop = c_inner_loop[0][0];
       c_cutoff_t * oc_cutoff_inner = c_cutoff_inner[0][0];
       c_inner_t * oc_inner = c_inner[0][0];
       c_outer_t * oc_outer = c_outer[0];
       int tp1sq = ntypes * ntypes;
       int tp1cb = ntypes * ntypes * ntypes;
       int tp1cb_pad = ntypes * ntypes * ntypes_pad;
       if (oc_first_loop != NULL && oc_second_loop != NULL && 
           oc_inner_loop != NULL && cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(oc_first_loop: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(oc_second_loop: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(oc_cutoff_outer: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(oc_outer: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(oc_inner_loop: length(tp1cb) alloc_if(1) free_if(0)) \
           nocopy(oc_inner: length(tp1cb) alloc_if(1) free_if(0)) \
           nocopy(oc_cutoff_inner: length(tp1cb_pad) alloc_if(1) free_if(0))
       }
       #endif
     }
   }
   _ntypes=ntypes;
   _memory=memory;
 }
 
 #ifdef _LMP_INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))
 #endif
 
 // The factor up to which we do caching
 static const int N_CACHE = 8;
 
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
 template<int EVFLAG, int EFLAG>
 void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
     int eatom, int vflag, 
     const int * _noalias const numneigh, const int * _noalias const cnumneigh, 
     const int * _noalias const firstneigh, int ntypes, 
     typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
     const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner, 
     const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer, 
     typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
     avec *vsevdwl, 
     avec *vsv0, 
     avec *vsv1, 
     avec *vsv2, 
     avec* vsv3, 
     avec *vsv4, 
     avec *vsv5,
     int compress_idx, 
     iarr is,
     iarr js,
     bvec vmask_repulsive
 ) {
   ivec v_i4floats((int) (4 * sizeof(typename v::fscal)));
   ivec v_i1(1);
   fvec v_2(0.);
   fvec v_0_5(0.5);
   ivec v_i0(0);
   ivec v_i_ntypes(ntypes);
   ivec v_i_NEIGHMASK(NEIGHMASK);
   
   farr fx, fy, fz, fw;
   int cache_idx = 0;
   fvec vfkx_cache[N_CACHE];
   fvec vfky_cache[N_CACHE];
   fvec vfkz_cache[N_CACHE];
   ivec vks_cache[N_CACHE];
   bvec vmask_cache[N_CACHE];
   ivec vkks_final_cache;
   bvec vmask_final_cache;
   iarr ts; 
   // compute all the stuff we know from i and j
   // TDO: We could extract this from the driver routine
   ivec vis = v::int_mullo(v_i4floats, v::int_load_vl(is));
   ivec vjs = v::int_mullo(v_i4floats, v::int_load_vl(js));
   bvec vmask = v::mask_enable_lower(compress_idx);
   fvec vx_i = v::zero(), vy_i = v::zero(), vz_i = v::zero();
   ivec vw_i = v_i0;
   v::gather_x(vis, vmask, x, &vx_i, &vy_i, &vz_i, &vw_i);
   fvec vx_j = v::zero(), vy_j = v::zero(), vz_j = v::zero();
   ivec vw_j = v_i0;
   v::gather_x(vjs, vmask, x, &vx_j, &vy_j, &vz_j, &vw_j);
   fvec vdx_ij = vx_j - vx_i, vdy_ij = vy_j - vy_i, vdz_ij = vz_j - vz_i;
   fvec vrijsq = vdx_ij * vdx_ij + vdy_ij *  vdy_ij + vdz_ij * vdz_ij;
   fvec vrij = sqrt(vrijsq);
   ivec vis_orig = v::int_load_vl(is);
   ivec vcnumneigh_i = v::int_gather<4>(v_i0, vmask, vis_orig, cnumneigh);
   ivec vnumneigh_i = v::int_gather<4>(v_i0, vmask, vis_orig, numneigh);
   ivec vc_idx_ij = v::int_mullo(v_i4floats, vw_j + v::int_mullo(v_i_ntypes, vw_i));
 
   fvec vzeta = v::zero();
   fvec vfxtmp = v::zero(), vfytmp = v::zero(), vfztmp = v::zero();
   fvec vfjxtmp = v::zero(), vfjytmp = v::zero(), vfjztmp = v::zero();
   // This piece of code faciliates the traversal of the k loop assuming
   //  nothing about i. As such, it uses masking to avoid superfluous loads
   //  and fast-forwards each lane until work is available.
   // This is useful because we can not make assumptions as to where in the
   //  neighbor list the atoms within the cutoff might be.
   // We also implement the caching in here, i.e. collect force contributions
   //  due to zeta.
   // This means that you will see four loops:
   // 1. the loop that does zeta calculation and caches the force contributions
   // 2. the loop that processes the remaining zeta calculations
   // 3. the loop that updates the force based on the cached force contributions
   // 4. the loop that computes force contributions for the remainder
   {
     ivec vkks = v_i0;
     bvec vactive_mask = vmask;
     bvec veff_old_mask(0);
     ivec vks, vw_k;
     fvec vx_k, vy_k, vz_k, vcutsq;
     while (! v::mask_testz(vactive_mask) && cache_idx < N_CACHE) {
       bvec vnew_mask = vactive_mask & ~ veff_old_mask;
       vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
           v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
       v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
       fvec vdx_ik = (vx_k - vx_i);
       fvec vdy_ik = (vy_k - vy_i);
       fvec vdz_ik = (vz_k - vz_i);
       fvec vrsq = vdx_ik * vdx_ik + vdy_ik *  vdy_ik + vdz_ik * vdz_ik;
       ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij);
       vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
       bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
       bvec vsame_mask = v::int_cmpneq(vjs, vks);
       bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask;
       if (v::mask_testz(~(veff_mask | ~vactive_mask))) {
         fvec vzeta_contrib;
         fvec vfix, vfiy, vfiz;
         fvec vfjx, vfjy, vfjz;
         fvec vfkx, vfky, vfkz;
 
         attractive_vector<true>(c_inner,vc_idx,veff_mask,fvec(1.),
             vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
             &vfix,&vfiy,&vfiz,
             &vfjx,&vfjy,&vfjz,
             &vfkx,&vfky,&vfkz,
 	    &vzeta_contrib);
         vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
         vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy);
         vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz);
         vfjxtmp = v::mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
         vfjytmp = v::mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
         vfjztmp = v::mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);
 
         vfkx_cache[cache_idx] = vfkx;
         vfky_cache[cache_idx] = vfky;
         vfkz_cache[cache_idx] = vfkz;
 	vks_cache[cache_idx] = vks;
 	vmask_cache[cache_idx] = veff_mask;
 	cache_idx += 1;
 
         vzeta = v::mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
         vkks = vkks + v_i1;
         veff_old_mask = bvec(0);
       } else {
         vkks = v::int_mask_add(vkks, ~veff_mask, vkks, v_i1);
         veff_old_mask = veff_mask;
       }
       vactive_mask &= v::int_cmplt(vkks, vnumneigh_i);
     }
     vkks_final_cache = vkks;
     vmask_final_cache = vactive_mask;
     while (! v::mask_testz(vactive_mask)) {
       bvec vnew_mask = vactive_mask & ~ veff_old_mask;
       vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
           v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
       v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
       fvec vdx_ik = (vx_k - vx_i);
       fvec vdy_ik = (vy_k - vy_i);
       fvec vdz_ik = (vz_k - vz_i);
       fvec vrsq = vdx_ik * vdx_ik + vdy_ik *  vdy_ik + vdz_ik * vdz_ik;
       ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij);
       vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
       bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
       bvec vsame_mask = v::int_cmpneq(vjs, vks);
       bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask;
       if (v::mask_testz(~(veff_mask | ~vactive_mask))) {
         fvec vzeta_contrib;
         vzeta_contrib = zeta_vector(c_inner,vc_idx,veff_mask,vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik);
         vzeta = v::mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
         vkks = vkks + v_i1;
         veff_old_mask = bvec(0);
       } else {
         vkks = v::int_mask_add(vkks, ~veff_mask, vkks, v_i1);
         veff_old_mask = veff_mask;
       }
       vactive_mask &= v::int_cmplt(vkks, vnumneigh_i);
     }
   }
   fvec vfpair, vevdwl, vprefactor, vfwtmp, vfjwtmp;
   force_zeta_vector(c_outer, vc_idx_ij, vmask, vrij, vzeta, &vfpair, &vprefactor, EFLAG, &vevdwl, vmask_repulsive);
   vfxtmp = vfxtmp * vprefactor + vdx_ij * vfpair;
   vfytmp = vfytmp * vprefactor + vdy_ij * vfpair;
   vfztmp = vfztmp * vprefactor + vdz_ij * vfpair;
   vfjxtmp = vfjxtmp * vprefactor - vdx_ij * vfpair;
   vfjytmp = vfjytmp * vprefactor - vdy_ij * vfpair;
   vfjztmp = vfjztmp * vprefactor - vdz_ij * vfpair;
  
   if (EVFLAG) {
     if (EFLAG) {
       *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
       if (eatom) {
         v::store(fw, (v_0_5 * vevdwl));
       }
     }
     if (vflag == 1) {				
       *vsv0 = v::acc_mask_add(*vsv0, vmask, *vsv0, vdx_ij * vdx_ij * vfpair);
       *vsv1 = v::acc_mask_add(*vsv1, vmask, *vsv1, vdy_ij * vdy_ij * vfpair);
       *vsv2 = v::acc_mask_add(*vsv2, vmask, *vsv2, vdz_ij * vdz_ij * vfpair);
       *vsv3 = v::acc_mask_add(*vsv3, vmask, *vsv3, vdx_ij * vdy_ij * vfpair);
       *vsv4 = v::acc_mask_add(*vsv4, vmask, *vsv4, vdx_ij * vdz_ij * vfpair);
       *vsv5 = v::acc_mask_add(*vsv5, vmask, *vsv5, vdy_ij * vdz_ij * vfpair);
     }						
   }
   {
     while (cache_idx-- > 0) {
       fvec vfkx = vprefactor * vfkx_cache[cache_idx];
       fvec vfky = vprefactor * vfky_cache[cache_idx];
       fvec vfkz = vprefactor * vfkz_cache[cache_idx];
       ivec vks = vks_cache[cache_idx];
       bvec veff_mask = vmask_cache[cache_idx];
       v::store(fx, vfkx);
       v::store(fy, vfky);
       v::store(fz, vfkz);
       v::int_store(ts, vks);
       for (int t = 0; t < v::VL; t++) {
         if (v::mask_test_at(veff_mask, t)) {
           int t_ = ts[t] / (4 * sizeof(typename v::fscal));
           f[t_].x += fx[t];
           f[t_].y += fy[t];
           f[t_].z += fz[t];
         }
       }
     }
     ivec vkks = vkks_final_cache;
     bvec vactive_mask = vmask_final_cache;
     bvec veff_old_mask(0);
     ivec vks, vw_k;
     fvec vx_k, vy_k, vz_k, vcutsq;
     while (! v::mask_testz(vactive_mask)) {
       bvec vnew_mask = vactive_mask & ~ veff_old_mask;
       vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK & 
           v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
       v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
       fvec vdx_ik = vx_k - vx_i;
       fvec vdy_ik = vy_k - vy_i;
       fvec vdz_ik = vz_k - vz_i;
       fvec vrsq = vdx_ik * vdx_ik + vdy_ik *  vdy_ik + vdz_ik * vdz_ik;
       ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij);
       vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
       bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
       bvec vsame_mask = v::int_cmpneq(vjs, vks);
       bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask;
       if (v::mask_testz(~(veff_mask | ~vactive_mask))) {
         fvec vfix, vfiy, vfiz;
         fvec vfjx, vfjy, vfjz;
         fvec vfkx, vfky, vfkz;
 
         attractive_vector<false>(c_inner,vc_idx,veff_mask,vprefactor,
             vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
             &vfix,&vfiy,&vfiz,
             &vfjx,&vfjy,&vfjz,
             &vfkx,&vfky,&vfkz,
 	    0);
         vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
         vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy);
         vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz);
         vfjxtmp = v::mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
         vfjytmp = v::mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
         vfjztmp = v::mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);
         v::store(fx, vfkx);
         v::store(fy, vfky);
         v::store(fz, vfkz);
         v::int_store(ts, vks);
         for (int t = 0; t < v::VL; t++) {
           if (v::mask_test_at(veff_mask, t)) {
             int t_ = ts[t] / (4 * sizeof(typename v::fscal));
             f[t_].x += fx[t];
             f[t_].y += fy[t];
             f[t_].z += fz[t];
           }
         }
         vkks = vkks + v_i1;
         veff_old_mask = bvec(0);
       } else {
         vkks = v::int_mask_add(vkks, ~veff_mask, vkks, v_i1);
         veff_old_mask = veff_mask;
       }
       vactive_mask &= v::int_cmplt(vkks, vnumneigh_i);
     } // while (vactive_mask != 0)
   } // section k
   // We can not make any assumptions regarding conflicts.
   // So we sequentialize this.
   // TDO: Once AVX-512 is around check out VPCONFLICT
   v::store(fx, vfjxtmp);
   v::store(fy, vfjytmp);
   v::store(fz, vfjztmp);
   for (int t = 0; t < compress_idx; t++) {
     int t_ = js[t];
     f[t_].x += fx[t];
     f[t_].y += fy[t];
     f[t_].z += fz[t];
     if (EVFLAG && EFLAG && eatom) {
       f[t_].w += fw[t];
     }
   }
   v::store(fx, vfxtmp);
   v::store(fy, vfytmp);
   v::store(fz, vfztmp);
   for (int t = 0; t < compress_idx; t++) {
     int t_ = is[t];
     f[t_].x += fx[t];
     f[t_].y += fy[t];
     f[t_].z += fz[t];
     if (EVFLAG && EFLAG && eatom) {
       f[t_].w += fw[t];
     }
   }
 }
 
 // Specialized kernel step for fixed i, means that we don't have to use the
 //  convoluted iteration scheme above, as the loop variables are uniform.
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
 template<int EVFLAG, int EFLAG>
 void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
     int eatom, int vflag, 
     const int * _noalias const numneigh, const int * _noalias const cnumneigh, 
     const int * _noalias const firstneigh, int ntypes, 
     typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
     const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner, 
     const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer, 
     typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
     avec *vsevdwl, 
     avec *vsv0, 
     avec *vsv1, 
     avec *vsv2, 
     avec* vsv3, 
     avec *vsv4, 
     avec *vsv5,
     int compress_idx, 
     int i,
     iarr js,
     bvec vmask_repulsive
 ) {
   typedef typename v::fvec fvec;
   typedef typename v::ivec ivec;
   typedef typename v::bvec bvec;
   typedef typename v::farr farr;
   typedef typename v::iarr iarr;
   typedef typename v::avec avec;
   typedef typename v::aarr aarr;
 
   ivec v_i4floats((int) (4 * sizeof(typename v::fscal)));
   ivec v_i1(1), v_i0(0), v_i_ntypes(ntypes), v_i_NEIGHMASK(NEIGHMASK);
   fvec v_0_5(0.5);
 
   int cache_idx = 0;
   fvec vfkx_cache[N_CACHE];
   fvec vfky_cache[N_CACHE];
   fvec vfkz_cache[N_CACHE];
   int k_cache[N_CACHE];
   bvec vmask_cache[N_CACHE];
   int kk_final_cache;
 
   aarr fx, fy, fz, fw;
   iarr ts; 
 
   bvec vmask = v::mask_enable_lower(compress_idx);
   fvec vx_i(x[i].x), vy_i(x[i].y), vz_i(x[i].z);
   int w_i = x[i].w;
 
   ivec vjs = v::int_mullo(v_i4floats, v::int_load_vl(js));
   fvec vx_j = v::zero(), vy_j = v::zero(), vz_j = v::zero();
   ivec vw_j = v_i0;
   v::gather_x(vjs, vmask, x, &vx_j, &vy_j, &vz_j, &vw_j);
 
   fvec vdx_ij = vx_j - vx_i, vdy_ij = vy_j - vy_i, vdz_ij = vz_j - vz_i;
   fvec vrijsq = vdx_ij * vdx_ij + vdy_ij *  vdy_ij + vdz_ij * vdz_ij;
   fvec vrij = sqrt(vrijsq);
 
   int cnumneigh_i = cnumneigh[i];
   int numneigh_i = numneigh[i];
   ivec vc_idx_j = v::int_mullo(v_i4floats, vw_j);
   ivec vc_idx_j_ntypes = v::int_mullo(v_i_ntypes, vc_idx_j);
 
   avec vzeta = v::acc_zero();
   avec vfxtmp = v::acc_zero(), vfytmp = v::acc_zero(), vfztmp = v::acc_zero();
   avec vfjxtmp = v::acc_zero(), vfjytmp = v::acc_zero(), vfjztmp = v::acc_zero();
 
   // Same structure as kernel_step, just simpler as the loops all iterate over
   //  the same k
   int kk = 0;
   for (; kk < numneigh_i && cache_idx < N_CACHE; kk++) {
     int k = firstneigh[kk + cnumneigh_i] & NEIGHMASK;
     fvec vx_k(x[k].x);
     fvec vy_k(x[k].y);
     fvec vz_k(x[k].z);
     int w_k = x[k].w;
     fvec vdx_ik = vx_k - vx_i;
     fvec vdy_ik = vy_k - vy_i;
     fvec vdz_ik = vz_k - vz_i;
     fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
     fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]);
     bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
     bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
     bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
     if (! v::mask_testz(veff_mask)) {
       fvec vzeta_contrib;
       fvec vfix, vfiy, vfiz;
       fvec vfjx, vfjy, vfjz;
       fvec vfkx, vfky, vfkz;
       
       attractive_vector<true>(&c_inner[ntypes * ntypes * w_i + w_k],vc_idx_j_ntypes,veff_mask,fvec(1.),
           vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
           &vfix,&vfiy,&vfiz,
           &vfjx,&vfjy,&vfjz,
           &vfkx,&vfky,&vfkz,
           &vzeta_contrib);
       vfxtmp  = v::acc_mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
       vfytmp  = v::acc_mask_add(vfytmp, veff_mask, vfytmp, vfiy);
       vfztmp  = v::acc_mask_add(vfztmp, veff_mask, vfztmp, vfiz);
       vfjxtmp = v::acc_mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
       vfjytmp = v::acc_mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
       vfjztmp = v::acc_mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);
       
       vfkx_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkx, v::zero());
       vfky_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfky, v::zero());
       vfkz_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkz, v::zero());
       vmask_cache[cache_idx] = veff_mask;
       k_cache[cache_idx] = k;
       cache_idx += 1;
 
       vzeta = v::acc_mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
     }
   }
   kk_final_cache = kk;
   for (; kk < numneigh_i; kk++) {
     int k = firstneigh[kk + cnumneigh_i] & NEIGHMASK;
     fvec vx_k(x[k].x);
     fvec vy_k(x[k].y);
     fvec vz_k(x[k].z);
     int w_k = x[k].w;
     fvec vdx_ik = vx_k - vx_i;
     fvec vdy_ik = vy_k - vy_i;
     fvec vdz_ik = vz_k - vz_i;
     fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
     fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]);
     bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
     bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
     bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
     if (! v::mask_testz(veff_mask)) {
       fvec vzeta_contrib = zeta_vector(&c_inner[ntypes * ntypes * w_i + w_k], vc_idx_j_ntypes, veff_mask, vrij, vrsq, 
           vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik);
       vzeta = v::acc_mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
     }
   }
   fvec vfpair, vevdwl, vprefactor, vfwtmp;
   force_zeta_vector(&c_outer[ntypes * w_i], vc_idx_j, vmask, vrij, vzeta, &vfpair, &vprefactor, EFLAG, &vevdwl, vmask_repulsive);
   avec vaprefactor(vprefactor);
   vfxtmp  = vfxtmp  * vaprefactor + avec(vdx_ij * vfpair);
   vfytmp  = vfytmp  * vaprefactor + avec(vdy_ij * vfpair);
   vfztmp  = vfztmp  * vaprefactor + avec(vdz_ij * vfpair);
   vfjxtmp = vfjxtmp * vaprefactor - avec(vdx_ij * vfpair);
   vfjytmp = vfjytmp * vaprefactor - avec(vdy_ij * vfpair);
   vfjztmp = vfjztmp * vaprefactor - avec(vdz_ij * vfpair);
  
   if (EVFLAG) {
     if (EFLAG) {
       *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
       if (eatom) {
         vfwtmp = v_0_5 * vevdwl;
         v::store(fw, vfwtmp);
       }
     }
     if (vflag == 1) {				
       *vsv0 = v::acc_mask_add(*vsv0, vmask, *vsv0, vdx_ij * vdx_ij * vfpair);
       *vsv1 = v::acc_mask_add(*vsv1, vmask, *vsv1, vdy_ij * vdy_ij * vfpair);
       *vsv2 = v::acc_mask_add(*vsv2, vmask, *vsv2, vdz_ij * vdz_ij * vfpair);
       *vsv3 = v::acc_mask_add(*vsv3, vmask, *vsv3, vdx_ij * vdy_ij * vfpair);
       *vsv4 = v::acc_mask_add(*vsv4, vmask, *vsv4, vdx_ij * vdz_ij * vfpair);
       *vsv5 = v::acc_mask_add(*vsv5, vmask, *vsv5, vdy_ij * vdz_ij * vfpair);
     }						
   }
   while (cache_idx-- > 0) {
     fvec vfkx = vprefactor * vfkx_cache[cache_idx];
     fvec vfky = vprefactor * vfky_cache[cache_idx];
     fvec vfkz = vprefactor * vfkz_cache[cache_idx];
     int k = k_cache[cache_idx];
     bvec veff_mask = vmask_cache[cache_idx];
     f[k].x += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfkx, v::zero()));
     f[k].y += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfky, v::zero()));
     f[k].z += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfkz, v::zero()));
   }
   for (int kk = kk_final_cache; kk < numneigh_i; kk++) {
     int k = firstneigh[kk + cnumneigh_i] & NEIGHMASK;
     fvec vx_k(x[k].x);
     fvec vy_k(x[k].y);
     fvec vz_k(x[k].z);
     int w_k = x[k].w;
     fvec vdx_ik = vx_k - vx_i;
     fvec vdy_ik = vy_k - vy_i;
     fvec vdz_ik = vz_k - vz_i;
     fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
     fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k].cutsq);
     bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
     bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
     bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
     if (! v::mask_testz(veff_mask)) {
        fvec vfix, vfiy, vfiz;
        fvec vfjx, vfjy, vfjz;
        fvec vfkx, vfky, vfkz;
 
        attractive_vector<false>(&c_inner[ntypes * ntypes * w_i + w_k],vc_idx_j_ntypes,veff_mask,vprefactor,
            vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
            &vfix,&vfiy,&vfiz,
            &vfjx,&vfjy,&vfjz,
            &vfkx,&vfky,&vfkz,
 	   0);
        vfxtmp  = v::acc_mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
        vfytmp  = v::acc_mask_add(vfytmp, veff_mask, vfytmp, vfiy);
        vfztmp  = v::acc_mask_add(vfztmp, veff_mask, vfztmp, vfiz);
        vfjxtmp = v::acc_mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
        vfjytmp = v::acc_mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
        vfjztmp = v::acc_mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);
        f[k].x += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfkx, v::zero()));
        f[k].y += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfky, v::zero()));
        f[k].z += v::reduce_add(v::mask_add(v::zero(), veff_mask, vfkz, v::zero()));
     }
   }
   // TDO: This could be a scatter
   v::acc_store(fx, vfjxtmp);
   v::acc_store(fy, vfjytmp);
   v::acc_store(fz, vfjztmp);
   for (int t = 0; t < compress_idx; t++) {
     int t_ = js[t];
     f[t_].x += fx[t];
     f[t_].y += fy[t];
     f[t_].z += fz[t];
     if (EVFLAG && EFLAG && eatom) {
       f[t_].w += fw[t];
     }
   }
   f[i].x += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfxtmp, v::zero()));
   f[i].y += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfytmp, v::zero()));
   f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfztmp, v::zero()));
   if (EVFLAG && EFLAG && eatom) {
     f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfwtmp, v::zero()));
   }
 }
 
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
 template<bool EVFLAG, bool EFLAG>
 void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
     int iito, int iifrom, int eatom, int vflag, 
     const int * _noalias const numneigh, 
     const int * _noalias const numneighhalf, 
     const int * _noalias const cnumneigh, 
     const int * _noalias const firstneigh, int ntypes, 
     typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
     const c_inner_t * _noalias const c_inner, 
     const c_outer_t * _noalias const c_outer, 
     typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
     acc_t *evdwl, acc_t *ov0, acc_t * ov1, acc_t *ov2, acc_t* ov3, acc_t *ov4, acc_t *ov5
 ) {
   int compress_idx = 0;
   int ii, jj;
   iarr is, js;
   avec vsevdwl = v::acc_zero();
   avec vsv0 = v::acc_zero(), vsv1 = v::acc_zero(), vsv2 = v::acc_zero();
   avec vsv3 = v::acc_zero(), vsv4 = v::acc_zero(), vsv5 = v::acc_zero();
   ivec v_i4floats(static_cast<int>(sizeof(typename v::fscal) * 4));
   ivec vj, v_NEIGHMASK(NEIGHMASK);
   bvec vmask_repulsive(0);
   iarr repulsive_flag = {0};
   // If you want to get the very most out of this, please uncomment.
   // Consider getting a coffee or doing something else.
   // Also good for heating.
   //#pragma forceinline recursive
   for (ii = iifrom; ii < iito; ii++) {
     // Right now this loop is scalar, to allow for the compiler to do
     //  its prefetching magic.
     int i = ii;
     int w_i = x[i].w;
     flt_t x_i = x[i].x;
     flt_t y_i = x[i].y;
     flt_t z_i = x[i].z;
     int jlist_off_i = cnumneigh[i];
     int jnum = numneigh[ii];
     for (jj = 0; jj < jnum; jj++) {
       int j = firstneigh[jlist_off_i + jj] & NEIGHMASK;
       int w_j = x[j].w;
       flt_t dx_ij = x[j].x - x_i;
       flt_t dy_ij = x[j].y - y_i;
       flt_t dz_ij = x[j].z - z_i;
       flt_t rsq = dx_ij*dx_ij + dy_ij*dy_ij + dz_ij*dz_ij;
       flt_t cutsq = c_outer[w_i * ntypes + w_j].cutsq;
       if (rsq < cutsq) {
         is[compress_idx] = ii;
         js[compress_idx] = j;
         if (jj < numneighhalf[i])
           repulsive_flag[compress_idx] = 1;
         compress_idx += 1;
       }
       if (pack_i) {
         if (compress_idx == v::VL) {
           vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
           kernel_step<EVFLAG,EFLAG>(
               eatom, vflag, 
               numneigh, cnumneigh, firstneigh, ntypes,
               x, c_inner, c_outer, f,
               &vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx, 
               is, js, vmask_repulsive
           );
           compress_idx = 0;
           v::int_clear_arr(repulsive_flag);
         }
       } else {
         if (compress_idx == v::VL || (compress_idx > 0 && jj == jnum-1)) {
           vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
           kernel_step_const_i<EVFLAG,EFLAG>(
               eatom, vflag, 
               numneigh, cnumneigh, firstneigh, ntypes,
               x, c_inner, c_outer, f,
               &vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx, 
               i, js, vmask_repulsive
           );
           compress_idx = 0;
           v::int_clear_arr(repulsive_flag);
         }
       }
     }
   }
   if (compress_idx > 0) {
         vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
         IntelKernelTersoff::kernel_step<EVFLAG,EFLAG>(
             eatom, vflag, 
             numneigh, cnumneigh, firstneigh, ntypes,
             x, c_inner, c_outer, f,
             &vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx, 
             is, js, vmask_repulsive
         );
   }
   if (EVFLAG) {
     if (EFLAG) {
       *evdwl += v::acc_reduce_add(vsevdwl);
     }
     if (vflag == 1) {
       *ov0 += v::acc_reduce_add(vsv0);
       *ov1 += v::acc_reduce_add(vsv1);
       *ov2 += v::acc_reduce_add(vsv2);
       *ov3 += v::acc_reduce_add(vsv3);
       *ov4 += v::acc_reduce_add(vsv4);
       *ov5 += v::acc_reduce_add(vsv5);
     }
   }
 }
 
 
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
 IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::zeta_vector(
     const c_inner_t * param, 
     ivec xjw, bvec mask, 
     fvec vrij, fvec rsq2, 
     fvec vdijx, fvec vdijy, fvec vdijz, 
     fvec dikx, fvec diky, fvec dikz
 ) {
   fvec v_1_0(1.0);
   fvec v_0_5(0.5);
   fvec vph = v::zero(), vpc2 = v::zero(), vpd2 = v::zero(), vpgamma = v::zero(), vplam3 = v::zero(), vppowermint = v::zero(), vpbigr = v::zero(), vpbigd = v::zero();
   // TDO: Specialize on number of species
   v::gather_8(xjw, mask, &param[0].lam3, &vplam3, &vppowermint, &vpbigr, &vpbigd, &vpc2, &vpd2, &vph, &vpgamma);
   fvec vrik = sqrt(rsq2);
   fvec vcostheta = (vdijx * dikx + vdijy * diky + vdijz * dikz) * v::recip(vrij * vrik);
   fvec vhcth = vph - vcostheta;
   fvec vgijk_a = vhcth * vhcth;
   fvec vgijk = vpgamma * (v_1_0 + vpc2 * vgijk_a * v::recip(vpd2 * (vpd2 + vgijk_a)));
   fvec varg1 = vplam3 * (vrij - vrik);
   fvec varg3 = varg1 * varg1 * varg1;
   bvec mask_ex = v::cmpeq(vppowermint, fvec(3.));
   fvec varg  = v::blend(mask_ex, varg1, varg3);
   fvec vex_delr = v::min(fvec(1.e30), exp(varg));
   bvec vmask_need_sine = v::cmpnle(vrik, vpbigr - vpbigd) & mask;
   fvec vfc = v_1_0;
   // Its kind of important to check the mask.
   // Some simulations never/rarely invoke this branch.
   if (! v::mask_testz(vmask_need_sine)) {
     vfc = v::blend(vmask_need_sine, vfc, 
         v_0_5 * (v_1_0 - sin(fvec(MY_PI2) * (vrik - vpbigr) * v::recip(vpbigd))));
   }
   return vgijk * vex_delr * vfc;
 }
 
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
 void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::force_zeta_vector(
     const c_outer_t * param, 
     ivec xjw,
     bvec mask,
     fvec vrij, fvec vzeta_ij,
     fvec *vfpair, fvec *vprefactor, int EVDWL, fvec *vevdwl,
     bvec vmask_repulsive
 ) {
   fvec v_0_0(0.0);
   fvec v_0_5(0.5);
   fvec v_m0_5(-0.5);
   fvec v_1_0(1.0);
   fvec v_m1_0(-1.0);
   fvec v_2_0(2.0);
   fvec vpbigr = v::zero(), vpbigd = v::zero(), vplam1 = v::zero(), vpbiga = v::zero(), vplam2 = v::zero(), vpbeta = v::zero(), vpbigb = v::zero(), vppowern = v::zero();
   v::gather_8(xjw, mask, &param[0].bigr, &vpbigr, &vpbigd, &vplam1, &vpbiga, &vplam2, &vpbeta, &vpbigb, &vppowern);
   fvec vfccos;
 
   // This is pretty much a literal translation.
   bvec vmask_need_sine = v::cmpnle(vrij, vpbigr - vpbigd) & mask;
   fvec vfc = v_1_0;
   fvec vfc_d = v_0_0;
   if (! v::mask_testz(vmask_need_sine)) {
     fvec vtmp = fvec(MY_PI2) * v::recip(vpbigd);
     vfc = v::blend(vmask_need_sine, vfc,
         v_0_5 * (v_1_0 - v::sincos(&vfccos, vtmp * (vrij - vpbigr))));
     vfc_d = v::blend(vmask_need_sine, vfc_d, v_m0_5 * vtmp * vfccos);
   }
   fvec vpminus_lam2 =  - vplam2;
 
   fvec vpminus_bigb = -vpbigb;
   fvec vexp = exp(vpminus_lam2 * vrij);
   fvec vfa = vpminus_bigb * vexp * vfc;
   fvec vfa_d = vpminus_lam2 * vfa + vpminus_bigb * vexp * vfc_d;
 
   fvec vpc1 = v::zero(), vpc2 = v::zero(), vpc3 = v::zero(), vpc4 = v::zero();
   v::gather_4(xjw, mask, &param[0].c1, &vpc1, &vpc2, &vpc3, &vpc4);
   fvec vpminus_powern = - vppowern;
   fvec vbij(0.), vbij_d(0.);
   fvec vtmp = vpbeta * vzeta_ij;
   bvec vmc1 = v::cmple(vpc1, vtmp) & mask;
   if (! v::mask_testz(vmc1)) {
     vbij = v::invsqrt(vtmp);
     vbij_d = vpbeta * v_m0_5 * vbij * v::recip(vtmp);
   }
   bvec vmc2 = v::cmple(vpc2, vtmp) & ~ vmc1 & mask;
   if (! v::mask_testz(vmc2)) {
     fvec vpowminus_powern = pow(vtmp, vpminus_powern);
     fvec vinvsqrt = v::invsqrt(vtmp);
     fvec vrcp2powern = v::recip(v_2_0 * vppowern);
     fvec va = (v_1_0 - vpowminus_powern * vrcp2powern) * vinvsqrt;
     fvec va_d = vpbeta * v_m0_5 * vinvsqrt * v::recip(vtmp) *
             (v_1_0 + v_m0_5 * vpowminus_powern * (v_1_0 + vrcp2powern));
     vbij = v::blend(vmc2, vbij, va);
     vbij_d = v::blend(vmc2, vbij_d, va_d);
   }
   bvec vmc3 = v::cmplt(vtmp, vpc4) & ~vmc2 & ~vmc1 & mask;
   if (! v::mask_testz(vmc3)) {
     vbij = v::blend(vmc3, vbij, v_1_0);
     vbij_d = v::blend(vmc3, vbij_d, v_0_0);
   }
   bvec vmc4 = v::cmple(vtmp, vpc3) & ~vmc3 & ~vmc2 & ~ vmc1 & mask;
   if (! v::mask_testz(vmc4)) {
     fvec vpowm1 = pow(vtmp, vppowern - v_1_0);
     fvec vrcp2powern = v::recip(v_2_0 * vppowern);
     fvec va = v_1_0 - vtmp * vrcp2powern * vpowm1;
     fvec va_d = v_m0_5 * vpbeta * vpowm1;
     vbij = v::blend(vmc4, vbij, va);
     vbij_d = v::blend(vmc4, vbij_d, va_d);
   }
   bvec vmc5 = mask & ~vmc1 & ~vmc2 & ~vmc3 & ~vmc4;
   if (! v::mask_testz(vmc5)) {
     fvec vtmp_n = pow(vtmp, vppowern);
     fvec vpow2 = pow(v_1_0 + vtmp_n, v_m1_0 - v::recip(v_2_0 * vppowern));
     fvec va = (v_1_0 + vtmp_n) * vpow2;
     fvec va_d = v_m0_5 * vpow2 * vtmp_n * v::recip(vzeta_ij);
     vbij = v::blend(vmc5, vbij, va);
     vbij_d = v::blend(vmc5, vbij_d, va_d);
   }
   fvec vtmp_exp = exp(-vplam1 * vrij);
   fvec vrep_fforce = vpbiga * vtmp_exp * (vfc_d - vfc * vplam1);
   fvec vfz_fforce = v_0_5 * vbij * vfa_d;
 
   *vfpair = v::mask_add(vfz_fforce, vmask_repulsive, vfz_fforce, vrep_fforce) * v::recip(vrij);
   *vprefactor = v_m0_5 * vfa * vbij_d;
   if (EVDWL) {
     fvec vrep_eng = vfc * vpbiga * vtmp_exp;
     fvec vfz_eng = v_0_5 * vfa * vbij;
     *vevdwl = v::mask_add(vfz_eng, vmask_repulsive, vfz_eng, vrep_eng);
   }
 }
 
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
 template<bool ZETA>
 void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector(
     const c_inner_t * param,
     ivec xjw,
     bvec mask,
     fvec vprefactor,
     fvec vrij, fvec rsq2,
     fvec vdijx, fvec vdijy, fvec vdijz,
     fvec dikx, fvec diky, fvec dikz,
     fvec *fix, fvec *fiy, fvec *fiz,
     fvec *fjx, fvec *fjy, fvec *fjz,
     fvec *fkx, fvec *fky, fvec *fkz,
     fvec *zeta
 ) {
   fvec v_1_0 = fvec(1.0);
 
   fvec vph = v::zero(), vpc2 = v::zero(), vpd2 = fvec(1.0), vpgamma = v::zero(), vplam3 = v::zero(), vppowermint = v::zero(), vpbigr = v::zero(), vpbigd = fvec(1.0);
   v::gather_8(xjw, mask, &param[0].lam3, &vplam3, &vppowermint, &vpbigr, &vpbigd, &vpc2, &vpd2, &vph, &vpgamma);
   fvec vrijinv = v::recip(vrij);
   fvec vrij_hatx = vrijinv * vdijx;
   fvec vrij_haty = vrijinv * vdijy;
   fvec vrij_hatz = vrijinv * vdijz;
   fvec rikinv = invsqrt(rsq2);
   fvec rik_hatx = rikinv * dikx;
   fvec rik_haty = rikinv * diky;
   fvec rik_hatz = rikinv * dikz;
 
   fvec vrik = sqrt(rsq2);
   fvec vcostheta = (vdijx * dikx + vdijy * diky + vdijz * dikz) * v::recip(vrij * vrik);
   fvec vhcth = vph - vcostheta;
   fvec vdenominator = v::recip(vpd2 + vhcth * vhcth);
   fvec vgijk = vpgamma * (v_1_0 + vpc2 * v::recip(vpd2) - vpc2 * vdenominator);
   fvec vnumerator = fvec(-2.) * vpc2 * vhcth;
   fvec vgijk_d = vpgamma * vnumerator * vdenominator * vdenominator;
   fvec varg1 = vplam3 * (vrij - vrik);
   fvec varg3 = varg1 * varg1 * varg1;
   bvec mask_ex = v::cmpeq(vppowermint, fvec(3.));
   fvec varg  = v::blend(mask_ex, varg1, varg3);
   fvec vex_delr = min(fvec(1.e30), exp(varg));
   fvec vex_delr_d_factor = v::blend(mask_ex, v_1_0, fvec(3.0) * varg1 * varg1);
   fvec vex_delr_d = vplam3 * vex_delr_d_factor * vex_delr;
   bvec vmask_need_sine = v::cmpnle(vrik, vpbigr - vpbigd) & mask;
   fvec vfccos;
   fvec vfc = v_1_0;
   fvec vfc_d = v::zero();
   if (! v::mask_testz(vmask_need_sine)) {
     fvec vtmp = fvec(MY_PI2) * v::recip(vpbigd);
     vfc = v::blend(vmask_need_sine, vfc,
         fvec(0.5) * (v_1_0 - v::sincos(&vfccos, vtmp * (vrik - vpbigr))));
     vfc_d = v::blend(vmask_need_sine, vfc_d, fvec(-0.5) * vtmp * vfccos);
   }
 
   fvec vzeta_d_fc = vfc_d * vgijk * vex_delr; 
   fvec vzeta_d_gijk = vfc * vgijk_d * vex_delr; 
   fvec vzeta_d_ex_delr = vfc * vgijk * vex_delr_d; 
   if (ZETA) *zeta = vfc * vgijk * vex_delr;
 
   fvec vminus_costheta = - vcostheta;
   fvec vdcosdrjx = vrijinv * fmadd(vminus_costheta, vrij_hatx, rik_hatx);
   fvec vdcosdrjy = vrijinv * fmadd(vminus_costheta, vrij_haty, rik_haty);
   fvec vdcosdrjz = vrijinv * fmadd(vminus_costheta, vrij_hatz, rik_hatz);
   fvec vdcosdrkx = rikinv * fmadd(vminus_costheta, rik_hatx, vrij_hatx);
   fvec vdcosdrky = rikinv * fmadd(vminus_costheta, rik_haty, vrij_haty);
   fvec vdcosdrkz = rikinv * fmadd(vminus_costheta, rik_hatz, vrij_hatz);
   fvec vdcosdrix = -(vdcosdrjx + vdcosdrkx);
   fvec vdcosdriy = -(vdcosdrjy + vdcosdrky);
   fvec vdcosdriz = -(vdcosdrjz + vdcosdrkz);
   
   *fix = vprefactor * (vzeta_d_gijk * vdcosdrix + vzeta_d_ex_delr * (rik_hatx - vrij_hatx) - vzeta_d_fc * rik_hatx);
   *fiy = vprefactor * (vzeta_d_gijk * vdcosdriy + vzeta_d_ex_delr * (rik_haty - vrij_haty) - vzeta_d_fc * rik_haty);
   *fiz = vprefactor * (vzeta_d_gijk * vdcosdriz + vzeta_d_ex_delr * (rik_hatz - vrij_hatz) - vzeta_d_fc * rik_hatz);
   *fjx = vprefactor * (vzeta_d_gijk * vdcosdrjx + vzeta_d_ex_delr * vrij_hatx);
   *fjy = vprefactor * (vzeta_d_gijk * vdcosdrjy + vzeta_d_ex_delr * vrij_haty);
   *fjz = vprefactor * (vzeta_d_gijk * vdcosdrjz + vzeta_d_ex_delr * vrij_hatz);
   *fkx = vprefactor * ((vzeta_d_fc - vzeta_d_ex_delr) * rik_hatx + vzeta_d_gijk * vdcosdrkx);
   *fky = vprefactor * ((vzeta_d_fc - vzeta_d_ex_delr) * rik_haty + vzeta_d_gijk * vdcosdrky);
   *fkz = vprefactor * ((vzeta_d_fc - vzeta_d_ex_delr) * rik_hatz + vzeta_d_gijk * vdcosdrkz);
 }
 
 
 #ifdef _LMP_INTEL_OFFLOAD
 #pragma offload_attribute(pop)
 #endif
 
 #endif
diff --git a/src/USER-MANIFOLD/fix_manifoldforce.cpp b/src/USER-MANIFOLD/fix_manifoldforce.cpp
index da794e4a4..c3bbc935a 100644
--- a/src/USER-MANIFOLD/fix_manifoldforce.cpp
+++ b/src/USER-MANIFOLD/fix_manifoldforce.cpp
@@ -1,185 +1,185 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
-#include "math.h"
-#include "string.h"
-#include "stdlib.h"
+#include <math.h>
+#include <string.h>
+#include <stdlib.h>
 #include "atom.h"
 #include "update.h"
 #include "respa.h"
 #include "error.h"
 #include "force.h"
 
 #include "manifold.h"
 #include "fix_manifoldforce.h"  // For stuff
 #include "manifold_factory.h"   // For constructing manifold
 
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
 using namespace user_manifold;
 
 
 // Helper functions for parameters/equal style variables in input script
 inline bool was_var( const char *arg )
 {
   return strstr( arg, "v_" ) == arg;
 }
 
 inline bool str_eq( const char *str1, const char *str2 )
 {
   return strcmp(str1,str2) == 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 FixManifoldForce::FixManifoldForce(LAMMPS *lmp, int narg, char **arg) :
   Fix(lmp, narg, arg)
 {
   int me = -1;
   MPI_Comm_rank(world,&me);
 
 
   // Check the min-style:
   int good_minner = str_eq(update->minimize_style,"hftn") |
                     str_eq(update->minimize_style,"quickmin");
   if( !good_minner){
     error->warning(FLERR,"Minimizing with fix manifoldforce without hftn or quickmin is fishy");
   }
 
 
   // Command is given as
   // fix <name> <group> manifoldforce manifold_name manifold_args
   if( narg < 5 ){
     error->all(FLERR,"Illegal fix manifoldforce! No manifold given");
   }
   const char *m_name = arg[3];
   ptr_m = create_manifold(m_name,lmp,narg,arg);
 
   // Construct manifold from factory:
   if( !ptr_m ){
     char msg[2048];
     snprintf(msg,2048,"Manifold pointer for manifold '%s' was NULL for some reason", arg[3]);
     error->all(FLERR,msg);
   }
 
 
   // After constructing the manifold, you can safely make
   // room for the parameters
   nvars = ptr_m->nparams();
   if( narg < nvars+4 ){
     char msg[2048];
     sprintf(msg,"Manifold %s needs at least %d argument(s)!",
             m_name, nvars);
     error->all(FLERR,msg);
   }
 
   *(ptr_m->get_params()) = new double[nvars];
   if( ptr_m->get_params() == NULL ){
     error->all(FLERR,"Parameter pointer was NULL!");
   }
 
   // This part here stores the names/text of each argument,
   // determines which params are equal-style variables,
   // and sets the values of those arguments that were _not_
   // equal style vars (so that they are not overwritten each time step).
 
   double *params = *(ptr_m->get_params());
   for( int i = 0; i < nvars; ++i ){
     if( was_var( arg[i+4] ) )
       error->all(FLERR,"Equal-style variables not allowed with fix manifoldforce");
 
     // Use force->numeric to trigger an error if arg is not a number.
     params[i] = force->numeric(FLERR,arg[i+4]);
   }
 
 
   // Perform any further initialisation for the manifold that depends on params:
   ptr_m->post_param_init();
 }
 
 /* ---------------------------------------------------------------------- */
 
 int FixManifoldForce::setmask()
 {
   int mask = 0;
   mask |= POST_FORCE;
   mask |= POST_FORCE_RESPA;
   mask |= MIN_POST_FORCE;
   return mask;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixManifoldForce::setup(int vflag)
 {
   if (strstr(update->integrate_style,"verlet"))
     post_force(vflag);
   else {
     int nlevels_respa = ((Respa *) update->integrate)->nlevels;
     for (int ilevel = 0; ilevel < nlevels_respa; ilevel++) {
       ((Respa *) update->integrate)->copy_flevel_f(ilevel);
       post_force_respa(vflag,ilevel,0);
       ((Respa *) update->integrate)->copy_f_flevel(ilevel);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixManifoldForce::min_setup(int vflag)
 {
   post_force(vflag);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixManifoldForce::post_force(int vflag)
 {
   double **x = atom->x;
   double **f = atom->f;
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
 
   double n[3];
   double invn2;
   double dot;
   for (int i = 0; i < nlocal; i++){
     if (mask[i] & groupbit) {
       // Determine normal of particle:
       ptr_m->n(x[i],n);
 
       invn2 = 1.0 / ( n[0]*n[0] + n[1]*n[1] + n[2]*n[2] );
       dot = f[i][0]*n[0] + f[i][1]*n[1] + f[i][2]*n[2];
 
       f[i][0] -= dot*n[0] * invn2;
       f[i][1] -= dot*n[1] * invn2;
       f[i][2] -= dot*n[2] * invn2;
 
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixManifoldForce::post_force_respa(int vflag, int ilevel, int iloop)
 {
   post_force(vflag);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixManifoldForce::min_post_force(int vflag)
 {
   post_force(vflag);
 }
diff --git a/src/USER-MANIFOLD/fix_nve_manifold_rattle.cpp b/src/USER-MANIFOLD/fix_nve_manifold_rattle.cpp
index 246f7cc66..e27762a7d 100644
--- a/src/USER-MANIFOLD/fix_nve_manifold_rattle.cpp
+++ b/src/USER-MANIFOLD/fix_nve_manifold_rattle.cpp
@@ -1,644 +1,644 @@
 /* ----------------------------------------------------------------------
    Lammps - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
    -----------------------------------------------------------------------
 
    This file is a part of the USER-MANIFOLD package.
 
    Copyright (2013-2014) Stefan Paquay, Eindhoven University of Technology.
    License: GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 
    This file is part of the user-manifold package written by
    Stefan Paquay at the Eindhoven University of Technology.
    This module makes it possible to do MD with particles constrained
    to pretty arbitrary manifolds characterised by some constraint function
    g(x,y,z) = 0 and its normal grad(g). The number of manifolds available
    right now is limited but can be extended straightforwardly by making
    a new class that inherits from manifold and implements all pure virtual
    methods.
 
    Thanks to Remy Kusters for beta-testing!
 
 ------------------------------------------------------------------------- */
 
 
-#include "stdio.h"
-#include "stdlib.h"
-#include "string.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 #include "atom.h"
 #include "force.h"
 #include "update.h"
 #include "respa.h"
 #include "error.h"
 #include "group.h"
-#include "math.h"
+#include <math.h>
 #include "input.h"
 #include "variable.h"
 #include "citeme.h"
 #include "memory.h"
 #include "comm.h"
 
 
 #include "fix_nve_manifold_rattle.h"
 #include "manifold_factory.h"
 #include "manifold.h"
 
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
 using namespace user_manifold;
 
 
 enum { CONST, EQUAL }; // For treating the variables.
 
 
 static const char* cite_fix_nve_manifold_rattle =
   "fix nve/manifold/rattle command:\n\n"
   "@article{paquay-2016,\n"
   "   author        = {Paquay, Stefan and Kusters, Remy},\n"
   "   doi           = {10.1016/j.bpj.2016.02.017},\n"
   "   issn          = {0006-3495},\n"
   "   journal       = {Biophysical Journal},\n"
   "   month         = apr,\n"
   "   number        = {6},\n"
   "   pages         = {1226--1233},\n"
   "   title         = {{A Method for Molecular Dynamics on Curved Surfaces}},\n"
   "   volume        = {110},\n"
   "   year          = {2016}\n"
   "}\n\n";
 
 
 /* -----------------------------------------------------------------------------
    ---------------------------------------------------------------------------*/
 FixNVEManifoldRattle::FixNVEManifoldRattle( LAMMPS *lmp, int &narg, char **arg,
                                             int error_on_unknown_keyword )
   : Fix(lmp,narg,arg)
 {
   if( lmp->citeme) lmp->citeme->add(cite_fix_nve_manifold_rattle);
   if( narg < 6 ) error->all(FLERR, "Illegal fix nve/manifold/rattle command");
 
   // Set all bits/settings:
   time_integrate = 1;
   dynamic_group_allow = 1;
   size_vector = 0;
   dof_flag = 1;
 
   nevery = 0;
   dtv = dtf = 0;
 
   tolerance = force->numeric( FLERR, arg[3] );
   max_iter  = force->numeric( FLERR, arg[4] );
 
   ptr_m = create_manifold(arg[5], lmp, narg, arg);
   if( !ptr_m ){
     error->all(FLERR,"Error creating manifold pointer");
   }
 
   nvars = ptr_m->nparams();
   tstrs  = new char*[nvars];
   tvars  = new int[nvars];
   tstyle = new int[nvars];
   is_var = new int[nvars];
 
   if( !tstrs || !tvars || !tstyle || !is_var ){
     error->all(FLERR, "Error creating manifold arg arrays");
   }
 
   // Loop over manifold args:
   for( int i = 0; i < nvars; ++i ){
     int len = 0, offset = 0;
     if( was_var( arg[i+6] ) ){
       len = strlen(arg[i+6]) - 1; // -1 because -2 for v_, +1 for \0.
       is_var[i] = 1;
       offset = 2;
     }else{
       force->numeric(FLERR,arg[i+6]); // Check if legal number.
       len = strlen( arg[i+6] ) + 1; // +1 for \0.
       is_var[i] = 0;
     }
     tstrs[i] = new char[len];
     if( tstrs[i] == NULL ) error->all(FLERR,"Error allocating space for args.");
     strcpy( tstrs[i], arg[i+6] + offset );
   }
 
   *ptr_m->get_params() = new double[nvars];
   if( !(*ptr_m->get_params()) ) error->all(FLERR,"Failed to allocate params!");
   for( int i = 0; i < nvars; ++i ){
     // If param i was variable type, it will be set later...
     (*ptr_m->get_params())[i] = is_var[i] ? 0.0 : force->numeric( FLERR, arg[i+6] );
   }
   ptr_m->post_param_init();
 
 
   // Loop over rest of args:
   int argi = 6 + nvars;
   while( argi < narg ){
     if( strcmp(arg[argi], "every") == 0 ){
       nevery = force->inumeric(FLERR,arg[argi+1]);
       argi += 2;
     }else if( error_on_unknown_keyword ){
       char msg[2048];
       sprintf(msg,"Error parsing arg \"%s\".\n", arg[argi]);
       error->all(FLERR, msg);
     }else{
       argi += 1;
     }
   }
 
 }
 
 
 /* -----------------------------------------------------------------------------
    ---------------------------------------------------------------------------*/
 FixNVEManifoldRattle::~FixNVEManifoldRattle()
 {
   if( tstrs ){
     for( int i = 0; i < nvars; ++i ){
       delete [] tstrs[i];
     }
     delete [] tstrs;
   }
 
   if( tvars  ) delete [] tvars;
   if( tstyle ) delete [] tstyle;
   if( is_var ) delete [] is_var;
 }
 
 
 
 
 /* -----------------------------------------------------------------------------
    ---------------------------------------------------------------------------*/
 void FixNVEManifoldRattle::reset_dt()
 {
   dtv = update->dt;
   dtf = 0.5 * update->dt * force->ftm2v;
 
 
 
 }
 
 void FixNVEManifoldRattle::print_stats( const char *header )
 {
   double n = stats.natoms;
   if( n > 0 ){
     stats.x_iters_per_atom += stats.x_iters / n;
     stats.v_iters_per_atom += stats.v_iters / n;
   }
 
   double x_iters = 0, v_iters = 0;
   bigint ntimestep = update->ntimestep;
   int me = -1;
 
 
   MPI_Comm_rank(world,&me);
   MPI_Allreduce(&stats.x_iters_per_atom,&x_iters,1,MPI_DOUBLE,MPI_SUM,world);
   MPI_Allreduce(&stats.v_iters_per_atom,&v_iters,1,MPI_DOUBLE,MPI_SUM,world);
 
   // Set iters back to zero:
   stats.x_iters_per_atom = stats.x_iters = 0;
   stats.v_iters_per_atom = stats.v_iters = 0;
 
 
   if( me == 0 ){
     double inv_tdiff = 1.0/( static_cast<double>(ntimestep) - stats.last_out );
     stats.last_out = ntimestep;
 
     fprintf(screen, "%s stats for time step " BIGINT_FORMAT " on %d atoms:\n",
             header, ntimestep, stats.natoms);
     fprintf(screen, "  iters/atom: x = %f, v = %f, dofs removed %d",
             x_iters * inv_tdiff, v_iters * inv_tdiff, stats.dofs_removed);
     fprintf(screen,"\n");
   }
 }
 
 
 /* -----------------------------------------------------------------------------
    ---------------------------------------------------------------------------*/
 int FixNVEManifoldRattle::was_var( const char *str )
 {
   if( strlen(str) > 2 ){
     return (str[0] == 'v') && (str[1] == '_');
   }else{
     return 0;
   }
 }
 
 
 
 /* -----------------------------------------------------------------------------
    ---------------------------------------------------------------------------*/
 int FixNVEManifoldRattle::setmask()
 {
   int mask = 0;
   mask |= INITIAL_INTEGRATE;
   mask |= FINAL_INTEGRATE;
   if( nevery > 0 ) mask |= END_OF_STEP;
 
   return mask;
 }
 
 
 /* -----------------------------------------------------------------------------
    ---------------------------------------------------------------------------*/
 void FixNVEManifoldRattle::init()
 {
   // Makes sure the manifold params are set initially.
 
   update_var_params();
   reset_dt();
 }
 
 
 void FixNVEManifoldRattle::update_var_params()
 {
   if( nevery > 0 ){
     stats.x_iters = 0;
     stats.v_iters = 0;
     stats.natoms  = 0;
     stats.x_iters_per_atom = 0.0;
     stats.v_iters_per_atom = 0.0;
   }
 
   double **ptr_params = ptr_m->get_params();
   for( int i = 0; i < nvars; ++i ){
     if( is_var[i] ){
       tvars[i] = input->variable->find(tstrs[i]);
       if( tvars[i] < 0 ){
         error->all(FLERR,
                    "Variable name for fix nve/manifold/rattle does not exist");
       }
       if( input->variable->equalstyle(tvars[i]) ){
         tstyle[i] = EQUAL;
         double new_val = input->variable->compute_equal(tvars[i]);
         // fprintf( stdout, "New value of var %d is now %f\n", i+1, new_val );
         *(ptr_params[i]) = new_val;
       }else{
         error->all(FLERR,
                    "Variable for fix nve/manifold/rattle is invalid style");
       }
     }
   }
 }
 
 
 
 /* -----------------------------------------------------------------------------
    ---------------------------------------------------------------------------*/
 int FixNVEManifoldRattle::dof(int igroup)
 {
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
   int natoms = 0;
   for( int i = 0; i < nlocal; ++i ){
     if(mask[i] & groupbit) ++natoms;
   }
 
   int dofs;
   MPI_Allreduce( &natoms, &dofs, 1, MPI_INT, MPI_SUM, world );
 
   // Make sure that, if there is just no or one atom, no dofs are subtracted,
   // since for the first atom already 3 dofs are subtracted because of the
   // centre of mass corrections:
   if( dofs <= 1 ) dofs = 0;
   stats.dofs_removed = dofs;
 
   return dofs;
 }
 
 
 
 /* -----------------------------------------------------------------------------
    ---------------------------------------------------------------------------*/
 double FixNVEManifoldRattle::memory_usage()
 {
   double bytes = 0.0;
 
   bytes += sizeof(statistics);
   bytes += sizeof(*ptr_m) + sizeof(ptr_m);
   bytes += nvars*sizeof(double) + sizeof(double*);
   bytes += nvars*( sizeof(char*) + 3*sizeof(int) );
   return bytes;
 }
 
 
 
 
 /* -----------------------------------------------------------------------------
    ---------------------------------------------------------------------------*/
 void FixNVEManifoldRattle::initial_integrate(int vflag)
 {
   update_var_params();
   nve_x_rattle(igroup, groupbit);
 }
 
 
 /* -----------------------------------------------------------------------------
    ---------------------------------------------------------------------------*/
 void FixNVEManifoldRattle::final_integrate()
 {
   nve_v_rattle(igroup, groupbit);
 }
 
 
 
 /* -----------------------------------------------------------------------------
    ---------------------------------------------------------------------------*/
 void FixNVEManifoldRattle::end_of_step()
 {
   print_stats( "nve/manifold/rattle" );
 }
 
 /* -----------------------------------------------------------------------------
    ---------------------------------------------------------------------------*/
 void FixNVEManifoldRattle::nve_x_rattle(int igroup, int groupbit)
 {
   double dtfm;
   // update v and x of atoms in group
   double **x = atom->x;
   double **v = atom->v;
   double **f = atom->f;
   double *rmass = atom->rmass;
   double *mass = atom->mass;
   int *type = atom->type;
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
   int natoms = 0;
 
   if (igroup == atom->firstgroup){
     nlocal = atom->nfirst;
   }
 
 
   if (rmass) {
     for (int i = 0; i < nlocal; i++){
       if (mask[i] & groupbit){
         natoms++;
         dtfm = dtf / rmass[i];
         rattle_manifold_x( x[i], v[i], f[i], dtv, dtfm, atom->tag[i] );
       }
     }
   } else {
     for (int i = 0; i < nlocal; i++){
       if (mask[i] & groupbit) {
         natoms++;
         dtfm = dtf / mass[type[i]];
         rattle_manifold_x( x[i], v[i], f[i], dtv, dtfm, atom->tag[i] );
       }
     }
   }
 
   if( nevery > 0 ){
     // Count ALL atoms this fix works on:
     MPI_Allreduce(&natoms,&stats.natoms,1,MPI_INT,MPI_SUM,world);
   }
 }
 
 /* -----------------------------------------------------------------------------
    ---------------------------------------------------------------------------*/
 void FixNVEManifoldRattle::nve_v_rattle(int igroup, int groupbit)
 {
   double dtfm;
 
   // update v of atoms in group
 
   double **x = atom->x;
   double **v = atom->v;
   double **f = atom->f;
   double *rmass = atom->rmass;
   double *mass = atom->mass;
   int *type = atom->type;
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
 
   if (igroup == atom->firstgroup) nlocal = atom->nfirst;
 
   if (rmass) {
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & groupbit) {
         dtfm = dtf / rmass[i];
         rattle_manifold_v( v[i], f[i], x[i], dtfm, atom->tag[i] );
       }
     }
   } else {
     for (int i = 0; i < nlocal; i++){
       if (mask[i] & groupbit) {
         dtfm = dtf / mass[type[i]];
         rattle_manifold_v( v[i], f[i], x[i], dtfm, atom->tag[i] );
       }
     }
   }
 }
 
 
 /* -----------------------------------------------------------------------------
    ---------------------------------------------------------------------------*/
 void FixNVEManifoldRattle::rattle_manifold_x(double *x, double *v,
                                              double *f, double dtv,
                                              double dtfm, tagint tagi )
 {
     /*
     A RATTLE update for the position constraint.
     Original update is x += dtv * v_1/2
     Now you do
     v_1/2(lambda) = v_0 + dtfm * ( f + lambda*n_old )
     and solve
     xold - xnew + dtv * v_1/2(lambda) = 0
     g(xnew) = 0
     for x and lambda. The lambda you find then gives v_1/2 as well.
   */
   double xo[3];      // Previous position to update from.
   double vo[3];      // Previous velocity to update from.
   double l = 0;      // Lagrangian multiplier for constraint forces.
   double R[4];       // System that is 0.
   double dx[4];      // Update that follows from Newton iteration.
   double no[3];      // Normal at xo.
   double nn[3];      // Normal at x, the new position.
   double res;        // Residual.
   int iters = 0;     // Iterations used
 
   double c  = dtfm*dtv; // Used for iterating in the Newton loop:
   double no_nn, nn_R;
 
   vo[0] = v[0];
   vo[1] = v[1];
   vo[2] = v[2];
 
   xo[0] = x[0];
   xo[1] = x[1];
   xo[2] = x[2];
 
   double gg = ptr_m->g_and_n(x,no);
   nn[0] = no[0];
   nn[1] = no[1];
   nn[2] = no[2];
 
   double vt[3];
   vt[0] = vo[0] + dtfm*f[0];
   vt[1] = vo[1] + dtfm*f[1];
   vt[2] = vo[2] + dtfm*f[2];
   double no_dt[3];
   no_dt[0] = dtfm*no[0];
   no_dt[1] = dtfm*no[1];
   no_dt[2] = dtfm*no[2];
 
   // Assume that no_nn is roughly constant during iteration:
 
   const double c_inv = 1.0 / c;
 
 
   while ( 1 ) {
     v[0] = vt[0] - l*no_dt[0];
     v[1] = vt[1] - l*no_dt[1];
     v[2] = vt[2] - l*no_dt[2];
 
     R[0] = xo[0] - x[0] + dtv * v[0];
     R[1] = xo[1] - x[1] + dtv * v[1];
     R[2] = xo[2] - x[2] + dtv * v[2];
     R[3] = gg;
 
     // Analytic solution to system J*(dx,dy,dz,dl)^T = R
     // no_nn = no[0]*nn[0] + no[1]*nn[1] + no[2]*nn[2];
     nn_R  = nn[0]*R[0]  + nn[1]*R[1]  + nn[2]*R[2];
     no_nn = no[0]*nn[0] + no[1]*nn[1] + no[2]*nn[2];
     double n_inv = 1.0 / no_nn;
 
     // fprintf( screen, "nn_R = %f, no_nn = %f\n", nn_R, no_nn );
 
     dx[3] = -nn_R - R[3];
     dx[3] *= n_inv;
     dx[0] = -R[0] - no[0]*dx[3];
     dx[1] = -R[1] - no[1]*dx[3];
     dx[2] = -R[2] - no[2]*dx[3];
 
     dx[3] *= c_inv;
 
 
     x[0] -= dx[0];
     x[1] -= dx[1];
     x[2] -= dx[2];
     l    -= dx[3];
 
     res = infnorm<4>(R);
     ++iters;
 
     if( (res < tolerance) || (iters >= max_iter) ) break;
 
     // Update nn and g.
     gg = ptr_m->g(x);
     ptr_m->n(x,nn);
     // gg = ptr_m->g(x);
   }
 
   if( iters >= max_iter && res > tolerance ){
     char msg[2048];
     sprintf(msg,"Failed to constrain atom %d (x = (%f, %f, %f)! res = %e, iters = %d\n",
             tagi, x[0], x[1], x[2], res, iters);
     error->one(FLERR,msg);
   }
 
   // "sync" x and v:
   v[0] = vt[0] - l*no_dt[0];
   v[1] = vt[1] - l*no_dt[1];
   v[2] = vt[2] - l*no_dt[2];
 
   stats.x_iters += iters;
 }
 
 
 /* -----------------------------------------------------------------------------
    ---------------------------------------------------------------------------*/
 void FixNVEManifoldRattle::rattle_manifold_v(double *v, double *f,
                                              double *x, double dtfm,
                                              tagint tagi )
 {
   /*
     The original update was
     v[i][0] += dtfm * f[i][0];
     v[i][1] += dtfm * f[i][1];
     v[i][2] += dtfm * f[i][2];
 
     Now you add the rattle-like update:
     vold - vnew + dtfm * F + mu * n_new = 0
     dot( vnew, n_new ) = 0
   */
   double vo[3];      // V at t + 1/2 dt
   double l = 0;      // Lagrangian multiplier for constraint forces.
   double R[4];       // System that is 0.
   double dv[4];      // Update that follows from Newton iteration.
   double n[3];       // Normal.
   double res;        // Residual.
   int iters = 0;     // Iterations used
 
   double c  = dtfm; // Used for iterating in the Newton loop:
   double nn2, nn_R;
 
   vo[0] = v[0];
   vo[1] = v[1];
   vo[2] = v[2];
 
   // Initial guess is unconstrained update:
   v[0] += dtfm*f[0];
   v[1] += dtfm*f[1];
   v[2] += dtfm*f[2];
 
   ptr_m->n(x,n);
 
   double vt[3];
   vt[0] = vo[0] + dtfm*f[0];
   vt[1] = vo[1] + dtfm*f[1];
   vt[2] = vo[2] + dtfm*f[2];
   double no_dt[3];
   no_dt[0] = dtfm*n[0];
   no_dt[1] = dtfm*n[1];
   no_dt[2] = dtfm*n[2];
 
   nn2 = n[0]*n[0] + n[1]*n[1] + n[2]*n[2];
 
   const double n_inv = 1.0 / nn2;
   const double c_inv = 1.0 / c;
 
   do{
     R[0] = vt[0] - v[0]  - l * no_dt[0];
     R[1] = vt[1] - v[1]  - l * no_dt[1];
     R[2] = vt[2] - v[2]  - l * no_dt[2];
     R[3] = v[0]*n[0] + v[1]*n[1] + v[2]*n[2];
 
     // Analytic solution to system J*(dx,dy,dz,dl)^T = R
     nn_R = n[0]*R[0] + n[1]*R[1] + n[2]*R[2];
 
     dv[3] = -nn_R - R[3];
     dv[3] *= n_inv;
     dv[0] = -n[0]*dv[3] - R[0];
     dv[1] = -n[1]*dv[3] - R[1];
     dv[2] = -n[2]*dv[3] - R[2];
     dv[3] *= c_inv;
 
     v[0] -= dv[0];
     v[1] -= dv[1];
     v[2] -= dv[2];
     l    -= dv[3];
 
     res = infnorm<4>(R);
     ++iters;
   }while( (res > tolerance) && (iters < max_iter) );
 
   if( iters >= max_iter && res >= tolerance ){
           char msg[2048];
           sprintf(msg,"Failed to constrain atom %d (x = (%f, %f, %f)! res = %e, iters = %d\n",
                   tagi, x[0], x[1], x[2], res, iters);
           error->all(FLERR,msg);
   }
 
   stats.v_iters += iters;
 }
diff --git a/src/USER-MANIFOLD/fix_nvt_manifold_rattle.cpp b/src/USER-MANIFOLD/fix_nvt_manifold_rattle.cpp
index b0109d16f..38e8c6a26 100644
--- a/src/USER-MANIFOLD/fix_nvt_manifold_rattle.cpp
+++ b/src/USER-MANIFOLD/fix_nvt_manifold_rattle.cpp
@@ -1,416 +1,416 @@
 /* ----------------------------------------------------------------------
    Lammps - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
    -----------------------------------------------------------------------
 
    This file is a part of the USER-MANIFOLD package.
 
    Copyright (2013-2014) Stefan Paquay, Eindhoven University of Technology.
    License: GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 
    This file is part of the user-manifold package written by
    Stefan Paquay at the Eindhoven University of Technology.
    This module makes it possible to do MD with particles constrained
    to pretty arbitrary manifolds characterised by some constraint function
    g(x,y,z) = 0 and its normal grad(g). The number of manifolds available
    right now is limited but can be extended straightforwardly by making
    a new class that inherits from manifold and implements all pure virtual
    methods.
 
    Thanks to Remy Kusters for beta-testing!
 
 ------------------------------------------------------------------------- */
 
 
-#include "stdio.h"
-#include "stdlib.h"
-#include "string.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 #include "atom.h"
 #include "force.h"
 #include "update.h"
 #include "respa.h"
 #include "error.h"
 #include "group.h"
-#include "math.h"
+#include <math.h>
 #include "input.h"
 #include "variable.h"
 #include "citeme.h"
 #include "memory.h"
 #include "comm.h"
 #include "modify.h"
 #include "compute.h"
 
 #include "fix_nvt_manifold_rattle.h"
 #include "manifold.h"
 
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
 using namespace user_manifold;
 
 enum {CONSTANT,EQUAL};
 enum {NOBIAS,BIAS};
 
 
 
 
 static const char* cite_fix_nvt_manifold_rattle =
   "fix nvt/manifold/rattle command:\n\n"
   "@article{paquay-2016,\n"
   "   author        = {Paquay, Stefan and Kusters, Remy},\n"
   "   doi           = {10.1016/j.bpj.2016.02.017},\n"
   "   issn          = {0006-3495},\n"
   "   journal       = {Biophysical Journal},\n"
   "   month         = apr,\n"
   "   number        = {6},\n"
   "   pages         = {1226--1233},\n"
   "   title         = {{A Method for Molecular Dynamics on Curved Surfaces}},\n"
   "   volume        = {110},\n"
   "   year          = {2016}\n"
   "}\n\n";
 /* ---------------------------------------------------------------------- */
 
 FixNVTManifoldRattle::FixNVTManifoldRattle(LAMMPS *lmp, int narg, char **arg,
                                            int error_on_unknown_keyword )
   : FixNVEManifoldRattle(lmp,narg,arg, 0)
 {
   if (lmp->citeme) lmp->citeme->add(cite_fix_nvt_manifold_rattle);
 
   if( narg < 6 ) error->all(FLERR,"Illegal fix nvt/manifold/rattle command");
 
   // Set all bits/settings:
   dof_flag = 1;
   dthalf = dt4 = dt8 = 0;
 
   t_start = t_stop = t_period = t_current = t_target = ke_target = 0.0;
   t_freq = drag = tdrag_factor = 0;
 
   boltz = force->boltz, nktv2p = force->nktv2p;
   tdof = 0;
   mtchain = 3;
   factor_eta = 0.0;
   which = got_temp = 0;
 
   int argi = 6 + ptr_m->nparams();
   while( argi < narg )
   {
     if( strcmp( arg[argi], "temp") == 0 ){
       if( argi+3 >= narg )
         error->all(FLERR,"Keyword 'temp' needs 3 arguments");
 
       t_start  = force->numeric(FLERR, arg[argi+1]);
       t_stop   = force->numeric(FLERR, arg[argi+2]);
       t_period = force->numeric(FLERR, arg[argi+3]);
       t_target = t_start;
       got_temp = 1;
 
       argi += 4;
     }else if( strcmp( arg[argi], "tchain" ) == 0 ){
       if( argi+1 >= narg )
         error->all(FLERR,"Keyword 'tchain' needs 1 argument");
 
       mtchain = force->inumeric(FLERR, arg[argi+1]);
       argi += 2;
     }else if( error_on_unknown_keyword ){
       char msg[2048];
       sprintf(msg,"Error parsing arg \"%s\".\n", arg[argi]);
       error->all(FLERR, msg);
     }else{
       argi += 1;
     }
   }
 
   reset_dt();
 
   if( !got_temp ) error->all(FLERR,"Fix nvt/manifold/rattle needs 'temp'!");
 
   if( t_period < 0.0 ){
     error->all(FLERR,"Fix nvt/manifold/rattle damping parameter must be > 0.0");
   }
 
   // Create temperature compute:
   const char *fix_id = arg[1];
   int n = strlen(fix_id)+6;
   id_temp = new char[n];
   strcpy(id_temp,fix_id);
   strcat(id_temp,"_temp");
   char **newarg = new char*[3];
   newarg[0] = id_temp;
   newarg[1] = group->names[igroup];
   newarg[2] = (char*) "temp";
 
 
   modify->add_compute(3,newarg);
   delete [] newarg;
   int icompute = modify->find_compute(id_temp);
   if( icompute < 0 ){
     error->all(FLERR,"Temperature ID for fix nvt/manifold/rattle "
                "does not exist");
   }
   temperature = modify->compute[icompute];
   if( temperature->tempbias ) which = BIAS;
   else                        which = NOBIAS;
 
   // Set t_freq from t_period
   t_freq = 1.0 / t_period;
 
   // Init Nosé-Hoover chain:
   eta        = new double[mtchain];
   eta_dot    = new double[mtchain+1];
   eta_dotdot = new double[mtchain];
   eta_mass   = new double[mtchain];
   eta_dot[mtchain] = 0.0;
 
   eta_dot[mtchain] = 0.0;
   for( int ich = 0; ich < mtchain; ++ich ){
     eta[ich] = eta_dot[ich] = eta_dotdot[ich] = 0.0;
   }
 
 
 }
 
 /* ---------------------------------------------------------------------- */
 
 FixNVTManifoldRattle::~FixNVTManifoldRattle()
 {
   // Deallocate heap-allocated objects.
   if( eta )        delete[] eta;
   if( eta_dot )    delete[] eta_dot;
   if( eta_dotdot ) delete[] eta_dotdot;
   if( eta_mass )   delete[] eta_mass;
 
   modify->delete_compute(id_temp);
   if( id_temp )    delete[] id_temp;
 }
 
 
 
 
 int FixNVTManifoldRattle::setmask()
 {
   int mask = 0;
   mask |= INITIAL_INTEGRATE;
   mask |= FINAL_INTEGRATE;
   if( nevery > 0 ) mask |= END_OF_STEP;
 
   return mask;
 }
 
 
 /* --------------------------------------------------------------------------
    Check that force modification happens before position and velocity update.
    Make sure respa is not used.
 ------------------------------------------------------------------------- */
 void FixNVTManifoldRattle::init()
 {
   // Makes sure the manifold params are set initially.
   update_var_params();
 
   int icompute = modify->find_compute(id_temp);
   if( icompute < 0 ){
     error->all(FLERR,"Temperature ID for fix nvt/manifold/rattle "
                "does not exist");
   }
   temperature = modify->compute[icompute];
   if( temperature->tempbias ) which = BIAS;
   else                        which = NOBIAS;
 
 }
 
 
 
 void FixNVTManifoldRattle::setup(int vflag)
 {
   compute_temp_target();
 
   t_current = temperature->compute_scalar();
   tdof = temperature->dof;
 
   // Compute/set eta-masses:
   double inv_t_freq2 = 1.0 / (t_freq*t_freq);
   eta_mass[0] = tdof * boltz * t_target * inv_t_freq2;
   for( int ich = 1; ich < mtchain; ++ich ){
     eta_mass[ich] = boltz * t_target * inv_t_freq2;
   }
 
   for( int ich = 1; ich < mtchain; ++ich ){
     eta_dotdot[ich] = (eta_mass[ich-1]*eta_dot[ich-1]*eta_dot[ich-1] -
                        boltz * t_target ) / eta_mass[ich];
   }
 }
 
 void FixNVTManifoldRattle::compute_temp_target()
 {
 
   t_current = temperature->compute_scalar();
   tdof      = temperature->dof;
 
   double delta = update->ntimestep - update->beginstep;
   if (delta != 0.0){
     delta /= update->endstep - update->beginstep;
   }
 
   tdof = temperature->dof;
   t_target = t_start + delta * (t_stop-t_start);
   ke_target = tdof * boltz * t_target;
 }
 
 void FixNVTManifoldRattle::nhc_temp_integrate()
 {
   int ich;
   // t_current = temperature->compute_scalar();
   // tdof = temperature->dof;
   compute_temp_target();
 
   double expfac, kecurrent = tdof * boltz * t_current;
   double inv_t_freq2 = 1.0 / (t_freq*t_freq);
   eta_mass[0] = tdof * boltz * t_target * inv_t_freq2;
   for( int ich = 1; ich < mtchain; ++ich ){
     eta_mass[ich] = boltz * t_target * inv_t_freq2;
   }
 
   if( eta_mass[0] > 0.0 ){
     eta_dotdot[0] = (kecurrent - ke_target)/eta_mass[0];
   }else{
     eta_dotdot[0] = 0;
   }
 
   for( ich = mtchain-1; ich > 0; --ich ){
     expfac = exp(-dt8*eta_dot[ich+1]);
     eta_dot[ich] *= expfac;
     eta_dot[ich] += eta_dotdot[ich] * dt4;
     eta_dot[ich] *= tdrag_factor * expfac;
 
   }
 
   expfac = exp(-dt8*eta_dot[1]);
   eta_dot[0] *= expfac;
   eta_dot[0] += eta_dotdot[0] * dt4;
   eta_dot[0] *= tdrag_factor * expfac;
 
   factor_eta = exp(-dthalf*eta_dot[0]);
 
   if( factor_eta == 0 ){
     char msg[2048];
     sprintf(msg, "WTF, factor_eta is 0! dthalf = %f, eta_dot[0] = %f",
             dthalf, eta_dot[0]);
     error->all(FLERR,msg);
   }
 
   nh_v_temp();
 
   t_current *= factor_eta*factor_eta;
   kecurrent = tdof * boltz * t_current;
 
   if( eta_mass[0] > 0.0 ){
     eta_dotdot[0] = (kecurrent - ke_target) / eta_mass[0];
   }else{
     eta_dotdot[0] = 0.0;
   }
 
   for( int ich = 1; ich < mtchain; ++ich ){
     eta[ich] += dthalf*eta_dot[ich];
   }
   eta_dot[0] *= expfac;
   eta_dot[0] += eta_dotdot[0]*dt4;
   eta_dot[0] *= expfac;
 
   for( int ich = 1; ich < mtchain; ++ich ){
     expfac = exp(-dt8*eta_dot[ich+1]);
     eta_dot[ich] *= expfac;
     eta_dotdot[ich] = (eta_mass[ich-1]*eta_dot[ich-1]*eta_dot[ich-1]
                        - boltz*t_target) / eta_mass[ich];
     eta_dot[ich] *= eta_dotdot[ich] * dt4;
     eta_dot[ich] *= expfac;
   }
 
 }
 
 void FixNVTManifoldRattle::nh_v_temp()
 {
   double **v = atom->v;
   int *mask  = atom->mask;
   int nlocal = atom->nlocal;
   if( igroup == atom->firstgroup) nlocal = atom->nfirst;
 
 
 
 
   if( which == NOBIAS ){
     for( int i = 0; i < nlocal; ++i ){
       if( mask[i] & groupbit ){
         v[i][0] *= factor_eta;
         v[i][1] *= factor_eta;
         v[i][2] *= factor_eta;
       }
     }
   }else if( which == BIAS ){
     for( int i = 0; i < nlocal; ++i ){
       if( mask[i] & groupbit ){
         temperature->remove_bias(i,v[i]);
         v[i][0] *= factor_eta;
         v[i][1] *= factor_eta;
         v[i][2] *= factor_eta;
         temperature->restore_bias(i,v[i]);
       }
     }
   }
 }
 
 
 
 
 // Most of this logic is based on fix_nh:
 void FixNVTManifoldRattle::initial_integrate(int vflag)
 {
 
   update_var_params();
 
   compute_temp_target();
   nhc_temp_integrate();
 
   nve_x_rattle(igroup, groupbit);
 }
 
 void FixNVTManifoldRattle::final_integrate()
 {
   nve_v_rattle(igroup, groupbit);
 
   nhc_temp_integrate();
 }
 
 
 
 /* ---------------------------------------------------------------------- */
 void FixNVTManifoldRattle::reset_dt()
 {
   FixNVEManifoldRattle::reset_dt();
 
   dthalf = 0.5 * update->dt;
   dt4 = 0.25 * update->dt;
   dt8 = 0.125 * update->dt;
   tdrag_factor = 1.0 - (update->dt * t_freq * drag);
 
 }
 
 
 
 
 
 double FixNVTManifoldRattle::memory_usage()
 {
   double bytes = FixNVEManifoldRattle::memory_usage();
   bytes += (4*mtchain+1)*sizeof(double);
 
   return bytes;
 }
diff --git a/src/compute_dipole_chunk.cpp b/src/compute_dipole_chunk.cpp
index 4bdf23e27..74d66e7c1 100644
--- a/src/compute_dipole_chunk.cpp
+++ b/src/compute_dipole_chunk.cpp
@@ -1,294 +1,294 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
-#include "string.h"
+#include <string.h>
 #include "compute_dipole_chunk.h"
 #include "atom.h"
 #include "update.h"
 #include "modify.h"
 #include "compute_chunk_atom.h"
 #include "domain.h"
 #include "memory.h"
 #include "error.h"
 #include "math_special.h"
 
 using namespace LAMMPS_NS;
 using namespace MathSpecial;
 
 enum { MASSCENTER, GEOMCENTER };
 
 /* ---------------------------------------------------------------------- */
 
 ComputeDipoleChunk::ComputeDipoleChunk(LAMMPS *lmp, int narg, char **arg) : 
   Compute(lmp, narg, arg),
   idchunk(NULL), massproc(NULL), masstotal(NULL), chrgproc(NULL), chrgtotal(NULL), com(NULL),
   comall(NULL), dipole(NULL), dipoleall(NULL)
 {
   if ((narg != 4) && (narg != 5)) error->all(FLERR,"Illegal compute dipole/chunk command");
 
   array_flag = 1;
   size_array_cols = 4;
   size_array_rows = 0;
   size_array_rows_variable = 1;
   extarray = 0;
 
   // ID of compute chunk/atom
 
   int n = strlen(arg[3]) + 1;
   idchunk = new char[n];
   strcpy(idchunk,arg[3]);
 
   usecenter = MASSCENTER;
 
   if (narg == 5) {
     if (strncmp(arg[4],"geom",4) == 0) usecenter = GEOMCENTER;
     else if (strcmp(arg[4],"mass") == 0) usecenter = MASSCENTER;
     else error->all(FLERR,"Illegal compute dipole/chunk command");
   }
   
   init();
 
   // chunk-based data
 
   nchunk = 1;
   maxchunk = 0;
   allocate();
 }
 
 /* ---------------------------------------------------------------------- */
 
 ComputeDipoleChunk::~ComputeDipoleChunk()
 {
   delete [] idchunk;
   memory->destroy(massproc);
   memory->destroy(masstotal);
   memory->destroy(chrgproc);
   memory->destroy(chrgtotal);
   memory->destroy(com);
   memory->destroy(comall);
   memory->destroy(dipole);
   memory->destroy(dipoleall);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputeDipoleChunk::init()
 {
   int icompute = modify->find_compute(idchunk);
   if (icompute < 0)
     error->all(FLERR,"Chunk/atom compute does not exist for "
                "compute dipole/chunk");
   cchunk = (ComputeChunkAtom *) modify->compute[icompute];
   if (strcmp(cchunk->style,"chunk/atom") != 0)
     error->all(FLERR,"Compute dipole/chunk does not use chunk/atom compute");
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputeDipoleChunk::compute_array()
 {
   int i,index;
   double massone;
   double unwrap[3];
 
   invoked_array = update->ntimestep;
 
   // compute chunk/atom assigns atoms to chunk IDs
   // extract ichunk index vector from compute
   // ichunk = 1 to Nchunk for included atoms, 0 for excluded atoms
 
   nchunk = cchunk->setup_chunks();
   cchunk->compute_ichunk();
   int *ichunk = cchunk->ichunk;
 
   if (nchunk > maxchunk) allocate();
   size_array_rows = nchunk;
 
   // zero local per-chunk values
 
   for (int i = 0; i < nchunk; i++) {
     massproc[i] = chrgproc[i] = 0.0;
     com[i][0] = com[i][1] = com[i][2] = 0.0;
     dipole[i][0] = dipole[i][1] = dipole[i][2] = dipole[i][3] = 0.0;
   }
 
   // compute COM for each chunk
 
   double **x = atom->x;
   int *mask = atom->mask;
   int *type = atom->type;
   imageint *image = atom->image;
   double *mass = atom->mass;
   double *rmass = atom->rmass;
   double *q = atom->q;
   double **mu = atom->mu;
 
   int nlocal = atom->nlocal;
 
   for (int i = 0; i < nlocal; i++)
     if (mask[i] & groupbit) {
       index = ichunk[i]-1;
       if (index < 0) continue;
       if (usecenter == MASSCENTER) {
         if (rmass) massone = rmass[i];
         else massone = mass[type[i]];
       } else massone = 1.0;     // usecenter == GEOMCENTER
 
       domain->unmap(x[i],image[i],unwrap);
       massproc[index] += massone;
       if (atom->q_flag) chrgproc[index] += atom->q[i];
       com[index][0] += unwrap[0] * massone;
       com[index][1] += unwrap[1] * massone;
       com[index][2] += unwrap[2] * massone;
     }
 
   MPI_Allreduce(massproc,masstotal,nchunk,MPI_DOUBLE,MPI_SUM,world);
   MPI_Allreduce(chrgproc,chrgtotal,nchunk,MPI_DOUBLE,MPI_SUM,world);
   MPI_Allreduce(&com[0][0],&comall[0][0],3*nchunk,MPI_DOUBLE,MPI_SUM,world);
 
   for (int i = 0; i < nchunk; i++) {
     if (masstotal[i] > 0.0) {
       comall[i][0] /= masstotal[i];
       comall[i][1] /= masstotal[i];
       comall[i][2] /= masstotal[i];
     }
   }
 
   // compute dipole for each chunk
 
   for (i = 0; i < nlocal; i++) {
     if (mask[i] & groupbit) {
       index = ichunk[i]-1;
       if (index < 0) continue;
       domain->unmap(x[i],image[i],unwrap);
       if (atom->q_flag) {
         dipole[index][0] += q[i]*unwrap[0];
         dipole[index][1] += q[i]*unwrap[1];
         dipole[index][2] += q[i]*unwrap[2];
       }
       if (atom->mu_flag) {
         dipole[index][0] += mu[i][0];
         dipole[index][1] += mu[i][1];
         dipole[index][2] += mu[i][2];
       }
     }
   }
 
   MPI_Allreduce(&dipole[0][0],&dipoleall[0][0],4*nchunk,
                 MPI_DOUBLE,MPI_SUM,world);
 
   for (i = 0; i < nchunk; i++) {
     // correct for position dependence with charged chunks
     dipoleall[i][0] -= chrgtotal[i]*comall[i][0];
     dipoleall[i][1] -= chrgtotal[i]*comall[i][1];
     dipoleall[i][2] -= chrgtotal[i]*comall[i][2];
     // compute total dipole moment
     dipoleall[i][3] = sqrt(square(dipoleall[i][0])
                            + square(dipoleall[i][1])
                            + square(dipoleall[i][2]));
   }
 }
 
 /* ----------------------------------------------------------------------
    lock methods: called by fix ave/time
    these methods insure vector/array size is locked for Nfreq epoch
      by passing lock info along to compute chunk/atom
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    increment lock counter
 ------------------------------------------------------------------------- */
 
 void ComputeDipoleChunk::lock_enable()
 {
   cchunk->lockcount++;
 }
 
 /* ----------------------------------------------------------------------
    decrement lock counter in compute chunk/atom, it if still exists
 ------------------------------------------------------------------------- */
 
 void ComputeDipoleChunk::lock_disable()
 {
   int icompute = modify->find_compute(idchunk);
   if (icompute >= 0) {
     cchunk = (ComputeChunkAtom *) modify->compute[icompute];
     cchunk->lockcount--;
   }
 }
 
 /* ----------------------------------------------------------------------
    calculate and return # of chunks = length of vector/array
 ------------------------------------------------------------------------- */
 
 int ComputeDipoleChunk::lock_length()
 {
   nchunk = cchunk->setup_chunks();
   return nchunk;
 }
 
 /* ----------------------------------------------------------------------
    set the lock from startstep to stopstep
 ------------------------------------------------------------------------- */
 
 void ComputeDipoleChunk::lock(Fix *fixptr, bigint startstep, bigint stopstep)
 {
   cchunk->lock(fixptr,startstep,stopstep);
 }
 
 /* ----------------------------------------------------------------------
    unset the lock
 ------------------------------------------------------------------------- */
 
 void ComputeDipoleChunk::unlock(Fix *fixptr)
 {
   cchunk->unlock(fixptr);
 }
 
 /* ----------------------------------------------------------------------
    free and reallocate per-chunk arrays
 ------------------------------------------------------------------------- */
 
 void ComputeDipoleChunk::allocate()
 {
   memory->destroy(massproc);
   memory->destroy(masstotal);
   memory->destroy(chrgproc);
   memory->destroy(chrgtotal);
   memory->destroy(com);
   memory->destroy(comall);
   memory->destroy(dipole);
   memory->destroy(dipoleall);
   maxchunk = nchunk;
   memory->create(massproc,maxchunk,"dipole/chunk:massproc");
   memory->create(masstotal,maxchunk,"dipole/chunk:masstotal");
   memory->create(chrgproc,maxchunk,"dipole/chunk:chrgproc");
   memory->create(chrgtotal,maxchunk,"dipole/chunk:chrgtotal");
   memory->create(com,maxchunk,3,"dipole/chunk:com");
   memory->create(comall,maxchunk,3,"dipole/chunk:comall");
   memory->create(dipole,maxchunk,4,"dipole/chunk:dipole");
   memory->create(dipoleall,maxchunk,4,"dipole/chunk:dipoleall");
   array = dipoleall;
 }
 
 /* ----------------------------------------------------------------------
    memory usage of local data
 ------------------------------------------------------------------------- */
 
 double ComputeDipoleChunk::memory_usage()
 {
   double bytes = (bigint) maxchunk * 2 * sizeof(double);
   bytes += (bigint) maxchunk * 2*3 * sizeof(double);
   bytes += (bigint) maxchunk * 2*4 * sizeof(double);
   return bytes;
 }