diff --git a/src/USER-OMP/fix_omp.cpp b/src/USER-OMP/fix_omp.cpp
index e2aaab594..bac001795 100644
--- a/src/USER-OMP/fix_omp.cpp
+++ b/src/USER-OMP/fix_omp.cpp
@@ -1,347 +1,342 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    OpenMP based threading support for LAMMPS
    Contributing author: Axel Kohlmeyer (Temple U)
 ------------------------------------------------------------------------- */
 
 #include "atom.h"
 #include "comm.h"
 #include "error.h"
 #include "force.h"
 #include "neighbor.h"
 #include "neigh_request.h"
 #include "universe.h"
 #include "update.h"
 #include "integrate.h"
 #include "min.h"
 
 #include "fix_omp.h"
 #include "thr_data.h"
 #include "thr_omp.h"
 
 #include "pair_hybrid.h"
 #include "bond_hybrid.h"
 #include "angle_hybrid.h"
 #include "dihedral_hybrid.h"
 #include "improper_hybrid.h"
 #include "kspace.h"
 
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
 
 #include "suffix.h"
 
 #if defined(LMP_USER_CUDA)
 #include "cuda_modify_flags.h"
 #endif
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
 #if defined(LMP_USER_CUDA)
 using namespace FixConstCuda;
 #endif
 
 static int get_tid()
 {
   int tid = 0;
 #if defined(_OPENMP)
   tid = omp_get_thread_num();
 #endif
   return tid;
 }
 
 /* ---------------------------------------------------------------------- */
 
 FixOMP::FixOMP(LAMMPS *lmp, int narg, char **arg) 
   :  Fix(lmp, narg, arg),
      thr(NULL), last_omp_style(NULL), last_pair_hybrid(NULL),
-     _nthr(-1), _neighbor(true), _mixed(false)
+     _nthr(-1), _neighbor(true), _mixed(false), _reduced(true)
 {
   if ((narg < 4) || (narg > 7)) error->all(FLERR,"Illegal package omp command");
   if (strcmp(arg[1],"all") != 0) error->all(FLERR,"fix OMP has to operate on group 'all'");
 
   int nthreads = 1;
   if (narg > 3) {
 #if defined(_OPENMP)
     if (strcmp(arg[3],"*") == 0)
 #pragma omp parallel default(none) shared(nthreads)
       nthreads = omp_get_num_threads();
     else
       nthreads = atoi(arg[3]);
 #endif
   }
 
   if (nthreads < 1)
     error->all(FLERR,"Illegal number of OpenMP threads requested");
 
+  int reset_thr = 0;
   if (nthreads != comm->nthreads) {
 #if defined(_OPENMP)
+    reset_thr = 1;
     omp_set_num_threads(nthreads);
 #endif
     comm->nthreads = nthreads;
   }
 
   int iarg = 4;
   while (iarg < narg) {
     if (strcmp(arg[iarg],"force/neigh") == 0)
       _neighbor = true;
     else if (strcmp(arg[iarg],"force") == 0)
       _neighbor = false;
     else if (strcmp(arg[iarg],"mixed") == 0)
       _mixed = true;
     else if (strcmp(arg[iarg],"double") == 0)
       _mixed = false;
     else
       error->all(FLERR,"Illegal package omp mode requested");
   }
 
   // print summary of settings
   if (comm->me == 0) {
-    const char * const nmode = _neighbor ? "OpenMP capable" : "serial";
+    const char * const nmode = _neighbor ? "multi-threaded" : "serial";
     const char * const kmode = _mixed ? "mixed" : "double";
 
     if (screen) {
-      fprintf(screen,"  reset %d OpenMP thread(s) per MPI task\n", nthreads);
-      fprintf(screen,"  using %s neighbor list subroutines\n", nmode);
-      if (_mixed)
-        fputs("  using mixed precision OpenMP force kernels where available\n", screen);
-      else
-        fputs("  using double precision OpenMP force kernels\n", screen);
+      if (reset_thr)
+	fprintf(screen,"set %d OpenMP thread(s) per MPI task\n", nthreads);
+      fprintf(screen,"using %s neighbor list subroutines\n", nmode);
+      fprintf(screen,"prefer %s precision OpenMP force kernels\n", kmode);
     }
     
     if (logfile) {
-      fprintf(logfile,"  reset %d OpenMP thread(s) per MPI task\n", nthreads);
-      fprintf(logfile,"  using %s neighbor list subroutines\n", nmode);
-      if (_mixed)
-        fputs("  using mixed precision OpenMP force kernels where available\n", logfile);
-      else
-        fputs("  using double precision OpenMP force kernels\n", logfile);
+      if (reset_thr)
+	fprintf(logfile,"set %d OpenMP thread(s) per MPI task\n", nthreads);
+      fprintf(logfile,"using %s neighbor list subroutines\n", nmode);
+      fprintf(logfile,"prefer %s precision OpenMP force kernels\n", kmode);
     }
   }
 
   // allocate list for per thread accumulator manager class instances
   // and then have each thread create an instance of this class to
   // encourage the OS to use storage that is "close" to each thread's CPU.
   thr = new ThrData *[nthreads];
   _nthr = nthreads;
-  _clearforce = true;
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
 #endif
   {
     const int tid = get_tid();
     thr[tid] = new ThrData(tid);
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 FixOMP::~FixOMP()
 {
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
 #endif
   {
     const int tid = get_tid();
     delete thr[tid];
   }
   delete[] thr;
 }
 
 /* ---------------------------------------------------------------------- */
 
 int FixOMP::setmask()
 {
   // compatibility with USER-CUDA
   // our fix doesn't need any data transfer.
 #if defined(LMP_USER_CUDA)
   if (lmp->cuda) {
     int mask = 0;
     mask |= PRE_FORCE_CUDA;
     mask |= PRE_FORCE_RESPA;
     mask |= MIN_PRE_FORCE;
     return mask;
   }
 #endif
 
   int mask = 0;
   mask |= PRE_FORCE;
   mask |= PRE_FORCE_RESPA;
   mask |= MIN_PRE_FORCE;
   return mask;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixOMP::init()
 {
-  if (strstr(update->integrate_style,"respa") != NULL)
-    error->all(FLERR,"Cannot use r-RESPA with /omp styles");
+  if ((strstr(update->integrate_style,"respa") != NULL)
+      && (strstr(update->integrate_style,"respa/omp") == NULL))
+    error->all(FLERR,"Need to use respa/omp for r-RESPA with /omp styles");
 
   int check_hybrid, kspace_split;
   last_pair_hybrid = NULL;
   last_omp_style = NULL;
   const char *last_omp_name = NULL;
   const char *last_hybrid_name = NULL;
   const char *last_force_name = NULL;
 
   // support for verlet/split operation.
   // kspace_split == 0 : regular processing
   // kspace_split < 0  : master partition, does not do kspace
   // kspace_split > 0  : slave partition, only does kspace
   if (strstr(update->integrate_style,"verlet/split") != NULL) {
     if (universe->iworld == 0) kspace_split = -1;
     else kspace_split = 1;
   } else {
     kspace_split = 0;
   }
 
 // determine which is the last force style with OpenMP
 // support as this is the one that has to reduce the forces
 
 #define CheckStyleForOMP(name)						\
   check_hybrid = 0;							\
   if (force->name) {							\
     if ( (strcmp(force->name ## _style,"hybrid") == 0) ||		\
          (strcmp(force->name ## _style,"hybrid/overlay") == 0) )	\
       check_hybrid=1;							\
     if (force->name->suffix_flag & Suffix::OMP) {			\
       last_force_name = (const char *) #name;				\
       last_omp_name = force->name ## _style;				\
       last_omp_style = (void *) force->name;				\
     }									\
   }
 
 #define CheckHybridForOMP(name,Class) \
   if (check_hybrid) {					      \
     Class ## Hybrid *style = (Class ## Hybrid *) force->name; \
     for (int i=0; i < style->nstyles; i++) {		      \
       if (style->styles[i]->suffix_flag & Suffix::OMP) {      \
         last_force_name = (const char *) #name;		      \
         last_omp_name = style->keywords[i];		      \
         last_omp_style = style->styles[i];		      \
       }							      \
     }							      \
   }
 
   if (kspace_split <= 0) {
     CheckStyleForOMP(pair);
     CheckHybridForOMP(pair,Pair);
     if (check_hybrid) {
       last_pair_hybrid = last_omp_style;
       last_hybrid_name = last_omp_name;
     }
 
     CheckStyleForOMP(bond);
     CheckHybridForOMP(bond,Bond);
 
     CheckStyleForOMP(angle);
     CheckHybridForOMP(angle,Angle);
 
     CheckStyleForOMP(dihedral);
     CheckHybridForOMP(dihedral,Dihedral);
 
     CheckStyleForOMP(improper);
     CheckHybridForOMP(improper,Improper);
   }
   
   if (kspace_split >= 0) {
     CheckStyleForOMP(kspace);
   }
 
 #undef CheckStyleForOMP
 #undef CheckHybridForOMP
   set_neighbor_omp();
 
   // diagnostic output
   if (comm->me == 0) {
     if (last_omp_style) {
       if (last_pair_hybrid) {
         if (screen)
           fprintf(screen,"Hybrid pair style last /omp style %s\n", last_hybrid_name);
         if (logfile)
           fprintf(logfile,"Hybrid pair style last /omp style %s\n", last_hybrid_name);
       }
       if (screen)
         fprintf(screen,"Last active /omp style is %s_style %s\n",
                 last_force_name, last_omp_name);
       if (logfile)
         fprintf(logfile,"Last active /omp style is %s_style %s\n",
                 last_force_name, last_omp_name);
     } else {
-      _clearforce = false;
       if (screen)
         fprintf(screen,"No /omp style for force computation currently active\n");
       if (logfile)
         fprintf(logfile,"No /omp style for force computation currently active\n");
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixOMP::set_neighbor_omp()
 {
   // select or deselect multi-threaded neighbor
   // list build depending on setting in package omp.
   // NOTE: since we are at the top of the list of
   // fixes, we cannot adjust neighbor lists from
   // other fixes. those have to be re-implemented
   // as /omp fix styles. :-(
 
   const int neigh_omp = _neighbor ? 1 : 0;
   const int nrequest = neighbor->nrequest;
 
   for (int i = 0; i < nrequest; ++i)
     neighbor->requests[i]->omp = neigh_omp;
 }
 
 /* ---------------------------------------------------------------------- */
 
 // adjust size and clear out per thread accumulator arrays
 void FixOMP::pre_force(int)
 {
   const int nall = atom->nlocal + atom->nghost;
 
   double **f = atom->f;
   double **torque = atom->torque;
   double *erforce = atom->erforce;
   double *de = atom->de;
   double *drho = atom->drho;
 
-  if (_clearforce) {
 #if defined(_OPENMP)
 #pragma omp parallel default(none) shared(f,torque,erforce,de,drho)
 #endif
-    {
-      const int tid = get_tid();
-      thr[tid]->check_tid(tid);
-      thr[tid]->init_force(nall,f,torque,erforce,de,drho);
-    }
-  } else {
-    thr[0]->init_force(nall,f,torque,erforce,de,drho);
-  }
+  {
+    const int tid = get_tid();
+    thr[tid]->check_tid(tid);
+    thr[tid]->init_force(nall,f,torque,erforce,de,drho);
+  } // end of omp parallel region
+
+  _reduced = false;
 }
 
 /* ---------------------------------------------------------------------- */
 
 double FixOMP::memory_usage()
 {
   double bytes = comm->nthreads * (sizeof(ThrData *) + sizeof(ThrData));
   bytes += comm->nthreads * thr[0]->memory_usage();
 
   return bytes;
 }
diff --git a/src/USER-OMP/fix_omp.h b/src/USER-OMP/fix_omp.h
index e65184998..656ab752a 100644
--- a/src/USER-OMP/fix_omp.h
+++ b/src/USER-OMP/fix_omp.h
@@ -1,74 +1,78 @@
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #ifdef FIX_CLASS
 
 FixStyle(OMP,FixOMP)
 
 #else
 
 #ifndef LMP_FIX_OMP_H
 #define LMP_FIX_OMP_H
 
 #include "fix.h"
 
 
 namespace LAMMPS_NS {
 
 class ThrData;
 
 class FixOMP : public Fix {
   friend class ThrOMP;
+  friend class RespaOMP;
 
  public:
   FixOMP(class LAMMPS *, int, char **);
   virtual ~FixOMP();
   virtual int setmask();
   virtual void init();
   virtual void pre_force(int);
 
   virtual void setup_pre_force(int vflag)           { pre_force(vflag); }
   virtual void min_setup_pre_force(int vflag)       { pre_force(vflag); }
   virtual void min_pre_force(int vflag)             { pre_force(vflag); }
   virtual void setup_pre_force_respa(int vflag,int) { pre_force(vflag); }
   virtual void pre_force_respa(int vflag,int,int)   { pre_force(vflag); }
 
   virtual double memory_usage();
 
  protected:
   ThrData **thr;
   void *last_omp_style;    // pointer to the style that needs
                            // to do the general force reduction
   void *last_pair_hybrid;  // pointer to the pair style that needs
                            // to call virial_fdot_compute()
+  // signal that an /omp style did the force reduction. needed by respa/omp
+  void did_reduce() { _reduced = true;  }
 
  public:
   ThrData *get_thr(int tid) { return thr[tid];  }
   int get_nthr()      const { return _nthr;     }
 
   bool get_neighbor() const { return _neighbor; }
   bool get_mixed()    const { return _mixed;    }
+  bool get_reduced()  const { return _reduced;  }
 
  private:
-  int  _nthr;     // number of currently active ThrData object
-  bool _neighbor; // en/disable threads for neighbor list construction
-  bool _clearforce; // whether to clear per thread data objects
-  bool _mixed;     // whether to use a mixed precision compute kernel
+  int  _nthr;       // number of currently active ThrData objects
+  bool _neighbor;   // en/disable threads for neighbor list construction
+  bool _mixed;      // whether to prefer mixed precision compute kernels
+  bool _reduced;    // whether forces have been reduced for this step
 
   void set_neighbor_omp();
 };
 
 }
 
 #endif
 #endif
diff --git a/src/USER-OMP/thr_omp.cpp b/src/USER-OMP/thr_omp.cpp
index 5bfc49638..4aea63055 100644
--- a/src/USER-OMP/thr_omp.cpp
+++ b/src/USER-OMP/thr_omp.cpp
@@ -1,1170 +1,1177 @@
 /* -------------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    OpenMP based threading support for LAMMPS
    Contributing author: Axel Kohlmeyer (Temple U)
 ------------------------------------------------------------------------- */
 
 #include "atom.h"
 #include "comm.h"
 #include "error.h"
 #include "force.h"
 #include "memory.h"
 #include "modify.h"
 #include "neighbor.h"
 
 #include "thr_omp.h"
 
 #include "pair.h"
 #include "bond.h"
 #include "angle.h"
 #include "dihedral.h"
 #include "improper.h"
 #include "kspace.h"
 
 #include "math_const.h"
 
 #include <string.h>
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
 /* ---------------------------------------------------------------------- */
 
 ThrOMP::ThrOMP(LAMMPS *ptr, int style)
   : lmp(ptr), fix(NULL), thr_style(style), thr_error(0)
 {
   // register fix omp with this class
   int ifix = lmp->modify->find_fix("package_omp");
   if (ifix < 0)
     lmp->error->all(FLERR,"The 'package omp' command is required for /omp styles");
   fix = static_cast<FixOMP *>(lmp->modify->fix[ifix]);
 }
 
 /* ---------------------------------------------------------------------- */
 
 ThrOMP::~ThrOMP()
 {
   // nothing to do?
 }
 
 /* ----------------------------------------------------------------------
    Hook up per thread per atom arrays into the tally infrastructure
    ---------------------------------------------------------------------- */
 
 void ThrOMP::ev_setup_thr(int eflag, int vflag, int nall, double *eatom,
                           double **vatom, ThrData *thr)
 {
   const int tid = thr->get_tid();
   if (tid == 0) thr_error = 0;
 
   if (thr_style & THR_PAIR) {
     if (eflag & 2) {
       thr->eatom_pair = eatom + tid*nall;
       if (nall > 0)
         memset(&(thr->eatom_pair[0]),0,nall*sizeof(double));
     }
     if (vflag & 4) {
       thr->vatom_pair = vatom + tid*nall;
       if (nall > 0)
         memset(&(thr->vatom_pair[0][0]),0,nall*6*sizeof(double));
     }
   }
 
   if (thr_style & THR_BOND) {
     if (eflag & 2) {
       thr->eatom_bond = eatom + tid*nall;
       if (nall > 0)
         memset(&(thr->eatom_bond[0]),0,nall*sizeof(double));
     }
     if (vflag & 4) {
       thr->vatom_bond = vatom + tid*nall;
       if (nall > 0)
         memset(&(thr->vatom_bond[0][0]),0,nall*6*sizeof(double));
     }
   }
 
   if (thr_style & THR_ANGLE) {
     if (eflag & 2) {
       thr->eatom_angle = eatom + tid*nall;
       if (nall > 0)
         memset(&(thr->eatom_angle[0]),0,nall*sizeof(double));
     }
     if (vflag & 4) {
       thr->vatom_angle = vatom + tid*nall;
       if (nall > 0)
         memset(&(thr->vatom_angle[0][0]),0,nall*6*sizeof(double));
     }
   }
 
   if (thr_style & THR_DIHEDRAL) {
     if (eflag & 2) {
       thr->eatom_dihed = eatom + tid*nall;
       if (nall > 0)
         memset(&(thr->eatom_dihed[0]),0,nall*sizeof(double));
     }
     if (vflag & 4) {
       thr->vatom_dihed = vatom + tid*nall;
       if (nall > 0)
         memset(&(thr->vatom_dihed[0][0]),0,nall*6*sizeof(double));
     }
   }
 
   if (thr_style & THR_IMPROPER) {
     if (eflag & 2) {
       thr->eatom_imprp = eatom + tid*nall;
       if (nall > 0)
         memset(&(thr->eatom_imprp[0]),0,nall*sizeof(double));
     }
     if (vflag & 4) {
       thr->vatom_imprp = vatom + tid*nall;
       if (nall > 0)
         memset(&(thr->vatom_imprp[0][0]),0,nall*6*sizeof(double));
     }
   }
 
   // nothing to do for THR_KSPACE
 }
 
 /* ----------------------------------------------------------------------
    Reduce per thread data into the regular structures
    Reduction of global properties is serialized with a "critical"
    directive, so that only one thread at a time will access the
    global variables. Since we are not synchronized, this should
    come with little overhead. The reduction of per-atom properties
    in contrast is parallelized over threads in the same way as forces.
    ---------------------------------------------------------------------- */
 
 void ThrOMP::reduce_thr(void *style, const int eflag, const int vflag,
                         ThrData *const thr)
 {
   const int nlocal = lmp->atom->nlocal;
   const int nghost = lmp->atom->nghost;
   const int nall = nlocal + nghost;
   const int nfirst = lmp->atom->nfirst;
   const int nthreads = lmp->comm->nthreads;
   const int evflag = eflag | vflag;
 
   const int tid = thr->get_tid();
   double **f = lmp->atom->f;
   double **x = lmp->atom->x;
 
   int need_force_reduce = 1;
 
   if (evflag)
     sync_threads();
 
   switch (thr_style) {
 
   case THR_PAIR: {
     Pair * const pair = lmp->force->pair;
 
     if (pair->vflag_fdotr) {
 
       // this is a non-hybrid pair style. compute per thread fdotr
       if (fix->last_pair_hybrid == NULL) {
         if (lmp->neighbor->includegroup == 0)
           thr->virial_fdotr_compute(x, nlocal, nghost, -1);
         else
           thr->virial_fdotr_compute(x, nlocal, nghost, nfirst);
       } else {
         if (style == fix->last_pair_hybrid) {
           // pair_style hybrid will compute fdotr for us
           // but we first need to reduce the forces
           data_reduce_thr(&(f[0][0]), nall, nthreads, 3, tid);
+	  fix->did_reduce();
           need_force_reduce = 0;
         }
       }
     }
 
     if (evflag) {
 #if defined(_OPENMP)
 #pragma omp critical
 #endif
       {
         if (eflag & 1) {
           pair->eng_vdwl += thr->eng_vdwl;
           pair->eng_coul += thr->eng_coul;
           thr->eng_vdwl = 0.0;
           thr->eng_coul = 0.0;
         }
         if (vflag & 3)
           for (int i=0; i < 6; ++i) {
             pair->virial[i] += thr->virial_pair[i];
             thr->virial_pair[i] = 0.0;
           }
       }
 
       if (eflag & 2) {
         data_reduce_thr(&(pair->eatom[0]), nall, nthreads, 1, tid);
       }
       if (vflag & 4) {
         data_reduce_thr(&(pair->vatom[0][0]), nall, nthreads, 6, tid);
       }
     }
   }
     break;
 
   case THR_BOND:
 
     if (evflag) {
       Bond * const bond = lmp->force->bond;
 #if defined(_OPENMP)
 #pragma omp critical
 #endif
       {
         if (eflag & 1) {
           bond->energy += thr->eng_bond;
           thr->eng_bond = 0.0;
         }
 
         if (vflag & 3) {
           for (int i=0; i < 6; ++i) {
             bond->virial[i] += thr->virial_bond[i];
             thr->virial_bond[i] = 0.0;
           }
         }
       }
 
       if (eflag & 2) {
         data_reduce_thr(&(bond->eatom[0]), nall, nthreads, 1, tid);
       }
       if (vflag & 4) {
         data_reduce_thr(&(bond->vatom[0][0]), nall, nthreads, 6, tid);
       }
 
     }
     break;
 
   case THR_ANGLE:
 
     if (evflag) {
       Angle * const angle = lmp->force->angle;
 #if defined(_OPENMP)
 #pragma omp critical
 #endif
       {
         if (eflag & 1) {
           angle->energy += thr->eng_angle;
           thr->eng_angle = 0.0;
         }
 
         if (vflag & 3) {
           for (int i=0; i < 6; ++i) {
             angle->virial[i] += thr->virial_angle[i];
             thr->virial_angle[i] = 0.0;
           }
         }
       }
 
       if (eflag & 2) {
         data_reduce_thr(&(angle->eatom[0]), nall, nthreads, 1, tid);
       }
       if (vflag & 4) {
         data_reduce_thr(&(angle->vatom[0][0]), nall, nthreads, 6, tid);
       }
 
     }
     break;
 
   case THR_DIHEDRAL:
 
     if (evflag) {
       Dihedral * const dihedral = lmp->force->dihedral;
 #if defined(_OPENMP)
 #pragma omp critical
 #endif
       {
         if (eflag & 1) {
           dihedral->energy += thr->eng_dihed;
           thr->eng_dihed = 0.0;
         }
 
         if (vflag & 3) {
           for (int i=0; i < 6; ++i) {
             dihedral->virial[i] += thr->virial_dihed[i];
             thr->virial_dihed[i] = 0.0;
           }
         }
       }
 
       if (eflag & 2) {
         data_reduce_thr(&(dihedral->eatom[0]), nall, nthreads, 1, tid);
       }
       if (vflag & 4) {
         data_reduce_thr(&(dihedral->vatom[0][0]), nall, nthreads, 6, tid);
       }
 
     }
     break;
 
   case THR_DIHEDRAL|THR_CHARMM: // special case for CHARMM dihedrals
 
     if (evflag) {
       Dihedral * const dihedral = lmp->force->dihedral;
       Pair * const pair = lmp->force->pair;
 #if defined(_OPENMP)
 #pragma omp critical
 #endif
       {
         if (eflag & 1) {
           dihedral->energy += thr->eng_dihed;
           pair->eng_vdwl += thr->eng_vdwl;
           pair->eng_coul += thr->eng_coul;
           thr->eng_dihed = 0.0;
           thr->eng_vdwl = 0.0;
           thr->eng_coul = 0.0;
         }
 
         if (vflag & 3) {
           for (int i=0; i < 6; ++i) {
             dihedral->virial[i] += thr->virial_dihed[i];
             pair->virial[i] += thr->virial_pair[i];
             thr->virial_dihed[i] = 0.0;
             thr->virial_pair[i] = 0.0;
           }
         }
       }
 
       if (eflag & 2) {
         data_reduce_thr(&(dihedral->eatom[0]), nall, nthreads, 1, tid);
         data_reduce_thr(&(pair->eatom[0]), nall, nthreads, 1, tid);
       }
       if (vflag & 4) {
         data_reduce_thr(&(dihedral->vatom[0][0]), nall, nthreads, 6, tid);
         data_reduce_thr(&(pair->vatom[0][0]), nall, nthreads, 6, tid);
       }
     }
     break;
 
   case THR_IMPROPER:
 
     if (evflag) {
       Improper *improper = lmp->force->improper;
 #if defined(_OPENMP)
 #pragma omp critical
 #endif
       {
         if (eflag & 1) {
           improper->energy += thr->eng_imprp;
           thr->eng_imprp = 0.0;
         }
 
         if (vflag & 3) {
           for (int i=0; i < 6; ++i) {
             improper->virial[i] += thr->virial_imprp[i];
             thr->virial_imprp[i] = 0.0;
           }
         }
       }
 
       if (eflag & 2) {
         data_reduce_thr(&(improper->eatom[0]), nall, nthreads, 1, tid);
       }
       if (vflag & 4) {
         data_reduce_thr(&(improper->vatom[0][0]), nall, nthreads, 6, tid);
       }
 
     }
     break;
 
   case THR_KSPACE:
-    // nothing to do. XXX need to add support for per-atom info
+    // nothing to do. XXX may need to add support for per-atom info
+    break;
+
+  case THR_INTGR:
+    // nothing to do
     break;
 
   default:
     printf("tid:%d unhandled thr_style case %d\n", tid, thr_style);
     break;
   }
 
   if (style == fix->last_omp_style) {
-    if (need_force_reduce)
+    if (need_force_reduce) {
       data_reduce_thr(&(f[0][0]), nall, nthreads, 3, tid);
+      fix->did_reduce();
+    }
 
     if (lmp->atom->torque)
       data_reduce_thr(&(lmp->atom->torque[0][0]), nall, nthreads, 3, tid);
   }
 }
 
 /* ----------------------------------------------------------------------
    tally eng_vdwl and eng_coul into per thread global and per-atom accumulators
 ------------------------------------------------------------------------- */
 
 void ThrOMP::e_tally_thr(Pair * const pair, const int i, const int j,
                          const int nlocal, const int newton_pair,
                          const double evdwl, const double ecoul, ThrData * const thr)
 {
   if (pair->eflag_global) {
     if (newton_pair) {
       thr->eng_vdwl += evdwl;
       thr->eng_coul += ecoul;
     } else {
       const double evdwlhalf = 0.5*evdwl;
       const double ecoulhalf = 0.5*ecoul;
       if (i < nlocal) {
         thr->eng_vdwl += evdwlhalf;
         thr->eng_coul += ecoulhalf;
       }
       if (j < nlocal) {
         thr->eng_vdwl += evdwlhalf;
         thr->eng_coul += ecoulhalf;
       }
     }
   }
   if (pair->eflag_atom) {
     const double epairhalf = 0.5 * (evdwl + ecoul);
     if (newton_pair || i < nlocal) thr->eatom_pair[i] += epairhalf;
     if (newton_pair || j < nlocal) thr->eatom_pair[j] += epairhalf;
   }
 }
 
 /* helper functions */
 static void v_tally(double * const vout, const double * const vin)
 {
   vout[0] += vin[0];
   vout[1] += vin[1];
   vout[2] += vin[2];
   vout[3] += vin[3];
   vout[4] += vin[4];
   vout[5] += vin[5];
 }
 
 static void v_tally(double * const vout, const double scale, const double * const vin)
 {
   vout[0] += scale*vin[0];
   vout[1] += scale*vin[1];
   vout[2] += scale*vin[2];
   vout[3] += scale*vin[3];
   vout[4] += scale*vin[4];
   vout[5] += scale*vin[5];
 }
 
 /* ----------------------------------------------------------------------
    tally virial into per thread global and per-atom accumulators
 ------------------------------------------------------------------------- */
 void ThrOMP::v_tally_thr(Pair * const pair, const int i, const int j,
                          const int nlocal, const int newton_pair,
                          const double * const v, ThrData * const thr)
 {
   if (pair->vflag_global) {
     double * const va = thr->virial_pair;
     if (newton_pair) {
       v_tally(va,v);
     } else {
       if (i < nlocal) v_tally(va,0.5,v);
       if (j < nlocal) v_tally(va,0.5,v);
     }
   }
 
   if (pair->vflag_atom) {
     if (newton_pair || i < nlocal) {
       double * const va = thr->vatom_pair[i];
       v_tally(va,0.5,v);
     }
     if (newton_pair || j < nlocal) {
       double * const va = thr->vatom_pair[j];
       v_tally(va,0.5,v);
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    tally eng_vdwl and virial into per thread global and per-atom accumulators
    need i < nlocal test since called by bond_quartic and dihedral_charmm
 ------------------------------------------------------------------------- */
 
 void ThrOMP::ev_tally_thr(Pair * const pair, const int i, const int j, const int nlocal,
                           const int newton_pair, const double evdwl, const double ecoul,
                           const double fpair, const double delx, const double dely,
                           const double delz, ThrData * const thr)
 {
 
   if (pair->eflag_either)
     e_tally_thr(pair, i, j, nlocal, newton_pair, evdwl, ecoul, thr);
 
   if (pair->vflag_either) {
     double v[6];
     v[0] = delx*delx*fpair;
     v[1] = dely*dely*fpair;
     v[2] = delz*delz*fpair;
     v[3] = delx*dely*fpair;
     v[4] = delx*delz*fpair;
     v[5] = dely*delz*fpair;
 
     v_tally_thr(pair, i, j, nlocal, newton_pair, v, thr);
   }
 }
 
 /* ----------------------------------------------------------------------
    tally eng_vdwl and virial into global and per-atom accumulators
    for virial, have delx,dely,delz and fx,fy,fz
 ------------------------------------------------------------------------- */
 
 void ThrOMP::ev_tally_xyz_thr(Pair * const pair, const int i, const int j,
                               const int nlocal, const int newton_pair,
                               const double evdwl, const double ecoul,
                               const double fx, const double fy, const double fz,
                               const double delx, const double dely, const double delz,
                               ThrData * const thr)
 {
 
   if (pair->eflag_either)
     e_tally_thr(pair, i, j, nlocal, newton_pair, evdwl, ecoul, thr);
 
   if (pair->vflag_either) {
     double v[6];
     v[0] = delx*fx;
     v[1] = dely*fy;
     v[2] = delz*fz;
     v[3] = delx*fy;
     v[4] = delx*fz;
     v[5] = dely*fz;
 
     v_tally_thr(pair, i, j, nlocal, newton_pair, v, thr);
   }
 }
 
 /* ----------------------------------------------------------------------
    tally eng_vdwl and virial into global and per-atom accumulators
    called by SW and hbond potentials, newton_pair is always on
    virial = riFi + rjFj + rkFk = (rj-ri) Fj + (rk-ri) Fk = drji*fj + drki*fk
  ------------------------------------------------------------------------- */
 
 void ThrOMP::ev_tally3_thr(Pair * const pair, const int i, const int j, const int k,
                            const double evdwl, const double ecoul,
                            const double * const fj, const double * const fk,
                            const double * const drji, const double * const drki,
                            ThrData * const thr)
 {
   if (pair->eflag_either) {
     if (pair->eflag_global) {
       thr->eng_vdwl += evdwl;
       thr->eng_coul += ecoul;
     }
     if (pair->eflag_atom) {
       const double epairthird = THIRD * (evdwl + ecoul);
       thr->eatom_pair[i] += epairthird;
       thr->eatom_pair[j] += epairthird;
       thr->eatom_pair[k] += epairthird;
     }
   }
 
   if (pair->vflag_either) {
     double v[6];
 
     v[0] = drji[0]*fj[0] + drki[0]*fk[0];
     v[1] = drji[1]*fj[1] + drki[1]*fk[1];
     v[2] = drji[2]*fj[2] + drki[2]*fk[2];
     v[3] = drji[0]*fj[1] + drki[0]*fk[1];
     v[4] = drji[0]*fj[2] + drki[0]*fk[2];
     v[5] = drji[1]*fj[2] + drki[1]*fk[2];
 
     if (pair->vflag_global) v_tally(thr->virial_pair,v);
 
     if (pair->vflag_atom) {
       v_tally(thr->vatom_pair[i],THIRD,v);
       v_tally(thr->vatom_pair[j],THIRD,v);
       v_tally(thr->vatom_pair[k],THIRD,v);
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    tally eng_vdwl and virial into global and per-atom accumulators
    called by AIREBO potential, newton_pair is always on
  ------------------------------------------------------------------------- */
 
 void ThrOMP::ev_tally4_thr(Pair * const pair, const int i, const int j,
                            const int k, const int m, const double evdwl,
                            const double * const fi, const double * const fj,
                            const double * const fk, const double * const drim,
                            const double * const drjm, const double * const drkm,
                            ThrData * const thr)
 {
   double v[6];
 
   if (pair->eflag_either) {
     if (pair->eflag_global) thr->eng_vdwl += evdwl;
     if (pair->eflag_atom) {
       const double epairfourth = 0.25 * evdwl;
       thr->eatom_pair[i] += epairfourth;
       thr->eatom_pair[j] += epairfourth;
       thr->eatom_pair[k] += epairfourth;
       thr->eatom_pair[m] += epairfourth;
     }
   }
 
   if (pair->vflag_atom) {
     v[0] = 0.25 * (drim[0]*fi[0] + drjm[0]*fj[0] + drkm[0]*fk[0]);
     v[1] = 0.25 * (drim[1]*fi[1] + drjm[1]*fj[1] + drkm[1]*fk[1]);
     v[2] = 0.25 * (drim[2]*fi[2] + drjm[2]*fj[2] + drkm[2]*fk[2]);
     v[3] = 0.25 * (drim[0]*fi[1] + drjm[0]*fj[1] + drkm[0]*fk[1]);
     v[4] = 0.25 * (drim[0]*fi[2] + drjm[0]*fj[2] + drkm[0]*fk[2]);
     v[5] = 0.25 * (drim[1]*fi[2] + drjm[1]*fj[2] + drkm[1]*fk[2]);
 
     v_tally(thr->vatom_pair[i],v);
     v_tally(thr->vatom_pair[j],v);
     v_tally(thr->vatom_pair[k],v);
     v_tally(thr->vatom_pair[m],v);
   }
 }
 
 /* ----------------------------------------------------------------------
    tally ecoul and virial into each of n atoms in list
    called by TIP4P potential, newton_pair is always on
    changes v values by dividing by n
  ------------------------------------------------------------------------- */
 
 void ThrOMP::ev_tally_list_thr(Pair * const pair, const int key,
                                const int * const list, const double * const v,
                                const double ecoul, const double alpha,
                                ThrData * const thr)
 {
   int i;
   if (pair->eflag_either) {
     if (pair->eflag_global) thr->eng_coul += ecoul;
     if (pair->eflag_atom) {
       if (key == 0) {
         thr->eatom_pair[list[0]] += 0.5*ecoul;
         thr->eatom_pair[list[1]] += 0.5*ecoul;
       } else if (key == 1) {
         thr->eatom_pair[list[0]] += 0.5*ecoul*(1-alpha);
         thr->eatom_pair[list[1]] += 0.25*ecoul*alpha;
         thr->eatom_pair[list[2]] += 0.25*ecoul*alpha;
         thr->eatom_pair[list[3]] += 0.5*ecoul;
       } else if (key == 2) {
         thr->eatom_pair[list[0]] += 0.5*ecoul;
         thr->eatom_pair[list[1]] += 0.5*ecoul*(1-alpha);
         thr->eatom_pair[list[2]] += 0.25*ecoul*alpha;
         thr->eatom_pair[list[3]] += 0.25*ecoul*alpha;
       } else {
         thr->eatom_pair[list[0]] += 0.5*ecoul*(1-alpha);
         thr->eatom_pair[list[1]] += 0.25*ecoul*alpha;
         thr->eatom_pair[list[2]] += 0.25*ecoul*alpha;
         thr->eatom_pair[list[3]] += 0.5*ecoul*(1-alpha);
         thr->eatom_pair[list[4]] += 0.25*ecoul*alpha;
         thr->eatom_pair[list[5]] += 0.25*ecoul*alpha;
       }
     }
   }
 
   if (pair->vflag_either) {
     if (pair->vflag_global)
       v_tally(thr->virial_pair,v);
 
     if (pair->vflag_atom) {
       if (key == 0) {
         for (i = 0; i <= 5; i++) {
           thr->vatom_pair[list[0]][i] += 0.5*v[i];
           thr->vatom_pair[list[1]][i] += 0.5*v[i];
         }
       } else if (key == 1) {
         for (i = 0; i <= 5; i++) {
           thr->vatom_pair[list[0]][i] += 0.5*v[i]*(1-alpha);
           thr->vatom_pair[list[1]][i] += 0.25*v[i]*alpha;
           thr->vatom_pair[list[2]][i] += 0.25*v[i]*alpha;
           thr->vatom_pair[list[3]][i] += 0.5*v[i];
         }
       } else if (key == 2) {
         for (i = 0; i <= 5; i++) {
           thr->vatom_pair[list[0]][i] += 0.5*v[i];
           thr->vatom_pair[list[1]][i] += 0.5*v[i]*(1-alpha);
           thr->vatom_pair[list[2]][i] += 0.25*v[i]*alpha;
           thr->vatom_pair[list[3]][i] += 0.25*v[i]*alpha;
         }
       } else {
         for (i = 0; i <= 5; i++) {
           thr->vatom_pair[list[0]][i] += 0.5*v[i]*(1-alpha);
           thr->vatom_pair[list[1]][i] += 0.25*v[i]*alpha;
           thr->vatom_pair[list[2]][i] += 0.25*v[i]*alpha;
           thr->vatom_pair[list[3]][i] += 0.5*v[i]*(1-alpha);
           thr->vatom_pair[list[4]][i] += 0.25*v[i]*alpha;
           thr->vatom_pair[list[5]][i] += 0.25*v[i]*alpha;
         }
       }
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    tally energy and virial into global and per-atom accumulators
 ------------------------------------------------------------------------- */
 
 void ThrOMP::ev_tally_thr(Bond * const bond, const int i, const int j, const int nlocal,
                           const int newton_bond, const double ebond, const double fbond,
                           const double delx, const double dely, const double delz,
                           ThrData * const thr)
 {
   if (bond->eflag_either) {
     const double ebondhalf = 0.5*ebond;
     if (newton_bond) {
       if (bond->eflag_global)
         thr->eng_bond += ebond;
       if (bond->eflag_atom) {
         thr->eatom_bond[i] += ebondhalf;
         thr->eatom_bond[j] += ebondhalf;
       }
     } else {
       if (bond->eflag_global) {
         if (i < nlocal) thr->eng_bond += ebondhalf;
         if (j < nlocal) thr->eng_bond += ebondhalf;
       }
       if (bond->eflag_atom) {
         if (i < nlocal) thr->eatom_bond[i] += ebondhalf;
         if (j < nlocal) thr->eatom_bond[j] += ebondhalf;
       }
     }
   }
 
   if (bond->vflag_either) {
     double v[6];
 
     v[0] = delx*delx*fbond;
     v[1] = dely*dely*fbond;
     v[2] = delz*delz*fbond;
     v[3] = delx*dely*fbond;
     v[4] = delx*delz*fbond;
     v[5] = dely*delz*fbond;
 
     if (bond->vflag_global) {
       if (newton_bond)
         v_tally(thr->virial_bond,v);
       else {
         if (i < nlocal)
           v_tally(thr->virial_bond,0.5,v);
         if (j < nlocal)
           v_tally(thr->virial_bond,0.5,v);
       }
     }
 
     if (bond->vflag_atom) {
       v[0] *= 0.5;
       v[1] *= 0.5;
       v[2] *= 0.5;
       v[3] *= 0.5;
       v[4] *= 0.5;
       v[5] *= 0.5;
 
       if (newton_bond) {
         v_tally(thr->vatom_bond[i],v);
         v_tally(thr->vatom_bond[j],v);
       } else {
         if (j < nlocal)
           v_tally(thr->vatom_bond[i],v);
         if (j < nlocal)
           v_tally(thr->vatom_bond[j],v);
       }
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    tally energy and virial into global and per-atom accumulators
    virial = r1F1 + r2F2 + r3F3 = (r1-r2) F1 + (r3-r2) F3 = del1*f1 + del2*f3
 ------------------------------------------------------------------------- */
 
 void ThrOMP::ev_tally_thr(Angle * const angle, const int i, const int j, const int k,
                           const int nlocal, const int newton_bond, const double eangle,
                           const double * const f1, const double * const f3,
                           const double delx1, const double dely1, const double delz1,
                           const double delx2, const double dely2, const double delz2,
                           ThrData * const thr)
 {
   if (angle->eflag_either) {
     const double eanglethird = THIRD*eangle;
     if (newton_bond) {
       if (angle->eflag_global)
         thr->eng_angle += eangle;
       if (angle->eflag_atom) {
         thr->eatom_angle[i] += eanglethird;
         thr->eatom_angle[j] += eanglethird;
         thr->eatom_angle[k] += eanglethird;
       }
     } else {
       if (angle->eflag_global) {
         if (i < nlocal) thr->eng_angle += eanglethird;
         if (j < nlocal) thr->eng_angle += eanglethird;
         if (k < nlocal) thr->eng_angle += eanglethird;
       }
       if (angle->eflag_atom) {
         if (i < nlocal) thr->eatom_angle[i] += eanglethird;
         if (j < nlocal) thr->eatom_angle[j] += eanglethird;
         if (k < nlocal) thr->eatom_angle[k] += eanglethird;
       }
     }
   }
 
   if (angle->vflag_either) {
     double v[6];
 
     v[0] = delx1*f1[0] + delx2*f3[0];
     v[1] = dely1*f1[1] + dely2*f3[1];
     v[2] = delz1*f1[2] + delz2*f3[2];
     v[3] = delx1*f1[1] + delx2*f3[1];
     v[4] = delx1*f1[2] + delx2*f3[2];
     v[5] = dely1*f1[2] + dely2*f3[2];
 
     if (angle->vflag_global) {
       if (newton_bond) {
         v_tally(thr->virial_angle,v);
       } else {
         int cnt = 0;
         if (i < nlocal) ++cnt;
         if (j < nlocal) ++cnt;
         if (k < nlocal) ++cnt;
         v_tally(thr->virial_angle,cnt*THIRD,v);
       }
     }
 
     if (angle->vflag_atom) {
       v[0] *= THIRD;
       v[1] *= THIRD;
       v[2] *= THIRD;
       v[3] *= THIRD;
       v[4] *= THIRD;
       v[5] *= THIRD;
 
       if (newton_bond) {
         v_tally(thr->vatom_angle[i],v);
         v_tally(thr->vatom_angle[j],v);
         v_tally(thr->vatom_angle[k],v);
       } else {
         if (j < nlocal) v_tally(thr->vatom_angle[i],v);
         if (j < nlocal) v_tally(thr->vatom_angle[j],v);
         if (k < nlocal) v_tally(thr->vatom_angle[k],v);
       }
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    tally energy and virial from 1-3 repulsion of SDK angle into accumulators
 ------------------------------------------------------------------------- */
 
 void ThrOMP::ev_tally13_thr(Angle * const angle, const int i1, const int i3,
                             const int nlocal, const int newton_bond,
                             const double epair, const double fpair,
                             const double delx, const double dely,
                             const double delz, ThrData * const thr)
 {
 
   if (angle->eflag_either) {
     const double epairhalf = 0.5 * epair;
 
     if (angle->eflag_global) {
       if (newton_bond || i1 < nlocal)
         thr->eng_angle += epairhalf;
       if (newton_bond || i3 < nlocal)
         thr->eng_angle += epairhalf;
     }
 
     if (angle->eflag_atom) {
       if (newton_bond || i1 < nlocal) thr->eatom_angle[i1] += epairhalf;
       if (newton_bond || i3 < nlocal) thr->eatom_angle[i3] += epairhalf;
     }
   }
 
   if (angle->vflag_either) {
     double v[6];
     v[0] = delx*delx*fpair;
     v[1] = dely*dely*fpair;
     v[2] = delz*delz*fpair;
     v[3] = delx*dely*fpair;
     v[4] = delx*delz*fpair;
     v[5] = dely*delz*fpair;
 
     if (angle->vflag_global) {
       double * const va = thr->virial_angle;
       if (newton_bond || i1 < nlocal) v_tally(va,0.5,v);
       if (newton_bond || i3 < nlocal) v_tally(va,0.5,v);
     }
 
     if (angle->vflag_atom) {
       if (newton_bond || i1 < nlocal) {
         double * const va = thr->vatom_angle[i1];
         v_tally(va,0.5,v);
       }
       if (newton_bond || i3 < nlocal) {
         double * const va = thr->vatom_angle[i3];
         v_tally(va,0.5,v);
       }
     }
   }
 }
 
 
 /* ----------------------------------------------------------------------
    tally energy and virial into global and per-atom accumulators
    virial = r1F1 + r2F2 + r3F3 + r4F4 = (r1-r2) F1 + (r3-r2) F3 + (r4-r2) F4
           = (r1-r2) F1 + (r3-r2) F3 + (r4-r3 + r3-r2) F4
           = vb1*f1 + vb2*f3 + (vb3+vb2)*f4
 ------------------------------------------------------------------------- */
 
 void ThrOMP::ev_tally_thr(Dihedral * const dihed, const int i1, const int i2,
                           const int i3, const int i4, const int nlocal,
                           const int newton_bond, const double edihedral,
                           const double * const f1, const double * const f3,
                           const double * const f4, const double vb1x,
                           const double vb1y, const double vb1z, const double vb2x,
                           const double vb2y, const double vb2z, const double vb3x,
                           const double vb3y, const double vb3z, ThrData * const thr)
 {
 
   if (dihed->eflag_either) {
     if (dihed->eflag_global) {
       if (newton_bond) {
         thr->eng_dihed += edihedral;
       } else {
         const double edihedralquarter = 0.25*edihedral;
         int cnt = 0;
         if (i1 < nlocal) ++cnt;
         if (i2 < nlocal) ++cnt;
         if (i3 < nlocal) ++cnt;
         if (i4 < nlocal) ++cnt;
         thr->eng_dihed += static_cast<double>(cnt)*edihedralquarter;
       }
     }
     if (dihed->eflag_atom) {
       const double edihedralquarter = 0.25*edihedral;
       if (newton_bond) {
         thr->eatom_dihed[i1] += edihedralquarter;
         thr->eatom_dihed[i2] += edihedralquarter;
         thr->eatom_dihed[i3] += edihedralquarter;
         thr->eatom_dihed[i4] += edihedralquarter;
       } else {
         if (i1 < nlocal) thr->eatom_dihed[i1] +=  edihedralquarter;
         if (i2 < nlocal) thr->eatom_dihed[i2] +=  edihedralquarter;
         if (i3 < nlocal) thr->eatom_dihed[i3] +=  edihedralquarter;
         if (i4 < nlocal) thr->eatom_dihed[i4] +=  edihedralquarter;
       }
     }
   }
 
   if (dihed->vflag_either) {
     double v[6];
     v[0] = vb1x*f1[0] + vb2x*f3[0] + (vb3x+vb2x)*f4[0];
     v[1] = vb1y*f1[1] + vb2y*f3[1] + (vb3y+vb2y)*f4[1];
     v[2] = vb1z*f1[2] + vb2z*f3[2] + (vb3z+vb2z)*f4[2];
     v[3] = vb1x*f1[1] + vb2x*f3[1] + (vb3x+vb2x)*f4[1];
     v[4] = vb1x*f1[2] + vb2x*f3[2] + (vb3x+vb2x)*f4[2];
     v[5] = vb1y*f1[2] + vb2y*f3[2] + (vb3y+vb2y)*f4[2];
 
     if (dihed->vflag_global) {
       if (newton_bond) {
         v_tally(thr->virial_dihed,v);
       } else {
         int cnt = 0;
         if (i1 < nlocal) ++cnt;
         if (i2 < nlocal) ++cnt;
         if (i3 < nlocal) ++cnt;
         if (i4 < nlocal) ++cnt;
         v_tally(thr->virial_dihed,0.25*static_cast<double>(cnt),v);
       }
     }
 
     v[0] *= 0.25;
     v[1] *= 0.25;
     v[2] *= 0.25;
     v[3] *= 0.25;
     v[4] *= 0.25;
     v[5] *= 0.25;
 
     if (dihed->vflag_atom) {
       if (newton_bond) {
         v_tally(thr->vatom_dihed[i1],v);
         v_tally(thr->vatom_dihed[i2],v);
         v_tally(thr->vatom_dihed[i3],v);
         v_tally(thr->vatom_dihed[i4],v);
       } else {
         if (i1 < nlocal) v_tally(thr->vatom_dihed[i1],v);
         if (i2 < nlocal) v_tally(thr->vatom_dihed[i2],v);
         if (i3 < nlocal) v_tally(thr->vatom_dihed[i3],v);
         if (i4 < nlocal) v_tally(thr->vatom_dihed[i4],v);
       }
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    tally energy and virial into global and per-atom accumulators
    virial = r1F1 + r2F2 + r3F3 + r4F4 = (r1-r2) F1 + (r3-r2) F3 + (r4-r2) F4
           = (r1-r2) F1 + (r3-r2) F3 + (r4-r3 + r3-r2) F4
           = vb1*f1 + vb2*f3 + (vb3+vb2)*f4
 ------------------------------------------------------------------------- */
 
 void ThrOMP::ev_tally_thr(Improper * const imprp, const int i1, const int i2,
                           const int i3, const int i4, const int nlocal,
                           const int newton_bond, const double eimproper,
                           const double * const f1, const double * const f3,
                           const double * const f4, const double vb1x,
                           const double vb1y, const double vb1z, const double vb2x,
                           const double vb2y, const double vb2z, const double vb3x,
                           const double vb3y, const double vb3z, ThrData * const thr)
 {
 
   if (imprp->eflag_either) {
     if (imprp->eflag_global) {
       if (newton_bond) {
         thr->eng_imprp += eimproper;
       } else {
         const double eimproperquarter = 0.25*eimproper;
         int cnt = 0;
         if (i1 < nlocal) ++cnt;
         if (i2 < nlocal) ++cnt;
         if (i3 < nlocal) ++cnt;
         if (i4 < nlocal) ++cnt;
         thr->eng_imprp += static_cast<double>(cnt)*eimproperquarter;
       }
     }
     if (imprp->eflag_atom) {
       const double eimproperquarter = 0.25*eimproper;
       if (newton_bond) {
         thr->eatom_imprp[i1] += eimproperquarter;
         thr->eatom_imprp[i2] += eimproperquarter;
         thr->eatom_imprp[i3] += eimproperquarter;
         thr->eatom_imprp[i4] += eimproperquarter;
       } else {
         if (i1 < nlocal) thr->eatom_imprp[i1] +=  eimproperquarter;
         if (i2 < nlocal) thr->eatom_imprp[i2] +=  eimproperquarter;
         if (i3 < nlocal) thr->eatom_imprp[i3] +=  eimproperquarter;
         if (i4 < nlocal) thr->eatom_imprp[i4] +=  eimproperquarter;
       }
     }
   }
 
   if (imprp->vflag_either) {
     double v[6];
     v[0] = vb1x*f1[0] + vb2x*f3[0] + (vb3x+vb2x)*f4[0];
     v[1] = vb1y*f1[1] + vb2y*f3[1] + (vb3y+vb2y)*f4[1];
     v[2] = vb1z*f1[2] + vb2z*f3[2] + (vb3z+vb2z)*f4[2];
     v[3] = vb1x*f1[1] + vb2x*f3[1] + (vb3x+vb2x)*f4[1];
     v[4] = vb1x*f1[2] + vb2x*f3[2] + (vb3x+vb2x)*f4[2];
     v[5] = vb1y*f1[2] + vb2y*f3[2] + (vb3y+vb2y)*f4[2];
 
     if (imprp->vflag_global) {
       if (newton_bond) {
         v_tally(thr->virial_imprp,v);
       } else {
         int cnt = 0;
         if (i1 < nlocal) ++cnt;
         if (i2 < nlocal) ++cnt;
         if (i3 < nlocal) ++cnt;
         if (i4 < nlocal) ++cnt;
         v_tally(thr->virial_imprp,0.25*static_cast<double>(cnt),v);
       }
     }
 
     v[0] *= 0.25;
     v[1] *= 0.25;
     v[2] *= 0.25;
     v[3] *= 0.25;
     v[4] *= 0.25;
     v[5] *= 0.25;
 
     if (imprp->vflag_atom) {
       if (newton_bond) {
         v_tally(thr->vatom_imprp[i1],v);
         v_tally(thr->vatom_imprp[i2],v);
         v_tally(thr->vatom_imprp[i3],v);
         v_tally(thr->vatom_imprp[i4],v);
       } else {
         if (i1 < nlocal) v_tally(thr->vatom_imprp[i1],v);
         if (i2 < nlocal) v_tally(thr->vatom_imprp[i2],v);
         if (i3 < nlocal) v_tally(thr->vatom_imprp[i3],v);
         if (i4 < nlocal) v_tally(thr->vatom_imprp[i4],v);
       }
     }
   }
 }
 
 /* ----------------------------------------------------------------------
    tally virial into per-atom accumulators
    called by AIREBO potential, newton_pair is always on
    fpair is magnitude of force on atom I
 ------------------------------------------------------------------------- */
 
 void ThrOMP::v_tally2_thr(const int i, const int j, const double fpair,
                           const double * const drij, ThrData * const thr)
 {
   double v[6];
 
   v[0] = 0.5 * drij[0]*drij[0]*fpair;
   v[1] = 0.5 * drij[1]*drij[1]*fpair;
   v[2] = 0.5 * drij[2]*drij[2]*fpair;
   v[3] = 0.5 * drij[0]*drij[1]*fpair;
   v[4] = 0.5 * drij[0]*drij[2]*fpair;
   v[5] = 0.5 * drij[1]*drij[2]*fpair;
 
   v_tally(thr->vatom_pair[i],v);
   v_tally(thr->vatom_pair[j],v);
 }
 
 /* ----------------------------------------------------------------------
    tally virial into per-atom accumulators
    called by AIREBO and Tersoff potential, newton_pair is always on
 ------------------------------------------------------------------------- */
 
 void ThrOMP::v_tally3_thr(const int i, const int j, const int k,
                           const double * const fi, const double * const fj,
                           const double * const drik, const double * const drjk,
                           ThrData * const thr)
 {
   double v[6];
 
   v[0] = THIRD * (drik[0]*fi[0] + drjk[0]*fj[0]);
   v[1] = THIRD * (drik[1]*fi[1] + drjk[1]*fj[1]);
   v[2] = THIRD * (drik[2]*fi[2] + drjk[2]*fj[2]);
   v[3] = THIRD * (drik[0]*fi[1] + drjk[0]*fj[1]);
   v[4] = THIRD * (drik[0]*fi[2] + drjk[0]*fj[2]);
   v[5] = THIRD * (drik[1]*fi[2] + drjk[1]*fj[2]);
 
   v_tally(thr->vatom_pair[i],v);
   v_tally(thr->vatom_pair[j],v);
   v_tally(thr->vatom_pair[k],v);
 }
 
 /* ----------------------------------------------------------------------
    tally virial into per-atom accumulators
    called by AIREBO potential, newton_pair is always on
 ------------------------------------------------------------------------- */
 
 void ThrOMP::v_tally4_thr(const int i, const int j, const int k, const int m,
                           const double * const fi, const double * const fj,
                           const double * const fk, const double * const drim,
                           const double * const drjm, const double * const drkm,
                           ThrData * const thr)
 {
   double v[6];
 
   v[0] = 0.25 * (drim[0]*fi[0] + drjm[0]*fj[0] + drkm[0]*fk[0]);
   v[1] = 0.25 * (drim[1]*fi[1] + drjm[1]*fj[1] + drkm[1]*fk[1]);
   v[2] = 0.25 * (drim[2]*fi[2] + drjm[2]*fj[2] + drkm[2]*fk[2]);
   v[3] = 0.25 * (drim[0]*fi[1] + drjm[0]*fj[1] + drkm[0]*fk[1]);
   v[4] = 0.25 * (drim[0]*fi[2] + drjm[0]*fj[2] + drkm[0]*fk[2]);
   v[5] = 0.25 * (drim[1]*fi[2] + drjm[1]*fj[2] + drkm[1]*fk[2]);
 
   v_tally(thr->vatom_pair[i],v);
   v_tally(thr->vatom_pair[j],v);
   v_tally(thr->vatom_pair[k],v);
   v_tally(thr->vatom_pair[m],v);
 }
 
 /* ---------------------------------------------------------------------- */
 
 double ThrOMP::memory_usage_thr()
 {
   double bytes=0.0;
 
   return bytes;
 }
diff --git a/src/USER-OMP/thr_omp.h b/src/USER-OMP/thr_omp.h
index 587c24f38..9348058d8 100644
--- a/src/USER-OMP/thr_omp.h
+++ b/src/USER-OMP/thr_omp.h
@@ -1,208 +1,208 @@
 /* -*- c++ -*- -------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Axel Kohlmeyer (Temple U)
 ------------------------------------------------------------------------- */
 
 #ifndef LMP_THR_OMP_H
 #define LMP_THR_OMP_H
 
 #include "pointers.h"
 #include "error.h"
 #include "fix_omp.h"
 #include "thr_data.h"
 
 namespace LAMMPS_NS {
 
 // forward declarations
 class Pair;
 class Bond;
 class Angle;
 class Dihedral;
 class Improper;
 class KSpace;
 class Fix;
 
 class ThrOMP {
 
  protected:
   LAMMPS *lmp; // reference to base lammps object.
   FixOMP *fix; // pointer to fix_omp;
 
   const int thr_style;
   int thr_error;
 
  public:
   ThrOMP(LAMMPS *, int);
   virtual ~ThrOMP();
 
   double memory_usage_thr();
 
   inline void sync_threads() {
 #if defined(_OPENMP)
 #pragma omp barrier
 #endif
       { ; }
     };
 
   enum {THR_NONE=0,THR_PAIR=1,THR_BOND=1<<1,THR_ANGLE=1<<2,
         THR_DIHEDRAL=1<<3,THR_IMPROPER=1<<4,THR_KSPACE=1<<5,
-        THR_CHARMM=1<<6, /*THR_PROXY=1<<7,*/ THR_HYBRID=1<<8,
-        THR_FIX=1<<9};
+        THR_CHARMM=1<<6, /*THR_PROXY=1<<7,THR_HYBRID=1<<8, */
+        THR_FIX=1<<9,THR_INTGR=1<<10};
 
  protected:
   // extra ev_tally setup work for threaded styles
   void ev_setup_thr(int, int, int, double *, double **, ThrData *);
 
   // compute global per thread virial contribution from per-thread force
   void virial_fdotr_compute_thr(double * const, const double * const * const,
                                 const double * const * const,
                                 const int, const int, const int);
 
   // reduce per thread data as needed
   void reduce_thr(void * const style, const int eflag, const int vflag,
                   ThrData * const thr);
 
   // thread safe variant error abort support.
   // signals an error condition in any thread by making
   // thr_error > 0, if condition "cond" is true.
   // will abort from thread 0 if thr_error is > 0
   // otherwise return true.
   // returns false if no error found on any thread.
   // use return value to jump/return to end of threaded region.
 
   bool check_error_thr(const bool cond, const int tid, const char *fname,
                        const int line, const char *errmsg) {
     if (cond) {
 #if defined(_OPENMP)
 #pragma omp atomic
       ++thr_error;
 #endif
       if (tid > 0) return true;
       else lmp->error->one(fname,line,errmsg);
     } else {
       if (thr_error > 0) {
         if (tid == 0) lmp->error->one(fname,line,errmsg);
         else return true;
       } else return false;
     }
     return false;
   };
 
  protected:
 
   // threading adapted versions of the ev_tally infrastructure
   // style specific versions (need access to style class flags)
 
   // Pair
   void e_tally_thr(Pair * const, const int, const int, const int,
                    const int, const double, const double, ThrData * const);
   void v_tally_thr(Pair * const, const int, const int, const int,
                    const int, const double * const, ThrData * const);
 
   void ev_tally_thr(Pair * const, const int, const int, const int, const int,
                     const double, const double, const double, const double,
                     const double, const double, ThrData * const);
   void ev_tally_xyz_thr(Pair * const, const int, const int, const int,
                         const int, const double, const double, const double,
                         const double, const double, const double,
                         const double, const double, ThrData * const);
   void ev_tally3_thr(Pair * const, const int, const int, const int, const double,
                      const double, const double * const, const double * const,
                      const double * const, const double * const, ThrData * const);
   void ev_tally4_thr(Pair * const, const int, const int, const int, const int,
                      const double, const double * const, const double * const,
                      const double * const, const double * const, const double * const,
                      const double * const, ThrData * const);
 
   // Bond
   void ev_tally_thr(Bond * const, const int, const int, const int, const int,
                     const double, const double, const double, const double,
                     const double, ThrData * const);
 
   // Angle
   void ev_tally_thr(Angle * const, const int, const int, const int, const int, const int,
                     const double, const double * const, const double * const,
                     const double, const double, const double, const double, const double,
                     const double, ThrData * const thr);
   void ev_tally13_thr(Angle * const, const int, const int, const int, const int,
                       const double, const double, const double, const double,
                       const double, ThrData * const thr);
 
   // Dihedral
   void ev_tally_thr(Dihedral * const, const int, const int, const int, const int, const int,
                     const int, const double, const double * const, const double * const,
                     const double * const, const double, const double, const double,
                     const double, const double, const double, const double, const double,
                     const double, ThrData * const);
 
   // Improper
   void ev_tally_thr(Improper * const, const int, const int, const int, const int, const int,
                     const int, const double, const double * const, const double * const,
                     const double * const, const double, const double, const double,
                     const double, const double, const double, const double, const double,
                     const double, ThrData * const);
 
   // style independent versions
   void v_tally2_thr(const int, const int, const double, const double * const, ThrData * const);
   void v_tally3_thr(const int, const int, const int, const double * const, const double * const,
                     const double * const, const double * const, ThrData * const);
   void v_tally4_thr(const int, const int, const int, const int, const double * const,
                     const double * const, const double * const, const double * const,
                     const double * const, const double * const, ThrData * const);
   void ev_tally_list_thr(Pair * const, const int, const int * const,
                          const double * const, const double, const double,
                          ThrData * const);
 
 };
 
 // set loop range thread id, and force array offset for threaded runs.
 static inline void loop_setup_thr(int &ifrom, int &ito, int &tid,
                                   int inum, int nthreads)
 {
 #if defined(_OPENMP)
   tid = omp_get_thread_num();
 
   // each thread works on a fixed chunk of atoms.
   const int idelta = 1 + inum/nthreads;
   ifrom = tid*idelta;
   ito   = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
 #else
   tid = 0;
   ifrom = 0;
   ito = inum;
 #endif
 }
 
 // helpful definitions to help compilers optimizing code better
 
 typedef struct { double x,y,z;   } dbl3_t;
 typedef struct { double x,y,z,w; } dbl4_t;
 typedef struct { int a,b,t;      } int3_t;
 typedef struct { int a,b,c,t;    } int4_t;
 typedef struct { int a,b,c,d,t;  } int5_t;
 
 }
 
 #ifdef _noalias
 #undef _noalias
 #endif
 
 #if defined(__GNUC__)
 #define _noalias __restrict
 #else
 #define _noalias
 #endif
 
 #endif