diff --git a/doc/src/Section_howto.txt b/doc/src/Section_howto.txt
index 8bd9ef805..d88ab0c2d 100644
--- a/doc/src/Section_howto.txt
+++ b/doc/src/Section_howto.txt
@@ -1,2821 +1,2821 @@
 "Previous Section"_Section_accelerate.html - "LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc - "Next Section"_Section_example.html :c
 
 :link(lws,http://lammps.sandia.gov)
 :link(ld,Manual.html)
 :link(lc,Section_commands.html#comm)
 
 :line 
 
 6. How-to discussions :h3
 
 This section describes how to perform common tasks using LAMMPS.
 
 6.1 "Restarting a simulation"_#howto_1
 6.2 "2d simulations"_#howto_2
 6.3 "CHARMM, AMBER, and DREIDING force fields"_#howto_3
 6.4 "Running multiple simulations from one input script"_#howto_4
 6.5 "Multi-replica simulations"_#howto_5
 6.6 "Granular models"_#howto_6
 6.7 "TIP3P water model"_#howto_7
 6.8 "TIP4P water model"_#howto_8
 6.9 "SPC water model"_#howto_9
 6.10 "Coupling LAMMPS to other codes"_#howto_10
 6.11 "Visualizing LAMMPS snapshots"_#howto_11
 6.12 "Triclinic (non-orthogonal) simulation boxes"_#howto_12
 6.13 "NEMD simulations"_#howto_13
 6.14 "Finite-size spherical and aspherical particles"_#howto_14
 6.15 "Output from LAMMPS (thermo, dumps, computes, fixes, variables)"_#howto_15
 6.16 "Thermostatting, barostatting and computing temperature"_#howto_16
 6.17 "Walls"_#howto_17
 6.18 "Elastic constants"_#howto_18
 6.19 "Library interface to LAMMPS"_#howto_19
 6.20 "Calculating thermal conductivity"_#howto_20
 6.21 "Calculating viscosity"_#howto_21
 6.22 "Calculating a diffusion coefficient"_#howto_22
 6.23 "Using chunks to calculate system properties"_#howto_23
 6.24 "Setting parameters for the kspace_style pppm/disp command"_#howto_24
 6.25 "Polarizable models"_#howto_25
 6.26 "Adiabatic core/shell model"_#howto_26
 6.27 "Drude induced dipoles"_#howto_27 :all(b)
 
 The example input scripts included in the LAMMPS distribution and
 highlighted in "Section_example"_Section_example.html also show how to
 setup and run various kinds of simulations.
 
 :line
 :line
 
 6.1 Restarting a simulation :link(howto_1),h4
 
 There are 3 ways to continue a long LAMMPS simulation.  Multiple
 "run"_run.html commands can be used in the same input script.  Each
 run will continue from where the previous run left off.  Or binary
 restart files can be saved to disk using the "restart"_restart.html
 command.  At a later time, these binary files can be read via a
 "read_restart"_read_restart.html command in a new script.  Or they can
 be converted to text data files using the "-r command-line
 switch"_Section_start.html#start_7 and read by a
 "read_data"_read_data.html command in a new script.
 
 Here we give examples of 2 scripts that read either a binary restart
 file or a converted data file and then issue a new run command to
 continue where the previous run left off.  They illustrate what
 settings must be made in the new script.  Details are discussed in the
 documentation for the "read_restart"_read_restart.html and
 "read_data"_read_data.html commands.
 
 Look at the {in.chain} input script provided in the {bench} directory
 of the LAMMPS distribution to see the original script that these 2
 scripts are based on.  If that script had the line
 
 restart	        50 tmp.restart :pre
 
 added to it, it would produce 2 binary restart files (tmp.restart.50
 and tmp.restart.100) as it ran.
 
 This script could be used to read the 1st restart file and re-run the
 last 50 timesteps:
 
 read_restart	tmp.restart.50 :pre
 
 neighbor	0.4 bin
 neigh_modify	every 1 delay 1 :pre
 
 fix		1 all nve
 fix		2 all langevin 1.0 1.0 10.0 904297 :pre
 
 timestep	0.012 :pre
 
 run		50 :pre
 
 Note that the following commands do not need to be repeated because
 their settings are included in the restart file: {units, atom_style,
 special_bonds, pair_style, bond_style}.  However these commands do
 need to be used, since their settings are not in the restart file:
 {neighbor, fix, timestep}.
 
 If you actually use this script to perform a restarted run, you will
 notice that the thermodynamic data match at step 50 (if you also put a
 "thermo 50" command in the original script), but do not match at step
 100.  This is because the "fix langevin"_fix_langevin.html command
 uses random numbers in a way that does not allow for perfect restarts.
 
 As an alternate approach, the restart file could be converted to a data
 file as follows:
 
 lmp_g++ -r tmp.restart.50 tmp.restart.data :pre
 
 Then, this script could be used to re-run the last 50 steps:
 
 units		lj
 atom_style	bond
 pair_style	lj/cut 1.12
 pair_modify	shift yes
 bond_style	fene
 special_bonds   0.0 1.0 1.0 :pre
 
 read_data	tmp.restart.data :pre
 
 neighbor	0.4 bin
 neigh_modify	every 1 delay 1 :pre
 
 fix		1 all nve
 fix		2 all langevin 1.0 1.0 10.0 904297 :pre
 
 timestep	0.012 :pre
 
 reset_timestep	50
 run		50 :pre
 
 Note that nearly all the settings specified in the original {in.chain}
 script must be repeated, except the {pair_coeff} and {bond_coeff}
 commands since the new data file lists the force field coefficients.
 Also, the "reset_timestep"_reset_timestep.html command is used to tell
 LAMMPS the current timestep.  This value is stored in restart files,
 but not in data files.
 
 :line
 
 6.2 2d simulations :link(howto_2),h4
 
 Use the "dimension"_dimension.html command to specify a 2d simulation.
 
 Make the simulation box periodic in z via the "boundary"_boundary.html
 command.  This is the default.
 
 If using the "create box"_create_box.html command to define a
 simulation box, set the z dimensions narrow, but finite, so that the
 create_atoms command will tile the 3d simulation box with a single z
 plane of atoms - e.g.
 
 "create box"_create_box.html 1 -10 10 -10 10 -0.25 0.25 :pre
 
 If using the "read data"_read_data.html command to read in a file of
 atom coordinates, set the "zlo zhi" values to be finite but narrow,
 similar to the create_box command settings just described.  For each
 atom in the file, assign a z coordinate so it falls inside the
 z-boundaries of the box - e.g. 0.0.
 
 Use the "fix enforce2d"_fix_enforce2d.html command as the last
 defined fix to insure that the z-components of velocities and forces
 are zeroed out every timestep.  The reason to make it the last fix is
 so that any forces induced by other fixes will be zeroed out.
 
 Many of the example input scripts included in the LAMMPS distribution
 are for 2d models.
 
 NOTE: Some models in LAMMPS treat particles as finite-size spheres, as
 opposed to point particles.  In 2d, the particles will still be
 spheres, not disks, meaning their moment of inertia will be the same
 as in 3d.
 
 :line
 
 6.3 CHARMM, AMBER, and DREIDING force fields :link(howto_3),h4
 
 A force field has 2 parts: the formulas that define it and the
 coefficients used for a particular system.  Here we only discuss
 formulas implemented in LAMMPS that correspond to formulas commonly
 used in the CHARMM, AMBER, and DREIDING force fields.  Setting
 coefficients is done in the input data file via the
 "read_data"_read_data.html command or in the input script with
 commands like "pair_coeff"_pair_coeff.html or
 "bond_coeff"_bond_coeff.html.  See "Section_tools"_Section_tools.html
 for additional tools that can use CHARMM or AMBER to assign force
 field coefficients and convert their output into LAMMPS input.
 
 See "(MacKerell)"_#howto-MacKerell for a description of the CHARMM force
 field.  See "(Cornell)"_#howto-Cornell for a description of the AMBER force
 field.
 
 :link(charmm,http://www.scripps.edu/brooks)
 :link(amber,http://amber.scripps.edu)
 
 These style choices compute force field formulas that are consistent
 with common options in CHARMM or AMBER.  See each command's
 documentation for the formula it computes.
 
 "bond_style"_bond_harmonic.html harmonic
 "angle_style"_angle_charmm.html charmm
 "dihedral_style"_dihedral_charmm.html charmm
 "pair_style"_pair_charmm.html lj/charmm/coul/charmm
 "pair_style"_pair_charmm.html lj/charmm/coul/charmm/implicit
 "pair_style"_pair_charmm.html lj/charmm/coul/long :ul
 
 "special_bonds"_special_bonds.html charmm
 "special_bonds"_special_bonds.html amber :ul
 
 DREIDING is a generic force field developed by the "Goddard
 group"_http://www.wag.caltech.edu at Caltech and is useful for
 predicting structures and dynamics of organic, biological and
 main-group inorganic molecules. The philosophy in DREIDING is to use
 general force constants and geometry parameters based on simple
 hybridization considerations, rather than individual force constants
 and geometric parameters that depend on the particular combinations of
 atoms involved in the bond, angle, or torsion terms. DREIDING has an
 "explicit hydrogen bond term"_pair_hbond_dreiding.html to describe
 interactions involving a hydrogen atom on very electronegative atoms
 (N, O, F).
 
 See "(Mayo)"_#howto-Mayo for a description of the DREIDING force field
 
 These style choices compute force field formulas that are consistent
 with the DREIDING force field.  See each command's
 documentation for the formula it computes.
 
 "bond_style"_bond_harmonic.html harmonic
 "bond_style"_bond_morse.html morse :ul
 
 "angle_style"_angle_harmonic.html harmonic
 "angle_style"_angle_cosine.html cosine
 "angle_style"_angle_cosine_periodic.html cosine/periodic :ul
 
 "dihedral_style"_dihedral_charmm.html charmm
 "improper_style"_improper_umbrella.html umbrella :ul
 
 "pair_style"_pair_buck.html buck
 "pair_style"_pair_buck.html buck/coul/cut
 "pair_style"_pair_buck.html buck/coul/long
 "pair_style"_pair_lj.html lj/cut
 "pair_style"_pair_lj.html lj/cut/coul/cut
 "pair_style"_pair_lj.html lj/cut/coul/long :ul
 
 "pair_style"_pair_hbond_dreiding.html hbond/dreiding/lj
 "pair_style"_pair_hbond_dreiding.html hbond/dreiding/morse :ul
 
 "special_bonds"_special_bonds.html dreiding :ul
 
 :line
 
 6.4 Running multiple simulations from one input script :link(howto_4),h4
 
 This can be done in several ways.  See the documentation for
 individual commands for more details on how these examples work.
 
 If "multiple simulations" means continue a previous simulation for
 more timesteps, then you simply use the "run"_run.html command
 multiple times.  For example, this script
 
 units lj
 atom_style atomic
 read_data data.lj
 run 10000
 run 10000
 run 10000
 run 10000
 run 10000 :pre
 
 would run 5 successive simulations of the same system for a total of
 50,000 timesteps.
 
 If you wish to run totally different simulations, one after the other,
 the "clear"_clear.html command can be used in between them to
 re-initialize LAMMPS.  For example, this script
 
 units lj
 atom_style atomic
 read_data data.lj
 run 10000
 clear
 units lj
 atom_style atomic
 read_data data.lj.new
 run 10000 :pre
 
 would run 2 independent simulations, one after the other.
 
 For large numbers of independent simulations, you can use
 "variables"_variable.html and the "next"_next.html and
 "jump"_jump.html commands to loop over the same input script
 multiple times with different settings.  For example, this
 script, named in.polymer
 
 variable d index run1 run2 run3 run4 run5 run6 run7 run8
 shell cd $d
 read_data data.polymer
 run 10000
 shell cd ..
 clear
 next d
 jump in.polymer :pre
 
 would run 8 simulations in different directories, using a data.polymer
 file in each directory.  The same concept could be used to run the
 same system at 8 different temperatures, using a temperature variable
 and storing the output in different log and dump files, for example
 
 variable a loop 8
 variable t index 0.8 0.85 0.9 0.95 1.0 1.05 1.1 1.15
 log log.$a
 read data.polymer
 velocity all create $t 352839
 fix 1 all nvt $t $t 100.0
 dump 1 all atom 1000 dump.$a
 run 100000
 clear
 next t
 next a
 jump in.polymer :pre
 
 All of the above examples work whether you are running on 1 or
 multiple processors, but assumed you are running LAMMPS on a single
 partition of processors.  LAMMPS can be run on multiple partitions via
 the "-partition" command-line switch as described in "this
 section"_Section_start.html#start_7 of the manual.
 
 In the last 2 examples, if LAMMPS were run on 3 partitions, the same
 scripts could be used if the "index" and "loop" variables were
 replaced with {universe}-style variables, as described in the
 "variable"_variable.html command.  Also, the "next t" and "next a"
 commands would need to be replaced with a single "next a t" command.
 With these modifications, the 8 simulations of each script would run
 on the 3 partitions one after the other until all were finished.
 Initially, 3 simulations would be started simultaneously, one on each
 partition.  When one finished, that partition would then start
 the 4th simulation, and so forth, until all 8 were completed.
 
 :line
 
 6.5 Multi-replica simulations :link(howto_5),h4
 
 Several commands in LAMMPS run mutli-replica simulations, meaning
 that multiple instances (replicas) of your simulation are run
 simultaneously, with small amounts of data exchanged between replicas
 periodically.
 
 These are the relevant commands:
 
 "neb"_neb.html for nudged elastic band calculations
 "prd"_prd.html for parallel replica dynamics
 "tad"_tad.html for temperature accelerated dynamics
 "temper"_temper.html for parallel tempering
 "fix pimd"_fix_pimd.html for path-integral molecular dynamics (PIMD) :ul
 
 NEB is a method for finding transition states and barrier energies.
 PRD and TAD are methods for performing accelerated dynamics to find
 and perform infrequent events.  Parallel tempering or replica exchange
 runs different replicas at a series of temperature to facilitate
 rare-event sampling.
 
 These commands can only be used if LAMMPS was built with the REPLICA
 package.  See the "Making LAMMPS"_Section_start.html#start_3 section
 for more info on packages.
 
 PIMD runs different replicas whose individual particles are coupled
 together by springs to model a system or ring-polymers.
 
 This commands can only be used if LAMMPS was built with the USER-MISC
 package.  See the "Making LAMMPS"_Section_start.html#start_3 section
 for more info on packages.
 
 In all these cases, you must run with one or more processors per
 replica.  The processors assigned to each replica are determined at
 run-time by using the "-partition command-line
 switch"_Section_start.html#start_7 to launch LAMMPS on multiple
 partitions, which in this context are the same as replicas.  E.g.
 these commands:
 
 mpirun -np 16 lmp_linux -partition 8x2 -in in.temper
 mpirun -np 8 lmp_linux -partition 8x1 -in in.neb :pre
 
 would each run 8 replicas, on either 16 or 8 processors.  Note the use
 of the "-in command-line switch"_Section_start.html#start_7 to specify
 the input script which is required when running in multi-replica mode.
 
 Also note that with MPI installed on a machine (e.g. your desktop),
 you can run on more (virtual) processors than you have physical
 processors.  Thus the above commands could be run on a
 single-processor (or few-processor) desktop so that you can run
 a multi-replica simulation on more replicas than you have
 physical processors.
 
 :line
 
 6.6 Granular models :link(howto_6),h4
 
 Granular system are composed of spherical particles with a diameter,
 as opposed to point particles.  This means they have an angular
 velocity and torque can be imparted to them to cause them to rotate.
 
 To run a simulation of a granular model, you will want to use
 the following commands:
 
 "atom_style sphere"_atom_style.html
 "fix nve/sphere"_fix_nve_sphere.html
 "fix gravity"_fix_gravity.html :ul
 
 This compute
 
 "compute erotate/sphere"_compute_erotate_sphere.html :ul
 
 calculates rotational kinetic energy which can be "output with
 thermodynamic info"_Section_howto.html#howto_15.
 
 Use one of these 3 pair potentials, which compute forces and torques
 between interacting pairs of particles:
 
 "pair_style"_pair_style.html gran/history
 "pair_style"_pair_style.html gran/no_history
 "pair_style"_pair_style.html gran/hertzian :ul
 
 These commands implement fix options specific to granular systems:
 
 "fix freeze"_fix_freeze.html
 "fix pour"_fix_pour.html
 "fix viscous"_fix_viscous.html
 "fix wall/gran"_fix_wall_gran.html :ul
 
 The fix style {freeze} zeroes both the force and torque of frozen
 atoms, and should be used for granular system instead of the fix style
 {setforce}.
 
 For computational efficiency, you can eliminate needless pairwise
 computations between frozen atoms by using this command:
 
 "neigh_modify"_neigh_modify.html exclude :ul
 
 :line
 
 6.7 TIP3P water model :link(howto_7),h4
 
 The TIP3P water model as implemented in CHARMM
 "(MacKerell)"_#howto-MacKerell specifies a 3-site rigid water molecule with
 charges and Lennard-Jones parameters assigned to each of the 3 atoms.
 In LAMMPS the "fix shake"_fix_shake.html command can be used to hold
 the two O-H bonds and the H-O-H angle rigid.  A bond style of
 {harmonic} and an angle style of {harmonic} or {charmm} should also be
 used.
 
 These are the additional parameters (in real units) to set for O and H
 atoms and the water molecule to run a rigid TIP3P-CHARMM model with a
 cutoff.  The K values can be used if a flexible TIP3P model (without
 fix shake) is desired.  If the LJ epsilon and sigma for HH and OH are
 set to 0.0, it corresponds to the original 1983 TIP3P model
 "(Jorgensen)"_#Jorgensen.
 
 O mass = 15.9994
 H mass = 1.008
 O charge = -0.834
 H charge = 0.417
 LJ epsilon of OO = 0.1521
 LJ sigma of OO = 3.1507
 LJ epsilon of HH = 0.0460
 LJ sigma of HH = 0.4000
 LJ epsilon of OH = 0.0836
 LJ sigma of OH = 1.7753
 K of OH bond = 450
 r0 of OH bond = 0.9572
 K of HOH angle = 55
 theta of HOH angle = 104.52 :all(b),p
 
 These are the parameters to use for TIP3P with a long-range Coulombic
 solver (e.g. Ewald or PPPM in LAMMPS), see "(Price)"_#Price for
 details:
 
 O mass = 15.9994
 H mass = 1.008
 O charge = -0.830
 H charge = 0.415
 LJ epsilon of OO = 0.102
 LJ sigma of OO = 3.188
 LJ epsilon, sigma of OH, HH = 0.0
 K of OH bond = 450
 r0 of OH bond = 0.9572
 K of HOH angle = 55
 theta of HOH angle = 104.52 :all(b),p
 
 Wikipedia also has a nice article on "water
 models"_http://en.wikipedia.org/wiki/Water_model.
 
 :line
 
 6.8 TIP4P water model :link(howto_8),h4
 
 The four-point TIP4P rigid water model extends the traditional
 three-point TIP3P model by adding an additional site, usually
 massless, where the charge associated with the oxygen atom is placed.
 This site M is located at a fixed distance away from the oxygen along
 the bisector of the HOH bond angle.  A bond style of {harmonic} and an
 angle style of {harmonic} or {charmm} should also be used.
 
 A TIP4P model is run with LAMMPS using either this command
 for a cutoff model:
 
 "pair_style lj/cut/tip4p/cut"_pair_lj.html
 
 or these two commands for a long-range model:
 
 "pair_style lj/cut/tip4p/long"_pair_lj.html
 "kspace_style pppm/tip4p"_kspace_style.html :ul
 
 For both models, the bond lengths and bond angles should be held fixed
 using the "fix shake"_fix_shake.html command.
 
 These are the additional parameters (in real units) to set for O and H
 atoms and the water molecule to run a rigid TIP4P model with a cutoff
 "(Jorgensen)"_#Jorgensen.  Note that the OM distance is specified in
 the "pair_style"_pair_style.html command, not as part of the pair
 coefficients.
 
 O mass = 15.9994
 H mass = 1.008
 O charge = -1.040
 H charge = 0.520
 r0 of OH bond = 0.9572
 theta of HOH angle = 104.52 
 OM distance = 0.15
 LJ epsilon of O-O = 0.1550
 LJ sigma of O-O = 3.1536
 LJ epsilon, sigma of OH, HH = 0.0
 Coulombic cutoff = 8.5 :all(b),p
 
 For the TIP4/Ice model (J Chem Phys, 122, 234511 (2005);
 http://dx.doi.org/10.1063/1.1931662) these values can be used:
 
 O mass = 15.9994
 H mass =  1.008
 O charge = -1.1794
 H charge =  0.5897
 r0 of OH bond = 0.9572
 theta of HOH angle = 104.52
 OM distance = 0.1577
 LJ epsilon of O-O = 0.21084
 LJ sigma of O-O = 3.1668
 LJ epsilon, sigma of OH, HH = 0.0
 Coulombic cutoff = 8.5 :all(b),p
 
 For the TIP4P/2005 model (J Chem Phys, 123, 234505 (2005);
 http://dx.doi.org/10.1063/1.2121687), these values can be used:
 
 O mass = 15.9994
 H mass =  1.008
 O charge = -1.1128
 H charge = 0.5564
 r0 of OH bond = 0.9572
 theta of HOH angle = 104.52
 OM distance = 0.1546
 LJ epsilon of O-O = 0.1852
 LJ sigma of O-O = 3.1589
 LJ epsilon, sigma of OH, HH = 0.0
 Coulombic cutoff = 8.5 :all(b),p
 
 These are the parameters to use for TIP4P with a long-range Coulombic
 solver (e.g. Ewald or PPPM in LAMMPS):
 
 O mass = 15.9994
 H mass = 1.008
 O charge = -1.0484
 H charge = 0.5242
 r0 of OH bond = 0.9572
 theta of HOH angle = 104.52
 OM distance = 0.1250
 LJ epsilon of O-O = 0.16275
 LJ sigma of O-O = 3.16435
 LJ epsilon, sigma of OH, HH = 0.0 :all(b),p
 
 Note that the when using the TIP4P pair style, the neighobr list
 cutoff for Coulomb interactions is effectively extended by a distance
 2 * (OM distance), to account for the offset distance of the
 fictitious charges on O atoms in water molecules.  Thus it is
 typically best in an efficiency sense to use a LJ cutoff >= Coulomb
 cutoff + 2*(OM distance), to shrink the size of the neighbor list.
 This leads to slightly larger cost for the long-range calculation, so
 you can test the trade-off for your model.  The OM distance and the LJ
 and Coulombic cutoffs are set in the "pair_style
 lj/cut/tip4p/long"_pair_lj.html command.
 
 Wikipedia also has a nice article on "water
 models"_http://en.wikipedia.org/wiki/Water_model.
 
 :line
 
 6.9 SPC water model :link(howto_9),h4
 
 The SPC water model specifies a 3-site rigid water molecule with
 charges and Lennard-Jones parameters assigned to each of the 3 atoms.
 In LAMMPS the "fix shake"_fix_shake.html command can be used to hold
 the two O-H bonds and the H-O-H angle rigid.  A bond style of
 {harmonic} and an angle style of {harmonic} or {charmm} should also be
 used.
 
 These are the additional parameters (in real units) to set for O and H
 atoms and the water molecule to run a rigid SPC model.
 
 O mass = 15.9994
 H mass = 1.008
 O charge = -0.820
 H charge = 0.410
 LJ epsilon of OO = 0.1553
 LJ sigma of OO = 3.166
 LJ epsilon, sigma of OH, HH = 0.0
 r0 of OH bond = 1.0
 theta of HOH angle = 109.47 :all(b),p
 
 Note that as originally proposed, the SPC model was run with a 9
 Angstrom cutoff for both LJ and Coulommbic terms.  It can also be used
 with long-range Coulombics (Ewald or PPPM in LAMMPS), without changing
 any of the parameters above, though it becomes a different model in
 that mode of usage.
 
 The SPC/E (extended) water model is the same, except
 the partial charge assignemnts change:
 
 O charge = -0.8476
 H charge = 0.4238 :all(b),p
 
 See the "(Berendsen)"_#howto-Berendsen reference for more details on both
 the SPC and SPC/E models.
 
 Wikipedia also has a nice article on "water
 models"_http://en.wikipedia.org/wiki/Water_model.
 
 :line 
 
 6.10 Coupling LAMMPS to other codes :link(howto_10),h4
 
 LAMMPS is designed to allow it to be coupled to other codes.  For
 example, a quantum mechanics code might compute forces on a subset of
 atoms and pass those forces to LAMMPS.  Or a continuum finite element
 (FE) simulation might use atom positions as boundary conditions on FE
 nodal points, compute a FE solution, and return interpolated forces on
 MD atoms.
 
 LAMMPS can be coupled to other codes in at least 3 ways.  Each has
 advantages and disadvantages, which you'll have to think about in the
 context of your application.
 
 (1) Define a new "fix"_fix.html command that calls the other code.  In
 this scenario, LAMMPS is the driver code.  During its timestepping,
 the fix is invoked, and can make library calls to the other code,
 which has been linked to LAMMPS as a library.  This is the way the
 "POEMS"_poems package that performs constrained rigid-body motion on
 groups of atoms is hooked to LAMMPS.  See the
 "fix poems"_fix_poems.html command for more details.  See "this
 section"_Section_modify.html of the documentation for info on how to add
 a new fix to LAMMPS.
 
 :link(poems,http://www.rpi.edu/~anderk5/lab)
 
 (2) Define a new LAMMPS command that calls the other code.  This is
 conceptually similar to method (1), but in this case LAMMPS and the
 other code are on a more equal footing.  Note that now the other code
 is not called during the timestepping of a LAMMPS run, but between
 runs.  The LAMMPS input script can be used to alternate LAMMPS runs
 with calls to the other code, invoked via the new command.  The
 "run"_run.html command facilitates this with its {every} option, which
 makes it easy to run a few steps, invoke the command, run a few steps,
 invoke the command, etc.
 
 In this scenario, the other code can be called as a library, as in
 (1), or it could be a stand-alone code, invoked by a system() call
 made by the command (assuming your parallel machine allows one or more
 processors to start up another program).  In the latter case the
 stand-alone code could communicate with LAMMPS thru files that the
 command writes and reads.
 
 See "Section_modify"_Section_modify.html of the documentation for how
 to add a new command to LAMMPS.
 
 (3) Use LAMMPS as a library called by another code.  In this case the
 other code is the driver and calls LAMMPS as needed.  Or a wrapper
 code could link and call both LAMMPS and another code as libraries.
 Again, the "run"_run.html command has options that allow it to be
 invoked with minimal overhead (no setup or clean-up) if you wish to do
 multiple short runs, driven by another program.
 
 Examples of driver codes that call LAMMPS as a library are included in
 the examples/COUPLE directory of the LAMMPS distribution; see
 examples/COUPLE/README for more details:
 
 simple: simple driver programs in C++ and C which invoke LAMMPS as a
 library :ulb,l
 
 lammps_quest: coupling of LAMMPS and "Quest"_quest, to run classical
 MD with quantum forces calculated by a density functional code :l
 
 lammps_spparks: coupling of LAMMPS and "SPPARKS"_spparks, to couple
 a kinetic Monte Carlo model for grain growth using MD to calculate
 strain induced across grain boundaries :l,ule
 
 :link(quest,http://dft.sandia.gov/Quest)
 :link(spparks,http://www.sandia.gov/~sjplimp/spparks.html)
 
 "This section"_Section_start.html#start_5 of the documentation
 describes how to build LAMMPS as a library.  Once this is done, you
 can interface with LAMMPS either via C++, C, Fortran, or Python (or
 any other language that supports a vanilla C-like interface).  For
 example, from C++ you could create one (or more) "instances" of
 LAMMPS, pass it an input script to process, or execute individual
 commands, all by invoking the correct class methods in LAMMPS.  From C
 or Fortran you can make function calls to do the same things.  See
 "Section_python"_Section_python.html of the manual for a description
 of the Python wrapper provided with LAMMPS that operates through the
 LAMMPS library interface.
 
 The files src/library.cpp and library.h contain the C-style interface
 to LAMMPS.  See "Section_howto 19"_Section_howto.html#howto_19 of the
 manual for a description of the interface and how to extend it for
 your needs.
 
 Note that the lammps_open() function that creates an instance of
 LAMMPS takes an MPI communicator as an argument.  This means that
 instance of LAMMPS will run on the set of processors in the
 communicator.  Thus the calling code can run LAMMPS on all or a subset
 of processors.  For example, a wrapper script might decide to
 alternate between LAMMPS and another code, allowing them both to run
 on all the processors.  Or it might allocate half the processors to
 LAMMPS and half to the other code and run both codes simultaneously
 before syncing them up periodically.  Or it might instantiate multiple
 instances of LAMMPS to perform different calculations.
 
 :line 
 
 6.11 Visualizing LAMMPS snapshots :link(howto_11),h4
 
 LAMMPS itself does not do visualization, but snapshots from LAMMPS
 simulations can be visualized (and analyzed) in a variety of ways.
 
 LAMMPS snapshots are created by the "dump"_dump.html command which can
 create files in several formats.  The native LAMMPS dump format is a
 text file (see "dump atom" or "dump custom") which can be visualized
 by the "xmovie"_Section_tools.html#xmovie program, included with the
 LAMMPS package.  This produces simple, fast 2d projections of 3d
 systems, and can be useful for rapid debugging of simulation geometry
 and atom trajectories.
 
 Several programs included with LAMMPS as auxiliary tools can convert
 native LAMMPS dump files to other formats.  See the
 "Section_tools"_Section_tools.html doc page for details.  The first is
 the "ch2lmp tool"_Section_tools.html#charmm, which contains a
 lammps2pdb Perl script which converts LAMMPS dump files into PDB
 files.  The second is the "lmp2arc tool"_Section_tools.html#arc which
 converts LAMMPS dump files into Accelrys' Insight MD program files.
 The third is the "lmp2cfg tool"_Section_tools.html#cfg which converts
 LAMMPS dump files into CFG files which can be read into the
 "AtomEye"_atomeye visualizer.
 
 A Python-based toolkit distributed by our group can read native LAMMPS
 dump files, including custom dump files with additional columns of
 user-specified atom information, and convert them to various formats
 or pipe them into visualization software directly.  See the "Pizza.py
 WWW site"_pizza for details.  Specifically, Pizza.py can convert
 LAMMPS dump files into PDB, XYZ, "Ensight"_ensight, and VTK formats.
 Pizza.py can pipe LAMMPS dump files directly into the Raster3d and
 RasMol visualization programs.  Pizza.py has tools that do interactive
 3d OpenGL visualization and one that creates SVG images of dump file
 snapshots.
 
 LAMMPS can create XYZ files directly (via "dump xyz") which is a
 simple text-based file format used by many visualization programs
 including "VMD"_vmd.
 
 LAMMPS can create DCD files directly (via "dump dcd") which can be
 read by "VMD"_vmd in conjunction with a CHARMM PSF file.  Using this
 form of output avoids the need to convert LAMMPS snapshots to PDB
 files.  See the "dump"_dump.html command for more information on DCD
 files.
 
 LAMMPS can create XTC files directly (via "dump xtc") which is GROMACS
 file format which can also be read by "VMD"_vmd for visualization.
 See the "dump"_dump.html command for more information on XTC files.
 
 :link(pizza,http://www.sandia.gov/~sjplimp/pizza.html)
 :link(vmd,http://www.ks.uiuc.edu/Research/vmd)
 :link(ensight,http://www.ensight.com)
 :link(atomeye,http://mt.seas.upenn.edu/Archive/Graphics/A)
 
 :line
 
 6.12 Triclinic (non-orthogonal) simulation boxes :link(howto_12),h4
 
 By default, LAMMPS uses an orthogonal simulation box to encompass the
 particles.  The "boundary"_boundary.html command sets the boundary
 conditions of the box (periodic, non-periodic, etc).  The orthogonal
 box has its "origin" at (xlo,ylo,zlo) and is defined by 3 edge vectors
 starting from the origin given by [a] = (xhi-xlo,0,0); [b] =
 (0,yhi-ylo,0); [c] = (0,0,zhi-zlo).  The 6 parameters
 (xlo,xhi,ylo,yhi,zlo,zhi) are defined at the time the simulation box
 is created, e.g. by the "create_box"_create_box.html or
 "read_data"_read_data.html or "read_restart"_read_restart.html
 commands.  Additionally, LAMMPS defines box size parameters lx,ly,lz
 where lx = xhi-xlo, and similarly in the y and z dimensions.  The 6
 parameters, as well as lx,ly,lz, can be output via the "thermo_style
 custom"_thermo_style.html command.
 
 LAMMPS also allows simulations to be performed in triclinic
 (non-orthogonal) simulation boxes shaped as a parallelepiped with
 triclinic symmetry.  The parallelepiped has its "origin" at
 (xlo,ylo,zlo) and is defined by 3 edge vectors starting from the
 origin given by [a] = (xhi-xlo,0,0); [b] = (xy,yhi-ylo,0); [c] =
 (xz,yz,zhi-zlo).  {xy,xz,yz} can be 0.0 or positive or negative values
 and are called "tilt factors" because they are the amount of
 displacement applied to faces of an originally orthogonal box to
 transform it into the parallelepiped.  In LAMMPS the triclinic
 simulation box edge vectors [a], [b], and [c] cannot be arbitrary
 vectors.  As indicated, [a] must lie on the positive x axis.  [b] must
 lie in the xy plane, with strictly positive y component. [c] may have
 any orientation with strictly positive z component.  The requirement
 that [a], [b], and [c] have strictly positive x, y, and z components,
 respectively, ensures that [a], [b], and [c] form a complete
 right-handed basis.  These restrictions impose no loss of generality,
 since it is possible to rotate/invert any set of 3 crystal basis
 vectors so that they conform to the restrictions.
 
 For example, assume that the 3 vectors [A],[B],[C] are the edge
 vectors of a general parallelepiped, where there is no restriction on
 [A],[B],[C] other than they form a complete right-handed basis i.e.
 [A] x [B] . [C] > 0.  The equivalent LAMMPS [a],[b],[c] are a linear
 rotation of [A], [B], and [C] and can be computed as follows:
 
 :c,image(Eqs/transform.jpg)
 
 where A = |[A]| indicates the scalar length of [A]. The ^ hat symbol
 indicates the corresponding unit vector. {beta} and {gamma} are angles
 between the vectors described below. Note that by construction, 
 [a], [b], and [c] have strictly positive x, y, and z components, respectively.
 If it should happen that
 [A], [B], and [C] form a left-handed basis, then the above equations
 are not valid for [c]. In this case, it is necessary
 to first apply an inversion. This can be achieved
 by interchanging two basis vectors or by changing the sign of one of them.
 
 For consistency, the same rotation/inversion applied to the basis vectors
 must also be applied to atom positions, velocities, 
 and any other vector quantities.
 This can be conveniently achieved by first converting to 
 fractional coordinates in the
 old basis and then converting to distance coordinates in the new basis.
 The transformation is given by the following equation:
 
 :c,image(Eqs/rotate.jpg)
 
 where {V} is the volume of the box, [X] is the original vector quantity and 
 [x] is the vector in the LAMMPS basis. 
 
 There is no requirement that a triclinic box be periodic in any
 dimension, though it typically should be in at least the 2nd dimension
 of the tilt (y in xy) if you want to enforce a shift in periodic
 boundary conditions across that boundary.  Some commands that work
 with triclinic boxes, e.g. the "fix deform"_fix_deform.html and "fix
 npt"_fix_nh.html commands, require periodicity or non-shrink-wrap
 boundary conditions in specific dimensions.  See the command doc pages
 for details.
 
 The 9 parameters (xlo,xhi,ylo,yhi,zlo,zhi,xy,xz,yz) are defined at the
 time the simluation box is created.  This happens in one of 3 ways.
 If the "create_box"_create_box.html command is used with a region of
 style {prism}, then a triclinic box is setup.  See the
 "region"_region.html command for details.  If the
 "read_data"_read_data.html command is used to define the simulation
 box, and the header of the data file contains a line with the "xy xz
 yz" keyword, then a triclinic box is setup.  See the
 "read_data"_read_data.html command for details.  Finally, if the
 "read_restart"_read_restart.html command reads a restart file which
 was written from a simulation using a triclinic box, then a triclinic
 box will be setup for the restarted simulation.
 
 Note that you can define a triclinic box with all 3 tilt factors =
 0.0, so that it is initially orthogonal.  This is necessary if the box
 will become non-orthogonal, e.g. due to the "fix npt"_fix_nh.html or
 "fix deform"_fix_deform.html commands.  Alternatively, you can use the
 "change_box"_change_box.html command to convert a simulation box from
 orthogonal to triclinic and vice versa.
 
 As with orthogonal boxes, LAMMPS defines triclinic box size parameters
 lx,ly,lz where lx = xhi-xlo, and similarly in the y and z dimensions.
 The 9 parameters, as well as lx,ly,lz, can be output via the
 "thermo_style custom"_thermo_style.html command.
 
 To avoid extremely tilted boxes (which would be computationally
 inefficient), LAMMPS normally requires that no tilt factor can skew
 the box more than half the distance of the parallel box length, which
 is the 1st dimension in the tilt factor (x for xz).  This is required
 both when the simulation box is created, e.g. via the
 "create_box"_create_box.html or "read_data"_read_data.html commands,
 as well as when the box shape changes dynamically during a simulation,
 e.g. via the "fix deform"_fix_deform.html or "fix npt"_fix_nh.html
 commands.
 
 For example, if xlo = 2 and xhi = 12, then the x box length is 10 and
 the xy tilt factor must be between -5 and 5.  Similarly, both xz and
 yz must be between -(xhi-xlo)/2 and +(yhi-ylo)/2.  Note that this is
 not a limitation, since if the maximum tilt factor is 5 (as in this
 example), then configurations with tilt = ..., -15, -5, 5, 15, 25,
 ... are geometrically all equivalent.  If the box tilt exceeds this
 limit during a dynamics run (e.g. via the "fix deform"_fix_deform.html
 command), then the box is "flipped" to an equivalent shape with a tilt
 factor within the bounds, so the run can continue.  See the "fix
 deform"_fix_deform.html doc page for further details.
 
 One exception to this rule is if the 1st dimension in the tilt
 factor (x for xy) is non-periodic.  In that case, the limits on the
 tilt factor are not enforced, since flipping the box in that dimension
 does not change the atom positions due to non-periodicity.  In this
 mode, if you tilt the system to extreme angles, the simulation will
 simply become inefficient, due to the highly skewed simulation box.
 
 The limitation on not creating a simulation box with a tilt factor
 skewing the box more than half the distance of the parallel box length
 can be overridden via the "box"_box.html command.  Setting the {tilt}
 keyword to {large} allows any tilt factors to be specified.
 
 Box flips that may occur using the "fix deform"_fix_deform.html or
 "fix npt"_fix_nh.html commands can be turned off using the {flip no}
 option with either of the commands.
 
 Note that if a simulation box has a large tilt factor, LAMMPS will run
 less efficiently, due to the large volume of communication needed to
 acquire ghost atoms around a processor's irregular-shaped sub-domain.
 For extreme values of tilt, LAMMPS may also lose atoms and generate an
 error.
 
 Triclinic crystal structures are often defined using three lattice
 constants {a}, {b}, and {c}, and three angles {alpha}, {beta} and
 {gamma}. Note that in this nomenclature, the a, b, and c lattice
 constants are the scalar lengths of the edge vectors [a], [b], and [c]
 defined above.  The relationship between these 6 quantities
 (a,b,c,alpha,beta,gamma) and the LAMMPS box sizes (lx,ly,lz) =
 (xhi-xlo,yhi-ylo,zhi-zlo) and tilt factors (xy,xz,yz) is as follows:
 
 :c,image(Eqs/box.jpg) 
 
 The inverse relationship can be written as follows:
 
 :c,image(Eqs/box_inverse.jpg) 
 
 The values of {a}, {b}, {c} , {alpha}, {beta} , and {gamma} can be printed 
 out or accessed by computes using the 
 "thermo_style custom"_thermo_style.html keywords 
 {cella}, {cellb}, {cellc}, {cellalpha}, {cellbeta}, {cellgamma},
 respectively. 
 
 As discussed on the "dump"_dump.html command doc page, when the BOX
 BOUNDS for a snapshot is written to a dump file for a triclinic box,
 an orthogonal bounding box which encloses the triclinic simulation box
 is output, along with the 3 tilt factors (xy, xz, yz) of the triclinic
 box, formatted as follows:
 
 ITEM: BOX BOUNDS xy xz yz
 xlo_bound xhi_bound xy
 ylo_bound yhi_bound xz
 zlo_bound zhi_bound yz :pre
 
 This bounding box is convenient for many visualization programs and is
 calculated from the 9 triclinic box parameters
 (xlo,xhi,ylo,yhi,zlo,zhi,xy,xz,yz) as follows:
 
 xlo_bound = xlo + MIN(0.0,xy,xz,xy+xz)
 xhi_bound = xhi + MAX(0.0,xy,xz,xy+xz)
 ylo_bound = ylo + MIN(0.0,yz)
 yhi_bound = yhi + MAX(0.0,yz)
 zlo_bound = zlo
 zhi_bound = zhi :pre
 
 These formulas can be inverted if you need to convert the bounding box
 back into the triclinic box parameters, e.g. xlo = xlo_bound -
 MIN(0.0,xy,xz,xy+xz).
 
 One use of triclinic simulation boxes is to model solid-state crystals
 with triclinic symmetry.  The "lattice"_lattice.html command can be
 used with non-orthogonal basis vectors to define a lattice that will
 tile a triclinic simulation box via the
 "create_atoms"_create_atoms.html command.
 
 A second use is to run Parinello-Rahman dyanamics via the "fix
 npt"_fix_nh.html command, which will adjust the xy, xz, yz tilt
 factors to compensate for off-diagonal components of the pressure
 tensor.  The analalog for an "energy minimization"_minimize.html is
 the "fix box/relax"_fix_box_relax.html command.
 
 A third use is to shear a bulk solid to study the response of the
 material.  The "fix deform"_fix_deform.html command can be used for
 this purpose.  It allows dynamic control of the xy, xz, yz tilt
 factors as a simulation runs.  This is discussed in the next section
 on non-equilibrium MD (NEMD) simulations.
 
 :line
 
 6.13 NEMD simulations :link(howto_13),h4
 
 Non-equilibrium molecular dynamics or NEMD simulations are typically
 used to measure a fluid's rheological properties such as viscosity.
 In LAMMPS, such simulations can be performed by first setting up a
 non-orthogonal simulation box (see the preceding Howto section).
 
 A shear strain can be applied to the simulation box at a desired
 strain rate by using the "fix deform"_fix_deform.html command.  The
 "fix nvt/sllod"_fix_nvt_sllod.html command can be used to thermostat
 the sheared fluid and integrate the SLLOD equations of motion for the
 system.  Fix nvt/sllod uses "compute
 temp/deform"_compute_temp_deform.html to compute a thermal temperature
 by subtracting out the streaming velocity of the shearing atoms.  The
 velocity profile or other properties of the fluid can be monitored via
 the "fix ave/spatial"_fix_ave_spatial.html command.
 
 As discussed in the previous section on non-orthogonal simulation
 boxes, the amount of tilt or skew that can be applied is limited by
 LAMMPS for computational efficiency to be 1/2 of the parallel box
 length.  However, "fix deform"_fix_deform.html can continuously strain
 a box by an arbitrary amount.  As discussed in the "fix
 deform"_fix_deform.html command, when the tilt value reaches a limit,
 the box is flipped to the opposite limit which is an equivalent tiling
 of periodic space.  The strain rate can then continue to change as
 before.  In a long NEMD simulation these box re-shaping events may
 occur many times.
 
 In a NEMD simulation, the "remap" option of "fix
 deform"_fix_deform.html should be set to "remap v", since that is what
 "fix nvt/sllod"_fix_nvt_sllod.html assumes to generate a velocity
 profile consistent with the applied shear strain rate.
 
 An alternative method for calculating viscosities is provided via the
 "fix viscosity"_fix_viscosity.html command.
 
 :line
 
 6.14 Finite-size spherical and aspherical particles :link(howto_14),h4
 
 Typical MD models treat atoms or particles as point masses.  Sometimes
 it is desirable to have a model with finite-size particles such as
 spheroids or ellipsoids or generalized aspherical bodies.  The
 difference is that such particles have a moment of inertia, rotational
 energy, and angular momentum.  Rotation is induced by torque coming
 from interactions with other particles.
 
 LAMMPS has several options for running simulations with these kinds of
 particles.  The following aspects are discussed in turn:
 
 atom styles
 pair potentials
 time integration
 computes, thermodynamics, and dump output
 rigid bodies composed of finite-size particles :ul
 
 Example input scripts for these kinds of models are in the body,
 colloid, dipole, ellipse, line, peri, pour, and tri directories of the
 "examples directory"_Section_example.html in the LAMMPS distribution.
 
 Atom styles :h5
 
 There are several "atom styles"_atom_style.html that allow for
 definition of finite-size particles: sphere, dipole, ellipsoid, line,
 tri, peri, and body.
 
 The sphere style defines particles that are spheriods and each
 particle can have a unique diameter and mass (or density).  These
 particles store an angular velocity (omega) and can be acted upon by
 torque.  The "set" command can be used to modify the diameter and mass
 of individual particles, after then are created.
 
 The dipole style does not actually define finite-size particles, but
 is often used in conjunction with spherical particles, via a command
 like
 
 atom_style hybrid sphere dipole :pre
 
 This is because when dipoles interact with each other, they induce
 torques, and a particle must be finite-size (i.e. have a moment of
 inertia) in order to respond and rotate.  See the "atom_style
 dipole"_atom_style.html command for details.  The "set" command can be
 used to modify the orientation and length of the dipole moment of
 individual particles, after then are created.
 
 The ellipsoid style defines particles that are ellipsoids and thus can
 be aspherical.  Each particle has a shape, specified by 3 diameters,
 and mass (or density).  These particles store an angular momentum and
 their orientation (quaternion), and can be acted upon by torque.  They
 do not store an angular velocity (omega), which can be in a different
 direction than angular momentum, rather they compute it as needed.
 The "set" command can be used to modify the diameter, orientation, and
 mass of individual particles, after then are created.  It also has a
 brief explanation of what quaternions are.
 
 The line style defines line segment particles with two end points and
 a mass (or density).  They can be used in 2d simulations, and they can
 be joined together to form rigid bodies which represent arbitrary
 polygons.
 
 The tri style defines triangular particles with three corner points
 and a mass (or density).  They can be used in 3d simulations, and they
 can be joined together to form rigid bodies which represent arbitrary
 particles with a triangulated surface.
 
 The peri style is used with "Peridynamic models"_pair_peri.html and
 defines particles as having a volume, that is used internally in the
 "pair_style peri"_pair_peri.html potentials.
 
 The body style allows for definition of particles which can represent
 complex entities, such as surface meshes of discrete points,
 collections of sub-particles, deformable objects, etc.  The body style
 is discussed in more detail on the "body"_body.html doc page.
 
 Note that if one of these atom styles is used (or multiple styles via
 the "atom_style hybrid"_atom_style.html command), not all particles in
 the system are required to be finite-size or aspherical.
 
 For example, in the ellipsoid style, if the 3 shape parameters are set
 to the same value, the particle will be a sphere rather than an
 ellipsoid.  If the 3 shape parameters are all set to 0.0 or if the
 diameter is set to 0.0, it will be a point particle.  In the line or
 tri style, if the lineflag or triflag is specified as 0, then it
 will be a point particle.
 
 Some of the pair styles used to compute pairwise interactions between
 finite-size particles also compute the correct interaction with point
 particles as well, e.g. the interaction between a point particle and a
 finite-size particle or between two point particles.  If necessary,
 "pair_style hybrid"_pair_hybrid.html can be used to insure the correct
 interactions are computed for the appropriate style of interactions.
 Likewise, using groups to partition particles (ellipsoids versus
 spheres versus point particles) will allow you to use the appropriate
 time integrators and temperature computations for each class of
 particles.  See the doc pages for various commands for details.
 
 Also note that for "2d simulations"_dimension.html, atom styles sphere
 and ellipsoid still use 3d particles, rather than as circular disks or
 ellipses.  This means they have the same moment of inertia as the 3d
 object.  When temperature is computed, the correct degrees of freedom
 are used for rotation in a 2d versus 3d system.
 
 Pair potentials :h5
 
 When a system with finite-size particles is defined, the particles
 will only rotate and experience torque if the force field computes
 such interactions.  These are the various "pair
 styles"_pair_style.html that generate torque:
 
 "pair_style gran/history"_pair_gran.html
 "pair_style gran/hertzian"_pair_gran.html
 "pair_style gran/no_history"_pair_gran.html
 "pair_style dipole/cut"_pair_dipole.html
 "pair_style gayberne"_pair_gayberne.html
 "pair_style resquared"_pair_resquared.html
 "pair_style brownian"_pair_brownian.html
 "pair_style lubricate"_pair_lubricate.html
 "pair_style line/lj"_pair_line_lj.html
 "pair_style tri/lj"_pair_tri_lj.html
 "pair_style body"_pair_body.html :ul
 
 The granular pair styles are used with spherical particles.  The
 dipole pair style is used with the dipole atom style, which could be
 applied to spherical or ellipsoidal particles.  The GayBerne and
 REsquared potentials require ellipsoidal particles, though they will
 also work if the 3 shape parameters are the same (a sphere).  The
 Brownian and lubrication potentials are used with spherical particles.
 The line, tri, and body potentials are used with line segment,
 triangular, and body particles respectively.
 
 Time integration :h5
 
 There are several fixes that perform time integration on finite-size
 spherical particles, meaning the integrators update the rotational
 orientation and angular velocity or angular momentum of the particles:
 
 "fix nve/sphere"_fix_nve_sphere.html
 "fix nvt/sphere"_fix_nvt_sphere.html
 "fix npt/sphere"_fix_npt_sphere.html :ul
 
 Likewise, there are 3 fixes that perform time integration on
 ellipsoidal particles:
 
 "fix nve/asphere"_fix_nve_asphere.html
 "fix nvt/asphere"_fix_nvt_asphere.html
 "fix npt/asphere"_fix_npt_asphere.html :ul
 
 The advantage of these fixes is that those which thermostat the
 particles include the rotational degrees of freedom in the temperature
 calculation and thermostatting.  The "fix langevin"_fix_langevin
 command can also be used with its {omgea} or {angmom} options to
 thermostat the rotational degrees of freedom for spherical or
 ellipsoidal particles.  Other thermostatting fixes only operate on the
 translational kinetic energy of finite-size particles.
 
 These fixes perform constant NVE time integration on line segment,
 triangular, and body particles:
 
 "fix nve/line"_fix_nve_line.html
 "fix nve/tri"_fix_nve_tri.html
 "fix nve/body"_fix_nve_body.html :ul
 
 Note that for mixtures of point and finite-size particles, these
 integration fixes can only be used with "groups"_group.html which
 contain finite-size particles.
 
 Computes, thermodynamics, and dump output :h5
 
 There are several computes that calculate the temperature or
 rotational energy of spherical or ellipsoidal particles:
 
 "compute temp/sphere"_compute_temp_sphere.html
 "compute temp/asphere"_compute_temp_asphere.html
 "compute erotate/sphere"_compute_erotate_sphere.html
 "compute erotate/asphere"_compute_erotate_asphere.html :ul
 
 These include rotational degrees of freedom in their computation.  If
 you wish the thermodynamic output of temperature or pressure to use
 one of these computes (e.g. for a system entirely composed of
 finite-size particles), then the compute can be defined and the
 "thermo_modify"_thermo_modify.html command used.  Note that by default
 thermodynamic quantities will be calculated with a temperature that
 only includes translational degrees of freedom.  See the
 "thermo_style"_thermo_style.html command for details.
 
 These commands can be used to output various attributes of finite-size
 particles:
 
 "dump custom"_dump.html
 "compute property/atom"_compute_property_atom.html
 "dump local"_dump.html
 "compute body/local"_compute_body_local.html :ul
 
 Attributes include the dipole moment, the angular velocity, the
 angular momentum, the quaternion, the torque, the end-point and
 corner-point coordinates (for line and tri particles), and
 sub-particle attributes of body particles.
 
 Rigid bodies composed of finite-size particles :h5
 
 The "fix rigid"_fix_rigid.html command treats a collection of
 particles as a rigid body, computes its inertia tensor, sums the total
 force and torque on the rigid body each timestep due to forces on its
 constituent particles, and integrates the motion of the rigid body.
 
 If any of the constituent particles of a rigid body are finite-size
 particles (spheres or ellipsoids or line segments or triangles), then
 their contribution to the inertia tensor of the body is different than
 if they were point particles.  This means the rotational dynamics of
 the rigid body will be different.  Thus a model of a dimer is
 different if the dimer consists of two point masses versus two
 spheroids, even if the two particles have the same mass.  Finite-size
 particles that experience torque due to their interaction with other
 particles will also impart that torque to a rigid body they are part
 of.
 
 See the "fix rigid" command for example of complex rigid-body models
 it is possible to define in LAMMPS.
 
 Note that the "fix shake"_fix_shake.html command can also be used to
 treat 2, 3, or 4 particles as a rigid body, but it always assumes the
 particles are point masses.
 
 Also note that body particles cannot be modeled with the "fix
 rigid"_fix_rigid.html command.  Body particles are treated by LAMMPS
 as single particles, though they can store internal state, such as a
 list of sub-particles.  Individual body partices are typically treated
 as rigid bodies, and their motion integrated with a command like "fix
 nve/body"_fix_nve_body.html.  Interactions between pairs of body
 particles are computed via a command like "pair_style
 body"_pair_body.html.
 
 :line
 
 6.15 Output from LAMMPS (thermo, dumps, computes, fixes, variables) :link(howto_15),h4
 
 There are four basic kinds of LAMMPS output:
 
 "Thermodynamic output"_thermo_style.html, which is a list
 of quantities printed every few timesteps to the screen and logfile. :ulb,l
 
 "Dump files"_dump.html, which contain snapshots of atoms and various
 per-atom values and are written at a specified frequency. :l
 
 Certain fixes can output user-specified quantities to files: "fix
 ave/time"_fix_ave_time.html for time averaging, "fix
 ave/chunk"_fix_ave_chunk.html for spatial or other averaging, and "fix
 print"_fix_print.html for single-line output of
 "variables"_variable.html.  Fix print can also output to the
 screen. :l
 
 "Restart files"_restart.html. :l,ule
 
 A simulation prints one set of thermodynamic output and (optionally)
 restart files.  It can generate any number of dump files and fix
 output files, depending on what "dump"_dump.html and "fix"_fix.html
 commands you specify.
 
 As discussed below, LAMMPS gives you a variety of ways to determine
 what quantities are computed and printed when the thermodynamics,
 dump, or fix commands listed above perform output.  Throughout this
 discussion, note that users can also "add their own computes and fixes
 to LAMMPS"_Section_modify.html which can then generate values that can
 then be output with these commands.
 
 The following sub-sections discuss different LAMMPS command related
 to output and the kind of data they operate on and produce:
 
 "Global/per-atom/local data"_#global
 "Scalar/vector/array data"_#scalar
 "Thermodynamic output"_#thermo
 "Dump file output"_#dump
 "Fixes that write output files"_#fixoutput
 "Computes that process output quantities"_#computeoutput
 "Fixes that process output quantities"_#fixprocoutput
 "Computes that generate values to output"_#compute
 "Fixes that generate values to output"_#fix
 "Variables that generate values to output"_#variable
 "Summary table of output options and data flow between commands"_#table :ul
 
 Global/per-atom/local data :h5,link(global)
 
 Various output-related commands work with three different styles of
 data: global, per-atom, or local.  A global datum is one or more
 system-wide values, e.g. the temperature of the system.  A per-atom
 datum is one or more values per atom, e.g. the kinetic energy of each
 atom.  Local datums are calculated by each processor based on the
 atoms it owns, but there may be zero or more per atom, e.g. a list of
 bond distances.
 
 Scalar/vector/array data :h5,link(scalar)
 
 Global, per-atom, and local datums can each come in three kinds: a
 single scalar value, a vector of values, or a 2d array of values.  The
 doc page for a "compute" or "fix" or "variable" that generates data
 will specify both the style and kind of data it produces, e.g. a
 per-atom vector.
 
 When a quantity is accessed, as in many of the output commands
 discussed below, it can be referenced via the following bracket
 notation, where ID in this case is the ID of a compute.  The leading
 "c_" would be replaced by "f_" for a fix, or "v_" for a variable:
 
 c_ID | entire scalar, vector, or array
 c_ID\[I\] | one element of vector, one column of array
 c_ID\[I\]\[J\] | one element of array :tb(s=|)
 
 In other words, using one bracket reduces the dimension of the data
 once (vector -> scalar, array -> vector).  Using two brackets reduces
 the dimension twice (array -> scalar).  Thus a command that uses
 scalar values as input can typically also process elements of a vector
 or array.
 
 Thermodynamic output :h5,link(thermo)
 
 The frequency and format of thermodynamic output is set by the
 "thermo"_thermo.html, "thermo_style"_thermo_style.html, and
 "thermo_modify"_thermo_modify.html commands.  The
 "thermo_style"_thermo_style.html command also specifies what values
 are calculated and written out.  Pre-defined keywords can be specified
 (e.g. press, etotal, etc).  Three additional kinds of keywords can
 also be specified (c_ID, f_ID, v_name), where a "compute"_compute.html
 or "fix"_fix.html or "variable"_variable.html provides the value to be
 output.  In each case, the compute, fix, or variable must generate
 global values for input to the "thermo_style custom"_dump.html
 command.
 
 Note that thermodynamic output values can be "extensive" or
 "intensive".  The former scale with the number of atoms in the system
 (e.g. total energy), the latter do not (e.g. temperature).  The
 setting for "thermo_modify norm"_thermo_modify.html determines whether
 extensive quantities are normalized or not.  Computes and fixes
 produce either extensive or intensive values; see their individual doc
 pages for details.  "Equal-style variables"_variable.html produce only
 intensive values; you can include a division by "natoms" in the
 formula if desired, to make an extensive calculation produce an
 intensive result.
 
 Dump file output :h5,link(dump)
 
 Dump file output is specified by the "dump"_dump.html and
 "dump_modify"_dump_modify.html commands.  There are several
 pre-defined formats (dump atom, dump xtc, etc).
 
 There is also a "dump custom"_dump.html format where the user
 specifies what values are output with each atom.  Pre-defined atom
 attributes can be specified (id, x, fx, etc).  Three additional kinds
 of keywords can also be specified (c_ID, f_ID, v_name), where a
 "compute"_compute.html or "fix"_fix.html or "variable"_variable.html
 provides the values to be output.  In each case, the compute, fix, or
 variable must generate per-atom values for input to the "dump
 custom"_dump.html command.
 
 There is also a "dump local"_dump.html format where the user specifies
 what local values to output.  A pre-defined index keyword can be
 specified to enumuerate the local values.  Two additional kinds of
 keywords can also be specified (c_ID, f_ID), where a
 "compute"_compute.html or "fix"_fix.html or "variable"_variable.html
 provides the values to be output.  In each case, the compute or fix
 must generate local values for input to the "dump local"_dump.html
 command.
 
 Fixes that write output files :h5,link(fixoutput)
 
 Several fixes take various quantities as input and can write output
 files: "fix ave/time"_fix_ave_time.html, "fix
 ave/chunk"_fix_ave_chunk.html, "fix ave/histo"_fix_ave_histo.html,
 "fix ave/correlate"_fix_ave_correlate.html, and "fix
 print"_fix_print.html.
 
 The "fix ave/time"_fix_ave_time.html command enables direct output to
 a file and/or time-averaging of global scalars or vectors.  The user
 specifies one or more quantities as input.  These can be global
 "compute"_compute.html values, global "fix"_fix.html values, or
 "variables"_variable.html of any style except the atom style which
 produces per-atom values.  Since a variable can refer to keywords used
 by the "thermo_style custom"_thermo_style.html command (like temp or
 press) and individual per-atom values, a wide variety of quantities
 can be time averaged and/or output in this way.  If the inputs are one
 or more scalar values, then the fix generate a global scalar or vector
 of output.  If the inputs are one or more vector values, then the fix
 generates a global vector or array of output.  The time-averaged
 output of this fix can also be used as input to other output commands.
 
 The "fix ave/chunk"_fix_ave_chunk.html command enables direct output
 to a file of chunk-averaged per-atom quantities like those output in
 dump files.  Chunks can represent spatial bins or other collections of
 atoms, e.g. individual molecules.  The per-atom quantities can be atom
 density (mass or number) or atom attributes such as position,
 velocity, force.  They can also be per-atom quantities calculated by a
 "compute"_compute.html, by a "fix"_fix.html, or by an atom-style
 "variable"_variable.html.  The chunk-averaged output of this fix can
 also be used as input to other output commands.
 
 The "fix ave/histo"_fix_ave_histo.html command enables direct output
 to a file of histogrammed quantities, which can be global or per-atom
 or local quantities.  The histogram output of this fix can also be
 used as input to other output commands.
 
 The "fix ave/correlate"_fix_ave_correlate.html command enables direct
 output to a file of time-correlated quantities, which can be global
 values.  The correlation matrix output of this fix can also be used as
 input to other output commands.
 
 The "fix print"_fix_print.html command can generate a line of output
 written to the screen and log file or to a separate file, periodically
 during a running simulation.  The line can contain one or more
 "variable"_variable.html values for any style variable except the
 vector or atom styles).  As explained above, variables themselves can
 contain references to global values generated by "thermodynamic
 keywords"_thermo_style.html, "computes"_compute.html,
 "fixes"_fix.html, or other "variables"_variable.html, or to per-atom
 values for a specific atom.  Thus the "fix print"_fix_print.html
 command is a means to output a wide variety of quantities separate
 from normal thermodynamic or dump file output.
 
 Computes that process output quantities :h5,link(computeoutput)
 
 The "compute reduce"_compute_reduce.html and "compute
 reduce/region"_compute_reduce.html commands take one or more per-atom
 or local vector quantities as inputs and "reduce" them (sum, min, max,
 ave) to scalar quantities.  These are produced as output values which
 can be used as input to other output commands.
 
 The "compute slice"_compute_slice.html command take one or more global
 vector or array quantities as inputs and extracts a subset of their
 values to create a new vector or array.  These are produced as output
 values which can be used as input to other output commands.
 
 The "compute property/atom"_compute_property_atom.html command takes a
 list of one or more pre-defined atom attributes (id, x, fx, etc) and
 stores the values in a per-atom vector or array.  These are produced
 as output values which can be used as input to other output commands.
 The list of atom attributes is the same as for the "dump
 custom"_dump.html command.
 
 The "compute property/local"_compute_property_local.html command takes
 a list of one or more pre-defined local attributes (bond info, angle
 info, etc) and stores the values in a local vector or array.  These
 are produced as output values which can be used as input to other
 output commands.
 
 Fixes that process output quantities :h5,link(fixprocoutput)
 
 The "fix vector"_fix_vector.html command can create global vectors as
 output from global scalars as input, accumulating them one element at
 a time.
 
 The "fix ave/atom"_fix_ave_atom.html command performs time-averaging
 of per-atom vectors.  The per-atom quantities can be atom attributes
 such as position, velocity, force.  They can also be per-atom
 quantities calculated by a "compute"_compute.html, by a
 "fix"_fix.html, or by an atom-style "variable"_variable.html.  The
 time-averaged per-atom output of this fix can be used as input to
 other output commands.
 
 The "fix store/state"_fix_store_state.html command can archive one or
 more per-atom attributes at a particular time, so that the old values
 can be used in a future calculation or output.  The list of atom
 attributes is the same as for the "dump custom"_dump.html command,
 including per-atom quantities calculated by a "compute"_compute.html,
 by a "fix"_fix.html, or by an atom-style "variable"_variable.html.
 The output of this fix can be used as input to other output commands.
 
 Computes that generate values to output :h5,link(compute)
 
 Every "compute"_compute.html in LAMMPS produces either global or
 per-atom or local values.  The values can be scalars or vectors or
 arrays of data.  These values can be output using the other commands
 described in this section.  The doc page for each compute command
 describes what it produces.  Computes that produce per-atom or local
 values have the word "atom" or "local" in their style name.  Computes
 without the word "atom" or "local" produce global values.
 
 Fixes that generate values to output :h5,link(fix)
 
 Some "fixes"_fix.html in LAMMPS produces either global or per-atom or
 local values which can be accessed by other commands.  The values can
 be scalars or vectors or arrays of data.  These values can be output
 using the other commands described in this section.  The doc page for
 each fix command tells whether it produces any output quantities and
 describes them.
 
 Variables that generate values to output :h5,link(variable)
 
 "Variables"_variable.html defined in an input script can store one or
 more strings.  But equal-style, vector-style, and atom-style or
 atomfile-style variables generate a global scalar value, global vector
 or values, or a per-atom vector, resepctively, when accessed.  The
 formulas used to define these variables can contain references to the
 thermodynamic keywords and to global and per-atom data generated by
 computes, fixes, and other variables.  The values generated by
 variables can be used as input to and thus output by the other
 commands described in this section.
 
 Summary table of output options and data flow between commands :h5,link(table)
 
 This table summarizes the various commands that can be used for
 generating output from LAMMPS.  Each command produces output data of
 some kind and/or writes data to a file.  Most of the commands can take
 data from other commands as input.  Thus you can link many of these
 commands together in pipeline form, where data produced by one command
 is used as input to another command and eventually written to the
 screen or to a file.  Note that to hook two commands together the
 output and input data types must match, e.g. global/per-atom/local
 data and scalar/vector/array data.
 
 Also note that, as described above, when a command takes a scalar as
 input, that could be an element of a vector or array.  Likewise a
 vector input could be a column of an array.
 
 Command: Input: Output:
 "thermo_style custom"_thermo_style.html: global scalars: screen, log file:
 "dump custom"_dump.html: per-atom vectors: dump file:
 "dump local"_dump.html: local vectors: dump file:
 "fix print"_fix_print.html: global scalar from variable: screen, file:
 "print"_print.html: global scalar from variable: screen:
 "computes"_compute.html: N/A: global/per-atom/local scalar/vector/array:
 "fixes"_fix.html: N/A: global/per-atom/local scalar/vector/array:
 "variables"_variable.html: global scalars and vectors, per-atom vectors: global scalar and vector, per-atom vector:
 "compute reduce"_compute_reduce.html: per-atom/local vectors: global scalar/vector:
 "compute slice"_compute_slice.html: global vectors/arrays: global vector/array:
 "compute property/atom"_compute_property_atom.html: per-atom vectors: per-atom vector/array:
 "compute property/local"_compute_property_local.html: local vectors: local vector/array:
 "fix vector"_fix_vector.html: global scalars: global vector:
 "fix ave/atom"_fix_ave_atom.html: per-atom vectors: per-atom vector/array:
 "fix ave/time"_fix_ave_time.html: global scalars/vectors: global scalar/vector/array, file:
 "fix ave/chunk"_fix_ave_chunk.html: per-atom vectors: global array, file:
 "fix ave/histo"_fix_ave_histo.html: global/per-atom/local scalars and vectors: global array, file:
 "fix ave/correlate"_fix_ave_correlate.html: global scalars: global array, file:
 "fix store/state"_fix_store_state.html: per-atom vectors: per-atom vector/array :tb(c=3,s=:)
 
 :line
 
 6.16 Thermostatting, barostatting, and computing temperature :link(howto_16),h4
 
 Thermostatting means controlling the temperature of particles in an MD
 simulation.  Barostatting means controlling the pressure.  Since the
 pressure includes a kinetic component due to particle velocities, both
 these operations require calculation of the temperature.  Typically a
 target temperature (T) and/or pressure (P) is specified by the user,
 and the thermostat or barostat attempts to equilibrate the system to
 the requested T and/or P.
 
 Temperature is computed as kinetic energy divided by some number of
 degrees of freedom (and the Boltzmann constant).  Since kinetic energy
 is a function of particle velocity, there is often a need to
 distinguish between a particle's advection velocity (due to some
 aggregate motiion of particles) and its thermal velocity.  The sum of
 the two is the particle's total velocity, but the latter is often what
 is wanted to compute a temperature.
 
 LAMMPS has several options for computing temperatures, any of which
 can be used in thermostatting and barostatting.  These "compute
 commands"_compute.html calculate temperature, and the "compute
 pressure"_compute_pressure.html command calculates pressure.
 
 "compute temp"_compute_temp.html
 "compute temp/sphere"_compute_temp_sphere.html
 "compute temp/asphere"_compute_temp_asphere.html
 "compute temp/com"_compute_temp_com.html
 "compute temp/deform"_compute_temp_deform.html
 "compute temp/partial"_compute_temp_partial.html
 "compute temp/profile"_compute_temp_profile.html
 "compute temp/ramp"_compute_temp_ramp.html
 "compute temp/region"_compute_temp_region.html :ul
 
 All but the first 3 calculate velocity biases directly (e.g. advection
 velocities) that are removed when computing the thermal temperature.
 "Compute temp/sphere"_compute_temp_sphere.html and "compute
 temp/asphere"_compute_temp_asphere.html compute kinetic energy for
 finite-size particles that includes rotational degrees of freedom.
 They both allow for velocity biases indirectly, via an optional extra
 argument, another temperature compute that subtracts a velocity bias.
 This allows the translational velocity of spherical or aspherical
 particles to be adjusted in prescribed ways.
 
 Thermostatting in LAMMPS is performed by "fixes"_fix.html, or in one
 case by a pair style.  Several thermostatting fixes are available:
 Nose-Hoover (nvt), Berendsen, CSVR, Langevin, and direct rescaling
 (temp/rescale).  Dissipative particle dynamics (DPD) thermostatting
 can be invoked via the {dpd/tstat} pair style:
 
 "fix nvt"_fix_nh.html
 "fix nvt/sphere"_fix_nvt_sphere.html
 "fix nvt/asphere"_fix_nvt_asphere.html
 "fix nvt/sllod"_fix_nvt_sllod.html
 "fix temp/berendsen"_fix_temp_berendsen.html
 "fix temp/csvr"_fix_temp_csvr.html
 "fix langevin"_fix_langevin.html
 "fix temp/rescale"_fix_temp_rescale.html
 "pair_style dpd/tstat"_pair_dpd.html :ul
 
 "Fix nvt"_fix_nh.html only thermostats the translational velocity of
 particles.  "Fix nvt/sllod"_fix_nvt_sllod.html also does this, except
 that it subtracts out a velocity bias due to a deforming box and
 integrates the SLLOD equations of motion.  See the "NEMD
 simulations"_#howto_13 section of this page for further details.  "Fix
 nvt/sphere"_fix_nvt_sphere.html and "fix
 nvt/asphere"_fix_nvt_asphere.html thermostat not only translation
 velocities but also rotational velocities for spherical and aspherical
 particles.
 
 DPD thermostatting alters pairwise interactions in a manner analagous
 to the per-particle thermostatting of "fix
 langevin"_fix_langevin.html.
 
 Any of the thermostatting fixes can use temperature computes that
 remove bias which has two effects.  First, the current calculated
 temperature, which is compared to the requested target temperature, is
 caluclated with the velocity bias removed.  Second, the thermostat
 adjusts only the thermal temperature component of the particle's
 velocities, which are the velocities with the bias removed.  The
 removed bias is then added back to the adjusted velocities.  See the
 doc pages for the individual fixes and for the
 "fix_modify"_fix_modify.html command for instructions on how to assign
 a temperature compute to a thermostatting fix.  For example, you can
 apply a thermostat to only the x and z components of velocity by using
 it in conjunction with "compute
 temp/partial"_compute_temp_partial.html.  Of you could thermostat only
 the thermal temperature of a streaming flow of particles without
 affecting the streaming velocity, by using "compute
 temp/profile"_compute_temp_profile.html.
 
 NOTE: Only the nvt fixes perform time integration, meaning they update
 the velocities and positions of particles due to forces and velocities
 respectively.  The other thermostat fixes only adjust velocities; they
 do NOT perform time integration updates.  Thus they should be used in
 conjunction with a constant NVE integration fix such as these:
 
 "fix nve"_fix_nve.html
 "fix nve/sphere"_fix_nve_sphere.html
 "fix nve/asphere"_fix_nve_asphere.html :ul
 
 Barostatting in LAMMPS is also performed by "fixes"_fix.html.  Two
 barosttating methods are currently available: Nose-Hoover (npt and
 nph) and Berendsen:
 
 "fix npt"_fix_nh.html
 "fix npt/sphere"_fix_npt_sphere.html
 "fix npt/asphere"_fix_npt_asphere.html
 "fix nph"_fix_nh.html
 "fix press/berendsen"_fix_press_berendsen.html :ul
 
 The "fix npt"_fix_nh.html commands include a Nose-Hoover thermostat
 and barostat.  "Fix nph"_fix_nh.html is just a Nose/Hoover barostat;
 it does no thermostatting.  Both "fix nph"_fix_nh.html and "fix
 press/bernendsen"_fix_press_berendsen.html can be used in conjunction
 with any of the thermostatting fixes.
 
 As with the thermostats, "fix npt"_fix_nh.html and "fix
 nph"_fix_nh.html only use translational motion of the particles in
 computing T and P and performing thermo/barostatting.  "Fix
 npt/sphere"_fix_npt_sphere.html and "fix
 npt/asphere"_fix_npt_asphere.html thermo/barostat using not only
 translation velocities but also rotational velocities for spherical
 and aspherical particles.
 
 All of the barostatting fixes use the "compute
 pressure"_compute_pressure.html compute to calculate a current
 pressure.  By default, this compute is created with a simple "compute
 temp"_compute_temp.html (see the last argument of the "compute
 pressure"_compute_pressure.html command), which is used to calculated
 the kinetic componenet of the pressure.  The barostatting fixes can
 also use temperature computes that remove bias for the purpose of
 computing the kinetic componenet which contributes to the current
 pressure.  See the doc pages for the individual fixes and for the
 "fix_modify"_fix_modify.html command for instructions on how to assign
 a temperature or pressure compute to a barostatting fix.
 
 NOTE: As with the thermostats, the Nose/Hoover methods ("fix
 npt"_fix_nh.html and "fix nph"_fix_nh.html) perform time integration.
 "Fix press/berendsen"_fix_press_berendsen.html does NOT, so it should
 be used with one of the constant NVE fixes or with one of the NVT
 fixes.
 
 Finally, thermodynamic output, which can be setup via the
 "thermo_style"_thermo_style.html command, often includes temperature
 and pressure values.  As explained on the doc page for the
 "thermo_style"_thermo_style.html command, the default T and P are
 setup by the thermo command itself.  They are NOT the ones associated
 with any thermostatting or barostatting fix you have defined or with
 any compute that calculates a temperature or pressure.  Thus if you
 want to view these values of T and P, you need to specify them
 explicitly via a "thermo_style custom"_thermo_style.html command.  Or
 you can use the "thermo_modify"_thermo_modify.html command to
 re-define what temperature or pressure compute is used for default
 thermodynamic output.
 
 :line
 
 6.17 Walls :link(howto_17),h4
 
 Walls in an MD simulation are typically used to bound particle motion,
 i.e. to serve as a boundary condition.
 
 Walls in LAMMPS can be of rough (made of particles) or idealized
 surfaces.  Ideal walls can be smooth, generating forces only in the
 normal direction, or frictional, generating forces also in the
 tangential direction.
 
 Rough walls, built of particles, can be created in various ways.  The
 particles themselves can be generated like any other particle, via the
 "lattice"_lattice.html and "create_atoms"_create_atoms.html commands,
 or read in via the "read_data"_read_data.html command.
 
 Their motion can be constrained by many different commands, so that
 they do not move at all, move together as a group at constant velocity
 or in response to a net force acting on them, move in a prescribed
 fashion (e.g. rotate around a point), etc.  Note that if a time
 integration fix like "fix nve"_fix_nve.html or "fix nvt"_fix_nh.html
 is not used with the group that contains wall particles, their
 positions and velocities will not be updated.
 
 "fix aveforce"_fix_aveforce.html - set force on particles to average value, so they move together
 "fix setforce"_fix_setforce.html - set force on particles to a value, e.g. 0.0
 "fix freeze"_fix_freeze.html - freeze particles for use as granular walls
 "fix nve/noforce"_fix_nve_noforce.html - advect particles by their velocity, but without force
 "fix move"_fix_move.html - prescribe motion of particles by a linear velocity, oscillation, rotation, variable :ul
 
 The "fix move"_fix_move.html command offers the most generality, since
 the motion of individual particles can be specified with
 "variable"_variable.html formula which depends on time and/or the
 particle position.
 
 For rough walls, it may be useful to turn off pairwise interactions
 between wall particles via the "neigh_modify
 exclude"_neigh_modify.html command.
 
 Rough walls can also be created by specifying frozen particles that do
 not move and do not interact with mobile particles, and then tethering
 other particles to the fixed particles, via a "bond"_bond_style.html.
 The bonded particles do interact with other mobile particles.
 
 Idealized walls can be specified via several fix commands.  "Fix
 wall/gran"_fix_wall_gran.html creates frictional walls for use with
 granular particles; all the other commands create smooth walls.
 
 "fix wall/reflect"_fix_wall_reflect.html - reflective flat walls
 "fix wall/lj93"_fix_wall.html - flat walls, with Lennard-Jones 9/3 potential
 "fix wall/lj126"_fix_wall.html - flat walls, with Lennard-Jones 12/6 potential
 "fix wall/colloid"_fix_wall.html - flat walls, with "pair_style colloid"_pair_colloid.html potential
 "fix wall/harmonic"_fix_wall.html - flat walls, with repulsive harmonic spring potential
 "fix wall/region"_fix_wall_region.html - use region surface as wall
 "fix wall/gran"_fix_wall_gran.html - flat or curved walls with "pair_style granular"_pair_gran.html potential :ul
 
 The {lj93}, {lj126}, {colloid}, and {harmonic} styles all allow the
 flat walls to move with a constant velocity, or oscillate in time.
 The "fix wall/region"_fix_wall_region.html command offers the most
 generality, since the region surface is treated as a wall, and the
 geometry of the region can be a simple primitive volume (e.g. a
 sphere, or cube, or plane), or a complex volume made from the union
 and intersection of primitive volumes.  "Regions"_region.html can also
 specify a volume "interior" or "exterior" to the specified primitive
 shape or {union} or {intersection}.  "Regions"_region.html can also be
 "dynamic" meaning they move with constant velocity, oscillate, or
 rotate.
 
 The only frictional idealized walls currently in LAMMPS are flat or
 curved surfaces specified by the "fix wall/gran"_fix_wall_gran.html
 command.  At some point we plan to allow regoin surfaces to be used as
 frictional walls, as well as triangulated surfaces.
 
 :line
 
 6.18 Elastic constants :link(howto_18),h4
 
 Elastic constants characterize the stiffness of a material. The formal
 definition is provided by the linear relation that holds between the
 stress and strain tensors in the limit of infinitesimal deformation.
 In tensor notation, this is expressed as s_ij = C_ijkl * e_kl, where
 the repeated indices imply summation. s_ij are the elements of the
 symmetric stress tensor. e_kl are the elements of the symmetric strain
 tensor. C_ijkl are the elements of the fourth rank tensor of elastic
 constants. In three dimensions, this tensor has 3^4=81 elements. Using
 Voigt notation, the tensor can be written as a 6x6 matrix, where C_ij
 is now the derivative of s_i w.r.t. e_j. Because s_i is itself a
 derivative w.r.t. e_i, it follows that C_ij is also symmetric, with at
 most 7*6/2 = 21 distinct elements.
 
 At zero temperature, it is easy to estimate these derivatives by
 deforming the simulation box in one of the six directions using the
 "change_box"_change_box.html command and measuring the change in the
 stress tensor. A general-purpose script that does this is given in the
 examples/elastic directory described in "this
 section"_Section_example.html.
 
 Calculating elastic constants at finite temperature is more
 challenging, because it is necessary to run a simulation that perfoms
 time averages of differential properties. One way to do this is to
 measure the change in average stress tensor in an NVT simulations when
 the cell volume undergoes a finite deformation. In order to balance
 the systematic and statistical errors in this method, the magnitude of
 the deformation must be chosen judiciously, and care must be taken to
 fully equilibrate the deformed cell before sampling the stress
 tensor. Another approach is to sample the triclinic cell fluctuations
 that occur in an NPT simulation. This method can also be slow to
 converge and requires careful post-processing "(Shinoda)"_#Shinoda
 
 :line
 
 6.19 Library interface to LAMMPS :link(howto_19),h4
 
 As described in "Section_start 5"_Section_start.html#start_5, LAMMPS
 can be built as a library, so that it can be called by another code,
 used in a "coupled manner"_Section_howto.html#howto_10 with other
 codes, or driven through a "Python interface"_Section_python.html.
 
 All of these methodologies use a C-style interface to LAMMPS that is
 provided in the files src/library.cpp and src/library.h.  The
 functions therein have a C-style argument list, but contain C++ code
 you could write yourself in a C++ application that was invoking LAMMPS
 directly.  The C++ code in the functions illustrates how to invoke
 internal LAMMPS operations.  Note that LAMMPS classes are defined
 within a LAMMPS namespace (LAMMPS_NS) if you use them from another C++
 application.
 
 Library.cpp contains these 5 basic functions:
 
 void lammps_open(int, char **, MPI_Comm, void **)
 void lammps_close(void *)
 int lammps_version(void *)
 void lammps_file(void *, char *)
 char *lammps_command(void *, char *) :pre
 
 The lammps_open() function is used to initialize LAMMPS, passing in a
 list of strings as if they were "command-line
 arguments"_Section_start.html#start_7 when LAMMPS is run in
 stand-alone mode from the command line, and a MPI communicator for
 LAMMPS to run under.  It returns a ptr to the LAMMPS object that is
 created, and which is used in subsequent library calls.  The
 lammps_open() function can be called multiple times, to create
 multiple instances of LAMMPS.
 
 LAMMPS will run on the set of processors in the communicator.  This
 means the calling code can run LAMMPS on all or a subset of
 processors.  For example, a wrapper script might decide to alternate
 between LAMMPS and another code, allowing them both to run on all the
 processors.  Or it might allocate half the processors to LAMMPS and
 half to the other code and run both codes simultaneously before
 syncing them up periodically.  Or it might instantiate multiple
 instances of LAMMPS to perform different calculations.
 
 The lammps_close() function is used to shut down an instance of LAMMPS
 and free all its memory.
 
 The lammps_version() function can be used to determined the specific
 version of the underlying LAMMPS code. This is particularly useful
 when loading LAMMPS as a shared library via dlopen(). The code using
 the library interface can than use this information to adapt to
 changes to the LAMMPS command syntax between versions. The returned
 LAMMPS version code is an integer (e.g. 2 Sep 2015 results in
 20150902) that grows with every new LAMMPS version.
 
 The lammps_file() and lammps_command() functions are used to pass a
 file or string to LAMMPS as if it were an input script or single
 command in an input script.  Thus the calling code can read or
 generate a series of LAMMPS commands one line at a time and pass it
 thru the library interface to setup a problem and then run it,
 interleaving the lammps_command() calls with other calls to extract
 information from LAMMPS, perform its own operations, or call another
 code's library.
 
 Other useful functions are also included in library.cpp.  For example:
 
 void *lammps_extract_global(void *, char *)
 void *lammps_extract_atom(void *, char *)
 void *lammps_extract_compute(void *, char *, int, int)
 void *lammps_extract_fix(void *, char *, int, int, int, int)
 void *lammps_extract_variable(void *, char *, char *)
 int lammps_set_variable(void *, char *, char *)
 int lammps_get_natoms(void *)
 void lammps_get_coords(void *, double *)
 void lammps_put_coords(void *, double *) :pre
 
 These can extract various global or per-atom quantities from LAMMPS as
 well as values calculated by a compute, fix, or variable.  The
 "set_variable" function can set an existing string-style variable to a
 new value, so that subsequent LAMMPS commands can access the variable.
 The "get" and "put" operations can retrieve and reset atom
 coordinates.  See the library.cpp file and its associated header file
 library.h for details.
 
 The key idea of the library interface is that you can write any
 functions you wish to define how your code talks to LAMMPS and add
 them to src/library.cpp and src/library.h, as well as to the "Python
 interface"_Section_python.html.  The routines you add can access or
 change any LAMMPS data you wish.  The examples/COUPLE and python
 directories have example C++ and C and Python codes which show how a
 driver code can link to LAMMPS as a library, run LAMMPS on a subset of
 processors, grab data from LAMMPS, change it, and put it back into
 LAMMPS.
 
 :line
 
 6.20 Calculating thermal conductivity :link(howto_20),h4
 
 The thermal conductivity kappa of a material can be measured in at
 least 4 ways using various options in LAMMPS.  See the examples/KAPPA
 directory for scripts that implement the 4 methods discussed here for
 a simple Lennard-Jones fluid model.  Also, see "this
 section"_Section_howto.html#howto_21 of the manual for an analogous
 discussion for viscosity.
 
 The thermal conducitivity tensor kappa is a measure of the propensity
 of a material to transmit heat energy in a diffusive manner as given
 by Fourier's law
 
 J = -kappa grad(T)
 
 where J is the heat flux in units of energy per area per time and
 grad(T) is the spatial gradient of temperature.  The thermal
 conductivity thus has units of energy per distance per time per degree
 K and is often approximated as an isotropic quantity, i.e. as a
 scalar.
 
 The first method is to setup two thermostatted regions at opposite
 ends of a simulation box, or one in the middle and one at the end of a
 periodic box.  By holding the two regions at different temperatures
 with a "thermostatting fix"_Section_howto.html#howto_13, the energy
 added to the hot region should equal the energy subtracted from the
 cold region and be proportional to the heat flux moving between the
 regions.  See the papers by "Ikeshoji and Hafskjold"_#howto-Ikeshoji
 and "Wirnsberger et al"_#howto-Wirnsberger for details of this idea.
 Note that thermostatting fixes such as "fix nvt"_fix_nh.html, "fix
 langevin"_fix_langevin.html, and "fix
 temp/rescale"_fix_temp_rescale.html store the cumulative energy they
 add/subtract.
 
 Alternatively, as a second method, the "fix heat"_fix_heat.html or
 "fix ehex"_fix_ehex.html commands can be used in place of thermostats
 on each of two regions to add/subtract specified amounts of energy to
 both regions.  In both cases, the resulting temperatures of the two
 regions can be monitored with the "compute temp/region" command and
 the temperature profile of the intermediate region can be monitored
 with the "fix ave/spatial"_fix_ave_spatial.html and "compute
 ke/atom"_compute_ke_atom.html commands.
 
 The third method is to perform a reverse non-equilibrium MD simulation
 using the "fix thermal/conductivity"_fix_thermal_conductivity.html
 command which implements the rNEMD algorithm of Muller-Plathe.
 Kinetic energy is swapped between atoms in two different layers of the
 simulation box.  This induces a temperature gradient between the two
 layers which can be monitored with the "fix
 ave/spatial"_fix_ave_spatial.html and "compute
 ke/atom"_compute_ke_atom.html commands.  The fix tallies the
 cumulative energy transfer that it performs.  See the "fix
 thermal/conductivity"_fix_thermal_conductivity.html command for
 details.
 
 The fourth method is based on the Green-Kubo (GK) formula which
 relates the ensemble average of the auto-correlation of the heat flux
 to kappa.  The heat flux can be calculated from the fluctuations of
 per-atom potential and kinetic energies and per-atom stress tensor in
 a steady-state equilibrated simulation.  This is in contrast to the
 two preceding non-equilibrium methods, where energy flows continuously
 between hot and cold regions of the simulation box.
 
 The "compute heat/flux"_compute_heat_flux.html command can calculate
 the needed heat flux and describes how to implement the Green_Kubo
 formalism using additional LAMMPS commands, such as the "fix
 ave/correlate"_fix_ave_correlate.html command to calculate the needed
 auto-correlation.  See the doc page for the "compute
 heat/flux"_compute_heat_flux.html command for an example input script
 that calculates the thermal conductivity of solid Ar via the GK
 formalism.
 
 :line
 
 6.21 Calculating viscosity :link(howto_21),h4
 
 The shear viscosity eta of a fluid can be measured in at least 5 ways
 using various options in LAMMPS.  See the examples/VISCOSITY directory
 for scripts that implement the 5 methods discussed here for a simple
 Lennard-Jones fluid model.  Also, see "this
 section"_Section_howto.html#howto_20 of the manual for an analogous
 discussion for thermal conductivity.
 
 Eta is a measure of the propensity of a fluid to transmit momentum in
 a direction perpendicular to the direction of velocity or momentum
 flow.  Alternatively it is the resistance the fluid has to being
 sheared.  It is given by
 
 J = -eta grad(Vstream)
 
 where J is the momentum flux in units of momentum per area per time.
 and grad(Vstream) is the spatial gradient of the velocity of the fluid
 moving in another direction, normal to the area through which the
 momentum flows.  Viscosity thus has units of pressure-time.
 
 The first method is to perform a non-equlibrium MD (NEMD) simulation
 by shearing the simulation box via the "fix deform"_fix_deform.html
 command, and using the "fix nvt/sllod"_fix_nvt_sllod.html command to
 thermostat the fluid via the SLLOD equations of motion.
 Alternatively, as a second method, one or more moving walls can be
 used to shear the fluid in between them, again with some kind of
 thermostat that modifies only the thermal (non-shearing) components of
 velocity to prevent the fluid from heating up.
 
 In both cases, the velocity profile setup in the fluid by this
 procedure can be monitored by the "fix
 ave/spatial"_fix_ave_spatial.html command, which determines
 grad(Vstream) in the equation above.  E.g. the derivative in the
 y-direction of the Vx component of fluid motion or grad(Vstream) =
 dVx/dy.  The Pxy off-diagonal component of the pressure or stress
 tensor, as calculated by the "compute pressure"_compute_pressure.html
 command, can also be monitored, which is the J term in the equation
 above.  See "this section"_Section_howto.html#howto_13 of the manual
 for details on NEMD simulations.
 
 The third method is to perform a reverse non-equilibrium MD simulation
 using the "fix viscosity"_fix_viscosity.html command which implements
 the rNEMD algorithm of Muller-Plathe.  Momentum in one dimension is
 swapped between atoms in two different layers of the simulation box in
 a different dimension.  This induces a velocity gradient which can be
 monitored with the "fix ave/spatial"_fix_ave_spatial.html command.
 The fix tallies the cummulative momentum transfer that it performs.
 See the "fix viscosity"_fix_viscosity.html command for details.
 
 The fourth method is based on the Green-Kubo (GK) formula which
 relates the ensemble average of the auto-correlation of the
 stress/pressure tensor to eta.  This can be done in a fully
 equilibrated simulation which is in contrast to the two preceding
 non-equilibrium methods, where momentum flows continuously through the
 simulation box.
 
 Here is an example input script that calculates the viscosity of
 liquid Ar via the GK formalism:
 
 # Sample LAMMPS input script for viscosity of liquid Ar :pre
 
 units       real
 variable    T equal 86.4956
 variable    V equal vol
 variable    dt equal 4.0
 variable    p equal 400     # correlation length
 variable    s equal 5       # sample interval
 variable    d equal $p*$s   # dump interval :pre
 
 # convert from LAMMPS real units to SI :pre
 
 variable    kB equal 1.3806504e-23    # \[J/K/] Boltzmann
 variable    atm2Pa equal 101325.0
 variable    A2m equal 1.0e-10
 variable    fs2s equal 1.0e-15
 variable    convert equal $\{atm2Pa\}*$\{atm2Pa\}*$\{fs2s\}*$\{A2m\}*$\{A2m\}*$\{A2m\} :pre
 
 # setup problem :pre
 
 dimension    3
 boundary     p p p
 lattice      fcc 5.376 orient x 1 0 0 orient y 0 1 0 orient z 0 0 1
 region       box block 0 4 0 4 0 4
 create_box   1 box
 create_atoms 1 box
 mass	     1 39.948
 pair_style   lj/cut 13.0
 pair_coeff   * * 0.2381 3.405
 timestep     $\{dt\}
 thermo	     $d :pre
 
 # equilibration and thermalization :pre
 
 velocity     all create $T 102486 mom yes rot yes dist gaussian
 fix          NVT all nvt temp $T $T 10 drag 0.2
 run          8000 :pre
 
 # viscosity calculation, switch to NVE if desired :pre
 
 #unfix       NVT
 #fix         NVE all nve :pre
 
 reset_timestep 0
 variable     pxy equal pxy
 variable     pxz equal pxz
 variable     pyz equal pyz
 fix          SS all ave/correlate $s $p $d &
              v_pxy v_pxz v_pyz type auto file S0St.dat ave running
 variable     scale equal $\{convert\}/($\{kB\}*$T)*$V*$s*$\{dt\}
 variable     v11 equal trap(f_SS\[3\])*$\{scale\}
 variable     v22 equal trap(f_SS\[4\])*$\{scale\}
 variable     v33 equal trap(f_SS\[5\])*$\{scale\}
 thermo_style custom step temp press v_pxy v_pxz v_pyz v_v11 v_v22 v_v33
 run          100000
 variable     v equal (v_v11+v_v22+v_v33)/3.0
 variable     ndens equal count(all)/vol
 print        "average viscosity: $v \[Pa.s/] @ $T K, $\{ndens\} /A^3" :pre
 
 The fifth method is related to the above Green-Kubo method,
 but uses the Einstein formulation, analogous to the Einstein
 mean-square-displacement formulation for self-diffusivity. The
 time-integrated momentum fluxes play the role of Cartesian
 coordinates, whose mean-square displacement increases linearly
 with time at sufficiently long times. 
 
 :line
 
 6.22 Calculating a diffusion coefficient :link(howto_22),h4
 
 The diffusion coefficient D of a material can be measured in at least
 2 ways using various options in LAMMPS.  See the examples/DIFFUSE
 directory for scripts that implement the 2 methods discussed here for
 a simple Lennard-Jones fluid model.
 
 The first method is to measure the mean-squared displacement (MSD) of
 the system, via the "compute msd"_compute_msd.html command.  The slope
 of the MSD versus time is proportional to the diffusion coefficient.
 The instantaneous MSD values can be accumulated in a vector via the
 "fix vector"_fix_vector.html command, and a line fit to the vector to
 compute its slope via the "variable slope"_variable.html function, and
 thus extract D.
 
 The second method is to measure the velocity auto-correlation function
 (VACF) of the system, via the "compute vacf"_compute_vacf.html
 command.  The time-integral of the VACF is proportional to the
 diffusion coefficient.  The instantaneous VACF values can be
 accumulated in a vector via the "fix vector"_fix_vector.html command,
 and time integrated via the "variable trap"_variable.html function,
 and thus extract D.
 
 :line
 
 6.23 Using chunks to calculate system properties :link(howto_23),h4
 
 In LAMMS, "chunks" are collections of atoms, as defined by the
 "compute chunk/atom"_compute_chunk_atom.html command, which assigns
 each atom to a chunk ID (or to no chunk at all).  The number of chunks
 and the assignment of chunk IDs to atoms can be static or change over
 time.  Examples of "chunks" are molecules or spatial bins or atoms
 with similar values (e.g. coordination number or potential energy).
 
 The per-atom chunk IDs can be used as input to two other kinds of
 commands, to calculate various properties of a system:
 
 "fix ave/chunk"_fix_ave_chunk.html
 any of the "compute */chunk"_compute.html commands :ul
 
 Here, each of the 3 kinds of chunk-related commands is briefly
 overviewed.  Then some examples are given of how to compute different
 properties with chunk commands.
 
 Compute chunk/atom command: :h5
 
 This compute can assign atoms to chunks of various styles.  Only atoms
 in the specified group and optional specified region are assigned to a
 chunk.  Here are some possible chunk definitions:
 
 atoms in same molecule | chunk ID = molecule ID |
 atoms of same atom type | chunk ID = atom type |
 all atoms with same atom property (charge, radius, etc) | chunk ID = output of compute property/atom |
 atoms in same cluster | chunk ID = output of "compute cluster/atom"_compute_cluster_atom.html command |
 atoms in same spatial bin | chunk ID = bin ID |
 atoms in same rigid body | chunk ID = molecule ID used to define rigid bodies |
 atoms with similar potential energy | chunk ID = output of "compute pe/atom"_compute_pe_atom.html |
 atoms with same local defect structure | chunk ID = output of "compute centro/atom"_compute_centro_atom.html or "compute coord/atom"_compute_coord_atom.html command :tb(s=|,c=2)
 
 Note that chunk IDs are integer values, so for atom properties or
 computes that produce a floating point value, they will be truncated
 to an integer.  You could also use the compute in a variable that
 scales the floating point value to spread it across multiple intergers.
 
 Spatial bins can be of various kinds, e.g. 1d bins = slabs, 2d bins =
 pencils, 3d bins = boxes, spherical bins, cylindrical bins.
 
 This compute also calculates the number of chunks {Nchunk}, which is
 used by other commands to tally per-chunk data.  {Nchunk} can be a
 static value or change over time (e.g. the number of clusters).  The
 chunk ID for an individual atom can also be static (e.g. a molecule
 ID), or dynamic (e.g. what spatial bin an atom is in as it moves).
 
 Note that this compute allows the per-atom output of other
 "computes"_compute.html, "fixes"_fix.html, and
 "variables"_variable.html to be used to define chunk IDs for each
 atom.  This means you can write your own compute or fix to output a
 per-atom quantity to use as chunk ID.  See
 "Section_modify"_Section_modify.html of the documentation for how to
 do this.  You can also define a "per-atom variable"_variable.html in
 the input script that uses a formula to generate a chunk ID for each
 atom.
 
 Fix ave/chunk command: :h5
 
 This fix takes the ID of a "compute
 chunk/atom"_compute_chunk_atom.html command as input.  For each chunk,
 it then sums one or more specified per-atom values over the atoms in
 each chunk.  The per-atom values can be any atom property, such as
 velocity, force, charge, potential energy, kinetic energy, stress,
 etc.  Additional keywords are defined for per-chunk properties like
 density and temperature.  More generally any per-atom value generated
 by other "computes"_compute.html, "fixes"_fix.html, and "per-atom
 variables"_variable.html, can be summed over atoms in each chunk.
 
 Similar to other averaging fixes, this fix allows the summed per-chunk
 values to be time-averaged in various ways, and output to a file.  The
 fix produces a global array as output with one row of values per
 chunk.
 
 Compute */chunk commands: :h5
 
 Currently the following computes operate on chunks of atoms to produce
 per-chunk values.
 
 "compute com/chunk"_compute_com_chunk.html
 "compute gyration/chunk"_compute_gyration_chunk.html
 "compute inertia/chunk"_compute_inertia_chunk.html
 "compute msd/chunk"_compute_msd_chunk.html
 "compute property/chunk"_compute_property_chunk.html
 "compute temp/chunk"_compute_temp_chunk.html
 "compute torque/chunk"_compute_vcm_chunk.html
 "compute vcm/chunk"_compute_vcm_chunk.html :ul
 
 They each take the ID of a "compute
 chunk/atom"_compute_chunk_atom.html command as input.  As their names
 indicate, they calculate the center-of-mass, radius of gyration,
 moments of inertia, mean-squared displacement, temperature, torque,
 and velocity of center-of-mass for each chunk of atoms.  The "compute
 property/chunk"_compute_property_chunk.html command can tally the
 count of atoms in each chunk and extract other per-chunk properties.
 
 The reason these various calculations are not part of the "fix
 ave/chunk command"_fix_ave_chunk.html, is that each requires a more
 complicated operation than simply summing and averaging over per-atom
 values in each chunk.  For example, many of them require calculation
 of a center of mass, which requires summing mass*position over the
 atoms and then dividing by summed mass.
 
 All of these computes produce a global vector or global array as
 output, wih one or more values per chunk.  They can be used
 in various ways:
 
 As input to the "fix ave/time"_fix_ave_time.html command, which can
 write the values to a file and optionally time average them. :ulb,l
 
 As input to the "fix ave/histo"_fix_ave_histo.html command to
 histogram values across chunks.  E.g. a histogram of cluster sizes or
 molecule diffusion rates. :l
 
 As input to special functions of "equal-style
 variables"_variable.html, like sum() and max().  E.g. to find the
 largest cluster or fastest diffusing molecule. :l,ule
 
 Example calculations with chunks :h5
 
 Here are eaxmples using chunk commands to calculate various
 properties:
 
 (1) Average velocity in each of 1000 2d spatial bins:
 
 compute cc1 all chunk/atom bin/2d x 0.0 0.1 y lower 0.01 units reduced
 fix 1 all ave/chunk 100 10 1000 cc1 vx vy file tmp.out :pre
 
 (2) Temperature in each spatial bin, after subtracting a flow
 velocity:
 
 compute cc1 all chunk/atom bin/2d x 0.0 0.1 y lower 0.1 units reduced
 compute vbias all temp/profile 1 0 0 y 10
 fix 1 all ave/chunk 100 10 1000 cc1 temp bias vbias file tmp.out :pre
 
 (3) Center of mass of each molecule:
 
 compute cc1 all chunk/atom molecule
 compute myChunk all com/chunk cc1
 fix 1 all ave/time 100 1 100 c_myChunk file tmp.out mode vector :pre
 
 (4) Total force on each molecule and ave/max across all molecules:
 
 compute cc1 all chunk/atom molecule
 fix 1 all ave/chunk 1000 1 1000 cc1 fx fy fz file tmp.out
-variable xave equal ave(f_1[2])
-variable xmax equal max(f_1[2])
+variable xave equal ave(f_1\[2\])
+variable xmax equal max(f_1\[2\])
 thermo 1000
 thermo_style custom step temp v_xave v_xmax :pre
 
 (5) Histogram of cluster sizes:
 
 compute cluster all cluster/atom 1.0
 compute cc1 all chunk/atom c_cluster compress yes
 compute size all property/chunk cc1 count
 fix 1 all ave/histo 100 1 100 0 20 20 c_size mode vector ave running beyond ignore file tmp.histo :pre
 
 :line
 
 6.24 Setting parameters for the "kspace_style pppm/disp"_kspace_style.html command :link(howto_24),h4
 
 The PPPM method computes interactions by splitting the pair potential
 into two parts, one of which is computed in a normal pairwise fashion,
 the so-called real-space part, and one of which is computed using the
 Fourier transform, the so called reciprocal-space or kspace part.  For
 both parts, the potential is not computed exactly but is approximated.
 Thus, there is an error in both parts of the computation, the
 real-space and the kspace error. The just mentioned facts are true
 both for the PPPM for Coulomb as well as dispersion interactions. The
 deciding difference - and also the reason why the parameters for
 pppm/disp have to be selected with more care - is the impact of the
 errors on the results: The kspace error of the PPPM for Coulomb and
 dispersion interaction and the real-space error of the PPPM for
 Coulomb interaction have the character of noise. In contrast, the
 real-space error of the PPPM for dispersion has a clear physical
 interpretation: the underprediction of cohesion. As a consequence, the
 real-space error has a much stronger effect than the kspace error on
 simulation results for pppm/disp.  Parameters must thus be chosen in a
 way that this error is much smaller than the kspace error.
 
 When using pppm/disp and not making any specifications on the PPPM
 parameters via the kspace modify command, parameters will be tuned
 such that the real-space error and the kspace error are equal.  This
 will result in simulations that are either inaccurate or slow, both of
 which is not desirable. For selecting parameters for the pppm/disp
 that provide fast and accurate simulations, there are two approaches,
 which both have their up- and downsides.
 
 The first approach is to set desired real-space an kspace accuracies
 via the {kspace_modify force/disp/real} and {kspace_modify
 force/disp/kspace} commands. Note that the accuracies have to be
 specified in force units and are thus dependend on the chosen unit
 settings. For real units, 0.0001 and 0.002 seem to provide reasonable
 accurate and efficient computations for the real-space and kspace
 accuracies.  0.002 and 0.05 work well for most systems using lj
 units. PPPM parameters will be generated based on the desired
 accuracies. The upside of this approach is that it usually provides a
 good set of parameters and will work for both the {kspace_modify diff
 ad} and {kspace_modify diff ik} options.  The downside of the method
 is that setting the PPPM parameters will take some time during the
 initialization of the simulation.
 
 The second approach is to set the parameters for the pppm/disp
 explicitly using the {kspace_modify mesh/disp}, {kspace_modify
 order/disp}, and {kspace_modify gewald/disp} commands. This approach
 requires a more experienced user who understands well the impact of
 the choice of parameters on the simulation accuracy and
 performance. This approach provides a fast initialization of the
 simulation. However, it is sensitive to errors: A combination of
 parameters that will perform well for one system might result in
 far-from-optimal conditions for other simulations. For example,
 parametes that provide accurate and fast computations for
 all-atomistic force fields can provide insufficient accuracy or
 united-atomistic force fields (which is related to that the latter
 typically have larger dispersion coefficients).
 
 To avoid inaccurate or inefficient simulations, the pppm/disp stops
 simulations with an error message if no action is taken to control the
 PPPM parameters. If the automatic parameter generation is desired and
 real-space and kspace accuracies are desired to be equal, this error
 message can be suppressed using the {kspace_modify disp/auto yes}
 command.
 
 A reasonable approach that combines the upsides of both methods is to
 make the first run using the {kspace_modify force/disp/real} and
 {kspace_modify force/disp/kspace} commands, write down the PPPM
 parameters from the outut, and specify these parameters using the
 second approach in subsequent runs (which have the same composition,
 force field, and approximately the same volume).
 
 Concerning the performance of the pppm/disp there are two more things
 to consider. The first is that when using the pppm/disp, the cutoff
 parameter does no longer affect the accuracy of the simulation
 (subject to that gewald/disp is adjusted when changing the cutoff).
 The performance can thus be increased by examining different values
 for the cutoff parameter. A lower bound for the cutoff is only set by
 the truncation error of the repulsive term of pair potentials.
 
 The second is that the mixing rule of the pair style has an impact on
 the computation time when using the pppm/disp. Fastest computations
 are achieved when using the geometric mixing rule. Using the
 arithmetic mixing rule substantially increases the computational cost.
 The computational overhead can be reduced using the {kspace_modify
 mix/disp geom} and {kspace_modify splittol} commands. The first
 command simply enforces geometric mixing of the dispersion
 coeffiecients in kspace computations.  This introduces some error in
 the computations but will also significantly speed-up the
 simulations. The second keyword sets the accuracy with which the
 dispersion coefficients are approximated using a matrix factorization
 approach.  This may result in better accuracy then using the first
 command, but will usually also not provide an equally good increase of
 efficiency.
 
 Finally, pppm/disp can also be used when no mixing rules apply.
 This can be achieved using the {kspace_modify mix/disp none} command.
 Note that the code does not check automatically whether any mixing
 rule is fulfilled. If mixing rules do not apply, the user will have
 to specify this command explicitly.
 
 :line
 
 6.25 Polarizable models :link(howto_25),h4
 
 In polarizable force fields the charge distributions in molecules and
 materials respond to their electrostatic environements. Polarizable
 systems can be simulated in LAMMPS using three methods:
 
 the fluctuating charge method, implemented in the "QEQ"_fix_qeq.html
 package, :ulb,l
 the adiabatic core-shell method, implemented in the
 "CORESHELL"_#howto_26 package, :l
 the thermalized Drude dipole method, implemented in the
 "USER-DRUDE"_#howto_27 package. :l,ule
 
 The fluctuating charge method calculates instantaneous charges on
 interacting atoms based on the electronegativity equalization
 principle. It is implemented in the "fix qeq"_fix_qeq.html which is
 available in several variants. It is a relatively efficient technique
 since no additional particles are introduced. This method allows for
 charge transfer between molecules or atom groups. However, because the
 charges are located at the interaction sites, off-plane components of
 polarization cannot be represented in planar molecules or atom groups.
 
 The two other methods share the same basic idea: polarizable atoms are
 split into one core atom and one satellite particle (called shell or
 Drude particle) attached to it by a harmonic spring.  Both atoms bear
 a charge and they represent collectively an induced electric dipole.
 These techniques are computationally more expensive than the QEq
 method because of additional particles and bonds. These two
 charge-on-spring methods differ in certain features, with the
 core-shell model being normally used for ionic/crystalline materials,
 whereas the so-called Drude model is normally used for molecular
 systems and fluid states.
 
 The core-shell model is applicable to crystalline materials where the
 high symmetry around each site leads to stable trajectories of the
 core-shell pairs. However, bonded atoms in molecules can be so close
 that a core would interact too strongly or even capture the Drude
 particle of a neighbor. The Drude dipole model is relatively more
 complex in order to remediate this and other issues. Specifically, the
 Drude model includes specific thermostating of the core-Drude pairs
 and short-range damping of the induced dipoles.
 
 The three polarization methods can be implemented through a
 self-consistent calculation of charges or induced dipoles at each
 timestep. In the fluctuating charge scheme this is done by the matrix
 inversion method in "fix qeq/point"_fix_qeq.html, but for core-shell
 or Drude-dipoles the relaxed-dipoles technique would require an slow
 iterative procedure. These self-consistent solutions yield accurate
 trajectories since the additional degrees of freedom representing
 polarization are massless.  An alternative is to attribute a mass to
 the additional degrees of freedom and perform time integration using
 an extended Lagrangian technique. For the fluctuating charge scheme
 this is done by "fix qeq/dynamic"_fix_qeq.html, and for the
 charge-on-spring models by the methods outlined in the next two
 sections. The assignment of masses to the additional degrees of
 freedom can lead to unphysical trajectories if care is not exerted in
 choosing the parameters of the poarizable models and the simulation
 conditions.
 
 In the core-shell model the vibration of the shells is kept faster
 than the ionic vibrations to mimic the fast response of the
 polarizable electrons.  But in molecular systems thermalizing the
 core-Drude pairs at temperatures comparable to the rest of the
 simulation leads to several problems (kinetic energy transfer, too
 short a timestep, etc.) In order to avoid these problems the relative
 motion of the Drude particles with respect to their cores is kept
 "cold" so the vibration of the core-Drude pairs is very slow,
 approaching the self-consistent regime.  In both models the
 temperature is regulated using the velocities of the center of mass of
 core+shell (or Drude) pairs, but in the Drude model the actual
 relative core-Drude particle motion is thermostated separately as
 well.
 
 :line
 
 6.26 Adiabatic core/shell model :link(howto_26),h4
 
 The adiabatic core-shell model by "Mitchell and
 Finchham"_#MitchellFinchham is a simple method for adding
 polarizability to a system.  In order to mimic the electron shell of
 an ion, a satellite particle is attached to it. This way the ions are
 split into a core and a shell where the latter is meant to react to
 the electrostatic environment inducing polarizability.
 
 Technically, shells are attached to the cores by a spring force f =
 k*r where k is a parametrized spring constant and r is the distance
 between the core and the shell. The charges of the core and the shell
 add up to the ion charge, thus q(ion) = q(core) + q(shell). This 
 setup introduces the ion polarizability (alpha) given by 
 alpha = q(shell)^2 / k. In a
 similar fashion the mass of the ion is distributed on the core and the
 shell with the core having the larger mass.
 
 To run this model in LAMMPS, "atom_style"_atom_style.html {full} can
 be used since atom charge and bonds are needed.  Each kind of
 core/shell pair requires two atom types and a bond type.  The core and
 shell of a core/shell pair should be bonded to each other with a
 harmonic bond that provides the spring force. For example, a data file
 for NaCl, as found in examples/coreshell, has this format:
 
 432   atoms  # core and shell atoms
 216   bonds  # number of core/shell springs :pre
 
 4     atom types  # 2 cores and 2 shells for Na and Cl 
 2     bond types :pre
 
 0.0 24.09597 xlo xhi
 0.0 24.09597 ylo yhi
 0.0 24.09597 zlo zhi :pre
 
 Masses       # core/shell mass ratio = 0.1 :pre
 
 1 20.690784  # Na core
 2 31.90500   # Cl core
 3 2.298976   # Na shell
 4 3.54500    # Cl shell :pre
 
 Atoms :pre
 
 1    1    2   1.5005    0.00000000   0.00000000   0.00000000 # core of core/shell pair 1
 2    1    4  -2.5005    0.00000000   0.00000000   0.00000000 # shell of core/shell pair 1
 3    2    1   1.5056    4.01599500   4.01599500   4.01599500 # core of core/shell pair 2
 4    2    3  -0.5056    4.01599500   4.01599500   4.01599500 # shell of core/shell pair 2 
 (...) :pre
 
 Bonds   # Bond topology for spring forces :pre
 
 1     2     1     2   # spring for core/shell pair 1
 2     2     3     4   # spring for core/shell pair 2 
 (...) :pre
 
 Non-Coulombic (e.g. Lennard-Jones) pairwise interactions are only
 defined between the shells.  Coulombic interactions are defined
 between all cores and shells.  If desired, additional bonds can be
 specified between cores.  
 
 The "special_bonds"_special_bonds.html command should be used to
 turn-off the Coulombic interaction within core/shell pairs, since that
 interaction is set by the bond spring.  This is done using the
 "special_bonds"_special_bonds.html command with a 1-2 weight = 0.0,
 which is the default value.  It needs to be considered whether one has
 to adjust the "special_bonds"_special_bonds.html weighting according
 to the molecular topology since the interactions of the shells are
 bypassed over an extra bond.
 
 Note that this core/shell implementation does not require all ions to
 be polarized.  One can mix core/shell pairs and ions without a
 satellite particle if desired.
 
 Since the core/shell model permits distances of r = 0.0 between the
 core and shell, a pair style with a "cs" suffix needs to be used to
 implement a valid long-range Coulombic correction.  Several such pair
 styles are provided in the CORESHELL package.  See "this doc
 page"_pair_cs.html for details.  All of the core/shell enabled pair
 styles require the use of a long-range Coulombic solver, as specified
 by the "kspace_style"_kspace_style.html command.  Either the PPPM or
 Ewald solvers can be used.
 
 For the NaCL example problem, these pair style and bond style settings
 are used:
 
 pair_style      born/coul/long/cs 20.0 20.0
 pair_coeff      * *      0.0 1.000   0.00  0.00   0.00
 pair_coeff      3 3    487.0 0.23768 0.00  1.05   0.50 #Na-Na
 pair_coeff      3 4 145134.0 0.23768 0.00  6.99   8.70 #Na-Cl
 pair_coeff      4 4 405774.0 0.23768 0.00 72.40 145.40 #Cl-Cl :pre
 
 bond_style      harmonic
 bond_coeff      1 63.014 0.0
 bond_coeff      2 25.724 0.0 :pre
 
 When running dynamics with the adiabatic core/shell model, the
 following issues should be considered.  Since the relative motion of
 the core and shell particles corresponds to the polarization, typical
 thermostats can alter the polarization behaviour, meaning the shell
 will not react freely to its electrostatic environment.  This is
 critical during the equilibration of the system. Therefore
 it's typically desirable to decouple the relative motion of the
 core/shell pair, which is an imaginary degree of freedom, from the
 real physical system.  To do that, the "compute
 temp/cs"_compute_temp_cs.html command can be used, in conjunction with
 any of the thermostat fixes, such as "fix nvt"_fix_nh.html or "fix
 langevin"_fix_langevin.  This compute uses the center-of-mass velocity
 of the core/shell pairs to calculate a temperature, and insures that
 velocity is what is rescaled for thermostatting purposes.  This
 compute also works for a system with both core/shell pairs and
 non-polarized ions (ions without an attached satellite particle).  The
 "compute temp/cs"_compute_temp_cs.html command requires input of two
 groups, one for the core atoms, another for the shell atoms.
 Non-polarized ions which might also be included in the treated system
 should not be included into either of these groups, they are taken
 into account by the {group-ID} (2nd argument) of the compute.  The
 groups can be defined using the "group {type}"_group.html command.
 Note that to perform thermostatting using this definition of
 temperature, the "fix modify temp"_fix_modify.html command should be
 used to assign the compute to the thermostat fix.  Likewise the
 "thermo_modify temp"_thermo_modify.html command can be used to make
 this temperature be output for the overall system. 
 
 For the NaCl example, this can be done as follows:
 
 group cores type 1 2
 group shells type 3 4
 compute CSequ all temp/cs cores shells
 fix thermoberendsen all temp/berendsen 1427 1427 0.4    # thermostat for the true physical system
 fix thermostatequ all nve                               # integrator as needed for the berendsen thermostat
 fix_modify thermoberendsen temp CSequ
 thermo_modify temp CSequ                                # output of center-of-mass derived temperature :pre
 
 If "compute temp/cs"_compute_temp_cs.html is used, the decoupled 
 relative motion of the core and the shell should in theory be 
 stable.  However numerical fluctuation can introduce a small
 momentum to the system, which is noticable over long trajectories.
 Therefore it is recomendable to use the "fix 
 momentum"_fix_momentum.html command in combination with "compute 
 temp/cs"_compute_temp_cs.html when equilibrating the system to 
 prevent any drift.
 
 When intializing the velocities of a system with core/shell pairs, it
 is also desirable to not introduce energy into the relative motion of
 the core/shell particles, but only assign a center-of-mass velocity to
 the pairs.  This can be done by using the {bias} keyword of the
 "velocity create"_velocity.html command and assigning the "compute
 temp/cs"_compute_temp_cs.html command to the {temp} keyword of the
 "velocity"_velocity.html commmand, e.g.
 
 velocity all create 1427 134 bias yes temp CSequ
 velocity all scale 1427 temp CSequ :pre
 
 It is important to note that the polarizability of the core/shell
 pairs is based on their relative motion. Therefore the choice of
 spring force and mass ratio need to ensure much faster relative motion
 of the 2 atoms within the core/shell pair than their center-of-mass
 velocity. This allow the shells to effectively react instantaneously
 to the electrostatic environment.  This fast movement also limits the
 timestep size that can be used.
 
 The primary literature of the adiabatic core/shell model suggests that
 the fast relative motion of the core/shell pairs only allows negligible 
 energy transfer to the environment. Therefore it is not intended to
 decouple the core/shell degree of freedom from the physical system
 during production runs. In other words, the "compute
 temp/cs"_compute_temp_cs.html command should not be used during
 production runs and is only required during equilibration. This way one 
 is consistent with literature (based on the code packages DL_POLY or 
 GULP for instance).
 
 The mentioned energy transfer will typically lead to a a small drift 
 in total energy over time.  This internal energy can be monitored 
 using the "compute chunk/atom"_compute_chunk_atom.html and "compute
 temp/chunk"_compute_temp_chunk.html commands.  The internal kinetic
 energies of each core/shell pair can then be summed using the sum()
 special function of the "variable"_variable.html command.  Or they can
 be time/averaged and output using the "fix ave/time"_fix_ave_time.html
 command.  To use these commands, each core/shell pair must be defined
 as a "chunk".  If each core/shell pair is defined as its own molecule,
 the molecule ID can be used to define the chunks.  If cores are bonded
 to each other to form larger molecules, the chunks can be identified
 by the "fix property/atom"_fix_property_atom.html via assigning a
 core/shell ID to each atom using a special field in the data file read
 by the "read_data"_read_data.html command.  This field can then be
 accessed by the "compute property/atom"_compute_property_atom.html
 command, to use as input to the "compute
 chunk/atom"_compute_chunk_atom.html command to define the core/shell
 pairs as chunks.
 
 For example,
 
 fix csinfo all property/atom i_CSID                       # property/atom command
 read_data NaCl_CS_x0.1_prop.data fix csinfo NULL CS-Info  # atom property added in the data-file
 compute prop all property/atom i_CSID
 compute cs_chunk all chunk/atom c_prop
 compute cstherm all temp/chunk cs_chunk temp internal com yes cdof 3.0     # note the chosen degrees of freedom for the core/shell pairs
 fix ave_chunk all ave/time 10 1 10 c_cstherm file chunk.dump mode vector :pre
 
 The additional section in the date file would be formatted like this:
 
 CS-Info         # header of additional section :pre
 
 1   1           # column 1 = atom ID, column 2 = core/shell ID 
 2   1   
 3   2   
 4   2   
 5   3   
 6   3   
 7   4   
 8   4   
 (...) :pre
 
 :line
 
 6.27 Drude induced dipoles :link(howto_27),h4
 
 The thermalized Drude model, similarly to the "core-shell"_#howto_26
 model, representes induced dipoles by a pair of charges (the core atom
 and the Drude particle) connected by a harmonic spring. The Drude
 model has a number of features aimed at its use in molecular systems
 ("Lamoureux and Roux"_#howto-Lamoureux):
 
 Thermostating of the additional degrees of freedom associated with the
 induced dipoles at very low temperature, in terms of the reduced
 coordinates of the Drude particles with respect to their cores. This
 makes the trajectory close to that of relaxed induced dipoles. :ulb,l
 
 Consistent definition of 1-2 to 1-4 neighbors. A core-Drude particle
 pair represents a single (polarizable) atom, so the special screening
 factors in a covalent structure should be the same for the core and
 the Drude particle.  Drude particles have to inherit the 1-2, 1-3, 1-4
 special neighbor relations from their respective cores. :l
 
 Stabilization of the interactions between induced dipoles. Drude
 dipoles on covalently bonded atoms interact too strongly due to the
 short distances, so an atom may capture the Drude particle of a
 neighbor, or the induced dipoles within the same molecule may align
 too much. To avoid this, damping at short range can be done by Thole
 functions (for which there are physical grounds). This Thole damping
 is applied to the point charges composing the induced dipole (the
 charge of the Drude particle and the opposite charge on the core, not
 to the total charge of the core atom). :l,ule
 
 A detailed tutorial covering the usage of Drude induced dipoles in
 LAMMPS is "available here"_tutorial_drude.html.
 
 As with the core-shell model, the cores and Drude particles should
 appear in the data file as standard atoms. The same holds for the
 springs between them, which are described by standard harmonic bonds.
 The nature of the atoms (core, Drude particle or non-polarizable) is
 specified via the "fix drude"_fix_drude.html command.  The special
 list of neighbors is automatically refactored to account for the
 equivalence of core and Drude particles as regards special 1-2 to 1-4
 screening. It may be necessary to use the {extra} keyword of the
 "special_bonds"_special_bonds.html command. If using "fix
 shake"_fix_shake.html, make sure no Drude particle is in this fix
 group.
 
 There are two ways to thermostat the Drude particles at a low
 temperature: use either "fix langevin/drude"_fix_langevin_drude.html
 for a Langevin thermostat, or "fix
 drude/transform/*"_fix_drude_transform.html for a Nose-Hoover
 thermostat. The former requires use of the command "comm_modify vel
 yes"_comm_modify.html. The latter requires two separate integration
 fixes like {nvt} or {npt}. The correct temperatures of the reduced
 degrees of freedom can be calculated using the "compute
 temp/drude"_compute_temp_drude.html. This requires also to use the
 command {comm_modify vel yes}.
 
 Short-range damping of the induced dipole interactions can be achieved
 using Thole functions through the the "pair style
 thole"_pair_thole.html in "pair_style hybrid/overlay"_pair_hybrid.html
 with a Coulomb pair style. It may be useful to use {coul/long/cs} or
 similar from the CORESHELL package if the core and Drude particle come
 too close, which can cause numerical issues.
 
 :line
 :line
 
 :link(howto-Berendsen)
 [(Berendsen)] Berendsen, Grigera, Straatsma, J Phys Chem, 91,
 6269-6271 (1987).
 
 :link(howto-Cornell)
 [(Cornell)] Cornell, Cieplak, Bayly, Gould, Merz, Ferguson,
 Spellmeyer, Fox, Caldwell, Kollman, JACS 117, 5179-5197 (1995).
 
 :link(Horn)
 [(Horn)] Horn, Swope, Pitera, Madura, Dick, Hura, and Head-Gordon,
 J Chem Phys, 120, 9665 (2004).
 
 :link(howto-Ikeshoji)
 [(Ikeshoji)] Ikeshoji and Hafskjold, Molecular Physics, 81, 251-261
 (1994).
 
 :link(howto-Wirnsberger)
 [(Wirnsberger)] Wirnsberger, Frenkel, and Dellago, J Chem Phys, 143, 124104
 (2015).
 
 :link(howto-MacKerell)
 [(MacKerell)] MacKerell, Bashford, Bellott, Dunbrack, Evanseck, Field,
 Fischer, Gao, Guo, Ha, et al, J Phys Chem, 102, 3586 (1998).
 
 :link(howto-Mayo)
 [(Mayo)] Mayo, Olfason, Goddard III, J Phys Chem, 94, 8897-8909
 (1990).
 
 :link(Jorgensen)
 [(Jorgensen)] Jorgensen, Chandrasekhar, Madura, Impey, Klein, J Chem
 Phys, 79, 926 (1983).
 
 :link(Price)
 [(Price)] Price and Brooks, J Chem Phys, 121, 10096 (2004).
 
 :link(Shinoda)
 [(Shinoda)] Shinoda, Shiga, and Mikami, Phys Rev B, 69, 134103 (2004).
 
 :link(MitchellFinchham)
 [(Mitchell and Finchham)] Mitchell, Finchham, J Phys Condensed Matter,
 5, 1031-1038 (1993).
 
 :link(howto-Lamoureux)
 [(Lamoureux and Roux)] G. Lamoureux, B. Roux, J. Chem. Phys 119, 3025 (2003)
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
index af2a61775..c01ceaf64 100644
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@@ -1,471 +1,471 @@
 # Default settings common options
 
 #LAMMPS specific settings:
 KOKKOS_PATH=../../lib/kokkos
 CXXFLAGS=$(CCFLAGS)
 
 #Options: OpenMP,Serial,Pthreads,Cuda
 KOKKOS_DEVICES ?= "OpenMP"
 #KOKKOS_DEVICES ?= "Pthreads"
 #Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,ARMv8,BGQ,Power7,Power8,KNL
 KOKKOS_ARCH ?= ""
 #Options: yes,no
 KOKKOS_DEBUG ?= "no"
 #Options: hwloc,librt,experimental_memkind
 KOKKOS_USE_TPLS ?= ""
 #Options: c++11
 KOKKOS_CXX_STANDARD ?= "c++11"
 #Options: aggressive_vectorization,disable_profiling
-KOKKOS_OPTIONS ?= "aggressive_vectorization"
+KOKKOS_OPTIONS ?= ""
 
 #Default settings specific options
 #Options: force_uvm,use_ldg,rdc,enable_lambda
 KOKKOS_CUDA_OPTIONS ?= ""
 
 # Check for general settings
 
 KOKKOS_INTERNAL_ENABLE_DEBUG := $(strip $(shell echo $(KOKKOS_DEBUG) | grep "yes" | wc -l))
 KOKKOS_INTERNAL_ENABLE_CXX11 := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++11" | wc -l))
 
 # Check for external libraries
 KOKKOS_INTERNAL_USE_HWLOC := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "hwloc" | wc -l))
 KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "librt" | wc -l))
 KOKKOS_INTERNAL_USE_MEMKIND := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "experimental_memkind" | wc -l))
 
 # Check for advanced settings
 KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "aggressive_vectorization" | wc -l))
 KOKKOS_INTERNAL_DISABLE_PROFILING := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_profiling" | wc -l))
 KOKKOS_INTERNAL_CUDA_USE_LDG := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "use_ldg" | wc -l))
 KOKKOS_INTERNAL_CUDA_USE_UVM := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "force_uvm" | wc -l))
 KOKKOS_INTERNAL_CUDA_USE_RELOC := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "rdc" | wc -l))
 KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "enable_lambda" | wc -l))
 
 # Check for Kokkos Host Execution Spaces one of which must be on
 
 KOKKOS_INTERNAL_USE_OPENMP := $(strip $(shell echo $(KOKKOS_DEVICES) | grep OpenMP | wc -l))
 KOKKOS_INTERNAL_USE_PTHREADS := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Pthread | wc -l))
 KOKKOS_INTERNAL_USE_SERIAL := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Serial | wc -l))
 KOKKOS_INTERNAL_USE_QTHREAD := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Qthread | wc -l))
 
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 0)
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0)
 	KOKKOS_INTERNAL_USE_SERIAL := 1
 endif
 endif
 
 KOKKOS_INTERNAL_COMPILER_INTEL := $(shell $(CXX) --version        2>&1 | grep "Intel Corporation" | wc -l)
 KOKKOS_INTERNAL_COMPILER_PGI   := $(shell $(CXX) --version        2>&1 | grep PGI   | wc -l)
 KOKKOS_INTERNAL_COMPILER_XL    := $(shell $(CXX) -qversion        2>&1 | grep XL    | wc -l)
 KOKKOS_INTERNAL_COMPILER_CRAY  := $(shell $(CXX) -craype-verbose  2>&1 | grep "CC-" | wc -l)
 KOKKOS_INTERNAL_OS_CYGWIN      := $(shell uname | grep CYGWIN | wc -l)
 
 ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
   KOKKOS_INTERNAL_OPENMP_FLAG := -mp 
 else
   ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
     KOKKOS_INTERNAL_OPENMP_FLAG := -qsmp=omp
   else
     ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
       # OpenMP is turned on by default in Cray compiler environment
       KOKKOS_INTERNAL_OPENMP_FLAG :=
     else
       KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
     endif
   endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
   KOKKOS_INTERNAL_CXX11_FLAG := --c++11
 else
   ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
      KOKKOS_INTERNAL_CXX11_FLAG := -std=c++11
   else
     ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
       KOKKOS_INTERNAL_CXX11_FLAG := -hstd=c++11
     else
       KOKKOS_INTERNAL_CXX11_FLAG := --std=c++11
     endif
   endif
 endif
 
 # Check for other Execution Spaces
 KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda | wc -l))
 
 # Check for Kokkos Architecture settings
 
 #Intel based
 KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l))
 
 #NVIDIA based
 NVCC_WRAPPER :=  $(KOKKOS_PATH)/config/nvcc_wrapper
 KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler30 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_KEPLER32 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler32 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler35 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_KEPLER37 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler37 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell50 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell52 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell53 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
 KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
 endif
 
 #ARM based
 KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv8 | wc -l))
 
 #IBM based
 KOKKOS_INTERNAL_USE_ARCH_BGQ := $(strip $(shell echo $(KOKKOS_ARCH) | grep BGQ | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power7 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power8 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8) | bc))
 
 #AMD based
 KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))
 
 #Any AVX?
 KOKKOS_INTERNAL_USE_ARCH_AVX       := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX2      := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
 
 # Decide what ISA level we are able to support
 KOKKOS_INTERNAL_USE_ISA_X86_64     := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
 KOKKOS_INTERNAL_USE_ISA_KNC        := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
 KOKKOS_INTERNAL_USE_ISA_POWERPCLE  := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8) | bc ))
 
 #Incompatible flags?
 KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV80)>1" | bc ))
 KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
   $(error Defined Multiple Host architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) )
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIGPU), 1)
   $(error Defined Multiple GPU architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) )
 endif
 
 #Generating the list of Flags
 
 KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src
 
 # No warnings:
 KOKKOS_CXXFLAGS =
 # INTEL and CLANG warnings:
 #KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
 # GCC warnings:
 #KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized -Wignored-qualifiers -Wempty-body -Wclobbered
 
 KOKKOS_LIBS = -lkokkos -ldl
 KOKKOS_LDFLAGS = -L$(shell pwd)
 KOKKOS_SRC = 
 KOKKOS_HEADERS =
 
 #Generating the KokkosCore_config.h file
 
 tmp := $(shell echo "/* ---------------------------------------------" > KokkosCore_config.tmp)
 tmp := $(shell echo "Makefile constructed configuration:" >> KokkosCore_config.tmp)
 tmp := $(shell date >> KokkosCore_config.tmp)
 tmp := $(shell echo "----------------------------------------------*/" >> KokkosCore_config.tmp)
 
 
 tmp := $(shell echo "/* Execution Spaces */" >> KokkosCore_config.tmp)
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
 	tmp := $(shell echo '\#define KOKKOS_HAVE_OPENMP 1' >> KokkosCore_config.tmp) 
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
 	tmp := $(shell echo "\#define KOKKOS_HAVE_PTHREAD 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
 	tmp := $(shell echo "\#define KOKKOS_HAVE_SERIAL 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 	tmp := $(shell echo "\#define KOKKOS_HAVE_CUDA 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ISA_X86_64), 1)
   	tmp := $(shell echo "\#define KOKKOS_USE_ISA_X86_64" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ISA_KNC), 1)
   	tmp := $(shell echo "\#define KOKKOS_USE_ISA_KNC" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCLE), 1)
   	tmp := $(shell echo "\#define KOKKOS_USE_ISA_POWERPCLE" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
 	KOKKOS_CPPFLAGS += -I$(QTHREAD_PATH)/include
 	KOKKOS_LDFLAGS += -L$(QTHREAD_PATH)/lib 
 	tmp := $(shell echo "\#define KOKKOS_HAVE_QTHREAD 1" >> KokkosCore_config.tmp )
 endif
 
 tmp := $(shell echo "/* General Settings */" >> KokkosCore_config.tmp)
 ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1)
 	KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG)
 	tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 	KOKKOS_CXXFLAGS += -G
 endif
 	KOKKOS_CXXFLAGS += -g 
 	KOKKOS_LDFLAGS += -g -ldl
 	tmp := $(shell echo "\#define KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK 1" >> KokkosCore_config.tmp )
 	tmp := $(shell echo "\#define KOKKOS_HAVE_DEBUG 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1)
 	KOKKOS_CPPFLAGS += -I$(HWLOC_PATH)/include
 	KOKKOS_LDFLAGS += -L$(HWLOC_PATH)/lib 
         KOKKOS_LIBS += -lhwloc
 	tmp := $(shell echo "\#define KOKKOS_HAVE_HWLOC 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1)
 	tmp := $(shell echo "\#define KOKKOS_USE_LIBRT 1" >> KokkosCore_config.tmp )
 	tmp := $(shell echo "\#define PREC_TIMER 1" >> KokkosCore_config.tmp )
   tmp := $(shell echo "\#define KOKKOSP_ENABLE_RTLIB 1" >> KokkosCore_config.tmp )
 	KOKKOS_LIBS += -lrt
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
   KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
   KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib 
         KOKKOS_LIBS += -lmemkind
   tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_DISABLE_PROFILING), 1)
   tmp := $(shell echo "\#define KOKKOS_ENABLE_PROFILING 0" >> KokkosCore_config.tmp )
 endif
 
 tmp := $(shell echo "/* Optimization Settings */" >> KokkosCore_config.tmp)
 
 ifeq ($(KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION), 1)
   tmp := $(shell echo "\#define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION 1" >> KokkosCore_config.tmp )
 endif
 
 tmp := $(shell echo "/* Cuda Settings */" >> KokkosCore_config.tmp)
 
 ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1)
 	tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LDG_INTRINSIC 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
 	tmp := $(shell echo "\#define KOKKOS_CUDA_USE_UVM 1" >> KokkosCore_config.tmp )
   tmp := $(shell echo "\#define KOKKOS_USE_CUDA_UVM 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1)
 	tmp := $(shell echo "\#define KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE 1" >> KokkosCore_config.tmp )
 	KOKKOS_CXXFLAGS += --relocatable-device-code=true
 	KOKKOS_LDFLAGS += --relocatable-device-code=true
 endif
 
 ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1)
   tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp )
   KOKKOS_CXXFLAGS += -expt-extended-lambda
 endif
 
 #Add Architecture flags
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
     tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )
     ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
 	KOKKOS_CXXFLAGS +=
 	KOKKOS_LDFLAGS +=
     else	
 	KOKKOS_CXXFLAGS += -mavx
 	KOKKOS_LDFLAGS += -mavx
     endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
     tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp )
 	KOKKOS_CXXFLAGS += -mcpu=power8
 	KOKKOS_LDFLAGS  += -mcpu=power8
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
     tmp := $(shell echo "\#define KOKKOS_ARCH_AVX2 1" >> KokkosCore_config.tmp )
 	ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
 		KOKKOS_CXXFLAGS += -xCORE-AVX2
 		KOKKOS_LDFLAGS  += -xCORE-AVX2
 	else
 		ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
 
 		else
 			ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) 
 
 			else
 				# Assume that this is a really a GNU compiler
 				KOKKOS_CXXFLAGS += -march=core-avx2
 				KOKKOS_LDFLAGS  += -march=core-avx2
 			endif
 		endif
 	endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
     tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512MIC 1" >> KokkosCore_config.tmp )
 	ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
 		KOKKOS_CXXFLAGS += -xMIC-AVX512
 		KOKKOS_LDFLAGS  += -xMIC-AVX512
 	else
 		ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
 
 		else
 			ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
 
 			else
 				# Asssume that this is really a GNU compiler
 				KOKKOS_CXXFLAGS += -march=knl
 				KOKKOS_LDFLAGS  += -march=knl
 			endif
 		endif
 	endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1)
     tmp := $(shell echo "\#define KOKKOS_ARCH_KNC 1" >> KokkosCore_config.tmp )
 	KOKKOS_CXXFLAGS += -mmic
 	KOKKOS_LDFLAGS += -mmic
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
     tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
     tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER30 1" >> KokkosCore_config.tmp )
 	KOKKOS_CXXFLAGS += -arch=sm_30
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1)
     tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
     tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER32 1" >> KokkosCore_config.tmp )
 	KOKKOS_CXXFLAGS += -arch=sm_32
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1)
     tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
     tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER35 1" >> KokkosCore_config.tmp )
 	KOKKOS_CXXFLAGS += -arch=sm_35
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1)
     tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
     tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER37 1" >> KokkosCore_config.tmp )
 	KOKKOS_CXXFLAGS += -arch=sm_37
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1)
     tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
     tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL50 1" >> KokkosCore_config.tmp )
 	KOKKOS_CXXFLAGS += -arch=sm_50
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1)
     tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
     tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL52 1" >> KokkosCore_config.tmp )
 	KOKKOS_CXXFLAGS += -arch=sm_52
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
     tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
     tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp )
 	KOKKOS_CXXFLAGS += -arch=sm_53
 endif
 endif
  
 KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)
 ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
 KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
 else
 KOKKOS_INTERNAL_NEW_CONFIG := 1
 endif
 
 ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
 	tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h)
 endif
 
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp)
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp)
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp)
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/algorithms/src/*.hpp)
 
 KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.cpp)
 KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
 	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
 	KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64 
 	KOKKOS_LIBS += -lcudart -lcuda
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
 	KOKKOS_LIBS += -lpthread
 	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
 	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
 	KOKKOS_LIBS += -lqthread
 	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.cpp)
 	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.hpp)
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
 	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
 	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
 	ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 		KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMP_FLAG)
 	else
 		KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
 	endif
 	KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
 endif
 
 #With Cygwin functions such as fdopen and fileno are not defined 
 #when strict ansi is enabled. strict ansi gets enabled with --std=c++11
 #though. So we hard undefine it here. Not sure if that has any bad side effects
 #This is needed for gtest actually, not for Kokkos itself!
 ifeq ($(KOKKOS_INTERNAL_OS_CYGWIN), 1)
   KOKKOS_CXXFLAGS += -U__STRICT_ANSI__
 endif
 
 # Setting up dependencies
 
 KokkosCore_config.h:
 
 KOKKOS_CPP_DEPENDS := KokkosCore_config.h $(KOKKOS_HEADERS)
 
 KOKKOS_OBJ = $(KOKKOS_SRC:.cpp=.o)
 KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ))
 
 include $(KOKKOS_PATH)/Makefile.targets
 
 kokkos-clean:
 	rm -f $(KOKKOS_OBJ_LINK) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a
 
 libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS)
 	ar cr libkokkos.a $(KOKKOS_OBJ_LINK)
 	ranlib libkokkos.a
 
 KOKKOS_LINK_DEPENDS=libkokkos.a
diff --git a/src/KOKKOS/pair_reax_c_kokkos.cpp b/src/KOKKOS/pair_reax_c_kokkos.cpp
index 9e0f06ef3..eba6fbb0b 100644
--- a/src/KOKKOS/pair_reax_c_kokkos.cpp
+++ b/src/KOKKOS/pair_reax_c_kokkos.cpp
@@ -1,3934 +1,3936 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Ray Shan (SNL), Stan Moore (SNL)
 ------------------------------------------------------------------------- */
 
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "pair_reax_c_kokkos.h"
 #include "kokkos.h"
 #include "atom_kokkos.h"
 #include "comm.h"
 #include "force.h"
 #include "neighbor.h"
 #include "neigh_request.h"
 #include "neigh_list_kokkos.h"
 #include "update.h"
 #include "integrate.h"
 #include "respa.h"
 #include "math_const.h"
 #include "math_special.h"
 #include "memory.h"
 #include "error.h"
 #include "atom_masks.h"
 #include "reaxc_defs.h"
 #include "reaxc_lookup.h"
 #include "reaxc_tool_box.h"
 
 
 #define TEAMSIZE 128
 
 /* ---------------------------------------------------------------------- */
 
 namespace LAMMPS_NS{
 using namespace MathConst;
 using namespace MathSpecial;
 
 template<class DeviceType>
 PairReaxCKokkos<DeviceType>::PairReaxCKokkos(LAMMPS *lmp) : PairReaxC(lmp)
 {
   respa_enable = 0;
 
   cut_nbsq = cut_hbsq = cut_bosq = 0.0;
 
   atomKK = (AtomKokkos *) atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
   datamask_read = X_MASK | Q_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
   datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
 
   k_resize_bo = DAT::tdual_int_scalar("pair:resize_bo");
   d_resize_bo = k_resize_bo.view<DeviceType>();
 
   k_resize_hb = DAT::tdual_int_scalar("pair:resize_hb");
   d_resize_hb = k_resize_hb.view<DeviceType>();
 
   nmax = 0;
   maxbo = 1;
   maxhb = 1;
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 PairReaxCKokkos<DeviceType>::~PairReaxCKokkos()
 {
   if (!copymode) {
     memory->destroy_kokkos(k_eatom,eatom);
     memory->destroy_kokkos(k_vatom,vatom);
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 void PairReaxCKokkos<DeviceType>::allocate()
 {
   int n = atom->ntypes;
 
   k_params_sing = Kokkos::DualView<params_sing*,typename DeviceType::array_layout,DeviceType>
     ("PairReaxC::params_sing",n+1);
   paramssing = k_params_sing.d_view;
 
   k_params_twbp = Kokkos::DualView<params_twbp**,typename DeviceType::array_layout,DeviceType>
     ("PairReaxC::params_twbp",n+1,n+1);
   paramstwbp = k_params_twbp.d_view;
 
   k_params_thbp = Kokkos::DualView<params_thbp***,typename DeviceType::array_layout,DeviceType>
     ("PairReaxC::params_thbp",n+1,n+1,n+1);
   paramsthbp = k_params_thbp.d_view;
 
   k_params_fbp = Kokkos::DualView<params_fbp****,typename DeviceType::array_layout,DeviceType>
     ("PairReaxC::params_fbp",n+1,n+1,n+1,n+1);
   paramsfbp = k_params_fbp.d_view;
 
   k_params_hbp = Kokkos::DualView<params_hbp***,typename DeviceType::array_layout,DeviceType>
     ("PairReaxC::params_hbp",n+1,n+1,n+1);
   paramshbp = k_params_hbp.d_view;
 
   k_tap = DAT::tdual_ffloat_1d("pair:tap",8);
   d_tap = k_tap.d_view;
   h_tap = k_tap.h_view;
 
 }
 
 /* ----------------------------------------------------------------------
    init specific to this pair style
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
 void PairReaxCKokkos<DeviceType>::init_style()
 {
   PairReaxC::init_style();
 
   // irequest = neigh request made by parent class
 
   neighflag = lmp->kokkos->neighflag;
   int irequest = neighbor->nrequest - 1;
 
   neighbor->requests[irequest]->
     kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value &&
     !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
   neighbor->requests[irequest]->
     kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
 
   if (neighflag == FULL) {
     neighbor->requests[irequest]->full = 1;
     neighbor->requests[irequest]->half = 0;
     neighbor->requests[irequest]->full_cluster = 0;
     neighbor->requests[irequest]->ghost = 1;
   } else if (neighflag == HALF || neighflag == HALFTHREAD) {
     neighbor->requests[irequest]->full = 0;
     neighbor->requests[irequest]->half = 1;
     neighbor->requests[irequest]->full_cluster = 0;
     neighbor->requests[irequest]->ghost = 1;
   } else {
     error->all(FLERR,"Cannot use chosen neighbor list style with reax/c/kk");
   }
 
   allocate();
   setup();
   init_md();
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 void PairReaxCKokkos<DeviceType>::setup()
 {
   int i,j,k,m;
   int n = atom->ntypes;
 
   // general parameters
   for (i = 0; i < 39; i ++)
     gp[i] = system->reax_param.gp.l[i];
 
   p_boc1 = gp[0];
   p_boc2 = gp[1];
 
   // vdw parameters
   vdwflag = system->reax_param.gp.vdw_type;
   lgflag = control->lgflag;
 
   // atom, bond, angle, dihedral, H-bond specific parameters
   two_body_parameters *twbp;
 
   // valence angle (3-body) parameters
   three_body_header *thbh;
   three_body_parameters *thbp;
 
   // torsion angle (4-body) parameters
   four_body_header *fbh;
   four_body_parameters *fbp;
 
   // hydrogen bond parameters
   hbond_parameters *hbp;
 
   for (i = 1; i <= n; i++) {
     // general
     k_params_sing.h_view(i).mass = system->reax_param.sbp[map[i]].mass;
 
     // polarization
     k_params_sing.h_view(i).chi = system->reax_param.sbp[map[i]].chi;
     k_params_sing.h_view(i).eta = system->reax_param.sbp[map[i]].eta;
 
     // bond order
     k_params_sing.h_view(i).r_s = system->reax_param.sbp[map[i]].r_s;
     k_params_sing.h_view(i).r_pi = system->reax_param.sbp[map[i]].r_pi;
     k_params_sing.h_view(i).r_pi2 = system->reax_param.sbp[map[i]].r_pi_pi;
     k_params_sing.h_view(i).valency = system->reax_param.sbp[map[i]].valency;
     k_params_sing.h_view(i).valency_val = system->reax_param.sbp[map[i]].valency_val;
     k_params_sing.h_view(i).valency_boc = system->reax_param.sbp[map[i]].valency_boc;
     k_params_sing.h_view(i).valency_e = system->reax_param.sbp[map[i]].valency_e;
     k_params_sing.h_view(i).nlp_opt = system->reax_param.sbp[map[i]].nlp_opt;
 
     // multibody
     k_params_sing.h_view(i).p_lp2 = system->reax_param.sbp[map[i]].p_lp2;
     k_params_sing.h_view(i).p_ovun2 = system->reax_param.sbp[map[i]].p_ovun2;
     k_params_sing.h_view(i).p_ovun5 = system->reax_param.sbp[map[i]].p_ovun5;
 
     // angular
     k_params_sing.h_view(i).p_val3 = system->reax_param.sbp[map[i]].p_val3;
     k_params_sing.h_view(i).p_val5 = system->reax_param.sbp[map[i]].p_val5;
 
     // hydrogen bond
     k_params_sing.h_view(i).p_hbond = system->reax_param.sbp[map[i]].p_hbond;
 
     for (j = 1; j <= n; j++) {
       twbp = &(system->reax_param.tbp[map[i]][map[j]]);
 
       // vdW
       k_params_twbp.h_view(i,j).gamma = twbp->gamma;
       k_params_twbp.h_view(i,j).gamma_w = twbp->gamma_w;
       k_params_twbp.h_view(i,j).alpha = twbp->alpha;
       k_params_twbp.h_view(i,j).r_vdw = twbp->r_vdW;
       k_params_twbp.h_view(i,j).epsilon = twbp->D;
       k_params_twbp.h_view(i,j).acore = twbp->acore;
       k_params_twbp.h_view(i,j).ecore = twbp->ecore;
       k_params_twbp.h_view(i,j).rcore = twbp->rcore;
       k_params_twbp.h_view(i,j).lgre = twbp->lgre;
       k_params_twbp.h_view(i,j).lgcij = twbp->lgcij;
 
       // bond order
       k_params_twbp.h_view(i,j).r_s = twbp->r_s;
       k_params_twbp.h_view(i,j).r_pi = twbp->r_p;
       k_params_twbp.h_view(i,j).r_pi2 = twbp->r_pp;
       k_params_twbp.h_view(i,j).p_bo1 = twbp->p_bo1;
       k_params_twbp.h_view(i,j).p_bo2 = twbp->p_bo2;
       k_params_twbp.h_view(i,j).p_bo3 = twbp->p_bo3;
       k_params_twbp.h_view(i,j).p_bo4 = twbp->p_bo4;
       k_params_twbp.h_view(i,j).p_bo5 = twbp->p_bo5;
       k_params_twbp.h_view(i,j).p_bo6 = twbp->p_bo6;
       k_params_twbp.h_view(i,j).p_boc3 = twbp->p_boc3;
       k_params_twbp.h_view(i,j).p_boc4 = twbp->p_boc4;
       k_params_twbp.h_view(i,j).p_boc5 = twbp->p_boc5;
       k_params_twbp.h_view(i,j).ovc = twbp->ovc;
       k_params_twbp.h_view(i,j).v13cor = twbp->v13cor;
 
       // bond energy
       k_params_twbp.h_view(i,j).p_be1 = twbp->p_be1;
       k_params_twbp.h_view(i,j).p_be2 = twbp->p_be2;
       k_params_twbp.h_view(i,j).De_s = twbp->De_s;
       k_params_twbp.h_view(i,j).De_p = twbp->De_p;
       k_params_twbp.h_view(i,j).De_pp = twbp->De_pp;
 
       // multibody
       k_params_twbp.h_view(i,j).p_ovun1 = twbp->p_ovun1;
 
       for (k = 1; k <= n; k++) {
         // Angular
         thbh = &(system->reax_param.thbp[map[i]][map[j]][map[k]]);
         thbp = &(thbh->prm[0]);
         k_params_thbp.h_view(i,j,k).cnt = thbh->cnt;
         k_params_thbp.h_view(i,j,k).theta_00 = thbp->theta_00;
         k_params_thbp.h_view(i,j,k).p_val1 = thbp->p_val1;
         k_params_thbp.h_view(i,j,k).p_val2 = thbp->p_val2;
         k_params_thbp.h_view(i,j,k).p_val4 = thbp->p_val4;
         k_params_thbp.h_view(i,j,k).p_val7 = thbp->p_val7;
         k_params_thbp.h_view(i,j,k).p_pen1 = thbp->p_pen1;
         k_params_thbp.h_view(i,j,k).p_coa1 = thbp->p_coa1;
 
         // Hydrogen Bond
         hbp = &(system->reax_param.hbp[map[i]][map[j]][map[k]]);
         k_params_hbp.h_view(i,j,k).p_hb1 = hbp->p_hb1;
         k_params_hbp.h_view(i,j,k).p_hb2 = hbp->p_hb2;
         k_params_hbp.h_view(i,j,k).p_hb3 = hbp->p_hb3;
         k_params_hbp.h_view(i,j,k).r0_hb = hbp->r0_hb;
 
         for (m = 1; m <= n; m++) {
           // Torsion
           fbh = &(system->reax_param.fbp[map[i]][map[j]][map[k]][map[m]]);
           fbp = &(fbh->prm[0]);
           k_params_fbp.h_view(i,j,k,m).p_tor1 = fbp->p_tor1;
           k_params_fbp.h_view(i,j,k,m).p_cot1 = fbp->p_cot1;
           k_params_fbp.h_view(i,j,k,m).V1 = fbp->V1;
           k_params_fbp.h_view(i,j,k,m).V2 = fbp->V2;
           k_params_fbp.h_view(i,j,k,m).V3 = fbp->V3;
         }
       }
     }
   }
   k_params_sing.template modify<LMPHostType>();
   k_params_twbp.template modify<LMPHostType>();
   k_params_thbp.template modify<LMPHostType>();
   k_params_fbp.template modify<LMPHostType>();
   k_params_hbp.template modify<LMPHostType>();
 
   // cutoffs
   cut_nbsq = control->nonb_cut * control->nonb_cut;
   cut_hbsq = control->hbond_cut * control->hbond_cut;
   cut_bosq = control->bond_cut * control->bond_cut;
 
   // bond order cutoffs
   bo_cut = 0.01 * gp[29];
   thb_cut = control->thb_cut;
   thb_cutsq = 0.000010; //thb_cut*thb_cut;
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 void PairReaxCKokkos<DeviceType>::init_md()
 {
   // init_taper()
   F_FLOAT d1, d7, swa, swa2, swa3, swb, swb2, swb3;
 
   swa = control->nonb_low;
   swb = control->nonb_cut;
 
   if (fabs(swa) > 0.01 )
     error->warning(FLERR,"Warning: non-zero lower Taper-radius cutoff");
 
   if (swb < 0)
     error->one(FLERR,"Negative upper Taper-radius cutoff");
   else if (swb < 5) {
     char str[128];
     sprintf(str,"Warning: very low Taper-radius cutoff: %f\n", swb);
     error->one(FLERR,str);
   }
 
   d1 = swb - swa;
   d7 = powint(d1,7);
   swa2 = swa * swa;
   swa3 = swa * swa2;
   swb2 = swb * swb;
   swb3 = swb * swb2;
 
   k_tap.h_view(7) = 20.0/d7;
   k_tap.h_view(6) = -70.0 * (swa + swb) / d7;
   k_tap.h_view(5) =  84.0 * (swa2 + 3.0*swa*swb + swb2) / d7;
   k_tap.h_view(4) = -35.0 * (swa3 + 9.0*swa2*swb + 9.0*swa*swb2 + swb3 ) / d7;
   k_tap.h_view(3) = 140.0 * (swa3*swb + 3.0*swa2*swb2 + swa*swb3 ) / d7;
   k_tap.h_view(2) =-210.0 * (swa3*swb2 + swa2*swb3) / d7;
   k_tap.h_view(1) = 140.0 * swa3 * swb3 / d7;
   k_tap.h_view(0) = (-35.0*swa3*swb2*swb2 + 21.0*swa2*swb3*swb2 +
                      7.0*swa*swb3*swb3 + swb3*swb3*swb ) / d7;
 
   k_tap.template modify<LMPHostType>();
   k_tap.template sync<DeviceType>();
 
 
   if ( control->tabulate ) {
     int ntypes = atom->ntypes;
 
     Init_Lookup_Tables();
     k_LR = tdual_LR_lookup_table_kk_2d("lookup:LR",ntypes+1,ntypes+1);
     d_LR = k_LR.d_view;
 
     for (int i = 1; i <= ntypes; ++i) {
       for (int j = i; j <= ntypes; ++j) {
         int n = LR[i][j].n;
         if (n == 0) continue;
         k_LR.h_view(i,j).xmin   = LR[i][j].xmin;
         k_LR.h_view(i,j).xmax   = LR[i][j].xmax;
         k_LR.h_view(i,j).n      = LR[i][j].n;
         k_LR.h_view(i,j).dx     = LR[i][j].dx;
         k_LR.h_view(i,j).inv_dx = LR[i][j].inv_dx;
         k_LR.h_view(i,j).a      = LR[i][j].a;
         k_LR.h_view(i,j).m      = LR[i][j].m;
         k_LR.h_view(i,j).c      = LR[i][j].c;
     
         k_LR.h_view(i,j).k_y      = tdual_LR_data_1d("lookup:LR[i,j].y",n);
         k_LR.h_view(i,j).k_H      = tdual_cubic_spline_coef_1d("lookup:LR[i,j].H",n);
         k_LR.h_view(i,j).k_vdW    = tdual_cubic_spline_coef_1d("lookup:LR[i,j].vdW",n);
         k_LR.h_view(i,j).k_CEvd   = tdual_cubic_spline_coef_1d("lookup:LR[i,j].CEvd",n);
         k_LR.h_view(i,j).k_ele    = tdual_cubic_spline_coef_1d("lookup:LR[i,j].ele",n);
         k_LR.h_view(i,j).k_CEclmb = tdual_cubic_spline_coef_1d("lookup:LR[i,j].CEclmb",n);
     
         k_LR.h_view(i,j).d_y      = k_LR.h_view(i,j).k_y.d_view;
         k_LR.h_view(i,j).d_H      = k_LR.h_view(i,j).k_H.d_view;
         k_LR.h_view(i,j).d_vdW    = k_LR.h_view(i,j).k_vdW.d_view;
         k_LR.h_view(i,j).d_CEvd   = k_LR.h_view(i,j).k_CEvd.d_view;
         k_LR.h_view(i,j).d_ele    = k_LR.h_view(i,j).k_ele.d_view;
         k_LR.h_view(i,j).d_CEclmb = k_LR.h_view(i,j).k_CEclmb.d_view;
     
         for (int k = 0; k < n; k++) {
           k_LR.h_view(i,j).k_y.h_view(k)      = LR[i][j].y[k];
           k_LR.h_view(i,j).k_H.h_view(k)      = LR[i][j].H[k];
           k_LR.h_view(i,j).k_vdW.h_view(k)    = LR[i][j].vdW[k];
           k_LR.h_view(i,j).k_CEvd.h_view(k)   = LR[i][j].CEvd[k];
           k_LR.h_view(i,j).k_ele.h_view(k)    = LR[i][j].ele[k];
           k_LR.h_view(i,j).k_CEclmb.h_view(k) = LR[i][j].CEclmb[k];
         }
     
         k_LR.h_view(i,j).k_y.template modify<LMPHostType>();
         k_LR.h_view(i,j).k_H.template modify<LMPHostType>();
         k_LR.h_view(i,j).k_vdW.template modify<LMPHostType>();
         k_LR.h_view(i,j).k_CEvd.template modify<LMPHostType>();
         k_LR.h_view(i,j).k_ele.template modify<LMPHostType>();
         k_LR.h_view(i,j).k_CEclmb.template modify<LMPHostType>();
     
         k_LR.h_view(i,j).k_y.template sync<DeviceType>();
         k_LR.h_view(i,j).k_H.template sync<DeviceType>();
         k_LR.h_view(i,j).k_vdW.template sync<DeviceType>();
         k_LR.h_view(i,j).k_CEvd.template sync<DeviceType>();
         k_LR.h_view(i,j).k_ele.template sync<DeviceType>();
         k_LR.h_view(i,j).k_CEclmb.template sync<DeviceType>();
       }
     }
     k_LR.template modify<LMPHostType>();
     k_LR.template sync<DeviceType>();
 
     Deallocate_Lookup_Tables();
   }
 
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 int PairReaxCKokkos<DeviceType>::Init_Lookup_Tables()
 {
   int i, j, r;
   int num_atom_types;
   double dr;
   double *h, *fh, *fvdw, *fele, *fCEvd, *fCEclmb;
   double v0_vdw, v0_ele, vlast_vdw, vlast_ele;
 
   /* initializations */
   v0_vdw = 0;
   v0_ele = 0;
   vlast_vdw = 0;
   vlast_ele = 0;
 
   num_atom_types = atom->ntypes;
   dr = control->nonb_cut / control->tabulate;
   h = (double*)
     smalloc( (control->tabulate+2) * sizeof(double), "lookup:h", world );
   fh = (double*)
     smalloc( (control->tabulate+2) * sizeof(double), "lookup:fh", world );
   fvdw = (double*)
     smalloc( (control->tabulate+2) * sizeof(double), "lookup:fvdw", world );
   fCEvd = (double*)
     smalloc( (control->tabulate+2) * sizeof(double), "lookup:fCEvd", world );
   fele = (double*)
     smalloc( (control->tabulate+2) * sizeof(double), "lookup:fele", world );
   fCEclmb = (double*)
     smalloc( (control->tabulate+2) * sizeof(double), "lookup:fCEclmb", world );
 
   LR = (LR_lookup_table**)
     scalloc( num_atom_types+1, sizeof(LR_lookup_table*), "lookup:LR", world );
   for( i = 0; i < num_atom_types+1; ++i )
     LR[i] = (LR_lookup_table*)
       scalloc( num_atom_types+1, sizeof(LR_lookup_table), "lookup:LR[i]", world );
 
   for( i = 1; i <= num_atom_types; ++i ) {
     for( j = i; j <= num_atom_types; ++j ) {
       LR[i][j].xmin = 0;
       LR[i][j].xmax = control->nonb_cut;
       LR[i][j].n = control->tabulate + 2;
       LR[i][j].dx = dr;
       LR[i][j].inv_dx = control->tabulate / control->nonb_cut;
       LR[i][j].y = (LR_data*)
         smalloc( LR[i][j].n * sizeof(LR_data), "lookup:LR[i,j].y", world );
       LR[i][j].H = (cubic_spline_coef*)
         smalloc( LR[i][j].n*sizeof(cubic_spline_coef),"lookup:LR[i,j].H" ,
                  world );
       LR[i][j].vdW = (cubic_spline_coef*)
         smalloc( LR[i][j].n*sizeof(cubic_spline_coef),"lookup:LR[i,j].vdW",
                  world);
       LR[i][j].CEvd = (cubic_spline_coef*)
         smalloc( LR[i][j].n*sizeof(cubic_spline_coef),"lookup:LR[i,j].CEvd",
                  world);
       LR[i][j].ele = (cubic_spline_coef*)
         smalloc( LR[i][j].n*sizeof(cubic_spline_coef),"lookup:LR[i,j].ele",
                  world );
       LR[i][j].CEclmb = (cubic_spline_coef*)
         smalloc( LR[i][j].n*sizeof(cubic_spline_coef),
                  "lookup:LR[i,j].CEclmb", world );
 
       for( r = 1; r <= control->tabulate; ++r ) {
         LR_vdW_Coulomb(i, j, r * dr, &(LR[i][j].y[r]) );
         h[r] = LR[i][j].dx;
         fh[r] = LR[i][j].y[r].H;
         fvdw[r] = LR[i][j].y[r].e_vdW;
         fCEvd[r] = LR[i][j].y[r].CEvd;
         fele[r] = LR[i][j].y[r].e_ele;
         fCEclmb[r] = LR[i][j].y[r].CEclmb;
       }
 
       // init the start-end points
       h[r] = LR[i][j].dx;
       v0_vdw = LR[i][j].y[1].CEvd;
       v0_ele = LR[i][j].y[1].CEclmb;
       fh[r] = fh[r-1];
       fvdw[r] = fvdw[r-1];
       fCEvd[r] = fCEvd[r-1];
       fele[r] = fele[r-1];
       fCEclmb[r] = fCEclmb[r-1];
       vlast_vdw = fCEvd[r-1];
       vlast_ele = fele[r-1];
 
       Natural_Cubic_Spline( &h[1], &fh[1],
                             &(LR[i][j].H[1]), control->tabulate+1, world );
 
       Complete_Cubic_Spline( &h[1], &fvdw[1], v0_vdw, vlast_vdw,
                              &(LR[i][j].vdW[1]), control->tabulate+1,
                              world );
 
       Natural_Cubic_Spline( &h[1], &fCEvd[1],
                             &(LR[i][j].CEvd[1]), control->tabulate+1,
                             world );
 
       Complete_Cubic_Spline( &h[1], &fele[1], v0_ele, vlast_ele,
                              &(LR[i][j].ele[1]), control->tabulate+1,
                              world );
 
       Natural_Cubic_Spline( &h[1], &fCEclmb[1],
                             &(LR[i][j].CEclmb[1]), control->tabulate+1,
                             world );
     }// else{
      // LR[i][j].n = 0;
     //}//
   }
   free(h);
   free(fh);
   free(fvdw);
   free(fCEvd);
   free(fele);
   free(fCEclmb);
 
   return 1;
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 void PairReaxCKokkos<DeviceType>::Deallocate_Lookup_Tables()
 {
   int i, j;
   int ntypes;
 
   ntypes = atom->ntypes;
 
   for( i = 0; i < ntypes; ++i ) {
     for( j = i; j < ntypes; ++j )
       if( LR[i][j].n ) {
         sfree( LR[i][j].y, "LR[i,j].y" );
         sfree( LR[i][j].H, "LR[i,j].H" );
         sfree( LR[i][j].vdW, "LR[i,j].vdW" );
         sfree( LR[i][j].CEvd, "LR[i,j].CEvd" );
         sfree( LR[i][j].ele, "LR[i,j].ele" );
         sfree( LR[i][j].CEclmb, "LR[i,j].CEclmb" );
       }
     sfree( LR[i], "LR[i]" );
   }
   sfree( LR, "LR" );
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 void PairReaxCKokkos<DeviceType>::LR_vdW_Coulomb( int i, int j, double r_ij, LR_data *lr )
 {
   double p_vdW1 = system->reax_param.gp.l[28];
   double p_vdW1i = 1.0 / p_vdW1;
   double powr_vdW1, powgi_vdW1;
   double tmp, fn13, exp1, exp2;
   double Tap, dTap, dfn13;
   double dr3gamij_1, dr3gamij_3;
   double e_core, de_core;
   double e_lg, de_lg, r_ij5, r_ij6, re6;
   two_body_parameters *twbp;
 
   twbp = &(system->reax_param.tbp[map[i]][map[j]]);
   e_core = 0;
   de_core = 0;
   e_lg = de_lg = 0.0;
 
   /* calculate taper and its derivative */
   Tap = k_tap.h_view[7] * r_ij + k_tap.h_view[6];
   Tap = Tap * r_ij + k_tap.h_view[5];
   Tap = Tap * r_ij + k_tap.h_view[4];
   Tap = Tap * r_ij + k_tap.h_view[3];
   Tap = Tap * r_ij + k_tap.h_view[2];
   Tap = Tap * r_ij + k_tap.h_view[1];
   Tap = Tap * r_ij + k_tap.h_view[0];
 
   dTap = 7*k_tap.h_view[7] * r_ij + 6*k_tap.h_view[6];
   dTap = dTap * r_ij + 5*k_tap.h_view[5];
   dTap = dTap * r_ij + 4*k_tap.h_view[4];
   dTap = dTap * r_ij + 3*k_tap.h_view[3];
   dTap = dTap * r_ij + 2*k_tap.h_view[2];
   dTap += k_tap.h_view[1]/r_ij;
 
   /*vdWaals Calculations*/
   if(system->reax_param.gp.vdw_type==1 || system->reax_param.gp.vdw_type==3)
     { // shielding
       powr_vdW1 = pow(r_ij, p_vdW1);
       powgi_vdW1 = pow( 1.0 / twbp->gamma_w, p_vdW1);
 
       fn13 = pow( powr_vdW1 + powgi_vdW1, p_vdW1i );
       exp1 = exp( twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
       exp2 = exp( 0.5 * twbp->alpha * (1.0 - fn13 / twbp->r_vdW) );
 
       lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
 
       dfn13 = pow( powr_vdW1 + powgi_vdW1, p_vdW1i-1.0) * pow(r_ij, p_vdW1-2.0);
 
       lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) -
         Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) * dfn13;
     }
   else{ // no shielding
     exp1 = exp( twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
     exp2 = exp( 0.5 * twbp->alpha * (1.0 - r_ij / twbp->r_vdW) );
 
     lr->e_vdW = Tap * twbp->D * (exp1 - 2.0 * exp2);
     lr->CEvd = dTap * twbp->D * (exp1 - 2.0 * exp2) -
       Tap * twbp->D * (twbp->alpha / twbp->r_vdW) * (exp1 - exp2) / r_ij;
   }
 
   if(system->reax_param.gp.vdw_type==2 || system->reax_param.gp.vdw_type==3)
     { // innner wall
       e_core = twbp->ecore * exp(twbp->acore * (1.0-(r_ij/twbp->rcore)));
       lr->e_vdW += Tap * e_core;
 
       de_core = -(twbp->acore/twbp->rcore) * e_core;
       lr->CEvd += dTap * e_core + Tap * de_core / r_ij;
 
       //  lg correction, only if lgvdw is yes
       if (control->lgflag) {
         r_ij5 = powint( r_ij, 5 );
         r_ij6 = powint( r_ij, 6 );
         re6 = powint( twbp->lgre, 6 );
         e_lg = -(twbp->lgcij/( r_ij6 + re6 ));
         lr->e_vdW += Tap * e_lg;
 
         de_lg = -6.0 * e_lg *  r_ij5 / ( r_ij6 + re6 ) ;
         lr->CEvd += dTap * e_lg + Tap * de_lg/r_ij;
       }
 
     }
 
 
   /* Coulomb calculations */
   dr3gamij_1 = ( r_ij * r_ij * r_ij + twbp->gamma );
   dr3gamij_3 = pow( dr3gamij_1 , 0.33333333333333 );
 
   tmp = Tap / dr3gamij_3;
   lr->H = EV_to_KCALpMOL * tmp;
   lr->e_ele = C_ele * tmp;
 
   lr->CEclmb = C_ele * ( dTap -  Tap * r_ij / dr3gamij_1 ) / dr3gamij_3;
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 void PairReaxCKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 {
   bocnt = hbcnt = 0;
 
   eflag = eflag_in;
   vflag = vflag_in;
 
   if (neighflag == FULL) no_virial_fdotr_compute = 1;
 
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
 
   atomKK->sync(execution_space,datamask_read);
   k_params_sing.template sync<DeviceType>();
   k_params_twbp.template sync<DeviceType>();
   k_params_thbp.template sync<DeviceType>();
   k_params_fbp.template sync<DeviceType>();
   k_params_hbp.template sync<DeviceType>();
 
   if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
   else atomKK->modified(execution_space,F_MASK);
 
   x = atomKK->k_x.view<DeviceType>();
   f = atomKK->k_f.view<DeviceType>();
   q = atomKK->k_q.view<DeviceType>();
   tag = atomKK->k_tag.view<DeviceType>();
   type = atomKK->k_type.view<DeviceType>();
   nlocal = atomKK->nlocal;
   nall = atom->nlocal + atom->nghost;
   newton_pair = force->newton_pair;
 
   const int inum = list->inum;
   const int ignum = inum + list->gnum;
   NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
   d_numneigh = k_list->d_numneigh;
   d_neighbors = k_list->d_neighbors;
   d_ilist = k_list->d_ilist;
 
   k_list->clean_copy();
 
   if (eflag_global) {
     for (int i = 0; i < 14; i++)
       pvector[i] = 0.0;
   }
 
   copymode = 1;
 
   EV_FLOAT_REAX ev;
   EV_FLOAT_REAX ev_all;
 
   // Polarization (self)
   if (neighflag == HALF) {
     if (evflag)
       Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputePolar<HALF,1> >(0,inum),*this,ev);
     else
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputePolar<HALF,0> >(0,inum),*this);
   } else { //if (neighflag == HALFTHREAD) {
     if (evflag)
       Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputePolar<HALFTHREAD,1> >(0,inum),*this,ev);
     else
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputePolar<HALFTHREAD,0> >(0,inum),*this);
   }
   DeviceType::fence();
   ev_all += ev;
   pvector[13] = ev.ecoul;
 
   // LJ + Coulomb
   if (control->tabulate) {
     if (neighflag == HALF) {
       if (evflag)
         Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeTabulatedLJCoulomb<HALF,1> >(0,inum),*this,ev);
       else
         Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeTabulatedLJCoulomb<HALF,0> >(0,inum),*this);
     } else if (neighflag == HALFTHREAD) {
       if (evflag)
         Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeTabulatedLJCoulomb<HALFTHREAD,1> >(0,inum),*this,ev);
       else
         Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeTabulatedLJCoulomb<HALFTHREAD,0> >(0,inum),*this);
     } else if (neighflag == FULL) {
       if (evflag)
         Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeTabulatedLJCoulomb<FULL,1> >(0,inum),*this,ev);
       else
         Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeTabulatedLJCoulomb<FULL,0> >(0,inum),*this);
     }
   } else {
     if (neighflag == HALF) {
       if (evflag)
         Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeLJCoulomb<HALF,1> >(0,inum),*this,ev);
       else
         Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeLJCoulomb<HALF,0> >(0,inum),*this);
     } else if (neighflag == HALFTHREAD) {
       if (evflag)
         Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeLJCoulomb<HALFTHREAD,1> >(0,inum),*this,ev);
       else
         Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeLJCoulomb<HALFTHREAD,0> >(0,inum),*this);
     } else if (neighflag == FULL) {
       if (evflag)
         Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeLJCoulomb<FULL,1> >(0,inum),*this,ev);
       else
         Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeLJCoulomb<FULL,0> >(0,inum),*this);
     }
   }
   DeviceType::fence();
   ev_all += ev;
   pvector[10] = ev.evdwl;
   pvector[11] = ev.ecoul;
 
 
   if (atom->nmax > nmax) {
     nmax = atom->nmax;
     allocate_array();
   }
 
   // Neighbor lists for bond and hbond
 
   // try, resize if necessary
 
   int resize = 1;
   while (resize) {
     resize = 0;
 
     k_resize_bo.h_view() = 0;
     k_resize_bo.modify<LMPHostType>();
     k_resize_bo.sync<DeviceType>();
 
     k_resize_hb.h_view() = 0;
     k_resize_hb.modify<LMPHostType>();
     k_resize_hb.sync<DeviceType>();
 
     // zero
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxZero>(0,nmax),*this);
     DeviceType::fence();
 
     if (neighflag == HALF)
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxBuildListsHalf<HALF> >(0,ignum),*this);
     else if (neighflag == HALFTHREAD)
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxBuildListsHalf_LessAtomics<HALFTHREAD> >(0,ignum),*this);
     else //(neighflag == FULL)
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxBuildListsFull>(0,ignum),*this);
     DeviceType::fence();
 
     k_resize_bo.modify<DeviceType>();
     k_resize_bo.sync<LMPHostType>();
     int resize_bo = k_resize_bo.h_view();
     if (resize_bo) maxbo++;
 
     k_resize_hb.modify<DeviceType>();
     k_resize_hb.sync<LMPHostType>();
     int resize_hb = k_resize_hb.h_view();
     if (resize_hb) maxhb++;
 
     resize = resize_bo || resize_hb;
     if (resize) allocate_array();
   }
 
   // Bond order
   if (neighflag == HALF) {
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxBondOrder1>(0,ignum),*this);
     DeviceType::fence();
   } else if (neighflag == HALFTHREAD) {
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxBondOrder1_LessAtomics>(0,ignum),*this);
     DeviceType::fence();
   }
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxBondOrder2>(0,ignum),*this);
   DeviceType::fence();
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxBondOrder3>(0,ignum),*this);
   DeviceType::fence();
 
   // Bond energy
   if (neighflag == HALF) {
     if (evflag)
       Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeBond1<HALF,1> >(0,inum),*this,ev);
     else
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeBond1<HALF,0> >(0,inum),*this);
     DeviceType::fence();
     ev_all += ev;
     pvector[0] = ev.evdwl;
   } else { //if (neighflag == HALFTHREAD) {
     if (evflag)
       Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeBond1<HALFTHREAD,1> >(0,inum),*this,ev);
     else
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeBond1<HALFTHREAD,0> >(0,inum),*this);
     DeviceType::fence();
     ev_all += ev;
     pvector[0] = ev.evdwl;
   }
 
   // Multi-body corrections
   if (neighflag == HALF) {
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeMulti1<HALF,0> >(0,inum),*this);
     DeviceType::fence();
     if (evflag)
       Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeMulti2<HALF,1> >(0,inum),*this,ev);
     else
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeMulti2<HALF,0> >(0,inum),*this);
     DeviceType::fence();
     ev_all += ev;
   } else { //if (neighflag == HALFTHREAD) {
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeMulti1<HALFTHREAD,0> >(0,inum),*this);
     DeviceType::fence();
     if (evflag)
       Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeMulti2<HALFTHREAD,1> >(0,inum),*this,ev);
     else
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeMulti2<HALFTHREAD,0> >(0,inum),*this);
     DeviceType::fence();
     ev_all += ev;
   }
   pvector[2] = ev.ereax[0];
   pvector[1] = ev.ereax[1]+ev.ereax[2];
   pvector[3] = 0.0;
   ev_all.evdwl += ev.ereax[0] + ev.ereax[1] + ev.ereax[2];
 
   // Angular
   if (neighflag == HALF) {
     if (evflag)
       Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeAngular<HALF,1> >(0,inum),*this,ev);
     else
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeAngular<HALF,0> >(0,inum),*this);
     DeviceType::fence();
     ev_all += ev;
   } else { //if (neighflag == HALFTHREAD) {
     if (evflag)
       Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeAngular<HALFTHREAD,1> >(0,inum),*this,ev);
     else
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeAngular<HALFTHREAD,0> >(0,inum),*this);
     DeviceType::fence();
     ev_all += ev;
   }
   pvector[4] = ev.ereax[3];
   pvector[5] = ev.ereax[4];
   pvector[6] = ev.ereax[5];
   ev_all.evdwl += ev.ereax[3] + ev.ereax[4] + ev.ereax[5];
 
   // Torsion
   if (neighflag == HALF) {
     if (evflag)
       Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeTorsion<HALF,1> >(0,inum),*this,ev);
     else
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeTorsion<HALF,0> >(0,inum),*this);
     DeviceType::fence();
     ev_all += ev;
   } else { //if (neighflag == HALFTHREAD) {
     if (evflag)
       Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeTorsion<HALFTHREAD,1> >(0,inum),*this,ev);
     else
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeTorsion<HALFTHREAD,0> >(0,inum),*this);
     DeviceType::fence();
     ev_all += ev;
   }
   pvector[8] = ev.ereax[6];
   pvector[9] = ev.ereax[7];
   ev_all.evdwl += ev.ereax[6] + ev.ereax[7];
 
   // Hydrogen Bond
   if (cut_hbsq > 0.0) {
     if (neighflag == HALF) {
       if (evflag)
         Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeHydrogen<HALF,1> >(0,inum),*this,ev);
       else
         Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeHydrogen<HALF,0> >(0,inum),*this);
       DeviceType::fence();
       ev_all += ev;
     } else { //if (neighflag == HALFTHREAD) {
       if (evflag)
         Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeHydrogen<HALFTHREAD,1> >(0,inum),*this,ev);
       else
         Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeHydrogen<HALFTHREAD,0> >(0,inum),*this);
       DeviceType::fence();
       ev_all += ev;
     }
   }
   pvector[7] = ev.ereax[8];
   ev_all.evdwl += ev.ereax[8];
 
   // Bond force
   if (neighflag == HALF) {
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxUpdateBond<HALF> >(0,ignum),*this);
     DeviceType::fence();
     if (evflag)
       Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeBond2<HALF,1> >(0,ignum),*this,ev);
     else
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeBond2<HALF,0> >(0,ignum),*this);
     DeviceType::fence();
     ev_all += ev;
     pvector[0] += ev.evdwl;
   } else { //if (neighflag == HALFTHREAD) {
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxUpdateBond<HALFTHREAD> >(0,ignum),*this);
     DeviceType::fence();
     if (evflag)
       Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeBond2<HALFTHREAD,1> >(0,ignum),*this,ev);
     else
       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxComputeBond2<HALFTHREAD,0> >(0,ignum),*this);
     DeviceType::fence();
     ev_all += ev;
     pvector[0] += ev.evdwl;
   }
 
   if (eflag_global) {
     eng_vdwl += ev_all.evdwl;
     eng_coul += ev_all.ecoul;
   }
   if (vflag_global) {
     virial[0] += ev_all.v[0];
     virial[1] += ev_all.v[1];
     virial[2] += ev_all.v[2];
     virial[3] += ev_all.v[3];
     virial[4] += ev_all.v[4];
     virial[5] += ev_all.v[5];
   }
 
   if (vflag_fdotr) pair_virial_fdotr_compute(this);
 
   if (eflag_atom) {
     k_eatom.template modify<DeviceType>();
     k_eatom.template sync<LMPHostType>();
   }
 
   if (vflag_atom) {
     k_vatom.template modify<DeviceType>();
     k_vatom.template sync<LMPHostType>();
   }
 
   copymode = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputePolar<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT_REAX& ev) const {
 
   const int i = d_ilist[ii];
   const int itype = type(i);
   const F_FLOAT qi = q(i);
   const F_FLOAT chi = paramssing(itype).chi;
   const F_FLOAT eta = paramssing(itype).eta;
 
   const F_FLOAT epol = KCALpMOL_to_EV*(chi*qi+(eta/2.0)*qi*qi);
   if (eflag) ev.ecoul += epol;
   //if (eflag_atom) this->template ev_tally<NEIGHFLAG>(ev,i,i,epol,0.0,0.0,0.0,0.0);
   if (eflag_atom) this->template e_tally_single<NEIGHFLAG>(ev,i,epol);
 
 }
 
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputePolar<NEIGHFLAG,EVFLAG>, const int &ii) const {
   EV_FLOAT_REAX ev;
   this->template operator()<NEIGHFLAG,EVFLAG>(PairReaxComputePolar<NEIGHFLAG,EVFLAG>(), ii, ev);
 
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeLJCoulomb<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT_REAX& ev) const {
 
   // The f array is atomic for Half/Thread neighbor style
   Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
 
   F_FLOAT powr_vdw, powgi_vdw, fn13, dfn13, exp1, exp2, etmp;
   F_FLOAT evdwl, fvdwl;
   evdwl = fvdwl = 0.0;
 
   const int i = d_ilist[ii];
   const X_FLOAT xtmp = x(i,0);
   const X_FLOAT ytmp = x(i,1);
   const X_FLOAT ztmp = x(i,2);
   const F_FLOAT qi = q(i);
   const int itype = type(i);
   const int itag = tag(i);
   const int jnum = d_numneigh[i];
 
   F_FLOAT fxtmp, fytmp, fztmp;
   fxtmp = fytmp = fztmp = 0.0;
 
   for (int jj = 0; jj < jnum; jj++) {
     int j = d_neighbors(i,jj);
     j &= NEIGHMASK;
     const int jtype = type(j);
     const int jtag = tag(j);
     const F_FLOAT qj = q(j);
 
     if (NEIGHFLAG != FULL) {
       // skip half of the interactions
       if (j >= nlocal) {
         if (itag > jtag) {
           if ((itag+jtag) % 2 == 0) continue;
         } else if (itag < jtag) {
           if ((itag+jtag) % 2 == 1) continue;
         } else {
           if (x(j,2) < ztmp) continue;
           if (x(j,2) == ztmp && x(j,1)  < ytmp) continue;
           if (x(j,2) == ztmp && x(j,1) == ytmp && x(j,0) < xtmp) continue;
         }
       }
     }
 
     const X_FLOAT delx = x(j,0) - xtmp;
     const X_FLOAT dely = x(j,1) - ytmp;
     const X_FLOAT delz = x(j,2) - ztmp;
     const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
 
     if (rsq > cut_nbsq) continue;
     const F_FLOAT rij = sqrt(rsq);
 
     // LJ energy/force
     F_FLOAT Tap = d_tap[7] * rij + d_tap[6];
     Tap = Tap * rij + d_tap[5];
     Tap = Tap * rij + d_tap[4];
     Tap = Tap * rij + d_tap[3];
     Tap = Tap * rij + d_tap[2];
     Tap = Tap * rij + d_tap[1];
     Tap = Tap * rij + d_tap[0];
 
     F_FLOAT dTap = 7*d_tap[7] * rij + 6*d_tap[6];
     dTap = dTap * rij + 5*d_tap[5];
     dTap = dTap * rij + 4*d_tap[4];
     dTap = dTap * rij + 3*d_tap[3];
     dTap = dTap * rij + 2*d_tap[2];
     dTap += d_tap[1]/rij;
 
     const F_FLOAT gamma_w = paramstwbp(itype,jtype).gamma_w;
     const F_FLOAT alpha = paramstwbp(itype,jtype).alpha;
     const F_FLOAT r_vdw = paramstwbp(itype,jtype).r_vdw;
     const F_FLOAT epsilon = paramstwbp(itype,jtype).epsilon;
 
     // shielding
     if (vdwflag == 1 || vdwflag == 3) {
       powr_vdw = pow(rij,gp[28]);
       powgi_vdw = pow(1.0/gamma_w,gp[28]);
       fn13 = pow(powr_vdw+powgi_vdw,1.0/gp[28]);
       exp1 = exp(alpha*(1.0-fn13/r_vdw));
       exp2 = exp(0.5*alpha*(1.0-fn13/r_vdw));
       dfn13 = pow(powr_vdw+powgi_vdw,1.0/gp[28]-1.0)*pow(rij,gp[28]-2.0);
       etmp = epsilon*(exp1-2.0*exp2);
       evdwl = Tap*etmp;
       fvdwl = dTap*etmp-Tap*epsilon*(alpha/r_vdw)*(exp1-exp2)*dfn13;
     } else {
       exp1 = exp(alpha*(1.0-rij/r_vdw));
       exp2 = exp(0.5*alpha*(1.0-rij/r_vdw));
       etmp = epsilon*(exp1-2.0*exp2);
       evdwl = Tap*etmp;
       fvdwl = dTap*etmp-Tap*epsilon*(alpha/r_vdw)*(exp1-exp2)*rij;
     }
     // inner wall
     if (vdwflag == 2 || vdwflag == 3) {
       const F_FLOAT ecore = paramstwbp(itype,jtype).ecore;
       const F_FLOAT acore = paramstwbp(itype,jtype).acore;
       const F_FLOAT rcore = paramstwbp(itype,jtype).rcore;
       const F_FLOAT e_core = ecore*exp(acore*(1.0-(rij/rcore)));
       const F_FLOAT de_core = -(acore/rcore)*e_core;
       evdwl += Tap*e_core;
       fvdwl += dTap*e_core+Tap*de_core/rij;
 
       if (lgflag) {
         const F_FLOAT lgre = paramstwbp(itype,jtype).lgre;
         const F_FLOAT lgcij = paramstwbp(itype,jtype).lgcij;
         const F_FLOAT rij5 = rsq*rsq*rij;
         const F_FLOAT rij6 = rij5*rij;
         const F_FLOAT re6 = lgre*lgre*lgre*lgre*lgre*lgre;
         const F_FLOAT elg = -lgcij/(rij6+re6);
         const F_FLOAT delg = -6.0*elg*rij5/(rij6+re6);
         evdwl += Tap*elg;
         fvdwl += dTap*elg+Tap*delg/rij;
       }
     }
 
     // Coulomb energy/force
     const F_FLOAT shld = paramstwbp(itype,jtype).gamma;
     const F_FLOAT denom1 = rij * rij * rij + shld;
     const F_FLOAT denom3 = pow(denom1,0.3333333333333);
     const F_FLOAT ecoul = C_ele * qi*qj*Tap/denom3;
     const F_FLOAT fcoul = C_ele * qi*qj*(dTap-Tap*rij/denom1)/denom3;
 
     const F_FLOAT ftotal = fvdwl + fcoul;
     fxtmp += delx*ftotal;
     fytmp += dely*ftotal;
     fztmp += delz*ftotal;
     if (NEIGHFLAG != FULL) {
       a_f(j,0) -= delx*ftotal;
       a_f(j,1) -= dely*ftotal;
       a_f(j,2) -= delz*ftotal;
     }
 
     if (NEIGHFLAG == FULL) {
       if (eflag) ev.evdwl += 0.5*evdwl;
       if (eflag) ev.ecoul += 0.5*ecoul;
     } else {
       if (eflag) ev.evdwl += evdwl;
       if (eflag) ev.ecoul += ecoul;
     }
 
     if (vflag_either || eflag_atom) this->template ev_tally<NEIGHFLAG>(ev,i,j,evdwl+ecoul,-ftotal,delx,dely,delz);
   }
 
   a_f(i,0) += fxtmp;
   a_f(i,1) += fytmp;
   a_f(i,2) += fztmp;
 }
 
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeLJCoulomb<NEIGHFLAG,EVFLAG>, const int &ii) const {
   EV_FLOAT_REAX ev;
   this->template operator()<NEIGHFLAG,EVFLAG>(PairReaxComputeLJCoulomb<NEIGHFLAG,EVFLAG>(), ii, ev);
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeTabulatedLJCoulomb<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT_REAX& ev) const {
 
   // The f array is atomic for Half/Thread neighbor style
   Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
 
   const int i = d_ilist[ii];
   const X_FLOAT xtmp = x(i,0);
   const X_FLOAT ytmp = x(i,1);
   const X_FLOAT ztmp = x(i,2);
   const F_FLOAT qi = q(i);
   const int itype = type(i);
   const int itag = tag(i);
   const int jnum = d_numneigh[i];
 
   F_FLOAT fxtmp, fytmp, fztmp;
   fxtmp = fytmp = fztmp = 0.0;
 
   for (int jj = 0; jj < jnum; jj++) {
     int j = d_neighbors(i,jj);
     j &= NEIGHMASK;
     const int jtype = type(j);
     const int jtag = tag(j);
     const F_FLOAT qj = q(j);
 
     if (NEIGHFLAG != FULL) {
       // skip half of the interactions
       if (j >= nlocal) {
         if (itag > jtag) {
           if ((itag+jtag) % 2 == 0) continue;
         } else if (itag < jtag) {
           if ((itag+jtag) % 2 == 1) continue;
         } else {
           if (x(j,2) < ztmp) continue;
           if (x(j,2) == ztmp && x(j,1)  < ytmp) continue;
           if (x(j,2) == ztmp && x(j,1) == ytmp && x(j,0) < xtmp) continue;
         }
       }
     }
 
     const X_FLOAT delx = x(j,0) - xtmp;
     const X_FLOAT dely = x(j,1) - ytmp;
     const X_FLOAT delz = x(j,2) - ztmp;
     const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
 
     if (rsq > cut_nbsq) continue;
     const F_FLOAT rij = sqrt(rsq);
 
     const int tmin  = MIN( itype, jtype );
     const int tmax  = MAX( itype, jtype );
     const LR_lookup_table_kk t = d_LR(tmin,tmax);
 
 
     /* Cubic Spline Interpolation */
     int r = (int)(rij * t.inv_dx);
     if( r == 0 )  ++r;
     const F_FLOAT base = (double)(r+1) * t.dx;
     const F_FLOAT dif = rij - base;
 
     const cubic_spline_coef vdW = t.d_vdW[r];
     const cubic_spline_coef ele = t.d_ele[r];
     const cubic_spline_coef CEvd = t.d_CEvd[r];
     const cubic_spline_coef CEclmb = t.d_CEclmb[r];
 
     const F_FLOAT evdwl = ((vdW.d*dif + vdW.c)*dif + vdW.b)*dif +
       vdW.a;
 
     const F_FLOAT ecoul = (((ele.d*dif + ele.c)*dif + ele.b)*dif +
       ele.a)*qi*qj;
 
     const F_FLOAT fvdwl = ((CEvd.d*dif + CEvd.c)*dif + CEvd.b)*dif +
       CEvd.a;
 
     const F_FLOAT fcoul = (((CEclmb.d*dif+CEclmb.c)*dif+CEclmb.b)*dif +
       CEclmb.a)*qi*qj;
 
     const F_FLOAT ftotal = fvdwl + fcoul;
     fxtmp += delx*ftotal;
     fytmp += dely*ftotal;
     fztmp += delz*ftotal;
     if (NEIGHFLAG != FULL) {
       a_f(j,0) -= delx*ftotal;
       a_f(j,1) -= dely*ftotal;
       a_f(j,2) -= delz*ftotal;
     }
 
     if (NEIGHFLAG == FULL) {
       if (eflag) ev.evdwl += 0.5*evdwl;
       if (eflag) ev.ecoul += 0.5*ecoul;
     } else {
       if (eflag) ev.evdwl += evdwl;
       if (eflag) ev.ecoul += ecoul;
     }
 
     if (vflag_either || eflag_atom) this->template ev_tally<NEIGHFLAG>(ev,i,j,evdwl+ecoul,-ftotal,delx,dely,delz);
   }
 
   a_f(i,0) += fxtmp;
   a_f(i,1) += fytmp;
   a_f(i,2) += fztmp;
 }
 
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeTabulatedLJCoulomb<NEIGHFLAG,EVFLAG>, const int &ii) const {
   EV_FLOAT_REAX ev;
   this->template operator()<NEIGHFLAG,EVFLAG>(PairReaxComputeTabulatedLJCoulomb<NEIGHFLAG,EVFLAG>(), ii, ev);
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 void PairReaxCKokkos<DeviceType>::allocate_array()
 {
   if (cut_hbsq > 0.0) {
     d_hb_first = typename AT::t_int_1d("reax/c/kk:hb_first",nmax);
     d_hb_num = typename AT::t_int_1d("reax/c/kk:hb_num",nmax);
     d_hb_list = typename AT::t_int_1d("reax/c/kk:hb_list",nmax*maxhb);
   }
   d_bo_first = typename AT::t_int_1d("reax/c/kk:bo_first",nmax);
   d_bo_num = typename AT::t_int_1d("reax/c/kk:bo_num",nmax);
   d_bo_list = typename AT::t_int_1d("reax/c/kk:bo_list",nmax*maxbo);
 
   d_BO = typename AT::t_ffloat_2d_dl("reax/c/kk:BO",nmax,maxbo);
   d_BO_s = typename AT::t_ffloat_2d_dl("reax/c/kk:BO",nmax,maxbo);
   d_BO_pi = typename AT::t_ffloat_2d_dl("reax/c/kk:BO_pi",nmax,maxbo);
   d_BO_pi2 = typename AT::t_ffloat_2d_dl("reax/c/kk:BO_pi2",nmax,maxbo);
 
   d_dln_BOp_pix = typename AT::t_ffloat_2d_dl("reax/c/kk:d_dln_BOp_pix",nmax,maxbo);
   d_dln_BOp_piy = typename AT::t_ffloat_2d_dl("reax/c/kk:d_dln_BOp_piy",nmax,maxbo);
   d_dln_BOp_piz = typename AT::t_ffloat_2d_dl("reax/c/kk:d_dln_BOp_piz",nmax,maxbo);
 
   d_dln_BOp_pi2x = typename AT::t_ffloat_2d_dl("reax/c/kk:d_dln_BOp_pi2x",nmax,maxbo);
   d_dln_BOp_pi2y = typename AT::t_ffloat_2d_dl("reax/c/kk:d_dln_BOp_pi2y",nmax,maxbo);
   d_dln_BOp_pi2z = typename AT::t_ffloat_2d_dl("reax/c/kk:d_dln_BOp_pi2z",nmax,maxbo);
 
   d_C1dbo = typename AT::t_ffloat_2d_dl("reax/c/kk:d_C1dbo",nmax,maxbo);
   d_C2dbo = typename AT::t_ffloat_2d_dl("reax/c/kk:d_C2dbo",nmax,maxbo);
   d_C3dbo = typename AT::t_ffloat_2d_dl("reax/c/kk:d_C3dbo",nmax,maxbo);
 
   d_C1dbopi = typename AT::t_ffloat_2d_dl("reax/c/kk:d_C1dbopi",nmax,maxbo);
   d_C2dbopi = typename AT::t_ffloat_2d_dl("reax/c/kk:d_C2dbopi",nmax,maxbo);
   d_C3dbopi = typename AT::t_ffloat_2d_dl("reax/c/kk:d_C3dbopi",nmax,maxbo);
   d_C4dbopi = typename AT::t_ffloat_2d_dl("reax/c/kk:d_C4dbopi",nmax,maxbo);
 
   d_C1dbopi2 = typename AT::t_ffloat_2d_dl("reax/c/kk:d_C1dbopi2",nmax,maxbo);
   d_C2dbopi2 = typename AT::t_ffloat_2d_dl("reax/c/kk:d_C2dbopi2",nmax,maxbo);
   d_C3dbopi2 = typename AT::t_ffloat_2d_dl("reax/c/kk:d_C3dbopi2",nmax,maxbo);
   d_C4dbopi2 = typename AT::t_ffloat_2d_dl("reax/c/kk:d_C4dbopi2",nmax,maxbo);
 
   d_dBOpx = typename AT::t_ffloat_2d_dl("reax/c/kk:dBOpx",nmax,maxbo);
   d_dBOpy = typename AT::t_ffloat_2d_dl("reax/c/kk:dBOpy",nmax,maxbo);
   d_dBOpz = typename AT::t_ffloat_2d_dl("reax/c/kk:dBOpz",nmax,maxbo);
 
   d_dDeltap_self = typename AT::t_ffloat_2d_dl("reax/c/kk:dDeltap_self",nmax,3);
   d_Deltap_boc = typename AT::t_ffloat_1d("reax/c/kk:Deltap_boc",nmax);
   d_Deltap = typename AT::t_ffloat_1d("reax/c/kk:Deltap",nmax);
   d_total_bo = typename AT::t_ffloat_1d("reax/c/kk:total_bo",nmax);
 
   d_Cdbo = typename AT::t_ffloat_2d_dl("reax/c/kk:Cdbo",nmax,3*maxbo);
   d_Cdbopi = typename AT::t_ffloat_2d_dl("reax/c/kk:Cdbopi",nmax,3*maxbo);
   d_Cdbopi2 = typename AT::t_ffloat_2d_dl("reax/c/kk:Cdbopi2",nmax,3*maxbo);
 
   d_Delta = typename AT::t_ffloat_1d("reax/c/kk:Delta",nmax);
   d_Delta_boc = typename AT::t_ffloat_1d("reax/c/kk:Delta_boc",nmax);
   d_dDelta_lp = typename AT::t_ffloat_1d("reax/c/kk:dDelta_lp",nmax);
   d_Delta_lp = typename AT::t_ffloat_1d("reax/c/kk:Delta_lp",nmax);
   d_Delta_lp_temp = typename AT::t_ffloat_1d("reax/c/kk:Delta_lp_temp",nmax);
   d_CdDelta = typename AT::t_ffloat_1d("reax/c/kk:CdDelta",nmax);
   d_sum_ovun = typename AT::t_ffloat_2d_dl("reax/c/kk:sum_ovun",nmax,3);
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxZero, const int &n) const {
   d_total_bo(n) = 0.0;
   d_CdDelta(n) = 0.0;
   if (neighflag != FULL) {
     d_bo_num(n) = 0.0;
     d_hb_num(n) = 0.0;
   }
   for (int j = 0; j < 3; j++)
     d_dDeltap_self(n,j) = 0.0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxBuildListsFull, const int &ii) const {
 
   if (d_resize_bo() || d_resize_hb())
     return;
 
   const int i = d_ilist[ii];
   const X_FLOAT xtmp = x(i,0);
   const X_FLOAT ytmp = x(i,1);
   const X_FLOAT ztmp = x(i,2);
   const int itype = type(i);
   const int jnum = d_numneigh[i];
 
   F_FLOAT C12, C34, C56, BO_s, BO_pi, BO_pi2, BO, delij[3], dBOp_i[3], dln_BOp_pi_i[3], dln_BOp_pi2_i[3];
   F_FLOAT total_bo = 0.0;
 
   int j_index = i*maxbo;
   d_bo_first[i] = j_index;
   const int bo_first_i = j_index;
 
   int ihb = -1;
   int jhb = -1;
   int hb_index = i*maxhb;
 
   int hb_first_i;
   if (cut_hbsq > 0.0) {
     ihb = paramssing(itype).p_hbond;
     if (ihb == 1) {
       d_hb_first[i] = hb_index;
       hb_first_i = hb_index;
     }
   }
 
   for (int jj = 0; jj < jnum; jj++) {
     int j = d_neighbors(i,jj);
     j &= NEIGHMASK;
     delij[0] = x(j,0) - xtmp;
     delij[1] = x(j,1) - ytmp;
     delij[2] = x(j,2) - ztmp;
     const F_FLOAT rsq = delij[0]*delij[0] + delij[1]*delij[1] + delij[2]*delij[2];
 
     double cutoffsq;
     if(i < nlocal) cutoffsq = MAX(cut_bosq,cut_hbsq);
     else cutoffsq = cut_bosq;
     if (rsq > cutoffsq) continue;
 
     const int jtype = type(j);
 
     // hbond list
     if (i < nlocal && cut_hbsq > 0.0 && (ihb == 1 || ihb == 2) && rsq <= cut_hbsq) {
       jhb = paramssing(jtype).p_hbond;
       if( ihb == 1 && jhb == 2) {
         const int jj_index = hb_index - hb_first_i;
         if (jj_index >= maxhb) {
           d_resize_hb() = 1;
           return;
         }
         d_hb_list[hb_index] = j;
         hb_index++;
       }
     }
 
     // bond_list
     const F_FLOAT rij = sqrt(rsq);
     const F_FLOAT p_bo1 = paramstwbp(itype,jtype).p_bo1;
     const F_FLOAT p_bo2 = paramstwbp(itype,jtype).p_bo2;
     const F_FLOAT p_bo3 = paramstwbp(itype,jtype).p_bo3;
     const F_FLOAT p_bo4 = paramstwbp(itype,jtype).p_bo4;
     const F_FLOAT p_bo5 = paramstwbp(itype,jtype).p_bo5;
     const F_FLOAT p_bo6 = paramstwbp(itype,jtype).p_bo6;
     const F_FLOAT r_s = paramstwbp(itype,jtype).r_s;
     const F_FLOAT r_pi = paramstwbp(itype,jtype).r_pi;
     const F_FLOAT r_pi2 = paramstwbp(itype,jtype).r_pi2;
 
     if (paramssing(itype).r_s > 0.0  && paramssing(jtype).r_s > 0.0) {
       C12 = p_bo1*pow(rij/r_s,p_bo2);
       BO_s = (1.0+bo_cut)*exp(C12);
     }
     else BO_s = C12 = 0.0;
 
     if (paramssing(itype).r_pi > 0.0  && paramssing(jtype).r_pi > 0.0) {
       C34 = p_bo3*pow(rij/r_pi,p_bo4);
       BO_pi = exp(C34);
     }
     else BO_pi = C34 = 0.0;
 
     if (paramssing(itype).r_pi2 > 0.0  && paramssing(jtype).r_pi2 > 0.0) {
       C56 = p_bo5*pow(rij/r_pi2,p_bo6);
       BO_pi2 = exp(C56);
     }
     else BO_pi2 = C56 = 0.0;
 
     BO = BO_s + BO_pi + BO_pi2;
     if (BO < bo_cut) continue;
 
     const int jj_index = j_index - bo_first_i;
 
     if (jj_index >= maxbo) {
       d_resize_bo() = 1;
       return;
     }
 
     d_bo_list[j_index] = j;
 
     // from BondOrder1
 
     d_BO(i,jj_index) = BO;
     d_BO_s(i,jj_index) = BO_s;
     d_BO_pi(i,jj_index) = BO_pi;
     d_BO_pi2(i,jj_index) = BO_pi2;
 
     F_FLOAT Cln_BOp_s = p_bo2 * C12 / rij / rij;
     F_FLOAT Cln_BOp_pi = p_bo4 * C34 / rij / rij;
     F_FLOAT Cln_BOp_pi2 = p_bo6 * C56 / rij / rij;
 
     if (nlocal == 0)
       Cln_BOp_s = Cln_BOp_pi = Cln_BOp_pi2 = 0.0;
 
     for (int d = 0; d < 3; d++) dln_BOp_pi_i[d] = -(BO_pi*Cln_BOp_pi)*delij[d];
     for (int d = 0; d < 3; d++) dln_BOp_pi2_i[d] = -(BO_pi2*Cln_BOp_pi2)*delij[d];
     for (int d = 0; d < 3; d++) dBOp_i[d] = -(BO_s*Cln_BOp_s+BO_pi*Cln_BOp_pi+BO_pi2*Cln_BOp_pi2)*delij[d];
     for (int d = 0; d < 3; d++) d_dDeltap_self(i,d) += dBOp_i[d];
 
     d_dln_BOp_pix(i,jj_index) = dln_BOp_pi_i[0];
     d_dln_BOp_piy(i,jj_index) = dln_BOp_pi_i[1];
     d_dln_BOp_piz(i,jj_index) = dln_BOp_pi_i[2];
 
     d_dln_BOp_pi2x(i,jj_index) = dln_BOp_pi2_i[0];
     d_dln_BOp_pi2y(i,jj_index) = dln_BOp_pi2_i[1];
     d_dln_BOp_pi2z(i,jj_index) = dln_BOp_pi2_i[2];
 
     d_dBOpx(i,jj_index) = dBOp_i[0];
     d_dBOpy(i,jj_index) = dBOp_i[1];
     d_dBOpz(i,jj_index) = dBOp_i[2];
 
     d_BO(i,jj_index) -= bo_cut;
     d_BO_s(i,jj_index) -= bo_cut;
     total_bo += d_BO(i,jj_index);
 
     j_index++;
   }
 
   d_bo_num[i] = j_index - d_bo_first[i];
   if (cut_hbsq > 0.0 && ihb == 1) d_hb_num[i] = hb_index - d_hb_first[i];
 
   d_total_bo[i] += total_bo;
 
   const F_FLOAT val_i = paramssing(itype).valency;
   d_Deltap[i] = d_total_bo[i] - val_i;
   d_Deltap_boc[i] = d_total_bo[i] - paramssing(itype).valency_val;
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 template<int NEIGHFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxBuildListsHalf<NEIGHFLAG>, const int &ii) const {
 
   if (d_resize_bo() || d_resize_hb())
     return;
 
   Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_dDeltap_self = d_dDeltap_self;
   Kokkos::View<F_FLOAT*, typename DAT::t_float_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_total_bo = d_total_bo;
 
   const int i = d_ilist[ii];
   const X_FLOAT xtmp = x(i,0);
   const X_FLOAT ytmp = x(i,1);
   const X_FLOAT ztmp = x(i,2);
   const int itype = type(i);
   const int itag = tag(i);
   const int jnum = d_numneigh[i];
 
   F_FLOAT C12, C34, C56, BO_s, BO_pi, BO_pi2, BO, delij[3], dBOp_i[3], dln_BOp_pi_i[3], dln_BOp_pi2_i[3];
   F_FLOAT total_bo = 0.0;
 
   int j_index,i_index;
   d_bo_first[i] = i*maxbo;
   const int bo_first_i = d_bo_first[i];
 
   int ihb = -1;
   int jhb = -1;
 
   int hb_first_i;
   if (cut_hbsq > 0.0) {
     ihb = paramssing(itype).p_hbond;
     if (ihb == 1) {
       d_hb_first[i] = i*maxhb;
       hb_first_i = d_hb_first[i];
     }
   }
 
   for (int jj = 0; jj < jnum; jj++) {
     int j = d_neighbors(i,jj);
     j &= NEIGHMASK;
     const int jtag = tag(j);
 
     d_bo_first[j] = j*maxbo;
     d_hb_first[j] = j*maxhb;
     const int jtype = type(j);
 
     delij[0] = x(j,0) - xtmp;
     delij[1] = x(j,1) - ytmp;
     delij[2] = x(j,2) - ztmp;
     const F_FLOAT rsq = delij[0]*delij[0] + delij[1]*delij[1] + delij[2]*delij[2];
 
     double cutoffsq;
     if(i < nlocal) cutoffsq = MAX(cut_bosq,cut_hbsq);
     else cutoffsq = cut_bosq;
     if (rsq > cutoffsq) continue;
 
     // hbond list
     if (i < nlocal && cut_hbsq > 0.0 && (ihb == 1 || ihb == 2) && rsq <= cut_hbsq) {
       jhb = paramssing(jtype).p_hbond;
       if( ihb == 1 && jhb == 2) {
         if (NEIGHFLAG == HALF) {
           j_index = hb_first_i + d_hb_num[i];
           d_hb_num[i]++;
         } else {
           j_index = hb_first_i + Kokkos::atomic_fetch_add(&d_hb_num[i],1);
         }
 
         const int jj_index = j_index - hb_first_i;
 
         if (jj_index >= maxhb) {
           d_resize_hb() = 1;
           return;
         }
 
         d_hb_list[j_index] = j;
       } else if ( j < nlocal && ihb == 2 && jhb == 1) {
         if (NEIGHFLAG == HALF) {
           i_index = d_hb_first[j] + d_hb_num[j];
           d_hb_num[j]++;
         } else {
           i_index = d_hb_first[j] + Kokkos::atomic_fetch_add(&d_hb_num[j],1);
         }
 
         const int ii_index = i_index - d_hb_first[j];
 
         if (ii_index >= maxhb) {
           d_resize_hb() = 1;
           return;
         }
 
         d_hb_list[i_index] = i;
       }
     }
 
     // bond_list
     const F_FLOAT rij = sqrt(rsq);
     const F_FLOAT p_bo1 = paramstwbp(itype,jtype).p_bo1;
     const F_FLOAT p_bo2 = paramstwbp(itype,jtype).p_bo2;
     const F_FLOAT p_bo3 = paramstwbp(itype,jtype).p_bo3;
     const F_FLOAT p_bo4 = paramstwbp(itype,jtype).p_bo4;
     const F_FLOAT p_bo5 = paramstwbp(itype,jtype).p_bo5;
     const F_FLOAT p_bo6 = paramstwbp(itype,jtype).p_bo6;
     const F_FLOAT r_s = paramstwbp(itype,jtype).r_s;
     const F_FLOAT r_pi = paramstwbp(itype,jtype).r_pi;
     const F_FLOAT r_pi2 = paramstwbp(itype,jtype).r_pi2;
 
     if (paramssing(itype).r_s > 0.0  && paramssing(jtype).r_s > 0.0) {
       C12 = p_bo1*pow(rij/r_s,p_bo2);
       BO_s = (1.0+bo_cut)*exp(C12);
     }
     else BO_s = C12 = 0.0;
 
     if (paramssing(itype).r_pi > 0.0  && paramssing(jtype).r_pi > 0.0) {
       C34 = p_bo3*pow(rij/r_pi,p_bo4);
       BO_pi = exp(C34);
     }
     else BO_pi = C34 = 0.0;
 
     if (paramssing(itype).r_pi2 > 0.0  && paramssing(jtype).r_pi2 > 0.0) {
       C56 = p_bo5*pow(rij/r_pi2,p_bo6);
       BO_pi2 = exp(C56);
     }
     else BO_pi2 = C56 = 0.0;
 
     BO = BO_s + BO_pi + BO_pi2;
     if (BO < bo_cut) continue;
 
     if (NEIGHFLAG == HALF) {
       j_index = bo_first_i + d_bo_num[i];
       i_index = d_bo_first[j] + d_bo_num[j];
       d_bo_num[i]++;
       d_bo_num[j]++;
     } else {
       j_index = bo_first_i + Kokkos::atomic_fetch_add(&d_bo_num[i],1);
       i_index = d_bo_first[j] + Kokkos::atomic_fetch_add(&d_bo_num[j],1);
     }
 
     const int jj_index = j_index - bo_first_i;
     const int ii_index = i_index - d_bo_first[j];
 
     if (jj_index >= maxbo || ii_index >= maxbo) {
       d_resize_bo() = 1;
       return;
     }
 
     d_bo_list[j_index] = j;
     d_bo_list[i_index] = i;
 
     // from BondOrder1
 
     d_BO(i,jj_index) = BO;
     d_BO_s(i,jj_index) = BO_s;
     d_BO_pi(i,jj_index) = BO_pi;
     d_BO_pi2(i,jj_index) = BO_pi2;
 
     d_BO(j,ii_index) = BO;
     d_BO_s(j,ii_index) = BO_s;
     d_BO_pi(j,ii_index) = BO_pi;
     d_BO_pi2(j,ii_index) = BO_pi2;
 
     F_FLOAT Cln_BOp_s = p_bo2 * C12 / rij / rij;
     F_FLOAT Cln_BOp_pi = p_bo4 * C34 / rij / rij;
     F_FLOAT Cln_BOp_pi2 = p_bo6 * C56 / rij / rij;
 
     if (nlocal == 0)
       Cln_BOp_s = Cln_BOp_pi = Cln_BOp_pi2 = 0.0;
 
     for (int d = 0; d < 3; d++) dln_BOp_pi_i[d] = -(BO_pi*Cln_BOp_pi)*delij[d];
     for (int d = 0; d < 3; d++) dln_BOp_pi2_i[d] = -(BO_pi2*Cln_BOp_pi2)*delij[d];
     for (int d = 0; d < 3; d++) dBOp_i[d] = -(BO_s*Cln_BOp_s+BO_pi*Cln_BOp_pi+BO_pi2*Cln_BOp_pi2)*delij[d];
     for (int d = 0; d < 3; d++) a_dDeltap_self(i,d) += dBOp_i[d];
     for (int d = 0; d < 3; d++) a_dDeltap_self(j,d) += -dBOp_i[d];
 
     d_dln_BOp_pix(i,jj_index) = dln_BOp_pi_i[0];
     d_dln_BOp_piy(i,jj_index) = dln_BOp_pi_i[1];
     d_dln_BOp_piz(i,jj_index) = dln_BOp_pi_i[2];
 
     d_dln_BOp_pix(j,ii_index) = -dln_BOp_pi_i[0];
     d_dln_BOp_piy(j,ii_index) = -dln_BOp_pi_i[1];
     d_dln_BOp_piz(j,ii_index) = -dln_BOp_pi_i[2];
 
     d_dln_BOp_pi2x(i,jj_index) = dln_BOp_pi2_i[0];
     d_dln_BOp_pi2y(i,jj_index) = dln_BOp_pi2_i[1];
     d_dln_BOp_pi2z(i,jj_index) = dln_BOp_pi2_i[2];
 
     d_dln_BOp_pi2x(j,ii_index) = -dln_BOp_pi2_i[0];
     d_dln_BOp_pi2y(j,ii_index) = -dln_BOp_pi2_i[1];
     d_dln_BOp_pi2z(j,ii_index) = -dln_BOp_pi2_i[2];
 
     d_dBOpx(i,jj_index) = dBOp_i[0];
     d_dBOpy(i,jj_index) = dBOp_i[1];
     d_dBOpz(i,jj_index) = dBOp_i[2];
 
     d_dBOpx(j,ii_index) = -dBOp_i[0];
     d_dBOpy(j,ii_index) = -dBOp_i[1];
     d_dBOpz(j,ii_index) = -dBOp_i[2];
 
     d_BO(i,jj_index) -= bo_cut;
     d_BO(j,ii_index) -= bo_cut;
     d_BO_s(i,jj_index) -= bo_cut;
     d_BO_s(j,ii_index) -= bo_cut;
     total_bo += d_BO(i,jj_index);
     a_total_bo[j] += d_BO(j,ii_index);
   }
   a_total_bo[i] += total_bo;
 
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxBondOrder1, const int &ii) const {
 
   const int i = d_ilist[ii];
   const int itype = type(i);
 
   const F_FLOAT val_i = paramssing(itype).valency;
   d_Deltap[i] = d_total_bo[i] - val_i;
   d_Deltap_boc[i] = d_total_bo[i] - paramssing(itype).valency_val;
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 template<int NEIGHFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxBuildListsHalf_LessAtomics<NEIGHFLAG>, const int &ii) const {
 
   if (d_resize_bo() || d_resize_hb())
     return;
 
   const int i = d_ilist[ii];
   const X_FLOAT xtmp = x(i,0);
   const X_FLOAT ytmp = x(i,1);
   const X_FLOAT ztmp = x(i,2);
   const int itype = type(i);
   const int itag = tag(i);
   const int jnum = d_numneigh[i];
 
   F_FLOAT C12, C34, C56, BO_s, BO_pi, BO_pi2, BO, delij[3];
 
   int j_index,i_index;
   d_bo_first[i] = i*maxbo;
   const int bo_first_i = d_bo_first[i];
 
   int ihb = -1;
   int jhb = -1;
 
   int hb_first_i;
   if (cut_hbsq > 0.0) {
     ihb = paramssing(itype).p_hbond;
     if (ihb == 1) {
       d_hb_first[i] = i*maxhb;
       hb_first_i = d_hb_first[i];
     }
   }
 
   for (int jj = 0; jj < jnum; jj++) {
     int j = d_neighbors(i,jj);
     j &= NEIGHMASK;
     const int jtag = tag(j);
 
     d_bo_first[j] = j*maxbo;
     d_hb_first[j] = j*maxhb;
     const int jtype = type(j);
 
     delij[0] = x(j,0) - xtmp;
     delij[1] = x(j,1) - ytmp;
     delij[2] = x(j,2) - ztmp;
     const F_FLOAT rsq = delij[0]*delij[0] + delij[1]*delij[1] + delij[2]*delij[2];
 
     double cutoffsq;
     if(i < nlocal) cutoffsq = MAX(cut_bosq,cut_hbsq);
     else cutoffsq = cut_bosq;
     if (rsq > cutoffsq) continue;
 
     // hbond list
     if (i < nlocal && cut_hbsq > 0.0 && (ihb == 1 || ihb == 2) && rsq <= cut_hbsq) {
       jhb = paramssing(jtype).p_hbond;
       if( ihb == 1 && jhb == 2) {
         if (NEIGHFLAG == HALF) {
           j_index = hb_first_i + d_hb_num[i];
           d_hb_num[i]++;
         } else {
           j_index = hb_first_i + Kokkos::atomic_fetch_add(&d_hb_num[i],1);
         }
 
         const int jj_index = j_index - hb_first_i;
 
         if (jj_index >= maxhb) {
           d_resize_hb() = 1;
           return;
         }
 
         d_hb_list[j_index] = j;
       } else if ( j < nlocal && ihb == 2 && jhb == 1) {
         if (NEIGHFLAG == HALF) {
           i_index = d_hb_first[j] + d_hb_num[j];
           d_hb_num[j]++;
         } else {
           i_index = d_hb_first[j] + Kokkos::atomic_fetch_add(&d_hb_num[j],1);
         }
 
         const int ii_index = i_index - d_hb_first[j];
 
         if (ii_index >= maxhb) {
           d_resize_hb() = 1;
           return;
         }
 
         d_hb_list[i_index] = i;
       }
     }
 
     // bond_list
     const F_FLOAT rij = sqrt(rsq);
     const F_FLOAT p_bo1 = paramstwbp(itype,jtype).p_bo1;
     const F_FLOAT p_bo2 = paramstwbp(itype,jtype).p_bo2;
     const F_FLOAT p_bo3 = paramstwbp(itype,jtype).p_bo3;
     const F_FLOAT p_bo4 = paramstwbp(itype,jtype).p_bo4;
     const F_FLOAT p_bo5 = paramstwbp(itype,jtype).p_bo5;
     const F_FLOAT p_bo6 = paramstwbp(itype,jtype).p_bo6;
     const F_FLOAT r_s = paramstwbp(itype,jtype).r_s;
     const F_FLOAT r_pi = paramstwbp(itype,jtype).r_pi;
     const F_FLOAT r_pi2 = paramstwbp(itype,jtype).r_pi2;
 
     if (paramssing(itype).r_s > 0.0  && paramssing(jtype).r_s > 0.0) {
       C12 = p_bo1*pow(rij/r_s,p_bo2);
       BO_s = (1.0+bo_cut)*exp(C12);
     }
     else BO_s = C12 = 0.0;
 
     if (paramssing(itype).r_pi > 0.0  && paramssing(jtype).r_pi > 0.0) {
       C34 = p_bo3*pow(rij/r_pi,p_bo4);
       BO_pi = exp(C34);
     }
     else BO_pi = C34 = 0.0;
 
     if (paramssing(itype).r_pi2 > 0.0  && paramssing(jtype).r_pi2 > 0.0) {
       C56 = p_bo5*pow(rij/r_pi2,p_bo6);
       BO_pi2 = exp(C56);
     }
     else BO_pi2 = C56 = 0.0;
 
     BO = BO_s + BO_pi + BO_pi2;
     if (BO < bo_cut) continue;
 
     if (NEIGHFLAG == HALF) {
       j_index = bo_first_i + d_bo_num[i];
       i_index = d_bo_first[j] + d_bo_num[j];
       d_bo_num[i]++;
       d_bo_num[j]++;
     } else {
       j_index = bo_first_i + Kokkos::atomic_fetch_add(&d_bo_num[i],1);
       i_index = d_bo_first[j] + Kokkos::atomic_fetch_add(&d_bo_num[j],1);
     }
 
     const int jj_index = j_index - bo_first_i;
     const int ii_index = i_index - d_bo_first[j];
 
     if (jj_index >= maxbo || ii_index >= maxbo) {
       d_resize_bo() = 1;
       return;
     }
 
     d_bo_list[j_index] = j;
     d_bo_list[i_index] = i;
   }
 
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxBondOrder1_LessAtomics, const int &ii) const {
 
   F_FLOAT C12, C34, C56, BO_s, BO_pi, BO_pi2, BO, delij[3], dBOp_i[3], dln_BOp_pi_i[3], dln_BOp_pi2_i[3];
 
   const int i = d_ilist[ii];
   const X_FLOAT xtmp = x(i,0);
   const X_FLOAT ytmp = x(i,1);
   const X_FLOAT ztmp = x(i,2);
   const int itype = type(i);
 
   const int j_start = d_bo_first[i];
   const int j_end = j_start + d_bo_num[i];
 
   F_FLOAT total_bo = 0.0;
 
   for (int jj = j_start; jj < j_end; jj++) {
     int j = d_bo_list[jj];
     j &= NEIGHMASK;
     delij[0] = x(j,0) - xtmp;
     delij[1] = x(j,1) - ytmp;
     delij[2] = x(j,2) - ztmp;
     const F_FLOAT rsq = delij[0]*delij[0] + delij[1]*delij[1] + delij[2]*delij[2];
     const F_FLOAT rij = sqrt(rsq);
     const int jtype = type(j);
     const int j_index = jj - j_start;
 
     // calculate uncorrected BO and total bond order
 
     const F_FLOAT p_bo1 = paramstwbp(itype,jtype).p_bo1;
     const F_FLOAT p_bo2 = paramstwbp(itype,jtype).p_bo2;
     const F_FLOAT p_bo3 = paramstwbp(itype,jtype).p_bo3;
     const F_FLOAT p_bo4 = paramstwbp(itype,jtype).p_bo4;
     const F_FLOAT p_bo5 = paramstwbp(itype,jtype).p_bo5;
     const F_FLOAT p_bo6 = paramstwbp(itype,jtype).p_bo6;
     const F_FLOAT r_s = paramstwbp(itype,jtype).r_s;
     const F_FLOAT r_pi = paramstwbp(itype,jtype).r_pi;
     const F_FLOAT r_pi2 = paramstwbp(itype,jtype).r_pi2;
 
     if (paramssing(itype).r_s > 0.0  && paramssing(jtype).r_s > 0.0) {
       C12 = p_bo1*pow(rij/r_s,p_bo2);
       BO_s = (1.0+bo_cut)*exp(C12);
     }
     else BO_s = C12 = 0.0;
 
     if (paramssing(itype).r_pi > 0.0  && paramssing(jtype).r_pi > 0.0) {
       C34 = p_bo3*pow(rij/r_pi,p_bo4);
       BO_pi = exp(C34);
     }
     else BO_pi = C34 = 0.0;
 
     if (paramssing(itype).r_pi2 > 0.0  && paramssing(jtype).r_pi2 > 0.0) {
       C56 = p_bo5*pow(rij/r_pi2,p_bo6);
       BO_pi2 = exp(C56);
     }
     else BO_pi2 = C56 = 0.0;
 
     BO = BO_s + BO_pi + BO_pi2;
     if (BO < bo_cut) continue;
 
     d_BO(i,j_index) = BO;
     d_BO_s(i,j_index) = BO;
     d_BO_pi(i,j_index) = BO_pi;
     d_BO_pi2(i,j_index) = BO_pi2;
 
     F_FLOAT Cln_BOp_s = p_bo2 * C12 / rij / rij;
     F_FLOAT Cln_BOp_pi = p_bo4 * C34 / rij / rij;
     F_FLOAT Cln_BOp_pi2 = p_bo6 * C56 / rij / rij;
 
     if (nlocal == 0)
       Cln_BOp_s = Cln_BOp_pi = Cln_BOp_pi2 = 0.0;
 
     for (int d = 0; d < 3; d++) dln_BOp_pi_i[d] = -(BO_pi*Cln_BOp_pi)*delij[d];
     for (int d = 0; d < 3; d++) dln_BOp_pi2_i[d] = -(BO_pi2*Cln_BOp_pi2)*delij[d];
     for (int d = 0; d < 3; d++) dBOp_i[d] = -(BO_s*Cln_BOp_s+BO_pi*Cln_BOp_pi+BO_pi2*Cln_BOp_pi2)*delij[d];
     for (int d = 0; d < 3; d++) d_dDeltap_self(i,d) += dBOp_i[d];
 
     d_dln_BOp_pix(i,j_index) = dln_BOp_pi_i[0];
     d_dln_BOp_piy(i,j_index) = dln_BOp_pi_i[1];
     d_dln_BOp_piz(i,j_index) = dln_BOp_pi_i[2];
 
     d_dln_BOp_pi2x(i,j_index) = dln_BOp_pi2_i[0];
     d_dln_BOp_pi2y(i,j_index) = dln_BOp_pi2_i[1];
     d_dln_BOp_pi2z(i,j_index) = dln_BOp_pi2_i[2];
 
     d_dBOpx(i,j_index) = dBOp_i[0];
     d_dBOpy(i,j_index) = dBOp_i[1];
     d_dBOpz(i,j_index) = dBOp_i[2];
 
     d_BO(i,j_index) -= bo_cut;
     d_BO_s(i,j_index) -= bo_cut;
     total_bo += d_BO(i,j_index);
   }
   d_total_bo[i] += total_bo;
 
   const F_FLOAT val_i = paramssing(itype).valency;
   d_Deltap[i] = d_total_bo[i] - val_i;
   d_Deltap_boc[i] = d_total_bo[i] - paramssing(itype).valency_val;
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxBondOrder2, const int &ii) const {
 
   F_FLOAT delij[3];
   F_FLOAT exp_p1i, exp_p2i, exp_p1j, exp_p2j, f1, f2, f3, u1_ij, u1_ji, Cf1A_ij, Cf1B_ij, Cf1_ij, Cf1_ji;
   F_FLOAT f4, f5, exp_f4, exp_f5, f4f5, Cf45_ij, Cf45_ji;
   F_FLOAT A0_ij, A1_ij, A2_ij, A3_ij, A2_ji, A3_ji;
 
   const int i = d_ilist[ii];
   const int itype = type(i);
   const int j_start = d_bo_first[i];
   const int j_end = j_start + d_bo_num[i];
 
   const X_FLOAT xtmp = x(i,0);
   const X_FLOAT ytmp = x(i,1);
   const X_FLOAT ztmp = x(i,2);
 
   const F_FLOAT val_i = paramssing(itype).valency;
 
   d_total_bo[i] = 0.0;
   F_FLOAT total_bo = 0.0;
 
   for (int jj = j_start; jj < j_end; jj++) {
     int j = d_bo_list[jj];
     j &= NEIGHMASK;
     delij[0] = x(j,0) - xtmp;
     delij[1] = x(j,1) - ytmp;
     delij[2] = x(j,2) - ztmp;
     const F_FLOAT rsq = delij[0]*delij[0] + delij[1]*delij[1] + delij[2]*delij[2];
     const F_FLOAT rij = sqrt(rsq);
     const int jtype = type(j);
     const int j_index = jj - j_start;
     const int i_index = maxbo+j_index;
 
     // calculate corrected BO and total bond order
 
     const F_FLOAT val_j = paramssing(jtype).valency;
     const F_FLOAT ovc = paramstwbp(itype,jtype).ovc;
     const F_FLOAT v13cor = paramstwbp(itype,jtype).v13cor;
     const F_FLOAT p_boc3 = paramstwbp(itype,jtype).p_boc3;
     const F_FLOAT p_boc4 = paramstwbp(itype,jtype).p_boc4;
     const F_FLOAT p_boc5 = paramstwbp(itype,jtype).p_boc5;
 
     if (ovc < 0.001 && v13cor < 0.001) {
       d_C1dbo(i,j_index) = 1.0;
       d_C2dbo(i,j_index) = 0.0;
       d_C3dbo(i,j_index) = 0.0;
       d_C1dbopi(i,j_index) = d_BO_pi(i,j_index);
       d_C2dbopi(i,j_index) = 0.0;
       d_C3dbopi(i,j_index) = 0.0;
       d_C4dbopi(i,j_index) = 0.0;
       d_C1dbopi2(i,j_index) = d_BO_pi(i,j_index);
       d_C2dbopi2(i,j_index) = 0.0;
       d_C3dbopi2(i,j_index) = 0.0;
       d_C4dbopi2(i,j_index) = 0.0;
     } else {
       if (ovc >= 0.001) {
         exp_p1i = exp(-p_boc1 * d_Deltap[i]);
         exp_p2i = exp(-p_boc2 * d_Deltap[i]);
         exp_p1j = exp(-p_boc1 * d_Deltap[j]);
         exp_p2j = exp(-p_boc2 * d_Deltap[j]);
 
         f2 = exp_p1i + exp_p1j;
         f3 = -1.0/p_boc2*log(0.5*(exp_p2i+exp_p2j));
         f1 = 0.5 * ((val_i + f2)/(val_i + f2 + f3) + (val_j + f2)/(val_j + f2 + f3));
         u1_ij = val_i + f2 + f3;
         u1_ji = val_j + f2 + f3;
         Cf1A_ij = 0.5 * f3 * (1.0/(u1_ij*u1_ij)+1.0/(u1_ji*u1_ji));
         Cf1B_ij = -0.5 * ((u1_ij - f3)/(u1_ij*u1_ij)+(u1_ji - f3)/(u1_ji*u1_ji));
         Cf1_ij = 0.5 * (-p_boc1 * exp_p1i / u1_ij - ((val_i+f2) / (u1_ij*u1_ij)) *
                        (-p_boc1 * exp_p1i + exp_p2i / (exp_p2i + exp_p2j)) +
                         -p_boc1 * exp_p1i / u1_ji - ((val_j+f2) / (u1_ji*u1_ji)) *
                        (-p_boc1 * exp_p1i + exp_p2i / (exp_p2i + exp_p2j)));
         Cf1_ji = -Cf1A_ij * p_boc1 * exp_p1j + Cf1B_ij * exp_p2j / ( exp_p2i + exp_p2j );
       } else {
         f1 = 1.0;
         Cf1_ij = Cf1_ji = 0.0;
       }
 
       if (v13cor >= 0.001) {
         exp_f4 =exp(-(p_boc4*(d_BO(i,j_index)*d_BO(i,j_index))-d_Deltap_boc[i])*p_boc3+p_boc5);
         exp_f5 =exp(-(p_boc4*(d_BO(i,j_index)*d_BO(i,j_index))-d_Deltap_boc[j])*p_boc3+p_boc5);
         f4 = 1. / (1. + exp_f4);
         f5 = 1. / (1. + exp_f5);
         f4f5 = f4 * f5;
 
         Cf45_ij = -f4 * exp_f4;
         Cf45_ji = -f5 * exp_f5;
       } else {
         f4 = f5 = f4f5 = 1.0;
         Cf45_ij = Cf45_ji = 0.0;
       }
 
       A0_ij = f1 * f4f5;
       A1_ij = -2 * p_boc3 * p_boc4 * d_BO(i,j_index) * (Cf45_ij + Cf45_ji);
       A2_ij = Cf1_ij / f1 + p_boc3 * Cf45_ij;
       A2_ji = Cf1_ji / f1 + p_boc3 * Cf45_ji;
       A3_ij = A2_ij + Cf1_ij / f1;
       A3_ji = A2_ji + Cf1_ji / f1;
 
       d_BO(i,j_index) = d_BO(i,j_index) * A0_ij;
       d_BO_pi(i,j_index) = d_BO_pi(i,j_index) * A0_ij * f1;
       d_BO_pi2(i,j_index) = d_BO_pi2(i,j_index) * A0_ij * f1;
       d_BO_s(i,j_index) = d_BO(i,j_index)-(d_BO_pi(i,j_index)+d_BO_pi2(i,j_index));
 
       d_C1dbo(i,j_index) = A0_ij + d_BO(i,j_index) * A1_ij;
       d_C2dbo(i,j_index) = d_BO(i,j_index) * A2_ij;
       d_C3dbo(i,j_index) = d_BO(i,j_index) * A2_ji;
 
       d_C1dbopi(i,j_index) = f1*f1*f4*f5;
       d_C2dbopi(i,j_index) = d_BO_pi(i,j_index) * A1_ij;
       d_C3dbopi(i,j_index) = d_BO_pi(i,j_index) * A3_ij;
       d_C4dbopi(i,j_index) = d_BO_pi(i,j_index) * A3_ji;
 
       d_C1dbopi2(i,j_index) = f1*f1*f4*f5;
       d_C2dbopi2(i,j_index) = d_BO_pi2(i,j_index) * A1_ij;
       d_C3dbopi2(i,j_index) = d_BO_pi2(i,j_index) * A3_ij;
       d_C4dbopi2(i,j_index) = d_BO_pi2(i,j_index) * A3_ji;
     }
 
     if(d_BO(i,j_index) < 1e-10) d_BO(i,j_index) = 0.0;
     if(d_BO_s(i,j_index) < 1e-10) d_BO_s(i,j_index) = 0.0;
     if(d_BO_pi(i,j_index) < 1e-10) d_BO_pi(i,j_index) = 0.0;
     if(d_BO_pi2(i,j_index) < 1e-10) d_BO_pi2(i,j_index) = 0.0;
 
     total_bo += d_BO(i,j_index);
 
     d_Cdbo(i,j_index) = 0.0;
     d_Cdbopi(i,j_index) = 0.0;
     d_Cdbopi2(i,j_index) = 0.0;
     d_Cdbo(j,i_index) = 0.0;
     d_Cdbopi(j,i_index) = 0.0;
     d_Cdbopi2(j,i_index) = 0.0;
 
     d_CdDelta[j] = 0.0;
   }
   d_CdDelta[i] = 0.0;
   d_total_bo[i] += total_bo;
 
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxBondOrder3, const int &ii) const {
 // bot part of BO()
 
   const int i = d_ilist[ii];
   const int itype = type(i);
   F_FLOAT nlp_temp;
 
   d_Delta[i] = d_total_bo[i] - paramssing(itype).valency;
   const F_FLOAT Delta_e = d_total_bo[i] - paramssing(itype).valency_e;
   d_Delta_boc[i] = d_total_bo[i] - paramssing(itype).valency_boc;
 
   const F_FLOAT vlpex = Delta_e - 2.0 * (int)(Delta_e/2.0);
   const F_FLOAT explp1 = exp(-gp[15] * SQR(2.0 + vlpex));
   const F_FLOAT nlp = explp1 - (int)(Delta_e / 2.0);
   d_Delta_lp[i] = paramssing(itype).nlp_opt - nlp;
   const F_FLOAT Clp = 2.0 * gp[15] * explp1 * (2.0 + vlpex);
   d_dDelta_lp[i] = Clp;
 
   if( paramssing(itype).mass > 21.0 ) {
     nlp_temp = 0.5 * (paramssing(itype).valency_e - paramssing(itype).valency);
     d_Delta_lp_temp[i] = paramssing(itype).nlp_opt - nlp_temp;
   } else {
     nlp_temp = nlp;
     d_Delta_lp_temp[i] = paramssing(itype).nlp_opt - nlp_temp;
   }
 
   d_sum_ovun(i,1) = 0.0;
   d_sum_ovun(i,2) = 0.0;
 
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeMulti1<NEIGHFLAG,EVFLAG>, const int &ii) const {
 
   const int i = d_ilist[ii];
   const int itype = type(i);
   const F_FLOAT imass = paramssing(itype).mass;
   F_FLOAT dfvl;
 
   if (imass > 21.0) dfvl = 0.0;
   else dfvl = 1.0;
 
   const int j_start = d_bo_first[i];
   const int j_end = j_start + d_bo_num[i];
 
   F_FLOAT sum_ovun1 = 0.0;
   F_FLOAT sum_ovun2 = 0.0;
 
   for (int jj = j_start; jj < j_end; jj++) {
     int j = d_bo_list[jj];
     j &= NEIGHMASK;
     const int jtype = type(j);
     const int j_index = jj - j_start;
 
     sum_ovun1 += paramstwbp(itype,jtype).p_ovun1 * paramstwbp(itype,jtype).De_s * d_BO(i,j_index);
     sum_ovun2 += (d_Delta[j] - dfvl * d_Delta_lp_temp[j]) * (d_BO_pi(i,j_index) + d_BO_pi2(i,j_index));
   }
   d_sum_ovun(i,1) += sum_ovun1;
   d_sum_ovun(i,2) += sum_ovun2;
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeMulti2<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT_REAX& ev) const {
 
   Kokkos::View<F_FLOAT*, typename DAT::t_float_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_CdDelta = d_CdDelta;
   Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_Cdbo = d_Cdbo;
   Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_Cdbopi = d_Cdbopi;
   Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_Cdbopi2 = d_Cdbopi2;
 
   const int i = d_ilist[ii];
   const int itype = type(i);
   const F_FLOAT imass = paramssing(itype).mass;
   const F_FLOAT val_i = paramssing(itype).valency;
 
   F_FLOAT dfvl;
   if (imass > 21.0) dfvl = 0.0;
   else dfvl = 1.0;
 
   F_FLOAT e_lp, e_ov, e_un;
   F_FLOAT CEover1, CEover2, CEover3, CEover4;
   F_FLOAT CEunder1, CEunder2, CEunder3, CEunder4;
   const F_FLOAT p_lp3 = gp[5];
   const F_FLOAT p_ovun2 = paramssing(itype).p_ovun2;
   const F_FLOAT p_ovun3 = gp[32];
   const F_FLOAT p_ovun4 = gp[31];
   const F_FLOAT p_ovun5 = paramssing(itype).p_ovun5;
   const F_FLOAT p_ovun6 = gp[6];
   const F_FLOAT p_ovun7 = gp[8];
   const F_FLOAT p_ovun8 = gp[9];
 
   // lone pair
   const F_FLOAT p_lp2 = paramssing(itype).p_lp2;
   const F_FLOAT expvd2 = exp( -75 * d_Delta_lp[i]);
   const F_FLOAT inv_expvd2 = 1.0 / (1.0+expvd2);
 
   int numbonds = d_bo_num[i];
 
   e_lp = 0.0;
   if (numbonds > 0)
     e_lp = p_lp2 * d_Delta_lp[i] * inv_expvd2;
   const F_FLOAT dElp = p_lp2 * inv_expvd2 + 75.0 * p_lp2 * d_Delta_lp[i] * expvd2 * inv_expvd2*inv_expvd2;
   const F_FLOAT CElp = dElp * d_dDelta_lp[i];
 
   if (numbonds > 0)
     a_CdDelta[i] += CElp;
 
   if (eflag) ev.ereax[0] += e_lp;
   //if (vflag_either || eflag_atom) this->template ev_tally<NEIGHFLAG>(ev,i,i,e_lp,0.0,0.0,0.0,0.0);
   //if (eflag_atom) this->template e_tally<NEIGHFLAG>(ev,i,i,e_lp);
 
   // over coordination
   const F_FLOAT exp_ovun1 = p_ovun3 * exp( p_ovun4 * d_sum_ovun(i,2) );
   const F_FLOAT inv_exp_ovun1 = 1.0 / (1 + exp_ovun1);
   const F_FLOAT Delta_lpcorr  = d_Delta[i] - (dfvl * d_Delta_lp_temp[i]) * inv_exp_ovun1;
 
   const F_FLOAT exp_ovun2 = exp( p_ovun2 * Delta_lpcorr );
   const F_FLOAT inv_exp_ovun2 = 1.0 / (1.0 + exp_ovun2);
   const F_FLOAT DlpVi = 1.0 / (Delta_lpcorr + val_i + 1e-8);
 
   CEover1 = Delta_lpcorr * DlpVi * inv_exp_ovun2;
   e_ov = d_sum_ovun(i,1) * CEover1;
 
   if (eflag) ev.ereax[1] += e_ov;
   //if (eflag_atom) this->template ev_tally<NEIGHFLAG>(ev,i,i,e_ov,0.0,0.0,0.0,0.0);
   //if (eflag_atom) this->template e_tally<NEIGHFLAG>(ev,i,i,e_ov);
 
   CEover2 = d_sum_ovun(i,1) * DlpVi * inv_exp_ovun2 *
     (1.0 - Delta_lpcorr * ( DlpVi + p_ovun2 * exp_ovun2 * inv_exp_ovun2 ));
   CEover3 = CEover2 * (1.0 - dfvl * d_dDelta_lp[i] * inv_exp_ovun1 );
   CEover4 = CEover2 * (dfvl * d_Delta_lp_temp[i]) * p_ovun4 * exp_ovun1 * SQR(inv_exp_ovun1);
 
   // under coordination
 
   const F_FLOAT exp_ovun2n = 1.0 / exp_ovun2;
   const F_FLOAT exp_ovun6 = exp( p_ovun6 * Delta_lpcorr );
   const F_FLOAT exp_ovun8 = p_ovun7 * exp(p_ovun8 * d_sum_ovun(i,2));
   const F_FLOAT inv_exp_ovun2n = 1.0 / (1.0 + exp_ovun2n);
   const F_FLOAT inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8);
 
   e_un = 0;
   if (numbonds > 0)
     e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
 
   if (eflag) ev.ereax[2] += e_un;
   //if (eflag_atom) this->template ev_tally<NEIGHFLAG>(ev,i,i,e_un,0.0,0.0,0.0,0.0);
   //if (eflag_atom) this->template e_tally<NEIGHFLAG>(ev,i,i,e_un);
 
   CEunder1 = inv_exp_ovun2n *
     ( p_ovun5 * p_ovun6 * exp_ovun6 * inv_exp_ovun8 + p_ovun2 * e_un * exp_ovun2n );
   CEunder2 = -e_un * p_ovun8 * exp_ovun8 * inv_exp_ovun8;
   CEunder3 = CEunder1 * (1.0 - dfvl * d_dDelta_lp[i] * inv_exp_ovun1);
   CEunder4 = CEunder1 * (dfvl * d_Delta_lp_temp[i]) *
       p_ovun4 * exp_ovun1 * inv_exp_ovun1 * inv_exp_ovun1 + CEunder2;
 
   const F_FLOAT eng_tmp = e_lp + e_ov + e_un;
   if (eflag_atom) this->template e_tally_single<NEIGHFLAG>(ev,i,eng_tmp);
 
   // multibody forces
 
   a_CdDelta[i] += CEover3;
   if (numbonds > 0)
     a_CdDelta[i] += CEunder3;
 
   const int j_start = d_bo_first[i];
   const int j_end = j_start + d_bo_num[i];
 
   F_FLOAT CdDelta_i = 0.0;
   for (int jj = j_start; jj < j_end; jj++) {
     int j = d_bo_list[jj];
     j &= NEIGHMASK;
     const int jtype = type(j);
     const F_FLOAT jmass = paramssing(jtype).mass;
     const int j_index = jj - j_start;
     const F_FLOAT De_s = paramstwbp(itype,jtype).De_s;
 
     // multibody lone pair: correction for C2
     if (p_lp3 > 0.001 && imass == 12.0 && jmass == 12.0) {
       const F_FLOAT Di = d_Delta[i];
       const F_FLOAT vov3 = d_BO(i,j_index) - Di - 0.040*pow(Di,4.0);
       if (vov3 > 3.0) {
         const F_FLOAT e_lph = p_lp3 * (vov3-3.0)*(vov3-3.0);
         const F_FLOAT deahu2dbo = 2.0 * p_lp3 * (vov3 - 3.0);
         const F_FLOAT deahu2dsbo = 2.0 * p_lp3 * (vov3 - 3.0) * (-1.0 - 0.16 * pow(Di,3.0));
         d_Cdbo(i,j_index) += deahu2dbo;
         CdDelta_i += deahu2dsbo;
 
         if (eflag) ev.ereax[0] += e_lph;
         if (eflag_atom) this->template e_tally<NEIGHFLAG>(ev,i,j,e_lph);
       }
     }
 
     // over/under coordination forces merged together
     const F_FLOAT p_ovun1 = paramstwbp(itype,jtype).p_ovun1;
     a_CdDelta[j] += (CEover4 + CEunder4) * (1.0 - dfvl * d_dDelta_lp[j]) * (d_BO_pi(i,j_index) + d_BO_pi2(i,j_index));
     d_Cdbo(i,j_index) += CEover1 * p_ovun1 * De_s;
     d_Cdbopi(i,j_index) += (CEover4 + CEunder4) * (d_Delta[j] - dfvl*d_Delta_lp_temp[j]);
     d_Cdbopi2(i,j_index) += (CEover4 + CEunder4) * (d_Delta[j] - dfvl*d_Delta_lp_temp[j]);
   }
   a_CdDelta[i] += CdDelta_i;
 
 }
 
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeMulti2<NEIGHFLAG,EVFLAG>, const int &ii) const {
   EV_FLOAT_REAX ev;
   this->template operator()<NEIGHFLAG,EVFLAG>(PairReaxComputeMulti2<NEIGHFLAG,EVFLAG>(), ii, ev);
 
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeAngular<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT_REAX& ev) const {
 
   Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
   Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_Cdbo = d_Cdbo;
   Kokkos::View<F_FLOAT*, typename DAT::t_float_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_CdDelta = d_CdDelta;
 
   const int i = d_ilist[ii];
   const int itype = type(i);
   const X_FLOAT xtmp = x(i,0);
   const X_FLOAT ytmp = x(i,1);
   const X_FLOAT ztmp = x(i,2);
 
   F_FLOAT temp, temp_bo_jt, pBOjt7;
   F_FLOAT p_val1, p_val2, p_val3, p_val4, p_val5;
   F_FLOAT p_val6, p_val7, p_val8, p_val9, p_val10;
   F_FLOAT p_pen1, p_pen2, p_pen3, p_pen4;
   F_FLOAT p_coa1, p_coa2, p_coa3, p_coa4;
   F_FLOAT trm8, expval6, expval7, expval2theta, expval12theta, exp3ij, exp3jk;
   F_FLOAT exp_pen2ij, exp_pen2jk, exp_pen3, exp_pen4, trm_pen34, exp_coa2;
   F_FLOAT dSBO1, dSBO2, SBO, SBO2, CSBO2, SBOp, prod_SBO, vlpadj;
   F_FLOAT CEval1, CEval2, CEval3, CEval4, CEval5, CEval6, CEval7, CEval8;
   F_FLOAT CEpen1, CEpen2, CEpen3;
   F_FLOAT e_ang, e_coa, e_pen;
   F_FLOAT CEcoa1, CEcoa2, CEcoa3, CEcoa4, CEcoa5;
   F_FLOAT Cf7ij, Cf7jk, Cf8j, Cf9j;
   F_FLOAT f7_ij, f7_jk, f8_Dj, f9_Dj;
   F_FLOAT Ctheta_0, theta_0, theta_00, theta, cos_theta, sin_theta;
   F_FLOAT BOA_ij, BOA_ik, rij, bo_ij, bo_ik;
   F_FLOAT dcos_theta_di[3], dcos_theta_dj[3], dcos_theta_dk[3];
   F_FLOAT eng_tmp, fi_tmp[3], fj_tmp[3], fk_tmp[3];
   F_FLOAT delij[3], delik[3];
 
   p_val6 = gp[14];
   p_val8 = gp[33];
   p_val9 = gp[16];
   p_val10 = gp[17];
 
   p_pen2 = gp[19];
   p_pen3 = gp[20];
   p_pen4 = gp[21];
 
   p_coa2 = gp[2];
   p_coa3 = gp[38];
   p_coa4 = gp[30];
 
   p_val3 = paramssing(itype).p_val3;
   p_val5 = paramssing(itype).p_val5;
 
   const int j_start = d_bo_first[i];
   const int j_end = j_start + d_bo_num[i];
 
   const F_FLOAT Delta_val = d_total_bo[i] - paramssing(itype).valency_val;
 
   SBOp = 0.0, prod_SBO = 1.0;
 
   for (int jj = j_start; jj < j_end; jj++) {
     int j = d_bo_list[jj];
     j &= NEIGHMASK;
     const int j_index = jj - j_start;
     bo_ij = d_BO(i,j_index);
 
     SBOp += (d_BO_pi(i,j_index) + d_BO_pi2(i,j_index));
     temp = SQR(bo_ij);
     temp *= temp;
     temp *= temp;
     prod_SBO *= exp( -temp );
   }
 
   const F_FLOAT Delta_e = d_total_bo[i] - paramssing(itype).valency_e;
   const F_FLOAT vlpex = Delta_e - 2.0 * (int)(Delta_e/2.0);
   const F_FLOAT explp1 = exp(-gp[15] * SQR(2.0 + vlpex));
   const F_FLOAT nlp = explp1 - (int)(Delta_e / 2.0);
   if( vlpex >= 0.0 ){
     vlpadj = 0.0;
     dSBO2 = prod_SBO - 1.0;
   } else{
     vlpadj = nlp;
     dSBO2 = (prod_SBO - 1.0) * (1.0 - p_val8 * d_dDelta_lp[i]);
   }
 
   SBO = SBOp + (1.0 - prod_SBO) * (-d_Delta_boc[i] - p_val8 * vlpadj);
   dSBO1 = -8.0 * prod_SBO * ( d_Delta_boc[i] + p_val8 * vlpadj );
 
   if( SBO <= 0.0 ) {
     SBO2 = 0.0;
     CSBO2 = 0.0;
   } else if( SBO > 0.0 && SBO <= 1.0 ) {
     SBO2 = pow( SBO, p_val9 );
     CSBO2 = p_val9 * pow( SBO, p_val9 - 1.0 );
   } else if( SBO > 1.0 && SBO < 2.0 ) {
     SBO2 = 2.0 - pow( 2.0-SBO, p_val9 );
     CSBO2 = p_val9 * pow( 2.0 - SBO, p_val9 - 1.0 );
   } else {
     SBO2 = 2.0;
     CSBO2 = 0.0;
   }
   expval6 = exp( p_val6 * d_Delta_boc[i] );
 
   F_FLOAT CdDelta_i = 0.0;
   F_FLOAT fitmp[3],fjtmp[3];
   for (int j = 0; j < 3; j++) fitmp[j] = 0.0;
 
   for (int jj = j_start; jj < j_end; jj++) {
     int j = d_bo_list[jj];
     j &= NEIGHMASK;
     const int j_index = jj - j_start;
     delij[0] = x(j,0) - xtmp;
     delij[1] = x(j,1) - ytmp;
     delij[2] = x(j,2) - ztmp;
     const F_FLOAT rsqij = delij[0]*delij[0] + delij[1]*delij[1] + delij[2]*delij[2];
     rij = sqrt(rsqij);
     bo_ij = d_BO(i,j_index);
     const int i_index = maxbo+j_index;
 
     BOA_ij = bo_ij - thb_cut;
     if (BOA_ij <= 0.0) continue;
     if (i >= nlocal && j >= nlocal) continue;
 
     const int jtype = type(j);
 
     F_FLOAT CdDelta_j = 0.0;
     for (int k = 0; k < 3; k++) fjtmp[k] = 0.0;
 
     for (int kk = jj+1; kk < j_end; kk++ ) {
     //for (int kk = j_start; kk < j_end; kk++ ) {
       int k = d_bo_list[kk];
       k &= NEIGHMASK;
       if (k == j) continue;
 
       const int k_index = kk - j_start;
       delik[0] = x(k,0) - xtmp;
       delik[1] = x(k,1) - ytmp;
       delik[2] = x(k,2) - ztmp;
       const F_FLOAT rsqik = delik[0]*delik[0] + delik[1]*delik[1] + delik[2]*delik[2];
       const F_FLOAT rik = sqrt(rsqik);
       bo_ik = d_BO(i,k_index);
       BOA_ik   = bo_ik - thb_cut;
 
       if (BOA_ik <= 0.0 || bo_ij <= thb_cut || bo_ik <= thb_cut || bo_ij * bo_ik <= thb_cutsq) continue;
 
       const int ktype = type(k);
 
       // theta and derivatives
 
       cos_theta = (delij[0]*delik[0]+delij[1]*delik[1]+delij[2]*delik[2])/(rij*rik);
       if( cos_theta > 1.0 ) cos_theta  = 1.0;
       if( cos_theta < -1.0 ) cos_theta  = -1.0;
       theta = acos(cos_theta);
 
       const F_FLOAT inv_dists = 1.0 / (rij * rik);
       const F_FLOAT Cdot_inv3 = cos_theta * inv_dists * inv_dists;
 
       for( int t = 0; t < 3; t++ ) {
         dcos_theta_di[t] = -(delik[t] + delij[t]) * inv_dists + Cdot_inv3 * (rsqik * delij[t] + rsqij * delik[t]);
         dcos_theta_dj[t] = delik[t] * inv_dists - Cdot_inv3 * rsqik * delij[t];
         dcos_theta_dk[t] = delij[t] * inv_dists - Cdot_inv3 * rsqij * delik[t];
       }
 
       sin_theta = sin(theta);
       if (sin_theta < 1.0e-5) sin_theta = 1.0e-5;
       p_val1 = paramsthbp(jtype,itype,ktype).p_val1;
 
       if (fabs(p_val1) <= 0.001) continue;
 
       // ANGLE ENERGY
 
       p_val1 = paramsthbp(jtype,itype,ktype).p_val1;
       p_val2 = paramsthbp(jtype,itype,ktype).p_val2;
       p_val4 = paramsthbp(jtype,itype,ktype).p_val4;
       p_val7 = paramsthbp(jtype,itype,ktype).p_val7;
       theta_00 = paramsthbp(jtype,itype,ktype).theta_00;
 
       exp3ij = exp( -p_val3 * pow( BOA_ij, p_val4 ) );
       f7_ij = 1.0 - exp3ij;
       Cf7ij = p_val3 * p_val4 * pow( BOA_ij, p_val4 - 1.0 ) * exp3ij;
       exp3jk = exp( -p_val3 * pow( BOA_ik, p_val4 ) );
       f7_jk = 1.0 - exp3jk;
       Cf7jk = p_val3 * p_val4 * pow( BOA_ik, p_val4 - 1.0 ) * exp3jk;
       expval7 = exp( -p_val7 * d_Delta_boc[i] );
       trm8 = 1.0 + expval6 + expval7;
       f8_Dj = p_val5 - ( (p_val5 - 1.0) * (2.0 + expval6) / trm8 );
       Cf8j = ((1.0 - p_val5) / (trm8*trm8)) *
        (p_val6 * expval6 * trm8 - (2.0 + expval6) * ( p_val6*expval6 - p_val7*expval7));
       theta_0 = 180.0 - theta_00 * (1.0 - exp(-p_val10 * (2.0 - SBO2)));
       theta_0 = theta_0*constPI/180.0;
 
       expval2theta  = exp( -p_val2 * (theta_0-theta)*(theta_0-theta) );
       if( p_val1 >= 0 )
         expval12theta = p_val1 * (1.0 - expval2theta);
       else // To avoid linear Me-H-Me angles (6/6/06)
         expval12theta = p_val1 * -expval2theta;
 
       CEval1 = Cf7ij * f7_jk * f8_Dj * expval12theta;
       CEval2 = Cf7jk * f7_ij * f8_Dj * expval12theta;
       CEval3 = Cf8j  * f7_ij * f7_jk * expval12theta;
       CEval4 = -2.0 * p_val1 * p_val2 * f7_ij * f7_jk * f8_Dj * expval2theta * (theta_0 - theta);
       Ctheta_0 = p_val10 * theta_00*constPI/180.0 * exp( -p_val10 * (2.0 - SBO2) );
       CEval5 = -CEval4 * Ctheta_0 * CSBO2;
       CEval6 = CEval5 * dSBO1;
       CEval7 = CEval5 * dSBO2;
       CEval8 = -CEval4 / sin_theta;
 
       e_ang = f7_ij * f7_jk * f8_Dj * expval12theta;
       if (eflag) ev.ereax[3] += e_ang;
 
       // Penalty energy
 
       p_pen1 = paramsthbp(jtype,itype,ktype).p_pen1;
 
       exp_pen2ij = exp( -p_pen2 * (BOA_ij - 2.0)*(BOA_ij - 2.0) );
       exp_pen2jk = exp( -p_pen2 * (BOA_ik - 2.0)*(BOA_ik - 2.0) );
       exp_pen3 = exp( -p_pen3 * d_Delta[i] );
       exp_pen4 = exp(  p_pen4 * d_Delta[i] );
       trm_pen34 = 1.0 + exp_pen3 + exp_pen4;
       f9_Dj = (2.0 + exp_pen3 ) / trm_pen34;
       Cf9j = (-p_pen3 * exp_pen3 * trm_pen34 - (2.0 + exp_pen3) *
        (-p_pen3 * exp_pen3 + p_pen4 * exp_pen4 ) )/(trm_pen34*trm_pen34);
 
       e_pen = p_pen1 * f9_Dj * exp_pen2ij * exp_pen2jk;
       if (eflag) ev.ereax[4] += e_pen;
 
       CEpen1 = e_pen * Cf9j / f9_Dj;
       temp   = -2.0 * p_pen2 * e_pen;
       CEpen2 = temp * (BOA_ij - 2.0);
       CEpen3 = temp * (BOA_ik - 2.0);
 
       // ConjAngle energy
 
       p_coa1 = paramsthbp(jtype,itype,ktype).p_coa1;
       exp_coa2 = exp( p_coa2 * Delta_val );
       e_coa = p_coa1 / (1. + exp_coa2) *
               exp( -p_coa3 * SQR(d_total_bo[j]-BOA_ij) ) *
               exp( -p_coa3 * SQR(d_total_bo[k]-BOA_ik) ) *
               exp( -p_coa4 * SQR(BOA_ij - 1.5) ) *
               exp( -p_coa4 * SQR(BOA_ik - 1.5) );
 
       CEcoa1 = -2 * p_coa4 * (BOA_ij - 1.5) * e_coa;
       CEcoa2 = -2 * p_coa4 * (BOA_ik - 1.5) * e_coa;
       CEcoa3 = -p_coa2 * exp_coa2 * e_coa / (1 + exp_coa2);
       CEcoa4 = -2 * p_coa3 * (d_total_bo[j]-BOA_ij) * e_coa;
       CEcoa5 = -2 * p_coa3 * (d_total_bo[k]-BOA_ik) * e_coa;
 
       if (eflag) ev.ereax[5] += e_coa;
 
       // Forces
 
       a_Cdbo(i,j_index) += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4));
       a_Cdbo(j,i_index) += (CEval1 + CEpen2 + (CEcoa1 - CEcoa4));
       a_Cdbo(i,k_index) += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5));
       a_Cdbo(k,i_index) += (CEval2 + CEpen3 + (CEcoa2 - CEcoa5));
 
       CdDelta_i += ((CEval3 + CEval7) + CEpen1 + CEcoa3);
       CdDelta_j += CEcoa4;
       a_CdDelta[k] += CEcoa5;
 
       for (int ll = j_start; ll < j_end; ll++) {
         int l = d_bo_list[ll];
         l &= NEIGHMASK;
         const int l_index = ll - j_start;
 
         temp_bo_jt = d_BO(i,l_index);
         temp = temp_bo_jt * temp_bo_jt * temp_bo_jt;
         pBOjt7 = temp * temp * temp_bo_jt;
 
         a_Cdbo(i,l_index) += (CEval6 * pBOjt7);
         d_Cdbopi(i,l_index) += CEval5;
         d_Cdbopi2(i,l_index) += CEval5;
       }
 
       for (int d = 0; d < 3; d++) fi_tmp[d] = CEval8 * dcos_theta_di[d];
       for (int d = 0; d < 3; d++) fj_tmp[d] = CEval8 * dcos_theta_dj[d];
       for (int d = 0; d < 3; d++) fk_tmp[d] = CEval8 * dcos_theta_dk[d];
       for (int d = 0; d < 3; d++) fitmp[d] -= fi_tmp[d];
       for (int d = 0; d < 3; d++) fjtmp[d] -= fj_tmp[d];
       for (int d = 0; d < 3; d++) a_f(k,d) -= fk_tmp[d];
 
       // energy/virial tally
       if (EVFLAG) {
         eng_tmp = e_ang + e_pen + e_coa;
         //if (eflag_atom) this->template ev_tally<NEIGHFLAG>(ev,i,j,eng_tmp,0.0,0.0,0.0,0.0);
         if (eflag_atom) this->template e_tally<NEIGHFLAG>(ev,i,j,eng_tmp);
         if (vflag_either) this->template v_tally3<NEIGHFLAG>(ev,i,j,k,fj_tmp,fk_tmp,delij,delik);
       }
 
     }
     a_CdDelta[j] += CdDelta_j;
     for (int d = 0; d < 3; d++) a_f(j,d) += fjtmp[d];
   }
   a_CdDelta[i] += CdDelta_i;
   for (int d = 0; d < 3; d++) a_f(i,d) += fitmp[d];
 }
 
 
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeAngular<NEIGHFLAG,EVFLAG>, const int &ii) const {
   EV_FLOAT_REAX ev;
   this->template operator()<NEIGHFLAG,EVFLAG>(PairReaxComputeAngular<NEIGHFLAG,EVFLAG>(), ii, ev);
 
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeTorsion<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT_REAX& ev) const {
 
   Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
   Kokkos::View<F_FLOAT*, typename DAT::t_float_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_CdDelta = d_CdDelta;
   Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_Cdbo = d_Cdbo;
 
   // in reaxc_torsion_angles: j = i, k = j, i = k;
 
   F_FLOAT Delta_i, Delta_j, bo_ij, bo_ik, bo_jl, BOA_ij, BOA_ik, BOA_jl;
   F_FLOAT p_tor1, p_cot1, V1, V2, V3;
   F_FLOAT exp_tor2_ij, exp_tor2_ik, exp_tor2_jl, exp_tor1, exp_tor3_DiDj, exp_tor4_DiDj, exp_tor34_inv;
   F_FLOAT exp_cot2_ij, exp_cot2_ik, exp_cot2_jl, fn10, f11_DiDj, dfn11, fn12;
   F_FLOAT theta_ijk, theta_jil, sin_ijk, sin_jil, cos_ijk, cos_jil, tan_ijk_i, tan_jil_i;
   F_FLOAT cos_omega, cos2omega, cos3omega;
   F_FLOAT CV, cmn, CEtors1, CEtors2, CEtors3, CEtors4;
   F_FLOAT CEtors5, CEtors6, CEtors7, CEtors8, CEtors9;
   F_FLOAT Cconj, CEconj1, CEconj2, CEconj3, CEconj4, CEconj5, CEconj6;
   F_FLOAT e_tor, e_con, eng_tmp;
 
   F_FLOAT delij[3], delik[3], deljl[3], dellk[3], delil[3], delkl[3];
   F_FLOAT fi_tmp[3], fj_tmp[3], fk_tmp[3], fl_tmp[3];
   F_FLOAT dcos_omega_di[3], dcos_omega_dj[3], dcos_omega_dk[3], dcos_omega_dl[3];
   F_FLOAT dcos_ijk_di[3], dcos_ijk_dj[3], dcos_ijk_dk[3], dcos_jil_di[3], dcos_jil_dj[3], dcos_jil_dk[3];
 
   F_FLOAT p_tor2 = gp[23];
   F_FLOAT p_tor3 = gp[24];
   F_FLOAT p_tor4 = gp[25];
   F_FLOAT p_cot2 = gp[27];
 
   const int i = d_ilist[ii];
   const int itype = type(i);
   const int itag = tag(i);
   const X_FLOAT xtmp = x(i,0);
   const X_FLOAT ytmp = x(i,1);
   const X_FLOAT ztmp = x(i,2);
   Delta_i = d_Delta_boc[i];
 
   const int j_start = d_bo_first[i];
   const int j_end = j_start + d_bo_num[i];
 
   F_FLOAT fitmp[3], fjtmp[3], fktmp[3];
   for(int j = 0; j < 3; j++) fitmp[j] = 0.0;
   F_FLOAT CdDelta_i = 0.0;
 
   for (int jj = j_start; jj < j_end; jj++) {
     int j = d_bo_list[jj];
     j &= NEIGHMASK;
     const int jtag = tag(j);
     const int jtype = type(j);
     const int j_index = jj - j_start;
 
     // skip half of the interactions
     if (itag > jtag) {
       if ((itag+jtag) % 2 == 0) continue;
     } else if (itag < jtag) {
       if ((itag+jtag) % 2 == 1) continue;
     } else {
       if (x(j,2)  < ztmp) continue;
       if (x(j,2) == ztmp && x(j,1)  < ytmp) continue;
       if (x(j,2) == ztmp && x(j,1) == ytmp && x(j,0) < xtmp) continue;
     }
 
     bo_ij = d_BO(i,j_index);
     if (bo_ij < thb_cut) continue;
 
     delij[0] = x(j,0) - xtmp;
     delij[1] = x(j,1) - ytmp;
     delij[2] = x(j,2) - ztmp;
     const F_FLOAT rsqij = delij[0]*delij[0] + delij[1]*delij[1] + delij[2]*delij[2];
     const F_FLOAT rij = sqrt(rsqij);
 
     BOA_ij = bo_ij - thb_cut;
     Delta_j = d_Delta_boc[j];
     exp_tor2_ij = exp( -p_tor2 * BOA_ij );
     exp_cot2_ij = exp( -p_cot2 * SQR(BOA_ij - 1.5) );
     exp_tor3_DiDj = exp( -p_tor3 * (Delta_i + Delta_j) );
     exp_tor4_DiDj = exp( p_tor4  * (Delta_i + Delta_j) );
     exp_tor34_inv = 1.0 / (1.0 + exp_tor3_DiDj + exp_tor4_DiDj);
     f11_DiDj = (2.0 + exp_tor3_DiDj) * exp_tor34_inv;
 
     const int l_start = d_bo_first[j];
     const int l_end = l_start + d_bo_num[j];
 
     for(int k = 0; k < 3; k++) fjtmp[k] = 0.0;
     F_FLOAT CdDelta_j = 0.0;
 
     for (int kk = j_start; kk < j_end; kk++) {
       int k = d_bo_list[kk];
       k &= NEIGHMASK;
       if (k == j) continue;
       const int ktype = type(k);
       const int k_index = kk - j_start;
 
       bo_ik = d_BO(i,k_index);
       if (bo_ik < thb_cut) continue;
 
       BOA_ik = bo_ik - thb_cut;
       for (int d = 0; d < 3; d ++) delik[d] = x(k,d) - x(i,d);
       const F_FLOAT rsqik = delik[0]*delik[0] + delik[1]*delik[1] + delik[2]*delik[2];
       const F_FLOAT rik = sqrt(rsqik);
 
       cos_ijk = (delij[0]*delik[0]+delij[1]*delik[1]+delij[2]*delik[2])/(rij*rik);
       if( cos_ijk > 1.0 ) cos_ijk  = 1.0;
       if( cos_ijk < -1.0 ) cos_ijk  = -1.0;
       theta_ijk = acos(cos_ijk);
 
       // dcos_ijk
       const F_FLOAT inv_dists = 1.0 / (rij * rik);
       const F_FLOAT cos_ijk_tmp = cos_ijk / ((rij*rik)*(rij*rik));
 
       for( int d = 0; d < 3; d++ ) {
         dcos_ijk_di[d] = -(delik[d] + delij[d]) * inv_dists + cos_ijk_tmp * (rsqik * delij[d] + rsqij * delik[d]);
         dcos_ijk_dj[d] = delik[d] * inv_dists - cos_ijk_tmp * rsqik * delij[d];
         dcos_ijk_dk[d] = delij[d] * inv_dists - cos_ijk_tmp * rsqij * delik[d];
       }
 
       sin_ijk = sin( theta_ijk );
       if( sin_ijk >= 0 && sin_ijk <= 1e-10 )
         tan_ijk_i = cos_ijk / 1e-10;
       else if( sin_ijk <= 0 && sin_ijk >= -1e-10 )
         tan_ijk_i = -cos_ijk / 1e-10;
       else tan_ijk_i = cos_ijk / sin_ijk;
 
       exp_tor2_ik = exp( -p_tor2 * BOA_ik );
       exp_cot2_ik = exp( -p_cot2 * SQR(BOA_ik -1.5) );
 
       for(int l = 0; l < 3; l++) fktmp[l] = 0.0;
 
       for (int ll = l_start; ll < l_end; ll++) {
         int l = d_bo_list[ll];
         l &= NEIGHMASK;
         if (l == i) continue;
         const int ltype = type(l);
         const int l_index = ll - l_start;
 
         bo_jl = d_BO(j,l_index);
         if (l == k || bo_jl < thb_cut || bo_ij*bo_ik*bo_jl < thb_cut) continue;
 
         for (int d = 0; d < 3; d ++) deljl[d] = x(l,d) - x(j,d);
         const F_FLOAT rsqjl = deljl[0]*deljl[0] + deljl[1]*deljl[1] + deljl[2]*deljl[2];
         const F_FLOAT rjl = sqrt(rsqjl);
         BOA_jl = bo_jl - thb_cut;
 
         cos_jil = -(delij[0]*deljl[0]+delij[1]*deljl[1]+delij[2]*deljl[2])/(rij*rjl);
         if( cos_jil > 1.0 ) cos_jil  = 1.0;
         if( cos_jil < -1.0 ) cos_jil  = -1.0;
         theta_jil = acos(cos_jil);
 
         // dcos_jil
         const F_FLOAT inv_distjl = 1.0 / (rij * rjl);
         const F_FLOAT inv_distjl3 = pow( inv_distjl, 3.0 );
         const F_FLOAT cos_jil_tmp = cos_jil / ((rij*rjl)*(rij*rjl));
 
         for( int d = 0; d < 3; d++ ) {
           dcos_jil_di[d] = deljl[d] * inv_distjl - cos_jil_tmp * rsqjl * -delij[d];
           dcos_jil_dj[d] = (-deljl[d] + delij[d]) * inv_distjl - cos_jil_tmp * (rsqjl * delij[d] + rsqij * -deljl[d]);
           dcos_jil_dk[d] = -delij[d] * inv_distjl - cos_jil_tmp * rsqij * deljl[d];
         }
 
         sin_jil = sin( theta_jil );
         if( sin_jil >= 0 && sin_jil <= 1e-10 )
           tan_jil_i = cos_jil / 1e-10;
         else if( sin_jil <= 0 && sin_jil >= -1e-10 )
           tan_jil_i = -cos_jil / 1e-10;
         else tan_jil_i = cos_jil / sin_jil;
 
         for (int d = 0; d < 3; d ++) dellk[d] = x(k,d) - x(l,d);
         const F_FLOAT rsqlk = dellk[0]*dellk[0] + dellk[1]*dellk[1] + dellk[2]*dellk[2];
         const F_FLOAT rlk = sqrt(rsqlk);
 
         F_FLOAT unnorm_cos_omega, unnorm_sin_omega, omega;
         F_FLOAT htra, htrb, htrc, hthd, hthe, hnra, hnrc, hnhd, hnhe;
         F_FLOAT arg, poem, tel;
         F_FLOAT cross_ij_jl[3];
 
         // omega
 
         F_FLOAT dot_ij_jk = -(delij[0]*delik[0]+delij[1]*delik[1]+delij[2]*delik[2]);
         F_FLOAT dot_ij_lj = delij[0]*deljl[0]+delij[1]*deljl[1]+delij[2]*deljl[2];
         F_FLOAT dot_ik_jl = delik[0]*deljl[0]+delik[1]*deljl[1]+delik[2]*deljl[2];
         unnorm_cos_omega = dot_ij_jk * dot_ij_lj + rsqij * dot_ik_jl;
 
         cross_ij_jl[0] = delij[1]*deljl[2] - delij[2]*deljl[1];
         cross_ij_jl[1] = delij[2]*deljl[0] - delij[0]*deljl[2];
         cross_ij_jl[2] = delij[0]*deljl[1] - delij[1]*deljl[0];
 
         unnorm_sin_omega = -rij*(delik[0]*cross_ij_jl[0]+delik[1]*cross_ij_jl[1]+delik[2]*cross_ij_jl[2]);
         omega = atan2( unnorm_sin_omega, unnorm_cos_omega );
 
         htra = rik + cos_ijk * ( rjl * cos_jil - rij );
         htrb = rij - rik * cos_ijk - rjl * cos_jil;
         htrc = rjl + cos_jil * ( rik * cos_ijk - rij );
         hthd = rik * sin_ijk * ( rij - rjl * cos_jil );
         hthe = rjl * sin_jil * ( rij - rik * cos_ijk );
         hnra = rjl * sin_ijk * sin_jil;
         hnrc = rik * sin_ijk * sin_jil;
         hnhd = rik * rjl * cos_ijk * sin_jil;
         hnhe = rik * rjl * sin_ijk * cos_jil;
 
         poem = 2.0 * rik * rjl * sin_ijk * sin_jil;
         if( poem < 1e-20 ) poem = 1e-20;
 
         tel = SQR(rik) + SQR(rij) + SQR(rjl) - SQR(rlk) -
               2.0 * (rik * rij * cos_ijk - rik * rjl * cos_ijk * cos_jil + rij * rjl * cos_jil);
 
         arg = tel / poem;
         if( arg >  1.0 ) arg =  1.0;
         if( arg < -1.0 ) arg = -1.0;
 
         if( sin_ijk >= 0 && sin_ijk <= 1e-10 ) sin_ijk = 1e-10;
         else if( sin_ijk <= 0 && sin_ijk >= -1e-10 ) sin_ijk = -1e-10;
         if( sin_jil >= 0 && sin_jil <= 1e-10 ) sin_jil = 1e-10;
         else if( sin_jil <= 0 && sin_jil >= -1e-10 ) sin_jil = -1e-10;
 
         // dcos_omega_di
         for (int d = 0; d < 3; d++) dcos_omega_dk[d] = ((htra-arg*hnra)/rik) * delik[d] - dellk[d];
         for (int d = 0; d < 3; d++) dcos_omega_dk[d] += (hthd-arg*hnhd)/sin_ijk * -dcos_ijk_dk[d];
         for (int d = 0; d < 3; d++) dcos_omega_dk[d] *= 2.0/poem;
 
         // dcos_omega_dj
         for (int d = 0; d < 3; d++) dcos_omega_di[d] = -((htra-arg*hnra)/rik) * delik[d] - htrb/rij * delij[d];
         for (int d = 0; d < 3; d++) dcos_omega_di[d] += -(hthd-arg*hnhd)/sin_ijk * dcos_ijk_di[d];
         for (int d = 0; d < 3; d++) dcos_omega_di[d] += -(hthe-arg*hnhe)/sin_jil * dcos_jil_di[d];
         for (int d = 0; d < 3; d++) dcos_omega_di[d] *= 2.0/poem;
 
         // dcos_omega_dk
         for (int d = 0; d < 3; d++) dcos_omega_dj[d] = -((htrc-arg*hnrc)/rjl) * deljl[d] + htrb/rij * delij[d];
         for (int d = 0; d < 3; d++) dcos_omega_dj[d] += -(hthd-arg*hnhd)/sin_ijk * dcos_ijk_dj[d];
         for (int d = 0; d < 3; d++) dcos_omega_dj[d] += -(hthe-arg*hnhe)/sin_jil * dcos_jil_dj[d];
         for (int d = 0; d < 3; d++) dcos_omega_dj[d] *= 2.0/poem;
 
         // dcos_omega_dl
         for (int d = 0; d < 3; d++) dcos_omega_dl[d] = ((htrc-arg*hnrc)/rjl) * deljl[d] + dellk[d];
         for (int d = 0; d < 3; d++) dcos_omega_dl[d] += (hthe-arg*hnhe)/sin_jil * -dcos_jil_dk[d];
         for (int d = 0; d < 3; d++) dcos_omega_dl[d] *= 2.0/poem;
 
         cos_omega = cos( omega );
         cos2omega = cos( 2. * omega );
         cos3omega = cos( 3. * omega );
 
         // torsion energy
 
         p_tor1 = paramsfbp(ktype,itype,jtype,ltype).p_tor1;
         p_cot1 = paramsfbp(ktype,itype,jtype,ltype).p_cot1;
         V1 = paramsfbp(ktype,itype,jtype,ltype).V1;
         V2 = paramsfbp(ktype,itype,jtype,ltype).V2;
         V3 = paramsfbp(ktype,itype,jtype,ltype).V3;
 
         exp_tor1 = exp(p_tor1 * SQR(2.0 - d_BO_pi(i,j_index) - f11_DiDj));
         exp_tor2_jl = exp(-p_tor2 * BOA_jl);
         exp_cot2_jl = exp(-p_cot2 * SQR(BOA_jl - 1.5) );
         fn10 = (1.0 - exp_tor2_ik) * (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jl);
 
         CV = 0.5 * (V1 * (1.0 + cos_omega) + V2 * exp_tor1 * (1.0 - cos2omega) + V3 * (1.0 + cos3omega) );
 
         e_tor = fn10 * sin_ijk * sin_jil * CV;
         if (eflag) ev.ereax[6] += e_tor;
 
         dfn11 = (-p_tor3 * exp_tor3_DiDj + (p_tor3 * exp_tor3_DiDj - p_tor4 * exp_tor4_DiDj) *
                 (2.0 + exp_tor3_DiDj) * exp_tor34_inv) * exp_tor34_inv;
 
         CEtors1 = sin_ijk * sin_jil * CV;
 
         CEtors2 = -fn10 * 2.0 * p_tor1 * V2 * exp_tor1 * (2.0 - d_BO_pi(i,j_index) - f11_DiDj) *
                   (1.0 - SQR(cos_omega)) * sin_ijk * sin_jil;
         CEtors3 = CEtors2 * dfn11;
 
         CEtors4 = CEtors1 * p_tor2 * exp_tor2_ik * (1.0 - exp_tor2_ij) * (1.0 - exp_tor2_jl);
         CEtors5 = CEtors1 * p_tor2 * (1.0 - exp_tor2_ik) * exp_tor2_ij * (1.0 - exp_tor2_jl);
         CEtors6 = CEtors1 * p_tor2 * (1.0 - exp_tor2_ik) * (1.0 - exp_tor2_ij) * exp_tor2_jl;
 
         cmn = -fn10 * CV;
         CEtors7 = cmn * sin_jil * tan_ijk_i;
         CEtors8 = cmn * sin_ijk * tan_jil_i;
 
         CEtors9 = fn10 * sin_ijk * sin_jil *
           (0.5 * V1 - 2.0 * V2 * exp_tor1 * cos_omega + 1.5 * V3 * (cos2omega + 2.0 * SQR(cos_omega)));
 
         // 4-body conjugation energy
 
         fn12 = exp_cot2_ik * exp_cot2_ij * exp_cot2_jl;
         e_con = p_cot1 * fn12 * (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jil);
         if (eflag) ev.ereax[7] += e_con;
 
         Cconj = -2.0 * fn12 * p_cot1 * p_cot2 * (1.0 + (SQR(cos_omega) - 1.0) * sin_ijk * sin_jil);
 
         CEconj1 = Cconj * (BOA_ik - 1.5e0);
         CEconj2 = Cconj * (BOA_ij - 1.5e0);
         CEconj3 = Cconj * (BOA_jl - 1.5e0);
 
         CEconj4 = -p_cot1 * fn12 * (SQR(cos_omega) - 1.0) * sin_jil * tan_ijk_i;
         CEconj5 = -p_cot1 * fn12 * (SQR(cos_omega) - 1.0) * sin_ijk * tan_jil_i;
         CEconj6 = 2.0 * p_cot1 * fn12 * cos_omega * sin_ijk * sin_jil;
 
         // forces
 
         // contribution to bond order
 
         d_Cdbopi(i,j_index) += CEtors2;
         CdDelta_i += CEtors3;
         CdDelta_j += CEtors3;
 
         a_Cdbo(i,k_index) += CEtors4 + CEconj1;
         a_Cdbo(i,j_index) += CEtors5 + CEconj2;
         a_Cdbo(j,l_index) += CEtors6 + CEconj3; // trouble
 
         // dcos_theta_ijk
         const F_FLOAT coeff74 = CEtors7 + CEconj4;
         for (int d = 0; d < 3; d++) fi_tmp[d] = (coeff74) * dcos_ijk_di[d];
         for (int d = 0; d < 3; d++) fj_tmp[d] = (coeff74) * dcos_ijk_dj[d];
         for (int d = 0; d < 3; d++) fk_tmp[d] = (coeff74) * dcos_ijk_dk[d];
 
         const F_FLOAT coeff85 = CEtors8 + CEconj5;
         // dcos_theta_jil
         for (int d = 0; d < 3; d++) fi_tmp[d] += (coeff85) * dcos_jil_di[d];
         for (int d = 0; d < 3; d++) fj_tmp[d] += (coeff85) * dcos_jil_dj[d];
         for (int d = 0; d < 3; d++) fl_tmp[d] =  (coeff85) * dcos_jil_dk[d];
 
         // dcos_omega
         const F_FLOAT coeff96 = CEtors9 + CEconj6;
         for (int d = 0; d < 3; d++) fi_tmp[d] += (coeff96) * dcos_omega_di[d];
         for (int d = 0; d < 3; d++) fj_tmp[d] += (coeff96) * dcos_omega_dj[d];
         for (int d = 0; d < 3; d++) fk_tmp[d] += (coeff96) * dcos_omega_dk[d];
         for (int d = 0; d < 3; d++) fl_tmp[d] += (coeff96) * dcos_omega_dl[d];
 
         // total forces
 
         for (int d = 0; d < 3; d++) fitmp[d] -= fi_tmp[d];
         for (int d = 0; d < 3; d++) fjtmp[d] -= fj_tmp[d];
         for (int d = 0; d < 3; d++) fktmp[d] -= fk_tmp[d];
         for (int d = 0; d < 3; d++) a_f(l,d) -= fl_tmp[d];
 
         // per-atom energy/virial tally
 
         if (EVFLAG) {
           eng_tmp = e_tor + e_con;
           //if (eflag_atom) this->template ev_tally<NEIGHFLAG>(ev,i,j,eng_tmp,0.0,0.0,0.0,0.0);
           if (eflag_atom) this->template e_tally<NEIGHFLAG>(ev,i,j,eng_tmp);
           if (vflag_either) {
               for (int d = 0; d < 3; d ++) delil[d] = x(l,d) - x(i,d);
               for (int d = 0; d < 3; d ++) delkl[d] = x(l,d) - x(k,d);
               this->template v_tally4<NEIGHFLAG>(ev,k,i,j,l,fk_tmp,fi_tmp,fj_tmp,delkl,delil,deljl);
           }
         }
 
       }
       for (int d = 0; d < 3; d++) a_f(k,d) += fktmp[d];
     }
     a_CdDelta[j] += CdDelta_j;
     for (int d = 0; d < 3; d++) a_f(j,d) += fjtmp[d];
   }
   a_CdDelta[i] += CdDelta_i;
   for (int d = 0; d < 3; d++) a_f(i,d) += fitmp[d];
 }
 
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeTorsion<NEIGHFLAG,EVFLAG>, const int &ii) const {
   EV_FLOAT_REAX ev;
   this->template operator()<NEIGHFLAG,EVFLAG>(PairReaxComputeTorsion<NEIGHFLAG,EVFLAG>(), ii, ev);
 
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeHydrogen<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT_REAX& ev) const {
 
   Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
 
   int hblist[MAX_BONDS];
   F_FLOAT theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2;
   F_FLOAT e_hb, exp_hb2, exp_hb3, CEhb1, CEhb2, CEhb3;
   F_FLOAT dcos_theta_di[3], dcos_theta_dj[3], dcos_theta_dk[3];
 
   // tally variables
   F_FLOAT fi_tmp[3], fj_tmp[3], fk_tmp[3], delij[3], delji[3], delik[3], delki[3];
   for (int d = 0; d < 3; d++) fi_tmp[d] = fj_tmp[d] = fk_tmp[d] = 0.0;
 
   const int i = d_ilist[ii];
   const int itype = type(i);
   if( paramssing(itype).p_hbond != 1 ) return;
 
   const X_FLOAT xtmp = x(i,0);
   const X_FLOAT ytmp = x(i,1);
   const X_FLOAT ztmp = x(i,2);
   const int itag = tag(i);
 
   const int j_start = d_bo_first[i];
   const int j_end = j_start + d_bo_num[i];
   const int k_start = d_hb_first[i];
   const int k_end = k_start + d_hb_num[i];
 
   int top = 0;
   for (int jj = j_start; jj < j_end; jj++) {
     int j = d_bo_list[jj];
     j &= NEIGHMASK;
     const int jtype = type(j);
     const int j_index = jj - j_start;
     const F_FLOAT bo_ij = d_BO(i,j_index);
 
     if( paramssing(jtype).p_hbond == 2 && bo_ij >= HB_THRESHOLD ) {
       hblist[top] = jj;
       top ++;
     }
   }
 
   F_FLOAT fitmp[3];
   for (int d = 0; d < 3; d++) fitmp[d] = 0.0;
 
   for (int kk = k_start; kk < k_end; kk++) {
     int k = d_hb_list[kk];
     k &= NEIGHMASK;
     const int ktag = tag(k);
     const int ktype = type(k);
 
     delik[0] = x(k,0) - xtmp;
     delik[1] = x(k,1) - ytmp;
     delik[2] = x(k,2) - ztmp;
     const F_FLOAT rsqik = delik[0]*delik[0] + delik[1]*delik[1] + delik[2]*delik[2];
     const F_FLOAT rik = sqrt(rsqik);
 
     for (int itr = 0; itr < top; itr++) {
       const int jj = hblist[itr];
       int j = d_bo_list[jj];
       j &= NEIGHMASK;
       const int jtag = tag(j);
       if (jtag == ktag) continue;
 
       const int jtype = type(j);
       const int j_index = jj - j_start;
       const F_FLOAT bo_ij = d_BO(i,j_index);
 
       delij[0] = x(j,0) - xtmp;
       delij[1] = x(j,1) - ytmp;
       delij[2] = x(j,2) - ztmp;
       const F_FLOAT rsqij = delij[0]*delij[0] + delij[1]*delij[1] + delij[2]*delij[2];
       const F_FLOAT rij = sqrt(rsqij);
 
       // theta and derivatives
       cos_theta = (delij[0]*delik[0]+delij[1]*delik[1]+delij[2]*delik[2])/(rij*rik);
       if( cos_theta > 1.0 ) cos_theta  = 1.0;
       if( cos_theta < -1.0 ) cos_theta  = -1.0;
       theta = acos(cos_theta);
 
       const F_FLOAT inv_dists = 1.0 / (rij * rik);
       const F_FLOAT Cdot_inv3 = cos_theta * inv_dists * inv_dists;
 
       for( int d = 0; d < 3; d++ ) {
         dcos_theta_di[d] = -(delik[d] + delij[d]) * inv_dists + Cdot_inv3 * (rsqik * delij[d] + rsqij * delik[d]);
         dcos_theta_dj[d] = delik[d] * inv_dists - Cdot_inv3 * rsqik * delij[d];
         dcos_theta_dk[d] = delij[d] * inv_dists - Cdot_inv3 * rsqij * delik[d];
       }
 
       // hydrogen bond energy
       const F_FLOAT p_hb1 = paramshbp(jtype,itype,ktype).p_hb1;
       const F_FLOAT p_hb2 = paramshbp(jtype,itype,ktype).p_hb2;
       const F_FLOAT p_hb3 = paramshbp(jtype,itype,ktype).p_hb3;
       const F_FLOAT r0_hb = paramshbp(jtype,itype,ktype).r0_hb;
 
       sin_theta2 = sin(theta/2.0);
       sin_xhz4 = SQR(sin_theta2);
       sin_xhz4 *= sin_xhz4;
       cos_xhz1 = (1.0 - cos_theta);
       exp_hb2 = exp(-p_hb2 * bo_ij);
       exp_hb3 = exp(-p_hb3 * (r0_hb/rik + rik/r0_hb - 2.0));
 
       e_hb = p_hb1 * (1.0 - exp_hb2) * exp_hb3 * sin_xhz4;
       if (eflag) ev.ereax[8] += e_hb;
 
       // hydrogen bond forces
       CEhb1 = p_hb1 * p_hb2 * exp_hb2 * exp_hb3 * sin_xhz4;
       CEhb2 = -p_hb1/2.0 * (1.0 - exp_hb2) * exp_hb3 * cos_xhz1;
       CEhb3 = -p_hb3 * (-r0_hb/SQR(rik) + 1.0/r0_hb) * e_hb;
 
       d_Cdbo(i,j_index) += CEhb1; // dbo term
 
       // dcos terms
       for (int d = 0; d < 3; d++) fi_tmp[d] = CEhb2 * dcos_theta_di[d];
       for (int d = 0; d < 3; d++) fj_tmp[d] = CEhb2 * dcos_theta_dj[d];
       for (int d = 0; d < 3; d++) fk_tmp[d] = CEhb2 * dcos_theta_dk[d];
       
       // dr terms
       for (int d = 0; d < 3; d++) fi_tmp[d] -= CEhb3/rik * delik[d];
       for (int d = 0; d < 3; d++) fk_tmp[d] += CEhb3/rik * delik[d];
       
       for (int d = 0; d < 3; d++) fitmp[d] -= fi_tmp[d];
       for (int d = 0; d < 3; d++) a_f(j,d) -= fj_tmp[d];
       for (int d = 0; d < 3; d++) a_f(k,d) -= fk_tmp[d];
       
       for (int d = 0; d < 3; d++) delki[d] = -1.0 * delik[d];
       for (int d = 0; d < 3; d++) delji[d] = -1.0 * delij[d];
       if (eflag_atom) this->template e_tally<NEIGHFLAG>(ev,i,j,e_hb);
       if (vflag_either) this->template v_tally3<NEIGHFLAG>(ev,i,j,k,fj_tmp,fk_tmp,delji,delki);
     }
   }
   for (int d = 0; d < 3; d++) a_f(i,d) += fitmp[d];
 }
 
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeHydrogen<NEIGHFLAG,EVFLAG>, const int &ii) const {
   EV_FLOAT_REAX ev;
   this->template operator()<NEIGHFLAG,EVFLAG>(PairReaxComputeHydrogen<NEIGHFLAG,EVFLAG>(), ii, ev);
 
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 template<int NEIGHFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxUpdateBond<NEIGHFLAG>, const int &ii) const {
 
   Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_Cdbo = d_Cdbo;
   Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_Cdbopi = d_Cdbopi;
   Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_Cdbopi2 = d_Cdbopi2;
 
   const int i = d_ilist[ii];
   const int itag = tag(i);
   const int j_start = d_bo_first[i];
   const int j_end = j_start + d_bo_num[i];
 
   for (int jj = j_start; jj < j_end; jj++) {
     int j = d_bo_list[jj];
     j &= NEIGHMASK;
     const int jtag = tag(j);
     const int j_index = jj - j_start;
     const F_FLOAT Cdbo_i = d_Cdbo(i,j_index);
     const F_FLOAT Cdbopi_i = d_Cdbopi(i,j_index);
     const F_FLOAT Cdbopi2_i = d_Cdbopi2(i,j_index);
 
     const int k_start = d_bo_first[j];
     const int k_end = k_start + d_bo_num[j];
 
     for (int kk = k_start; kk < k_end; kk++) {
       int k = d_bo_list[kk];
       k &= NEIGHMASK;
       if (k != i) continue;
       const int k_index = kk - k_start;
 
       int flag = 0;
       if (itag > jtag) {
         if ((itag+jtag) % 2 == 0) flag = 1;
       } else if (itag < jtag) {
         if ((itag+jtag) % 2 == 1) flag = 1;
       }
 
       if (flag) {
         a_Cdbo(j,k_index) += Cdbo_i;
         a_Cdbopi(j,k_index) += Cdbopi_i;
         a_Cdbopi2(j,k_index) += Cdbopi2_i;
       }
     }
   }
 
 }
 
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeBond1<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT_REAX& ev) const {
 
   Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
   Kokkos::View<F_FLOAT*, typename DAT::t_ffloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_CdDelta = d_CdDelta;
 
   F_FLOAT delij[3];
   F_FLOAT p_be1, p_be2, De_s, De_p, De_pp, pow_BOs_be2, exp_be12, CEbo, ebond;
 
   const int i = d_ilist[ii];
   const X_FLOAT xtmp = x(i,0);
   const X_FLOAT ytmp = x(i,1);
   const X_FLOAT ztmp = x(i,2);
   const int itype = type(i);
   const int itag = tag(i);
   const F_FLOAT imass = paramssing(itype).mass;
   const F_FLOAT val_i = paramssing(itype).valency;
   const int j_start = d_bo_first[i];
   const int j_end = j_start + d_bo_num[i];
 
   F_FLOAT CdDelta_i = 0.0;
 
   for (int jj = j_start; jj < j_end; jj++) {
     int j = d_bo_list[jj];
     j &= NEIGHMASK;
     const int jtag = tag(j);
 
     if (itag > jtag) {
       if ((itag+jtag) % 2 == 0) continue;
     } else if (itag < jtag) {
       if ((itag+jtag) % 2 == 1) continue;
     } else {
       if (x(j,2)  < ztmp) continue;
       if (x(j,2) == ztmp && x(j,1)  < ytmp) continue;
       if (x(j,2) == ztmp && x(j,1) == ytmp && x(j,0) < xtmp) continue;
     }
 
     const int jtype = type(j);
     const int j_index = jj - j_start;
     const F_FLOAT jmass = paramssing(jtype).mass;
 
     delij[0] = x(j,0) - xtmp;
     delij[1] = x(j,1) - ytmp;
     delij[2] = x(j,2) - ztmp;
 
     const F_FLOAT rsq = delij[0]*delij[0] + delij[1]*delij[1] + delij[2]*delij[2];
     const F_FLOAT rij = sqrt(rsq);
 
     const int k_start = d_bo_first[j];
     const int k_end = k_start + d_bo_num[j];
 
     const F_FLOAT p_bo1 = paramstwbp(itype,jtype).p_bo1;
     const F_FLOAT p_bo2 = paramstwbp(itype,jtype).p_bo2;
     const F_FLOAT p_bo3 = paramstwbp(itype,jtype).p_bo3;
     const F_FLOAT p_bo4 = paramstwbp(itype,jtype).p_bo4;
     const F_FLOAT p_bo5 = paramstwbp(itype,jtype).p_bo5;
     const F_FLOAT p_bo6 = paramstwbp(itype,jtype).p_bo6;
     const F_FLOAT r_s = paramstwbp(itype,jtype).r_s;
     const F_FLOAT r_pi = paramstwbp(itype,jtype).r_pi;
     const F_FLOAT r_pi2 = paramstwbp(itype,jtype).r_pi2;
 
     // bond energy (nlocal only)
     p_be1 = paramstwbp(itype,jtype).p_be1;
     p_be2 = paramstwbp(itype,jtype).p_be2;
     De_s = paramstwbp(itype,jtype).De_s;
     De_p = paramstwbp(itype,jtype).De_p;
     De_pp = paramstwbp(itype,jtype).De_pp;
 
     const F_FLOAT BO_i = d_BO(i,j_index);
     const F_FLOAT BO_s_i = d_BO_s(i,j_index);
     const F_FLOAT BO_pi_i = d_BO_pi(i,j_index);
     const F_FLOAT BO_pi2_i = d_BO_pi2(i,j_index);
 
     pow_BOs_be2 = pow(BO_s_i,p_be2);
     exp_be12 = exp(p_be1*(1.0-pow_BOs_be2));
     CEbo = -De_s*exp_be12*(1.0-p_be1*p_be2*pow_BOs_be2);
     ebond = -De_s*BO_s_i*exp_be12
                               -De_p*BO_pi_i
                           -De_pp*BO_pi2_i;
 
     if (eflag) ev.evdwl += ebond;
     //if (eflag_atom) this->template ev_tally<NEIGHFLAG>(ev,i,j,ebond,0.0,0.0,0.0,0.0);
     //if (eflag_atom) this->template e_tally<NEIGHFLAG>(ev,i,j,ebond);
 
     // calculate derivatives of Bond Orders
     d_Cdbo(i,j_index) += CEbo;
     d_Cdbopi(i,j_index) -= (CEbo + De_p);
     d_Cdbopi2(i,j_index) -= (CEbo + De_pp);
 
     // Stabilisation terminal triple bond
     F_FLOAT estriph = 0.0;
 
     if( BO_i >= 1.00 ) {
       if( gp[37] == 2 || (imass == 12.0000 && jmass == 15.9990) ||
                          (jmass == 12.0000 && imass == 15.9990) ) {
         const F_FLOAT exphu = exp(-gp[7] * SQR(BO_i - 2.50) );
         const F_FLOAT exphua1 = exp(-gp[3] * (d_total_bo[i]-BO_i));
         const F_FLOAT exphub1 = exp(-gp[3] * (d_total_bo[j]-BO_i));
         const F_FLOAT exphuov = exp(gp[4] * (d_Delta[i] + d_Delta[j]));
         const F_FLOAT hulpov = 1.0 / (1.0 + 25.0 * exphuov);
         estriph = gp[10] * exphu * hulpov * (exphua1 + exphub1);
 
         if (eflag) ev.evdwl += estriph;
         //if (eflag_atom) this->template ev_tally<NEIGHFLAG>(ev,i,j,estriph,0.0,0.0,0.0,0.0);
         //if (eflag_atom) this->template e_tally<NEIGHFLAG>(ev,i,j,estriph);
 
         const F_FLOAT decobdbo = gp[10] * exphu * hulpov * (exphua1 + exphub1) *
             ( gp[3] - 2.0 * gp[7] * (BO_i-2.50) );
         const F_FLOAT decobdboua = -gp[10] * exphu * hulpov *
             (gp[3]*exphua1 + 25.0*gp[4]*exphuov*hulpov*(exphua1+exphub1));
         const F_FLOAT decobdboub = -gp[10] * exphu * hulpov *
             (gp[3]*exphub1 + 25.0*gp[4]*exphuov*hulpov*(exphua1+exphub1));
 
         d_Cdbo(i,j_index) += decobdbo;
         CdDelta_i += decobdboua;
         a_CdDelta[j] += decobdboub;
       }
     }
     const F_FLOAT eng_tmp = ebond + estriph;
     if (eflag_atom) this->template e_tally<NEIGHFLAG>(ev,i,j,eng_tmp);
   }
   a_CdDelta[i] += CdDelta_i;
 }
 
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeBond1<NEIGHFLAG,EVFLAG>, const int &ii) const {
   EV_FLOAT_REAX ev;
   this->template operator()<NEIGHFLAG,EVFLAG>(PairReaxComputeBond1<NEIGHFLAG,EVFLAG>(), ii, ev);
 
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeBond2<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT_REAX& ev) const {
 
   Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
 
   F_FLOAT delij[3], delik[3], deljk[3], tmpvec[3];
   F_FLOAT dBOp_i[3], dBOp_k[3], dln_BOp_pi[3], dln_BOp_pi2[3];
 
   const int i = d_ilist[ii];
   const X_FLOAT xtmp = x(i,0);
   const X_FLOAT ytmp = x(i,1);
   const X_FLOAT ztmp = x(i,2);
   const int itype = type(i);
   const int itag = tag(i);
   const F_FLOAT imass = paramssing(itype).mass;
   const F_FLOAT val_i = paramssing(itype).valency;
   const int j_start = d_bo_first[i];
   const int j_end = j_start + d_bo_num[i];
 
   F_FLOAT CdDelta_i = d_CdDelta[i];
   F_FLOAT fitmp[3];
   for (int j = 0; j < 3; j++) fitmp[j] = 0.0;
 
   for (int jj = j_start; jj < j_end; jj++) {
     int j = d_bo_list[jj];
     j &= NEIGHMASK;
     const int jtag = tag(j);
 
     if (itag > jtag) {
       if ((itag+jtag) % 2 == 0) continue;
     } else if (itag < jtag) {
       if ((itag+jtag) % 2 == 1) continue;
     } else {
       if (x(j,2)  < ztmp) continue;
       if (x(j,2) == ztmp && x(j,1)  < ytmp) continue;
       if (x(j,2) == ztmp && x(j,1) == ytmp && x(j,0) < xtmp) continue;
     }
 
     const int jtype = type(j);
     const int j_index = jj - j_start;
     const F_FLOAT jmass = paramssing(jtype).mass;
     F_FLOAT CdDelta_j = d_CdDelta[j];
 
     delij[0] = x(j,0) - xtmp;
     delij[1] = x(j,1) - ytmp;
     delij[2] = x(j,2) - ztmp;
 
     const F_FLOAT rsq = delij[0]*delij[0] + delij[1]*delij[1] + delij[2]*delij[2];
     const F_FLOAT rij = sqrt(rsq);
 
     const int k_start = d_bo_first[j];
     const int k_end = k_start + d_bo_num[j];
 
     F_FLOAT coef_C1dbo, coef_C2dbo, coef_C3dbo, coef_C1dbopi, coef_C2dbopi, coef_C3dbopi, coef_C4dbopi;
     F_FLOAT coef_C1dbopi2, coef_C2dbopi2, coef_C3dbopi2, coef_C4dbopi2, coef_C1dDelta, coef_C2dDelta, coef_C3dDelta;
 
     coef_C1dbo = coef_C2dbo = coef_C3dbo = 0.0;
     coef_C1dbopi = coef_C2dbopi = coef_C3dbopi = coef_C4dbopi = 0.0;
     coef_C1dbopi2 = coef_C2dbopi2 = coef_C3dbopi2 = coef_C4dbopi2 = 0.0;
     coef_C1dDelta = coef_C2dDelta = coef_C3dDelta = 0.0;
 
     // total forces on i, j, k (nlocal + nghost, from Add_dBond_to_Forces))
     const F_FLOAT Cdbo_ij = d_Cdbo(i,j_index);
     coef_C1dbo = d_C1dbo(i,j_index) * (Cdbo_ij);
     coef_C2dbo = d_C2dbo(i,j_index) * (Cdbo_ij);
     coef_C3dbo = d_C3dbo(i,j_index) * (Cdbo_ij);
 
     const F_FLOAT Cdbopi_ij = d_Cdbopi(i,j_index);
     coef_C1dbopi = d_C1dbopi(i,j_index) * (Cdbopi_ij);
     coef_C2dbopi = d_C2dbopi(i,j_index) * (Cdbopi_ij);
     coef_C3dbopi = d_C3dbopi(i,j_index) * (Cdbopi_ij);
     coef_C4dbopi = d_C4dbopi(i,j_index) * (Cdbopi_ij);
 
     const F_FLOAT Cdbopi2_ij = d_Cdbopi2(i,j_index);
     coef_C1dbopi2 = d_C1dbopi2(i,j_index) * (Cdbopi2_ij);
     coef_C2dbopi2 = d_C2dbopi2(i,j_index) * (Cdbopi2_ij);
     coef_C3dbopi2 = d_C3dbopi2(i,j_index) * (Cdbopi2_ij);
     coef_C4dbopi2 = d_C4dbopi2(i,j_index) * (Cdbopi2_ij);
 
     const F_FLOAT coeff_CdDelta_ij = CdDelta_i + CdDelta_j;
     coef_C1dDelta = d_C1dbo(i,j_index) * (coeff_CdDelta_ij);
     coef_C2dDelta = d_C2dbo(i,j_index) * (coeff_CdDelta_ij);
     coef_C3dDelta = d_C3dbo(i,j_index) * (coeff_CdDelta_ij);
 
     F_FLOAT temp[3];
 
     dln_BOp_pi[0] = d_dln_BOp_pix(i,j_index);
     dln_BOp_pi[1] = d_dln_BOp_piy(i,j_index);
     dln_BOp_pi[2] = d_dln_BOp_piz(i,j_index);
 
     dln_BOp_pi2[0] = d_dln_BOp_pi2x(i,j_index);
     dln_BOp_pi2[1] = d_dln_BOp_pi2y(i,j_index);
     dln_BOp_pi2[2] = d_dln_BOp_pi2z(i,j_index);
 
     dBOp_i[0] = d_dBOpx(i,j_index);
     dBOp_i[1] = d_dBOpy(i,j_index);
     dBOp_i[2] = d_dBOpz(i,j_index);
 
     // forces on i
     for (int d = 0; d < 3; d++) temp[d] =  coef_C1dbo * dBOp_i[d];
     for (int d = 0; d < 3; d++) temp[d] += coef_C2dbo * d_dDeltap_self(i,d);
     for (int d = 0; d < 3; d++) temp[d] += coef_C1dDelta * dBOp_i[d];
     for (int d = 0; d < 3; d++) temp[d] += coef_C2dDelta * d_dDeltap_self(i,d);
     for (int d = 0; d < 3; d++) temp[d] += coef_C1dbopi * dln_BOp_pi[d];
     for (int d = 0; d < 3; d++) temp[d] += coef_C2dbopi * dBOp_i[d];
     for (int d = 0; d < 3; d++) temp[d] += coef_C3dbopi * d_dDeltap_self(i,d);
     for (int d = 0; d < 3; d++) temp[d] += coef_C1dbopi2 * dln_BOp_pi2[d];
     for (int d = 0; d < 3; d++) temp[d] += coef_C2dbopi2 * dBOp_i[d];
     for (int d = 0; d < 3; d++) temp[d] += coef_C3dbopi2 * d_dDeltap_self(i,d);
 
     if (EVFLAG)
       if (vflag_either) this->template v_tally<NEIGHFLAG>(ev,i,temp,delij);
 
     fitmp[0] -= temp[0];
     fitmp[1] -= temp[1];
     fitmp[2] -= temp[2];
 
     // forces on j
     for (int d = 0; d < 3; d++) temp[d] = -coef_C1dbo * dBOp_i[d];
     for (int d = 0; d < 3; d++) temp[d] += coef_C3dbo * d_dDeltap_self(j,d);
     for (int d = 0; d < 3; d++) temp[d] -= coef_C1dDelta * dBOp_i[d];
     for (int d = 0; d < 3; d++) temp[d] += coef_C3dDelta * d_dDeltap_self(j,d);
     for (int d = 0; d < 3; d++) temp[d] -= coef_C1dbopi * dln_BOp_pi[d];
     for (int d = 0; d < 3; d++) temp[d] -= coef_C2dbopi * dBOp_i[d];
     for (int d = 0; d < 3; d++) temp[d] += coef_C4dbopi * d_dDeltap_self(j,d);
     for (int d = 0; d < 3; d++) temp[d] -= coef_C1dbopi2 * dln_BOp_pi2[d];
     for (int d = 0; d < 3; d++) temp[d] -= coef_C2dbopi2 * dBOp_i[d];
     for (int d = 0; d < 3; d++) temp[d] += coef_C4dbopi2 * d_dDeltap_self(j,d);
 
     a_f(j,0) -= temp[0];
     a_f(j,1) -= temp[1];
     a_f(j,2) -= temp[2];
 
     if (EVFLAG)
       if (vflag_either) {
         for (int d = 0; d < 3; d++) tmpvec[d] = -delij[d];
         this->template v_tally<NEIGHFLAG>(ev,j,temp,tmpvec);
       }
 
     // forces on k: i neighbor
     for (int kk = j_start; kk < j_end; kk++) {
       int k = d_bo_list[kk];
       k &= NEIGHMASK;
       const int k_index = kk - j_start;
 
       dBOp_k[0] = d_dBOpx(i,k_index);
       dBOp_k[1] = d_dBOpy(i,k_index);
       dBOp_k[2] = d_dBOpz(i,k_index);
       const F_FLOAT coef_all = -coef_C2dbo - coef_C2dDelta - coef_C3dbopi - coef_C3dbopi2;
       for (int d = 0; d < 3; d++) temp[d] = coef_all * dBOp_k[d];
 
       a_f(k,0) -= temp[0];
       a_f(k,1) -= temp[1];
       a_f(k,2) -= temp[2];
 
       if (EVFLAG)
         if (vflag_either) {
           delik[0] = x(k,0) - xtmp;
           delik[1] = x(k,1) - ytmp;
           delik[2] = x(k,2) - ztmp;
           for (int d = 0; d < 3; d++) tmpvec[d] = x(j,d) - x(k,d) - delik[d];
           this->template v_tally<NEIGHFLAG>(ev,k,temp,tmpvec);
         }
 
     }
 
     // forces on k: j neighbor
     for (int kk = k_start; kk < k_end; kk++) {
       int k = d_bo_list[kk];
       k &= NEIGHMASK;
       const int k_index = kk - k_start;
 
       dBOp_k[0] = d_dBOpx(j,k_index);
       dBOp_k[1] = d_dBOpy(j,k_index);
       dBOp_k[2] = d_dBOpz(j,k_index);
       const F_FLOAT coef_all = -coef_C3dbo - coef_C3dDelta - coef_C4dbopi - coef_C4dbopi2;
       for (int d = 0; d < 3; d++) temp[d] = coef_all * dBOp_k[d];
 
       a_f(k,0) -= temp[0];
       a_f(k,1) -= temp[1];
       a_f(k,2) -= temp[2];
 
       if (EVFLAG) {
         if (vflag_either) {
           for (int d = 0; d < 3; d++) deljk[d] = x(k,d) - x(j,d);
           for (int d = 0; d < 3; d++) tmpvec[d] = x(i,d) - x(k,d) - deljk[d];
           this->template v_tally<NEIGHFLAG>(ev,k,temp,tmpvec);
         }
       }
 
     }
   }
   for (int d = 0; d < 3; d++) a_f(i,d) += fitmp[d];
 }
 
 template<class DeviceType>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeBond2<NEIGHFLAG,EVFLAG>, const int &ii) const {
   EV_FLOAT_REAX ev;
   this->template operator()<NEIGHFLAG,EVFLAG>(PairReaxComputeBond2<NEIGHFLAG,EVFLAG>(), ii, ev);
 
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 template<int NEIGHFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::ev_tally(EV_FLOAT_REAX &ev, const int &i, const int &j,
       const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
                 const F_FLOAT &dely, const F_FLOAT &delz) const
 {
   const int VFLAG = vflag_either;
 
   // The eatom and vatom arrays are atomic for Half/Thread neighbor style
   Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_eatom = v_eatom;
   Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_vatom = v_vatom;
 
   if (eflag_atom) {
     const E_FLOAT epairhalf = 0.5 * epair;
     a_eatom[i] += epairhalf;
     if (NEIGHFLAG != FULL) a_eatom[j] += epairhalf;
   }
 
   if (VFLAG) {
     const E_FLOAT v0 = delx*delx*fpair;
     const E_FLOAT v1 = dely*dely*fpair;
     const E_FLOAT v2 = delz*delz*fpair;
     const E_FLOAT v3 = delx*dely*fpair;
     const E_FLOAT v4 = delx*delz*fpair;
     const E_FLOAT v5 = dely*delz*fpair;
 
     if (vflag_global) {
       if (NEIGHFLAG != FULL) {
         ev.v[0] += v0;
         ev.v[1] += v1;
         ev.v[2] += v2;
         ev.v[3] += v3;
         ev.v[4] += v4;
         ev.v[5] += v5;
       } else {
         ev.v[0] += 0.5*v0;
         ev.v[1] += 0.5*v1;
         ev.v[2] += 0.5*v2;
         ev.v[3] += 0.5*v3;
         ev.v[4] += 0.5*v4;
         ev.v[5] += 0.5*v5;
       }
     }
 
     if (vflag_atom) {
       a_vatom(i,0) += 0.5*v0;
       a_vatom(i,1) += 0.5*v1;
       a_vatom(i,2) += 0.5*v2;
       a_vatom(i,3) += 0.5*v3;
       a_vatom(i,4) += 0.5*v4;
       a_vatom(i,5) += 0.5*v5;
 
       if (NEIGHFLAG != FULL) {
         a_vatom(j,0) += 0.5*v0;
         a_vatom(j,1) += 0.5*v1;
         a_vatom(j,2) += 0.5*v2;
         a_vatom(j,3) += 0.5*v3;
         a_vatom(j,4) += 0.5*v4;
         a_vatom(j,5) += 0.5*v5;
       }
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 template<int NEIGHFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::e_tally(EV_FLOAT_REAX &ev, const int &i, const int &j,
       const F_FLOAT &epair) const
 {
 
   // The eatom array is atomic for Half/Thread neighbor style
 
   if (eflag_atom) {
     Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_eatom = v_eatom;
     const E_FLOAT epairhalf = 0.5 * epair;
     a_eatom[i] += epairhalf;
     a_eatom[j] += epairhalf;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 template<int NEIGHFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::e_tally_single(EV_FLOAT_REAX &ev, const int &i,
       const F_FLOAT &epair) const
 {
   // The eatom array is atomic for Half/Thread neighbor style
   Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_eatom = v_eatom;
 
   a_eatom[i] += epair;
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 template<int NEIGHFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::v_tally(EV_FLOAT_REAX &ev, const int &i,
   F_FLOAT *fi, F_FLOAT *drij) const
 {
 
   F_FLOAT v[6];
 
   v[0] = 0.5*drij[0]*fi[0];
   v[1] = 0.5*drij[1]*fi[1];
   v[2] = 0.5*drij[2]*fi[2];
   v[3] = 0.5*drij[0]*fi[1];
   v[4] = 0.5*drij[0]*fi[2];
   v[5] = 0.5*drij[1]*fi[2];
 
   if (vflag_global) {
     ev.v[0] += v[0];
     ev.v[1] += v[1];
     ev.v[2] += v[2];
     ev.v[3] += v[3];
     ev.v[4] += v[4];
     ev.v[5] += v[5];
   }
 
   if (vflag_atom) {
     Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_vatom = v_vatom;
     a_vatom(i,0) += v[0]; a_vatom(i,1) += v[1]; a_vatom(i,2) += v[2];
     a_vatom(i,3) += v[3]; a_vatom(i,4) += v[4]; a_vatom(i,5) += v[5];
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 template<int NEIGHFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::v_tally3(EV_FLOAT_REAX &ev, const int &i, const int &j, const int &k,
   F_FLOAT *fj, F_FLOAT *fk, F_FLOAT *drij, F_FLOAT *drik) const
 {
 
   // The eatom and vatom arrays are atomic for Half/Thread neighbor style
   Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_vatom = v_vatom;
 
   F_FLOAT v[6];
 
   v[0] = (drij[0]*fj[0] + drik[0]*fk[0]);
   v[1] = (drij[1]*fj[1] + drik[1]*fk[1]);
   v[2] = (drij[2]*fj[2] + drik[2]*fk[2]);
   v[3] = (drij[0]*fj[1] + drik[0]*fk[1]);
   v[4] = (drij[0]*fj[2] + drik[0]*fk[2]);
   v[5] = (drij[1]*fj[2] + drik[1]*fk[2]);
 
   if (vflag_global) {
     ev.v[0] += v[0];
     ev.v[1] += v[1];
     ev.v[2] += v[2];
     ev.v[3] += v[3];
     ev.v[4] += v[4];
     ev.v[5] += v[5];
   }
 
   if (vflag_atom) {
     a_vatom(i,0) += THIRD*v[0]; a_vatom(i,1) += THIRD*v[1]; a_vatom(i,2) += THIRD*v[2];
     a_vatom(i,3) += THIRD*v[3]; a_vatom(i,4) += THIRD*v[4]; a_vatom(i,5) += THIRD*v[5];
     a_vatom(j,0) += THIRD*v[0]; a_vatom(j,1) += THIRD*v[1]; a_vatom(j,2) += THIRD*v[2];
     a_vatom(j,3) += THIRD*v[3]; a_vatom(j,4) += THIRD*v[4]; a_vatom(j,5) += THIRD*v[5];
     a_vatom(k,0) += THIRD*v[0]; a_vatom(k,1) += THIRD*v[1]; a_vatom(k,2) += THIRD*v[2];
     a_vatom(k,3) += THIRD*v[3]; a_vatom(k,4) += THIRD*v[4]; a_vatom(k,5) += THIRD*v[5];
   }
 
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 template<int NEIGHFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::v_tally4(EV_FLOAT_REAX &ev, const int &i, const int &j, const int &k,
   const int &l, F_FLOAT *fi, F_FLOAT *fj, F_FLOAT *fk, F_FLOAT *dril, F_FLOAT *drjl, F_FLOAT *drkl) const
 {
 
   // The vatom array is atomic for Half/Thread neighbor style
   F_FLOAT v[6];
 
   v[0] = 0.25 * (dril[0]*fi[0] + drjl[0]*fj[0] + drkl[0]*fk[0]);
   v[1] = 0.25 * (dril[1]*fi[1] + drjl[1]*fj[1] + drkl[1]*fk[1]);
   v[2] = 0.25 * (dril[2]*fi[2] + drjl[2]*fj[2] + drkl[2]*fk[2]);
   v[3] = 0.25 * (dril[0]*fi[1] + drjl[0]*fj[1] + drkl[0]*fk[1]);
   v[4] = 0.25 * (dril[0]*fi[2] + drjl[0]*fj[2] + drkl[0]*fk[2]);
   v[5] = 0.25 * (dril[1]*fi[2] + drjl[1]*fj[2] + drkl[1]*fk[2]);
 
   if (vflag_global) {
     ev.v[0] += v[0];
     ev.v[1] += v[1];
     ev.v[2] += v[2];
     ev.v[3] += v[3];
     ev.v[4] += v[4];
     ev.v[5] += v[5];
   }
 
   if (vflag_atom) {
     Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_vatom = v_vatom;
     a_vatom(i,0) += v[0]; a_vatom(i,1) += v[1]; a_vatom(i,2) += v[2];
     a_vatom(i,3) += v[3]; a_vatom(i,4) += v[4]; a_vatom(i,5) += v[5];
     a_vatom(j,0) += v[0]; a_vatom(j,1) += v[1]; a_vatom(j,2) += v[2];
     a_vatom(j,3) += v[3]; a_vatom(j,4) += v[4]; a_vatom(j,5) += v[5];
     a_vatom(k,0) += v[0]; a_vatom(k,1) += v[1]; a_vatom(k,2) += v[2];
     a_vatom(k,3) += v[3]; a_vatom(k,4) += v[4]; a_vatom(k,5) += v[5];
     a_vatom(l,0) += v[0]; a_vatom(l,1) += v[1]; a_vatom(l,2) += v[2];
     a_vatom(l,3) += v[3]; a_vatom(l,4) += v[4]; a_vatom(l,5) += v[5];
   }
 
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairReaxCKokkos<DeviceType>::v_tally3_atom(EV_FLOAT_REAX &ev, const int &i, const int &j, const int &k,
         F_FLOAT *fj, F_FLOAT *fk, F_FLOAT *drji, F_FLOAT *drjk) const
 {
   F_FLOAT v[6];
 
   v[0] = THIRD * (drji[0]*fj[0] + drjk[0]*fk[0]);
   v[1] = THIRD * (drji[1]*fj[1] + drjk[1]*fk[1]);
   v[2] = THIRD * (drji[2]*fj[2] + drjk[2]*fk[2]);
   v[3] = THIRD * (drji[0]*fj[1] + drjk[0]*fk[1]);
   v[4] = THIRD * (drji[0]*fj[2] + drjk[0]*fk[2]);
   v[5] = THIRD * (drji[1]*fj[2] + drjk[1]*fk[2]);
 
   if (vflag_global) {
     ev.v[0] += v[0];
     ev.v[1] += v[1];
     ev.v[2] += v[2];
     ev.v[3] += v[3];
     ev.v[4] += v[4];
     ev.v[5] += v[5];
   }
 
   if (vflag_atom) {
     d_vatom(i,0) += v[0]; d_vatom(i,1) += v[1]; d_vatom(i,2) += v[2];
     d_vatom(i,3) += v[3]; d_vatom(i,4) += v[4]; d_vatom(i,5) += v[5];
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 void *PairReaxCKokkos<DeviceType>::extract(const char *str, int &dim)
 {
   dim = 1;
   if (strcmp(str,"chi") == 0 && chi) {
     for (int i = 1; i <= atom->ntypes; i++)
       if (map[i] >= 0) chi[i] = system->reax_param.sbp[map[i]].chi;
       else chi[i] = 0.0;
     return (void *) chi;
   }
   if (strcmp(str,"eta") == 0 && eta) {
     for (int i = 1; i <= atom->ntypes; i++)
       if (map[i] >= 0) eta[i] = system->reax_param.sbp[map[i]].eta;
       else eta[i] = 0.0;
     return (void *) eta;
   }
   if (strcmp(str,"gamma") == 0 && gamma) {
     for (int i = 1; i <= atom->ntypes; i++)
       if (map[i] >= 0) gamma[i] = system->reax_param.sbp[map[i]].gamma;
       else gamma[i] = 0.0;
     return (void *) gamma;
   }
   return NULL;
 }
 
 /* ----------------------------------------------------------------------
    setup for energy, virial computation
    see integrate::ev_set() for values of eflag (0-3) and vflag (0-6)
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
 void PairReaxCKokkos<DeviceType>::ev_setup(int eflag, int vflag)
 {
   int i;
 
   evflag = 1;
 
   eflag_either = eflag;
   eflag_global = eflag % 2;
   eflag_atom = eflag / 2;
 
   vflag_either = vflag;
   vflag_global = vflag % 4;
   vflag_atom = vflag / 4;
 
   // reallocate per-atom arrays if necessary
 
   if (eflag_atom && atom->nmax > maxeatom) {
-      memory->destroy_kokkos(k_eatom,eatom);
-      memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
-      v_eatom = k_eatom.view<DeviceType>();
+    maxeatom = atom->nmax;
+    memory->destroy_kokkos(k_eatom,eatom);
+    memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+    v_eatom = k_eatom.view<DeviceType>();
   }
   if (vflag_atom && atom->nmax > maxvatom) {
-      memory->destroy_kokkos(k_vatom,vatom);
-      memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
-      v_vatom = k_vatom.view<DeviceType>();
+    maxvatom = atom->nmax;
+    memory->destroy_kokkos(k_vatom,vatom);
+    memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
+    v_vatom = k_vatom.view<DeviceType>();
   }
 
   // zero accumulators
 
   if (eflag_global) eng_vdwl = eng_coul = 0.0;
   if (vflag_global) for (i = 0; i < 6; i++) virial[i] = 0.0;
 
   // if vflag_global = 2 and pair::compute() calls virial_fdotr_compute()
   // compute global virial via (F dot r) instead of via pairwise summation
   // unset other flags as appropriate
 
   if (vflag_global == 2 && no_virial_fdotr_compute == 0) {
     vflag_fdotr = 1;
     vflag_global = 0;
     if (vflag_atom == 0) vflag_either = 0;
     if (vflag_either == 0 && eflag_either == 0) evflag = 0;
   } else vflag_fdotr = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 double PairReaxCKokkos<DeviceType>::memory_usage()
 {
   double bytes = 0.0;
 
   if (cut_hbsq > 0.0) {
     bytes += nmax*3*sizeof(int);
     bytes += maxhb*nmax*sizeof(int);
   }
   bytes += nmax*2*sizeof(int);
   bytes += maxbo*nmax*sizeof(int);
 
   bytes += nmax*17*sizeof(F_FLOAT);
   bytes += maxbo*nmax*34*sizeof(F_FLOAT);
 
   return bytes;
 }
 
 /* ---------------------------------------------------------------------- */
 
 template class PairReaxCKokkos<LMPDeviceType>;
 #ifdef KOKKOS_HAVE_CUDA
 template class PairReaxCKokkos<LMPHostType>;
 #endif
 }
diff --git a/src/compute_omega_chunk.cpp b/src/compute_omega_chunk.cpp
index 58acc19c3..50f7db7aa 100644
--- a/src/compute_omega_chunk.cpp
+++ b/src/compute_omega_chunk.cpp
@@ -1,333 +1,334 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include <string.h>
 #include "compute_omega_chunk.h"
 #include "atom.h"
 #include "update.h"
 #include "modify.h"
 #include "compute_chunk_atom.h"
 #include "domain.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 ComputeOmegaChunk::ComputeOmegaChunk(LAMMPS *lmp, int narg, char **arg) :
   Compute(lmp, narg, arg),
-  idchunk(NULL), massproc(NULL), masstotal(NULL), com(NULL), comall(NULL), angmom(NULL), angmomall(NULL)
+  idchunk(NULL), massproc(NULL), masstotal(NULL), com(NULL), comall(NULL), 
+  angmom(NULL), angmomall(NULL)
 {
   if (narg != 4) error->all(FLERR,"Illegal compute omega/chunk command");
 
   array_flag = 1;
   size_array_cols = 3;
   size_array_rows = 0;
   size_array_rows_variable = 1;
   extarray = 0;
 
   // ID of compute chunk/atom
 
   int n = strlen(arg[3]) + 1;
   idchunk = new char[n];
   strcpy(idchunk,arg[3]);
 
   init();
 
   // chunk-based data
 
   nchunk = 1;
   maxchunk = 0;
   allocate();
 }
 
 /* ---------------------------------------------------------------------- */
 
 ComputeOmegaChunk::~ComputeOmegaChunk()
 {
   delete [] idchunk;
   memory->destroy(massproc);
   memory->destroy(masstotal);
   memory->destroy(com);
   memory->destroy(comall);
   memory->destroy(angmom);
   memory->destroy(angmomall);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputeOmegaChunk::init()
 {
   int icompute = modify->find_compute(idchunk);
   if (icompute < 0)
     error->all(FLERR,"Chunk/atom compute does not exist for "
                "compute omega/chunk");
   cchunk = (ComputeChunkAtom *) modify->compute[icompute];
   if (strcmp(cchunk->style,"chunk/atom") != 0)
     error->all(FLERR,"Compute omega/chunk does not use chunk/atom compute");
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputeOmegaChunk::compute_array()
 {
   int i,j,index;
   double dx,dy,dz,massone;
   double unwrap[3];
 
   invoked_array = update->ntimestep;
 
   // compute chunk/atom assigns atoms to chunk IDs
   // extract ichunk index vector from compute
   // ichunk = 1 to Nchunk for included atoms, 0 for excluded atoms
 
   nchunk = cchunk->setup_chunks();
   cchunk->compute_ichunk();
   int *ichunk = cchunk->ichunk;
 
   if (nchunk > maxchunk) allocate();
   size_array_rows = nchunk;
 
   // zero local per-chunk values
 
   for (int i = 0; i < nchunk; i++) {
     massproc[i] = 0.0;
     com[i][0] = com[i][1] = com[i][2] = 0.0;
     for (j = 0; j < 6; j++) inertia[i][j] = 0.0;
     angmom[i][0] = angmom[i][1] = angmom[i][2] = 0.0;
     omega[i][0] = omega[i][1] = omega[i][2] = 0.0;
   }
 
   // compute COM for each chunk
 
   double **x = atom->x;
   int *mask = atom->mask;
   int *type = atom->type;
   imageint *image = atom->image;
   double *mass = atom->mass;
   double *rmass = atom->rmass;
   int nlocal = atom->nlocal;
 
   for (int i = 0; i < nlocal; i++)
     if (mask[i] & groupbit) {
       index = ichunk[i]-1;
       if (index < 0) continue;
       if (rmass) massone = rmass[i];
       else massone = mass[type[i]];
       domain->unmap(x[i],image[i],unwrap);
       massproc[index] += massone;
       com[index][0] += unwrap[0] * massone;
       com[index][1] += unwrap[1] * massone;
       com[index][2] += unwrap[2] * massone;
     }
 
   MPI_Allreduce(massproc,masstotal,nchunk,MPI_DOUBLE,MPI_SUM,world);
   MPI_Allreduce(&com[0][0],&comall[0][0],3*nchunk,MPI_DOUBLE,MPI_SUM,world);
 
   for (int i = 0; i < nchunk; i++) {
     if (masstotal[i] > 0.0) {
       comall[i][0] /= masstotal[i];
       comall[i][1] /= masstotal[i];
       comall[i][2] /= masstotal[i];
     }
   }
 
   // compute inertia tensor for each chunk
 
   for (i = 0; i < nlocal; i++)
     if (mask[i] & groupbit) {
       index = ichunk[i]-1;
       if (index < 0) continue;
       if (rmass) massone = rmass[i];
       else massone = mass[type[i]];
       domain->unmap(x[i],image[i],unwrap);
       dx = unwrap[0] - comall[index][0];
       dy = unwrap[1] - comall[index][1];
       dz = unwrap[2] - comall[index][2];
       inertia[index][0] += massone * (dy*dy + dz*dz);
       inertia[index][1] += massone * (dx*dx + dz*dz);
       inertia[index][2] += massone * (dx*dx + dy*dy);
       inertia[index][3] -= massone * dx*dy;
       inertia[index][4] -= massone * dy*dz;
       inertia[index][5] -= massone * dx*dz;
     }
 
   MPI_Allreduce(&inertia[0][0],&inertiaall[0][0],6*nchunk,
                 MPI_DOUBLE,MPI_SUM,world);
 
   // compute angmom for each chunk
 
   double **v = atom->v;
 
   for (i = 0; i < nlocal; i++)
     if (mask[i] & groupbit) {
       index = ichunk[i]-1;
       if (index < 0) continue;
       domain->unmap(x[i],image[i],unwrap);
       dx = unwrap[0] - comall[index][0];
       dy = unwrap[1] - comall[index][1];
       dz = unwrap[2] - comall[index][2];
       if (rmass) massone = rmass[i];
       else massone = mass[type[i]];
       angmom[index][0] += massone * (dy*v[i][2] - dz*v[i][1]);
       angmom[index][1] += massone * (dz*v[i][0] - dx*v[i][2]);
       angmom[index][2] += massone * (dx*v[i][1] - dy*v[i][0]);
     }
 
   MPI_Allreduce(&angmom[0][0],&angmomall[0][0],3*nchunk,
                 MPI_DOUBLE,MPI_SUM,world);
 
   // compute omega for each chunk from L = Iw, inverting I to solve for w
 
   double ione[3][3],inverse[3][3];
 
-  for (i = 0; i < nchunk; i++) {
-    ione[0][0] = inertiaall[i][0];
-    ione[1][1] = inertiaall[i][1];
-    ione[2][2] = inertiaall[i][2];
-    ione[0][1] = inertiaall[i][3];
-    ione[1][2] = inertiaall[i][4];
-    ione[0][2] = inertiaall[i][5];
+  for (m = 0; m < nchunk; m++) {
+    ione[0][0] = inertiaall[m][0];
+    ione[1][1] = inertiaall[m][1];
+    ione[2][2] = inertiaall[m][2];
+    ione[0][1] = inertiaall[m][3];
+    ione[1][2] = inertiaall[m][4];
+    ione[0][2] = inertiaall[m][5];
     ione[1][0] = ione[0][1];
     ione[2][1] = ione[1][2];
     ione[2][0] = ione[0][2];
 
     inverse[0][0] = ione[1][1]*ione[2][2] - ione[1][2]*ione[2][1];
     inverse[0][1] = -(ione[0][1]*ione[2][2] - ione[0][2]*ione[2][1]);
     inverse[0][2] = ione[0][1]*ione[1][2] - ione[0][2]*ione[1][1];
 
     inverse[1][0] = -(ione[1][0]*ione[2][2] - ione[1][2]*ione[2][0]);
     inverse[1][1] = ione[0][0]*ione[2][2] - ione[0][2]*ione[2][0];
     inverse[1][2] = -(ione[0][0]*ione[1][2] - ione[0][2]*ione[1][0]);
 
     inverse[2][0] = ione[1][0]*ione[2][1] - ione[1][1]*ione[2][0];
     inverse[2][1] = -(ione[0][0]*ione[2][1] - ione[0][1]*ione[2][0]);
     inverse[2][2] = ione[0][0]*ione[1][1] - ione[0][1]*ione[1][0];
 
     double determinant = ione[0][0]*ione[1][1]*ione[2][2] +
       ione[0][1]*ione[1][2]*ione[2][0] + ione[0][2]*ione[1][0]*ione[2][1] -
       ione[0][0]*ione[1][2]*ione[2][1] - ione[0][1]*ione[1][0]*ione[2][2] -
       ione[2][0]*ione[1][1]*ione[0][2];
 
     if (determinant > 0.0)
-      for (int i = 0; i < 3; i++)
-        for (int j = 0; j < 3; j++)
+      for (i = 0; i < 3; i++)
+        for (j = 0; j < 3; j++)
           inverse[i][j] /= determinant;
 
-    omega[i][0] = inverse[0][0]*angmom[i][0] + inverse[0][1]*angmom[i][1] +
-      inverse[0][2]*angmom[i][2];
-    omega[i][1] = inverse[1][0]*angmom[i][0] + inverse[1][1]*angmom[i][1] +
-      inverse[1][2]*angmom[i][2];
-    omega[i][2] = inverse[2][0]*angmom[i][0] + inverse[2][1]*angmom[i][1] +
+    omega[m][0] = inverse[0][0]*angmom[m][0] + inverse[0][1]*angmom[m][1] +
+      inverse[0][2]*angmom[m][2];
+    omega[m][1] = inverse[1][0]*angmom[m][0] + inverse[1][1]*angmom[m][1] +
+      inverse[1][2]*angmom[m][2];
+    omega[m][2] = inverse[2][0]*angmom[m][0] + inverse[2][1]*angmom[m][1] +
       inverse[2][2]*angmom[i][2];
   }
 }
 
 /* ----------------------------------------------------------------------
    lock methods: called by fix ave/time
    these methods insure vector/array size is locked for Nfreq epoch
      by passing lock info along to compute chunk/atom
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    increment lock counter
 ------------------------------------------------------------------------- */
 
 void ComputeOmegaChunk::lock_enable()
 {
   cchunk->lockcount++;
 }
 
 /* ----------------------------------------------------------------------
    decrement lock counter in compute chunk/atom, it if still exists
 ------------------------------------------------------------------------- */
 
 void ComputeOmegaChunk::lock_disable()
 {
   int icompute = modify->find_compute(idchunk);
   if (icompute >= 0) {
     cchunk = (ComputeChunkAtom *) modify->compute[icompute];
     cchunk->lockcount--;
   }
 }
 
 /* ----------------------------------------------------------------------
    calculate and return # of chunks = length of vector/array
 ------------------------------------------------------------------------- */
 
 int ComputeOmegaChunk::lock_length()
 {
   nchunk = cchunk->setup_chunks();
   return nchunk;
 }
 
 /* ----------------------------------------------------------------------
    set the lock from startstep to stopstep
 ------------------------------------------------------------------------- */
 
 void ComputeOmegaChunk::lock(Fix *fixptr, bigint startstep, bigint stopstep)
 {
   cchunk->lock(fixptr,startstep,stopstep);
 }
 
 /* ----------------------------------------------------------------------
    unset the lock
 ------------------------------------------------------------------------- */
 
 void ComputeOmegaChunk::unlock(Fix *fixptr)
 {
   cchunk->unlock(fixptr);
 }
 
 /* ----------------------------------------------------------------------
    free and reallocate per-chunk arrays
 ------------------------------------------------------------------------- */
 
 void ComputeOmegaChunk::allocate()
 {
   memory->destroy(massproc);
   memory->destroy(masstotal);
   memory->destroy(com);
   memory->destroy(comall);
   memory->destroy(inertia);
   memory->destroy(inertiaall);
   memory->destroy(angmom);
   memory->destroy(angmomall);
   memory->destroy(omega);
   maxchunk = nchunk;
   memory->create(massproc,maxchunk,"omega/chunk:massproc");
   memory->create(masstotal,maxchunk,"omega/chunk:masstotal");
   memory->create(com,maxchunk,3,"omega/chunk:com");
   memory->create(comall,maxchunk,3,"omega/chunk:comall");
   memory->create(inertia,maxchunk,6,"omega/chunk:inertia");
   memory->create(inertiaall,maxchunk,6,"omega/chunk:inertiaall");
   memory->create(angmom,maxchunk,3,"omega/chunk:angmom");
   memory->create(angmomall,maxchunk,3,"omega/chunk:angmomall");
   memory->create(omega,maxchunk,3,"omega/chunk:omega");
   array = omega;
 }
 
 /* ----------------------------------------------------------------------
    memory usage of local data
 ------------------------------------------------------------------------- */
 
 double ComputeOmegaChunk::memory_usage()
 {
   double bytes = (bigint) maxchunk * 2 * sizeof(double);
   bytes += (bigint) maxchunk * 2*3 * sizeof(double);
   bytes += (bigint) maxchunk * 2*6 * sizeof(double);
   bytes += (bigint) maxchunk * 2*3 * sizeof(double);
   bytes += (bigint) maxchunk * 3 * sizeof(double);
   return bytes;
 }
diff --git a/src/fix_balance.cpp b/src/fix_balance.cpp
index f29518515..500498b46 100644
--- a/src/fix_balance.cpp
+++ b/src/fix_balance.cpp
@@ -1,324 +1,323 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include <string.h>
 #include <stdlib.h>
 #include "fix_balance.h"
 #include "balance.h"
 #include "update.h"
 #include "atom.h"
 #include "comm.h"
 #include "domain.h"
 #include "neighbor.h"
 #include "irregular.h"
 #include "force.h"
 #include "kspace.h"
 #include "rcb.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
 enum{SHIFT,BISECTION};
 enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
 
 /* ---------------------------------------------------------------------- */
 
 FixBalance::FixBalance(LAMMPS *lmp, int narg, char **arg) :
-  Fix(lmp, narg, arg),
-  fp(NULL), nimbalance(0), imbalance(NULL), imb_fix(NULL), balance(NULL), irregular(NULL)
+  Fix(lmp, narg, arg), fp(NULL)
 {
   if (narg < 6) error->all(FLERR,"Illegal fix balance command");
 
   box_change_domain = 1;
   scalar_flag = 1;
   extscalar = 0;
   vector_flag = 1;
   size_vector = 3;
   extvector = 0;
   global_freq = 1;
 
   // parse arguments
 
   int dimension = domain->dimension;
 
   nevery = force->inumeric(FLERR,arg[3]);
   if (nevery < 0) error->all(FLERR,"Illegal fix balance command");
   thresh = force->numeric(FLERR,arg[4]);
 
   if (strcmp(arg[5],"shift") == 0) lbstyle = SHIFT;
   else if (strcmp(arg[5],"rcb") == 0) lbstyle = BISECTION;
   else error->all(FLERR,"Illegal fix balance command");
 
   int iarg = 5;
   if (lbstyle == SHIFT) {
     if (iarg+4 > narg) error->all(FLERR,"Illegal fix balance command");
     if (strlen(arg[iarg+1]) > 3) error->all(FLERR,"Illegal fix balance command");
     strcpy(bstr,arg[iarg+1]);
     nitermax = force->inumeric(FLERR,arg[iarg+2]);
     if (nitermax <= 0) error->all(FLERR,"Illegal fix balance command");
     stopthresh = force->numeric(FLERR,arg[iarg+3]);
     if (stopthresh < 1.0) error->all(FLERR,"Illegal fix balance command");
     iarg += 4;
   } else if (lbstyle == BISECTION) {
     iarg++;
   }
 
   // optional args
 
   outflag = 0;
   int outarg = 0;
   fp = NULL;
 
   while (iarg < narg) {
     if (strcmp(arg[iarg],"out") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal fix balance command");
       outflag = 1;
       outarg = iarg+1;
       iarg += 2;
     } else error->all(FLERR,"Illegal fix balance command");
   }
 
   // error check
 
   if (lbstyle == SHIFT) {
     int blen = strlen(bstr);
     for (int i = 0; i < blen; i++) {
       if (bstr[i] != 'x' && bstr[i] != 'y' && bstr[i] != 'z')
         error->all(FLERR,"Fix balance shift string is invalid");
       if (bstr[i] == 'z' && dimension == 2)
         error->all(FLERR,"Fix balance shift string is invalid");
       for (int j = i+1; j < blen; j++)
         if (bstr[i] == bstr[j])
           error->all(FLERR,"Fix balance shift string is invalid");
     }
   }
 
   if (lbstyle == BISECTION && comm->style == 0)
     error->all(FLERR,"Fix balance rcb cannot be used with comm_style brick");
 
   // create instance of Balance class
   // if SHIFT, initialize it with params
 
   balance = new Balance(lmp);
   if (lbstyle == SHIFT) balance->shift_setup(bstr,nitermax,thresh);
 
   // create instance of Irregular class
 
   irregular = new Irregular(lmp);
 
   // output file
 
   if (outflag && comm->me == 0) {
     fp = fopen(arg[outarg],"w");
     if (fp == NULL) error->one(FLERR,"Cannot open fix balance output file");
   }
 
   // only force reneighboring if nevery > 0
 
   if (nevery) force_reneighbor = 1;
 
   // compute initial outputs
 
   imbfinal = imbprev = balance->imbalance_nlocal(maxperproc);
   itercount = 0;
   pending = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 FixBalance::~FixBalance()
 {
   if (fp) fclose(fp);
   delete balance;
   delete irregular;
 }
 
 /* ---------------------------------------------------------------------- */
 
 int FixBalance::setmask()
 {
   int mask = 0;
   mask |= PRE_EXCHANGE;
   mask |= PRE_NEIGHBOR;
   return mask;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixBalance::init()
 {
   if (force->kspace) kspace_flag = 1;
   else kspace_flag = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixBalance::setup(int vflag)
 {
   // compute final imbalance factor if setup_pre_exchange() invoked balancer
   // this is called at end of run setup, before output
 
   pre_neighbor();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixBalance::setup_pre_exchange()
 {
   // insure atoms are in current box & update box via shrink-wrap
   // has to be be done before rebalance() invokes Irregular::migrate_atoms()
   //   since it requires atoms be inside simulation box
   //   even though pbc() will be done again in Verlet::run()
   // no exchange() since doesn't matter if atoms are assigned to correct procs
 
   if (domain->triclinic) domain->x2lamda(atom->nlocal);
   domain->pbc();
   domain->reset_box();
   if (domain->triclinic) domain->lamda2x(atom->nlocal);
 
   // perform a rebalance if threshhold exceeded
 
   imbnow = balance->imbalance_nlocal(maxperproc);
   if (imbnow > thresh) rebalance();
 
   // next_reneighbor = next time to force reneighboring
 
   if (nevery) next_reneighbor = (update->ntimestep/nevery)*nevery + nevery;
 }
 
 /* ----------------------------------------------------------------------
    perform dynamic load balancing
 ------------------------------------------------------------------------- */
 
 void FixBalance::pre_exchange()
 {
   // return if not a rebalance timestep
 
   if (nevery && update->ntimestep < next_reneighbor) return;
 
   // insure atoms are in current box & update box via shrink-wrap
   // no exchange() since doesn't matter if atoms are assigned to correct procs
 
   if (domain->triclinic) domain->x2lamda(atom->nlocal);
   domain->pbc();
   domain->reset_box();
   if (domain->triclinic) domain->lamda2x(atom->nlocal);
 
   // return if imbalance < threshhold
 
   imbnow = balance->imbalance_nlocal(maxperproc);
   if (imbnow <= thresh) {
     if (nevery) next_reneighbor = (update->ntimestep/nevery)*nevery + nevery;
     return;
   }
 
   rebalance();
 
   // next timestep to rebalance
 
   if (nevery) next_reneighbor = (update->ntimestep/nevery)*nevery + nevery;
 }
 
 /* ----------------------------------------------------------------------
    compute final imbalance factor based on nlocal after comm->exchange()
    only do this if rebalancing just occured
 ------------------------------------------------------------------------- */
 
 void FixBalance::pre_neighbor()
 {
   if (!pending) return;
   imbfinal = balance->imbalance_nlocal(maxperproc);
   pending = 0;
 }
 
 /* ----------------------------------------------------------------------
    perform dynamic load balancing
 ------------------------------------------------------------------------- */
 
 void FixBalance::rebalance()
 {
   imbprev = imbnow;
 
   // invoke balancer and reset comm->uniform flag
 
   int *sendproc;
   if (lbstyle == SHIFT) {
     itercount = balance->shift();
     comm->layout = LAYOUT_NONUNIFORM;
   } else if (lbstyle == BISECTION) {
     sendproc = balance->bisection();
     comm->layout = LAYOUT_TILED;
   }
 
   // output of new decomposition
 
   if (outflag) balance->dumpout(update->ntimestep,fp);
 
   // reset proc sub-domains
   // check and warn if any proc's subbox is smaller than neigh skin
   //   since may lead to lost atoms in exchange()
 
   if (domain->triclinic) domain->set_lamda_box();
   domain->set_local_box();
   domain->subbox_too_small_check(neighbor->skin);
 
   // move atoms to new processors via irregular()
   // only needed if migrate_check() says an atom moves to far
   // else allow caller's comm->exchange() to do it
 
   if (domain->triclinic) domain->x2lamda(atom->nlocal);
   if (lbstyle == BISECTION) irregular->migrate_atoms(0,1,sendproc);
   else if (irregular->migrate_check()) irregular->migrate_atoms();
   if (domain->triclinic) domain->lamda2x(atom->nlocal);
 
   // invoke KSpace setup_grid() to adjust to new proc sub-domains
 
   if (kspace_flag) force->kspace->setup_grid();
 
   // pending triggers pre_neighbor() to compute final imbalance factor
   // can only be done after atoms migrate in caller's comm->exchange()
 
   pending = 1;
 }
 
 /* ----------------------------------------------------------------------
    return imbalance factor after last rebalance
 ------------------------------------------------------------------------- */
 
 double FixBalance::compute_scalar()
 {
   return imbfinal;
 }
 
 /* ----------------------------------------------------------------------
    return stats for last rebalance
 ------------------------------------------------------------------------- */
 
 double FixBalance::compute_vector(int i)
 {
   if (i == 0) return (double) maxperproc;
   if (i == 1) return (double) itercount;
   return imbprev;
 }
 
 /* ----------------------------------------------------------------------
    return # of bytes of allocated memory
 ------------------------------------------------------------------------- */
 
 double FixBalance::memory_usage()
 {
   double bytes = irregular->memory_usage();
   if (balance->rcb) bytes += balance->rcb->memory_usage();
   return bytes;
 }
diff --git a/src/math_extra.h b/src/math_extra.h
index 0bfc7193a..3c00f7a4f 100755
--- a/src/math_extra.h
+++ b/src/math_extra.h
@@ -1,696 +1,699 @@
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Mike Brown (SNL)
 ------------------------------------------------------------------------- */
 
 #ifndef LMP_MATH_EXTRA_H
 #define LMP_MATH_EXTRA_H
 
 #include <math.h>
 #include <stdio.h>
 #include <string.h>
 #include "error.h"
 
 namespace MathExtra {
 
   // 3 vector operations
 
   inline void copy3(const double *v, double *ans);
   inline void zero3(double *v);
   inline void norm3(double *v);
   inline void normalize3(const double *v, double *ans);
   inline void snormalize3(const double, const double *v, double *ans);
   inline void negate3(double *v);
   inline void scale3(double s, double *v);
   inline void add3(const double *v1, const double *v2, double *ans);
-  inline void scaleadd3(double s, const double *v1, const double *v2, double *ans);
+  inline void scaleadd3(double s, const double *v1, const double *v2, 
+                        double *ans);
   inline void sub3(const double *v1, const double *v2, double *ans);
   inline double len3(const double *v);
   inline double lensq3(const double *v);
   inline double dot3(const double *v1, const double *v2);
   inline void cross3(const double *v1, const double *v2, double *ans);
 
   // 3x3 matrix operations
 
   inline void col2mat(const double *ex, const double *ey, const double *ez,
                       double m[3][3]);
   inline double det3(const double mat[3][3]);
   inline void diag_times3(const double *d, const double m[3][3],
                           double ans[3][3]);
   inline void times3_diag(const double m[3][3], const double *d,
                           double ans[3][3]);
   inline void plus3(const double m[3][3], const double m2[3][3],
                     double ans[3][3]);
   inline void times3(const double m[3][3], const double m2[3][3],
                      double ans[3][3]);
   inline void transpose_times3(const double m[3][3], const double m2[3][3],
                                double ans[3][3]);
   inline void times3_transpose(const double m[3][3], const double m2[3][3],
                                double ans[3][3]);
   inline void invert3(const double mat[3][3], double ans[3][3]);
   inline void matvec(const double mat[3][3], const double *vec, double *ans);
   inline void matvec(const double *ex, const double *ey, const double *ez,
                      const double *vec, double *ans);
   inline void transpose_matvec(const double mat[3][3], const double *vec,
                                double *ans);
   inline void transpose_matvec(const double *ex, const double *ey,
                                const double *ez, const double *v,
                                double *ans);
   inline void transpose_diag3(const double m[3][3], const double *d,
                               double ans[3][3]);
   inline void vecmat(const double *v, const double m[3][3], double *ans);
   inline void scalar_times3(const double f, double m[3][3]);
 
   void write3(const double mat[3][3]);
   int mldivide3(const double mat[3][3], const double *vec, double *ans);
   int jacobi(double matrix[3][3], double *evalues, double evectors[3][3]);
   void rotate(double matrix[3][3], int i, int j, int k, int l,
               double s, double tau);
   void richardson(double *q, double *m, double *w, double *moments, double dtq);
   void no_squish_rotate(int k, double *p, double *q, double *inertia, 
                         double dt);
 
   // shape matrix operations
   // upper-triangular 3x3 matrix stored in Voigt notation as 6-vector
 
   inline void multiply_shape_shape(const double *one, const double *two,
                                    double *ans);
 
   // quaternion operations
 
   inline void qnormalize(double *q);
   inline void qconjugate(double *q, double *qc);
   inline void vecquat(double *a, double *b, double *c);
   inline void quatvec(double *a, double *b, double *c);
   inline void quatquat(double *a, double *b, double *c);
   inline void invquatvec(double *a, double *b, double *c);
   inline void axisangle_to_quat(const double *v, const double angle,
                                 double *quat);
 
   void angmom_to_omega(double *m, double *ex, double *ey, double *ez,
                        double *idiag, double *w);
   void omega_to_angmom(double *w, double *ex, double *ey, double *ez,
                        double *idiag, double *m);
   void mq_to_omega(double *m, double *q, double *moments, double *w);
   void exyz_to_q(double *ex, double *ey, double *ez, double *q);
   void q_to_exyz(double *q, double *ex, double *ey, double *ez);
   void quat_to_mat(const double *quat, double mat[3][3]);
   void quat_to_mat_trans(const double *quat, double mat[3][3]);
 
   // rotation operations
 
   inline void rotation_generator_x(const double m[3][3], double ans[3][3]);
   inline void rotation_generator_y(const double m[3][3], double ans[3][3]);
   inline void rotation_generator_z(const double m[3][3], double ans[3][3]);
   
   void BuildRxMatrix(double R[3][3], const double angle);
   void BuildRyMatrix(double R[3][3], const double angle);
   void BuildRzMatrix(double R[3][3], const double angle);
 
   // moment of inertia operations
 
   void inertia_ellipsoid(double *shape, double *quat, double mass,
                          double *inertia);
   void inertia_line(double length, double theta, double mass,
                     double *inertia);
   void inertia_triangle(double *v0, double *v1, double *v2,
                         double mass, double *inertia);
   void inertia_triangle(double *idiag, double *quat, double mass,
                         double *inertia);
 }
 
 /* ----------------------------------------------------------------------
    copy a vector, return in ans
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::copy3(const double *v, double *ans)
 {
   ans[0] = v[0];
   ans[1] = v[1];
   ans[2] = v[2];
 }
 
 /* ----------------------------------------------------------------------
    set vector equal to zero
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::zero3(double *v)
 {
   v[0] = 0.0;
   v[1] = 0.0;
   v[2] = 0.0;
 }
 
 /* ----------------------------------------------------------------------
    normalize a vector in place
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::norm3(double *v)
 {
   double scale = 1.0/sqrt(v[0]*v[0]+v[1]*v[1]+v[2]*v[2]);
   v[0] *= scale;
   v[1] *= scale;
   v[2] *= scale;
 }
 
 /* ----------------------------------------------------------------------
    normalize a vector, return in ans
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::normalize3(const double *v, double *ans)
 {
   double scale = 1.0/sqrt(v[0]*v[0]+v[1]*v[1]+v[2]*v[2]);
   ans[0] = v[0]*scale;
   ans[1] = v[1]*scale;
   ans[2] = v[2]*scale;
 }
 
 /* ----------------------------------------------------------------------
    scale a vector to length
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::snormalize3(const double length, const double *v, 
                                    double *ans)
 {
   double scale = length/sqrt(v[0]*v[0]+v[1]*v[1]+v[2]*v[2]);
   ans[0] = v[0]*scale;
   ans[1] = v[1]*scale;
   ans[2] = v[2]*scale;
 }
 
 /* ----------------------------------------------------------------------
    negate vector v
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::negate3(double *v)
 {
   v[0] = -v[0];
   v[1] = -v[1];
   v[2] = -v[2];
 }
 
 /* ----------------------------------------------------------------------
    scale vector v by s
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::scale3(double s, double *v)
 {
   v[0] *= s;
   v[1] *= s;
   v[2] *= s;
 }
 
 /* ----------------------------------------------------------------------
    ans = v1 + v2
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::add3(const double *v1, const double *v2, double *ans)
 {
   ans[0] = v1[0] + v2[0];
   ans[1] = v1[1] + v2[1];
   ans[2] = v1[2] + v2[2];
 }
 
 /* ----------------------------------------------------------------------
    ans = s*v1 + v2
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::scaleadd3(double s, const double *v1, const double *v2, double *ans)
 {
   ans[0] = s*v1[0] + v2[0];
   ans[1] = s*v1[1] + v2[1];
   ans[2] = s*v1[2] + v2[2];
 }
 
 /* ----------------------------------------------------------------------
    ans = v1 - v2
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::sub3(const double *v1, const double *v2, double *ans)
 {
   ans[0] = v1[0] - v2[0];
   ans[1] = v1[1] - v2[1];
   ans[2] = v1[2] - v2[2];
 }
 
 /* ----------------------------------------------------------------------
    length of vector v
 ------------------------------------------------------------------------- */
 
 inline double MathExtra::len3(const double *v)
 {
   return sqrt(v[0]*v[0] + v[1]*v[1] + v[2]*v[2]);
 }
 
 /* ----------------------------------------------------------------------
    squared length of vector v, or dot product of v with itself
 ------------------------------------------------------------------------- */
 
 inline double MathExtra::lensq3(const double *v)
 {
   return v[0]*v[0] + v[1]*v[1] + v[2]*v[2];
 }
 
 /* ----------------------------------------------------------------------
    dot product of 2 vectors
 ------------------------------------------------------------------------- */
 
 inline double MathExtra::dot3(const double *v1, const double *v2)
 {
   return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2];
 }
 
 /* ----------------------------------------------------------------------
    cross product of 2 vectors
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::cross3(const double *v1, const double *v2, double *ans)
 {
   ans[0] = v1[1]*v2[2] - v1[2]*v2[1];
   ans[1] = v1[2]*v2[0] - v1[0]*v2[2];
   ans[2] = v1[0]*v2[1] - v1[1]*v2[0];
 }
 
 /* ----------------------------------------------------------------------
    construct matrix from 3 column vectors
 ------------------------------------------------------------------------- */
 
 void MathExtra::col2mat(const double *ex, const double *ey, const double *ez,
                         double m[3][3])
 {
   m[0][0] = ex[0];
   m[1][0] = ex[1];
   m[2][0] = ex[2];
   m[0][1] = ey[0];
   m[1][1] = ey[1];
   m[2][1] = ey[2];
   m[0][2] = ez[0];
   m[1][2] = ez[1];
   m[2][2] = ez[2];
 }
 
 /* ----------------------------------------------------------------------
    determinant of a matrix
 ------------------------------------------------------------------------- */
 
 inline double MathExtra::det3(const double m[3][3])
 {
   double ans = m[0][0]*m[1][1]*m[2][2] - m[0][0]*m[1][2]*m[2][1] -
     m[1][0]*m[0][1]*m[2][2] + m[1][0]*m[0][2]*m[2][1] +
     m[2][0]*m[0][1]*m[1][2] - m[2][0]*m[0][2]*m[1][1];
   return ans;
 }
 
 /* ----------------------------------------------------------------------
    diagonal matrix times a full matrix
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::diag_times3(const double *d, const double m[3][3],
                                    double ans[3][3])
 {
   ans[0][0] = d[0]*m[0][0];
   ans[0][1] = d[0]*m[0][1];
   ans[0][2] = d[0]*m[0][2];
   ans[1][0] = d[1]*m[1][0];
   ans[1][1] = d[1]*m[1][1];
   ans[1][2] = d[1]*m[1][2];
   ans[2][0] = d[2]*m[2][0];
   ans[2][1] = d[2]*m[2][1];
   ans[2][2] = d[2]*m[2][2];
 }
 
 /* ----------------------------------------------------------------------
    full matrix times a diagonal matrix
 ------------------------------------------------------------------------- */
 
 void MathExtra::times3_diag(const double m[3][3], const double *d,
                             double ans[3][3])
 {
   ans[0][0] = m[0][0]*d[0];
   ans[0][1] = m[0][1]*d[1];
   ans[0][2] = m[0][2]*d[2];
   ans[1][0] = m[1][0]*d[0];
   ans[1][1] = m[1][1]*d[1];
   ans[1][2] = m[1][2]*d[2];
   ans[2][0] = m[2][0]*d[0];
   ans[2][1] = m[2][1]*d[1];
   ans[2][2] = m[2][2]*d[2];
 }
 
 /* ----------------------------------------------------------------------
    add two matrices
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::plus3(const double m[3][3], const double m2[3][3],
                              double ans[3][3])
 {
   ans[0][0] = m[0][0]+m2[0][0];
   ans[0][1] = m[0][1]+m2[0][1];
   ans[0][2] = m[0][2]+m2[0][2];
   ans[1][0] = m[1][0]+m2[1][0];
   ans[1][1] = m[1][1]+m2[1][1];
   ans[1][2] = m[1][2]+m2[1][2];
   ans[2][0] = m[2][0]+m2[2][0];
   ans[2][1] = m[2][1]+m2[2][1];
   ans[2][2] = m[2][2]+m2[2][2];
 }
 
 /* ----------------------------------------------------------------------
    multiply mat1 times mat2
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::times3(const double m[3][3], const double m2[3][3],
                               double ans[3][3])
 {
   ans[0][0] = m[0][0]*m2[0][0] + m[0][1]*m2[1][0] + m[0][2]*m2[2][0];
   ans[0][1] = m[0][0]*m2[0][1] + m[0][1]*m2[1][1] + m[0][2]*m2[2][1];
   ans[0][2] = m[0][0]*m2[0][2] + m[0][1]*m2[1][2] + m[0][2]*m2[2][2];
   ans[1][0] = m[1][0]*m2[0][0] + m[1][1]*m2[1][0] + m[1][2]*m2[2][0];
   ans[1][1] = m[1][0]*m2[0][1] + m[1][1]*m2[1][1] + m[1][2]*m2[2][1];
   ans[1][2] = m[1][0]*m2[0][2] + m[1][1]*m2[1][2] + m[1][2]*m2[2][2];
   ans[2][0] = m[2][0]*m2[0][0] + m[2][1]*m2[1][0] + m[2][2]*m2[2][0];
   ans[2][1] = m[2][0]*m2[0][1] + m[2][1]*m2[1][1] + m[2][2]*m2[2][1];
   ans[2][2] = m[2][0]*m2[0][2] + m[2][1]*m2[1][2] + m[2][2]*m2[2][2];
 }
 
 /* ----------------------------------------------------------------------
    multiply the transpose of mat1 times mat2
 ------------------------------------------------------------------------- */
 
-inline void MathExtra::transpose_times3(const double m[3][3], const double m2[3][3],
-                                 double ans[3][3])
+inline void MathExtra::transpose_times3(const double m[3][3], 
+                                        const double m2[3][3],double ans[3][3])
 {
   ans[0][0] = m[0][0]*m2[0][0] + m[1][0]*m2[1][0] + m[2][0]*m2[2][0];
   ans[0][1] = m[0][0]*m2[0][1] + m[1][0]*m2[1][1] + m[2][0]*m2[2][1];
   ans[0][2] = m[0][0]*m2[0][2] + m[1][0]*m2[1][2] + m[2][0]*m2[2][2];
   ans[1][0] = m[0][1]*m2[0][0] + m[1][1]*m2[1][0] + m[2][1]*m2[2][0];
   ans[1][1] = m[0][1]*m2[0][1] + m[1][1]*m2[1][1] + m[2][1]*m2[2][1];
   ans[1][2] = m[0][1]*m2[0][2] + m[1][1]*m2[1][2] + m[2][1]*m2[2][2];
   ans[2][0] = m[0][2]*m2[0][0] + m[1][2]*m2[1][0] + m[2][2]*m2[2][0];
   ans[2][1] = m[0][2]*m2[0][1] + m[1][2]*m2[1][1] + m[2][2]*m2[2][1];
   ans[2][2] = m[0][2]*m2[0][2] + m[1][2]*m2[1][2] + m[2][2]*m2[2][2];
 }
 
 /* ----------------------------------------------------------------------
    multiply mat1 times transpose of mat2
 ------------------------------------------------------------------------- */
 
-inline void MathExtra::times3_transpose(const double m[3][3], const double m2[3][3],
-                                 double ans[3][3])
+inline void MathExtra::times3_transpose(const double m[3][3], 
+                                        const double m2[3][3],double ans[3][3])
 {
   ans[0][0] = m[0][0]*m2[0][0] + m[0][1]*m2[0][1] + m[0][2]*m2[0][2];
   ans[0][1] = m[0][0]*m2[1][0] + m[0][1]*m2[1][1] + m[0][2]*m2[1][2];
   ans[0][2] = m[0][0]*m2[2][0] + m[0][1]*m2[2][1] + m[0][2]*m2[2][2];
   ans[1][0] = m[1][0]*m2[0][0] + m[1][1]*m2[0][1] + m[1][2]*m2[0][2];
   ans[1][1] = m[1][0]*m2[1][0] + m[1][1]*m2[1][1] + m[1][2]*m2[1][2];
   ans[1][2] = m[1][0]*m2[2][0] + m[1][1]*m2[2][1] + m[1][2]*m2[2][2];
   ans[2][0] = m[2][0]*m2[0][0] + m[2][1]*m2[0][1] + m[2][2]*m2[0][2];
   ans[2][1] = m[2][0]*m2[1][0] + m[2][1]*m2[1][1] + m[2][2]*m2[1][2];
   ans[2][2] = m[2][0]*m2[2][0] + m[2][1]*m2[2][1] + m[2][2]*m2[2][2];
 }
 
 /* ----------------------------------------------------------------------
    invert a matrix
-   does NOT checks for singular or badly scaled matrix
+   does NOT check for singular or badly scaled matrix
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::invert3(const double m[3][3], double ans[3][3])
 {
   double den = m[0][0]*m[1][1]*m[2][2]-m[0][0]*m[1][2]*m[2][1];
   den += -m[1][0]*m[0][1]*m[2][2]+m[1][0]*m[0][2]*m[2][1];
   den += m[2][0]*m[0][1]*m[1][2]-m[2][0]*m[0][2]*m[1][1];
 
   ans[0][0] = (m[1][1]*m[2][2]-m[1][2]*m[2][1]) / den;
   ans[0][1] = -(m[0][1]*m[2][2]-m[0][2]*m[2][1]) / den;
   ans[0][2] = (m[0][1]*m[1][2]-m[0][2]*m[1][1]) / den;
   ans[1][0] = -(m[1][0]*m[2][2]-m[1][2]*m[2][0]) / den;
   ans[1][1] = (m[0][0]*m[2][2]-m[0][2]*m[2][0]) / den;
   ans[1][2] = -(m[0][0]*m[1][2]-m[0][2]*m[1][0]) / den;
   ans[2][0] = (m[1][0]*m[2][1]-m[1][1]*m[2][0]) / den;
   ans[2][1] = -(m[0][0]*m[2][1]-m[0][1]*m[2][0]) / den;
   ans[2][2] = (m[0][0]*m[1][1]-m[0][1]*m[1][0]) / den;
 }
 
 /* ----------------------------------------------------------------------
    matrix times vector
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::matvec(const double m[3][3], const double *v, 
                               double *ans)
 {
   ans[0] = m[0][0]*v[0] + m[0][1]*v[1] + m[0][2]*v[2];
   ans[1] = m[1][0]*v[0] + m[1][1]*v[1] + m[1][2]*v[2];
   ans[2] = m[2][0]*v[0] + m[2][1]*v[1] + m[2][2]*v[2];
 }
 
 /* ----------------------------------------------------------------------
    matrix times vector
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::matvec(const double *ex, const double *ey, 
                               const double *ez, const double *v, double *ans)
 {
   ans[0] = ex[0]*v[0] + ey[0]*v[1] + ez[0]*v[2];
   ans[1] = ex[1]*v[0] + ey[1]*v[1] + ez[1]*v[2];
   ans[2] = ex[2]*v[0] + ey[2]*v[1] + ez[2]*v[2];
 }
 
 /* ----------------------------------------------------------------------
    transposed matrix times vector
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::transpose_matvec(const double m[3][3], const double *v,
                                  double *ans)
 {
   ans[0] = m[0][0]*v[0] + m[1][0]*v[1] + m[2][0]*v[2];
   ans[1] = m[0][1]*v[0] + m[1][1]*v[1] + m[2][1]*v[2];
   ans[2] = m[0][2]*v[0] + m[1][2]*v[1] + m[2][2]*v[2];
 }
 
 /* ----------------------------------------------------------------------
    transposed matrix times vector
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::transpose_matvec(const double *ex, const double *ey,
                                  const double *ez, const double *v,
                                  double *ans)
 {
   ans[0] = ex[0]*v[0] + ex[1]*v[1] + ex[2]*v[2];
   ans[1] = ey[0]*v[0] + ey[1]*v[1] + ey[2]*v[2];
   ans[2] = ez[0]*v[0] + ez[1]*v[1] + ez[2]*v[2];
 }
 
 /* ----------------------------------------------------------------------
    transposed matrix times diagonal matrix
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::transpose_diag3(const double m[3][3], const double *d,
                                 double ans[3][3])
 {
   ans[0][0] = m[0][0]*d[0];
   ans[0][1] = m[1][0]*d[1];
   ans[0][2] = m[2][0]*d[2];
   ans[1][0] = m[0][1]*d[0];
   ans[1][1] = m[1][1]*d[1];
   ans[1][2] = m[2][1]*d[2];
   ans[2][0] = m[0][2]*d[0];
   ans[2][1] = m[1][2]*d[1];
   ans[2][2] = m[2][2]*d[2];
 }
 
 /* ----------------------------------------------------------------------
    row vector times matrix
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::vecmat(const double *v, const double m[3][3], 
                               double *ans)
 {
   ans[0] = v[0]*m[0][0] + v[1]*m[1][0] + v[2]*m[2][0];
   ans[1] = v[0]*m[0][1] + v[1]*m[1][1] + v[2]*m[2][1];
   ans[2] = v[0]*m[0][2] + v[1]*m[1][2] + v[2]*m[2][2];
 }
 
 /* ----------------------------------------------------------------------
    matrix times scalar, in place
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::scalar_times3(const double f, double m[3][3])
 {
   m[0][0] *= f; m[0][1] *= f; m[0][2] *= f;
   m[1][0] *= f; m[1][1] *= f; m[1][2] *= f;
   m[2][0] *= f; m[2][1] *= f; m[2][2] *= f;
 }
 
 /* ----------------------------------------------------------------------
    multiply 2 shape matrices
    upper-triangular 3x3, stored as 6-vector in Voigt notation
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::multiply_shape_shape(const double *one, 
                                             const double *two, double *ans)
 {
   ans[0] = one[0]*two[0];
   ans[1] = one[1]*two[1];
   ans[2] = one[2]*two[2];
   ans[3] = one[1]*two[3] + one[3]*two[2];
   ans[4] = one[0]*two[4] + one[5]*two[3] + one[4]*two[2];
   ans[5] = one[0]*two[5] + one[5]*two[1];
 }
 
 /* ----------------------------------------------------------------------
    normalize a quaternion
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::qnormalize(double *q)
 {
   double norm = 1.0 / sqrt(q[0]*q[0] + q[1]*q[1] + q[2]*q[2] + q[3]*q[3]);
   q[0] *= norm;
   q[1] *= norm;
   q[2] *= norm;
   q[3] *= norm;
 }
 
 /* ----------------------------------------------------------------------
    conjugate of a quaternion: qc = conjugate of q
    assume q is of unit length
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::qconjugate(double *q, double *qc)
 {
   qc[0] = q[0];
   qc[1] = -q[1];
   qc[2] = -q[2];
   qc[3] = -q[3];
 }
 
 /* ----------------------------------------------------------------------
    vector-quaternion multiply: c = a*b, where a = (0,a)
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::vecquat(double *a, double *b, double *c)
 {
   c[0] = -a[0]*b[1] - a[1]*b[2] - a[2]*b[3];
   c[1] = b[0]*a[0] + a[1]*b[3] - a[2]*b[2];
   c[2] = b[0]*a[1] + a[2]*b[1] - a[0]*b[3];
   c[3] = b[0]*a[2] + a[0]*b[2] - a[1]*b[1];
 }
 
 /* ----------------------------------------------------------------------
    quaternion-vector multiply: c = a*b, where b = (0,b)
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::quatvec(double *a, double *b, double *c)
 {
   c[0] = -a[1]*b[0] - a[2]*b[1] - a[3]*b[2];
   c[1] = a[0]*b[0] + a[2]*b[2] - a[3]*b[1];
   c[2] = a[0]*b[1] + a[3]*b[0] - a[1]*b[2];
   c[3] = a[0]*b[2] + a[1]*b[1] - a[2]*b[0];
 }
 
 /* ----------------------------------------------------------------------
    quaternion-quaternion multiply: c = a*b
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::quatquat(double *a, double *b, double *c)
 {
   c[0] = a[0]*b[0] - a[1]*b[1] - a[2]*b[2] - a[3]*b[3];
   c[1] = a[0]*b[1] + b[0]*a[1] + a[2]*b[3] - a[3]*b[2];
   c[2] = a[0]*b[2] + b[0]*a[2] + a[3]*b[1] - a[1]*b[3];
   c[3] = a[0]*b[3] + b[0]*a[3] + a[1]*b[2] - a[2]*b[1];
 }
 
 /* ----------------------------------------------------------------------
    quaternion multiply: c = inv(a)*b
    a is a quaternion
    b is a four component vector
    c is a three component vector
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::invquatvec(double *a, double *b, double *c)
 {
   c[0] = -a[1]*b[0] + a[0]*b[1] + a[3]*b[2] - a[2]*b[3];
   c[1] = -a[2]*b[0] - a[3]*b[1] + a[0]*b[2] + a[1]*b[3];
   c[2] = -a[3]*b[0] + a[2]*b[1] - a[1]*b[2] + a[0]*b[3];
 }
 
 /* ----------------------------------------------------------------------
    compute quaternion from axis-angle rotation
    v MUST be a unit vector
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::axisangle_to_quat(const double *v, const double angle,
-                                  double *quat)
+                                         double *quat)
 {
   double halfa = 0.5*angle;
   double sina = sin(halfa);
   quat[0] = cos(halfa);
   quat[1] = v[0]*sina;
   quat[2] = v[1]*sina;
   quat[3] = v[2]*sina;
 }
 
 /* ----------------------------------------------------------------------
    Apply principal rotation generator about x to rotation matrix m
 ------------------------------------------------------------------------- */
 
 inline void MathExtra::rotation_generator_x(const double m[3][3], 
                                             double ans[3][3])
 {
   ans[0][0] = 0;
   ans[0][1] = -m[0][2];
   ans[0][2] = m[0][1];
   ans[1][0] = 0;
   ans[1][1] = -m[1][2];
   ans[1][2] = m[1][1];
   ans[2][0] = 0;
   ans[2][1] = -m[2][2];
   ans[2][2] = m[2][1];
 }
 
 /* ----------------------------------------------------------------------
    Apply principal rotation generator about y to rotation matrix m
 ------------------------------------------------------------------------- */
 
-inline void MathExtra::rotation_generator_y(const double m[3][3], double ans[3][3])
+inline void MathExtra::rotation_generator_y(const double m[3][3], 
+                                            double ans[3][3])
 {
   ans[0][0] = m[0][2];
   ans[0][1] = 0;
   ans[0][2] = -m[0][0];
   ans[1][0] = m[1][2];
   ans[1][1] = 0;
   ans[1][2] = -m[1][0];
   ans[2][0] = m[2][2];
   ans[2][1] = 0;
   ans[2][2] = -m[2][0];
 }
 
 /* ----------------------------------------------------------------------
    Apply principal rotation generator about z to rotation matrix m
 ------------------------------------------------------------------------- */
 
-inline void MathExtra::rotation_generator_z(const double m[3][3], double ans[3][3])
+inline void MathExtra::rotation_generator_z(const double m[3][3], 
+                                            double ans[3][3])
 {
   ans[0][0] = -m[0][1];
   ans[0][1] = m[0][0];
   ans[0][2] = 0;
   ans[1][0] = -m[1][1];
   ans[1][1] = m[1][0];
   ans[1][2] = 0;
   ans[2][0] = -m[2][1];
   ans[2][1] = m[2][0];
   ans[2][2] = 0;
 }
 
 #endif