diff --git a/doc/src/JPG/user_intel.png b/doc/src/JPG/user_intel.png
index 302b50124..7ec83b320 100755
Binary files a/doc/src/JPG/user_intel.png and b/doc/src/JPG/user_intel.png differ
diff --git a/doc/src/accelerate_intel.txt b/doc/src/accelerate_intel.txt
index 9eb295e0d..f5e25d6b6 100644
--- a/doc/src/accelerate_intel.txt
+++ b/doc/src/accelerate_intel.txt
@@ -1,514 +1,516 @@
 "Previous Section"_Section_packages.html - "LAMMPS WWW Site"_lws -
 "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
 
 :link(lws,http://lammps.sandia.gov)
 :link(ld,Manual.html)
 :link(lc,Section_commands.html#comm)
 
 :line
 
 "Return to Section accelerate overview"_Section_accelerate.html
 
 5.3.2 USER-INTEL package :h5
 
 The USER-INTEL package is maintained by Mike Brown at Intel
 Corporation.  It provides two methods for accelerating simulations,
 depending on the hardware you have.  The first is acceleration on
 Intel CPUs by running in single, mixed, or double precision with
 vectorization.  The second is acceleration on Intel Xeon Phi
 coprocessors via offloading neighbor list and non-bonded force
 calculations to the Phi.  The same C++ code is used in both cases.
 When offloading to a coprocessor from a CPU, the same routine is run
 twice, once on the CPU and once with an offload flag. This allows
 LAMMPS to run on the CPU cores and coprocessor cores simultaneously.
 
 [Currently Available USER-INTEL Styles:]
 
 Angle Styles: charmm, harmonic :ulb,l
 Bond Styles: fene, harmonic :l
 Dihedral Styles: charmm, harmonic, opls :l
 Fixes: nve, npt, nvt, nvt/sllod :l
 Improper Styles: cvff, harmonic :l
-Pair Styles: buck/coul/cut, buck/coul/long, buck, eam, gayberne,
-charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, sw, tersoff :l
+Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long, 
+buck, eam, eam/alloy, eam/fs, gayberne, charmm/coul/charmm, 
+charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
+sw, tersoff :l
 K-Space Styles: pppm, pppm/disp :l
 :ule
 
 [Speed-ups to expect:]
 
 The speedups will depend on your simulation, the hardware, which
 styles are used, the number of atoms, and the floating-point
 precision mode. Performance improvements are shown compared to
 LAMMPS {without using other acceleration packages} as these are
 under active development (and subject to performance changes). The
 measurements were performed using the input files available in
 the src/USER-INTEL/TEST directory with the provided run script.
 These are scalable in size; the results given are with 512K
 particles (524K for Liquid Crystal). Most of the simulations are
 standard LAMMPS benchmarks (indicated by the filename extension in
 parenthesis) with modifications to the run length and to add a
 warmup run (for use with offload benchmarks).
 
 :c,image(JPG/user_intel.png)
 
 Results are speedups obtained on Intel Xeon E5-2697v4 processors
 (code-named Broadwell) and Intel Xeon Phi 7250 processors
 (code-named Knights Landing) with "June 2017" LAMMPS built with
 Intel Parallel Studio 2017 update 2. Results are with 1 MPI task
 per physical core. See {src/USER-INTEL/TEST/README} for the raw
 simulation rates and instructions to reproduce.
 
 :line
 
 [Accuracy and order of operations:]
 
 In most molecular dynamics software, parallelization parameters
 (# of MPI, OpenMP, and vectorization) can change the results due
 to changing the order of operations with finite-precision
 calculations. The USER-INTEL package is deterministic. This means
 that the results should be reproducible from run to run with the
 {same} parallel configurations and when using determinstic
 libraries or library settings (MPI, OpenMP, FFT). However, there
 are differences in the USER-INTEL package that can change the
 order of operations compared to LAMMPS without acceleration:
 
 Neighbor lists can be created in a different order :ulb,l
 Bins used for sorting atoms can be oriented differently :l
 The default stencil order for PPPM is 7. By default, LAMMPS will
 calculate other PPPM parameters to fit the desired acuracy with
 this order :l
 The {newton} setting applies to all atoms, not just atoms shared
 between MPI tasks :l
 Vectorization can change the order for adding pairwise forces :l
 :ule
 
 The precision mode (described below) used with the USER-INTEL
 package can change the {accuracy} of the calculations. For the
 default {mixed} precision option, calculations between pairs or
 triplets of atoms are performed in single precision, intended to
 be within the inherent error of MD simulations. All accumulation
 is performed in double precision to prevent the error from growing
 with the number of atoms in the simulation. {Single} precision
 mode should not be used without appropriate validation.
 
 :line
 
 [Quick Start for Experienced Users:]
 
 LAMMPS should be built with the USER-INTEL package installed.
 Simulations should be run with 1 MPI task per physical {core},
 not {hardware thread}.
 
 Edit src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi as necessary. :ulb,l
 Set the environment variable KMP_BLOCKTIME=0 :l
 "-pk intel 0 omp $t -sf intel" added to LAMMPS command-line :l
 $t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l
 For some of the simple 2-body potentials without long-range
 electrostatics, performance and scalability can be better with
 the "newton off" setting added to the input script :l
 For simulations on higher node counts, add "processors * * * grid 
 numa" to the beginning of the input script for better scalability :l
 If using {kspace_style pppm} in the input script, add
 "kspace_modify diff ad" for better performance :l
 :ule
 
 For Intel Xeon Phi CPUs:
 
 Runs should be performed using MCDRAM. :ulb,l
 :ule
 
 For simulations using {kspace_style pppm} on Intel CPUs
 supporting AVX-512:
 
 Add "kspace_modify diff ad" to the input script :ulb,l
 The command-line option should be changed to
 "-pk intel 0 omp $r lrt yes -sf intel" where $r is the number of
 threads minus 1. :l
 Do not use thread affinity (set KMP_AFFINITY=none) :l
 The "newton off" setting may provide better scalability :l
 :ule
 
 For Intel Xeon Phi coprocessors (Offload):
 
 Edit src/MAKE/OPTIONS/Makefile.intel_coprocessor as necessary :ulb,l
 "-pk intel N omp 1" added to command-line where N is the number of
 coprocessors per node. :l
 :ule
 
 :line
 
 [Required hardware/software:]
 
 In order to use offload to coprocessors, an Intel Xeon Phi
 coprocessor and an Intel compiler are required. For this, the
 recommended version of the Intel compiler is 14.0.1.106 or
 versions 15.0.2.044 and higher.
 
 Although any compiler can be used with the USER-INTEL package,
 currently, vectorization directives are disabled by default when
 not using Intel compilers due to lack of standard support and
 observations of decreased performance. The OpenMP standard now
 supports directives for vectorization and we plan to transition the
 code to this standard once it is available in most compilers. We
 expect this to allow improved performance and support with other
 compilers.
 
 For Intel Xeon Phi x200 series processors (code-named Knights
 Landing), there are multiple configuration options for the hardware.
 For best performance, we recommend that the MCDRAM is configured in
 "Flat" mode and with the cluster mode set to "Quadrant" or "SNC4".
 "Cache" mode can also be used, although the performance might be
 slightly lower.
 
 [Notes about Simultaneous Multithreading:]
 
 Modern CPUs often support Simultaneous Multithreading (SMT). On
 Intel processors, this is called Hyper-Threading (HT) technology.
 SMT is hardware support for running multiple threads efficiently on
 a single core. {Hardware threads} or {logical cores} are often used
 to refer to the number of threads that are supported in hardware.
 For example, the Intel Xeon E5-2697v4 processor is described
 as having 36 cores and 72 threads. This means that 36 MPI processes
 or OpenMP threads can run simultaneously on separate cores, but that
 up to 72 MPI processes or OpenMP threads can be running on the CPU
 without costly operating system context switches.
 
 Molecular dynamics simulations will often run faster when making use
 of SMT. If a thread becomes stalled, for example because it is
 waiting on data that has not yet arrived from memory, another thread
 can start running so that the CPU pipeline is still being used
 efficiently. Although benefits can be seen by launching a MPI task
 for every hardware thread, for multinode simulations, we recommend
 that OpenMP threads are used for SMT instead, either with the
 USER-INTEL package, "USER-OMP package"_accelerate_omp.html, or
 "KOKKOS package"_accelerate_kokkos.html. In the example above, up
 to 36X speedups can be observed by using all 36 physical cores with
 LAMMPS. By using all 72 hardware threads, an additional 10-30%
 performance gain can be achieved.
 
 The BIOS on many platforms allows SMT to be disabled, however, we do
 not recommend this on modern processors as there is little to no
 benefit for any software package in most cases. The operating system
 will report every hardware thread as a separate core allowing one to
 determine the number of hardware threads available. On Linux systems,
 this information can normally be obtained with:
 
 cat /proc/cpuinfo :pre
 
 [Building LAMMPS with the USER-INTEL package:]
 
 NOTE: See the src/USER-INTEL/README file for additional flags that
 might be needed for best performance on Intel server processors
 code-named "Skylake".
 
 The USER-INTEL package must be installed into the source directory:
 
 make yes-user-intel :pre
 
 Several example Makefiles for building with the Intel compiler are
 included with LAMMPS in the src/MAKE/OPTIONS/ directory:
 
 Makefile.intel_cpu_intelmpi # Intel Compiler, Intel MPI, No Offload
 Makefile.knl                # Intel Compiler, Intel MPI, No Offload
 Makefile.intel_cpu_mpich    # Intel Compiler, MPICH, No Offload
 Makefile.intel_cpu_openpmi  # Intel Compiler, OpenMPI, No Offload
 Makefile.intel_coprocessor  # Intel Compiler, Intel MPI, Offload :pre
 
 Makefile.knl is identical to Makefile.intel_cpu_intelmpi except that
 it explicitly specifies that vectorization should be for Intel
 Xeon Phi x200 processors making it easier to cross-compile. For
 users with recent installations of Intel Parallel Studio, the
 process can be as simple as:
 
 make yes-user-intel
 source /opt/intel/parallel_studio_xe_2016.3.067/psxevars.sh
 # or psxevars.csh for C-shell
 make intel_cpu_intelmpi :pre
 
 Alternatively this can be done as a single command with
 suitable make command invocations. This is discussed in "Section
 4"_Section_packages.html of the manual.
 
 Note that if you build with support for a Phi coprocessor, the same
 binary can be used on nodes with or without coprocessors installed.
 However, if you do not have coprocessors on your system, building
 without offload support will produce a smaller binary.
 
 The general requirements for Makefiles with the USER-INTEL package
 are as follows. "-DLAMMPS_MEMALIGN=64" is required for CCFLAGS. When
 using Intel compilers, "-restrict" is required and "-qopenmp" is
 highly recommended for CCFLAGS and LINKFLAGS. LIB should include
 "-ltbbmalloc". For builds supporting offload, "-DLMP_INTEL_OFFLOAD"
 is required for CCFLAGS and "-qoffload" is required for LINKFLAGS.
 Other recommended CCFLAG options for best performance are
 "-O2 -fno-alias -ansi-alias -qoverride-limits fp-model fast=2
 -no-prec-div".
 
 NOTE: The vectorization and math capabilities can differ depending on
 the CPU. For Intel compilers, the "-x" flag specifies the type of
 processor for which to optimize. "-xHost" specifies that the compiler
 should build for the processor used for compiling. For Intel Xeon Phi
 x200 series processors, this option is "-xMIC-AVX512". For fourth
 generation Intel Xeon (v4/Broadwell) processors, "-xCORE-AVX2" should
 be used. For older Intel Xeon processors, "-xAVX" will perform best
 in general for the different simulations in LAMMPS. The default
 in most of the example Makefiles is to use "-xHost", however this
 should not be used when cross-compiling.
 
 [Running LAMMPS with the USER-INTEL package:]
 
 Running LAMMPS with the USER-INTEL package is similar to normal use
 with the exceptions that one should 1) specify that LAMMPS should use
 the USER-INTEL package, 2) specify the number of OpenMP threads, and
 3) optionally specify the specific LAMMPS styles that should use the
 USER-INTEL package. 1) and 2) can be performed from the command-line
 or by editing the input script. 3) requires editing the input script.
 Advanced performance tuning options are also described below to get
 the best performance.
 
 When running on a single node (including runs using offload to a
 coprocessor), best performance is normally obtained by using 1 MPI
 task per physical core and additional OpenMP threads with SMT. For
 Intel Xeon processors, 2 OpenMP threads should be used for SMT.
 For Intel Xeon Phi CPUs, 2 or 4 OpenMP threads should be used
 (best choice depends on the simulation). In cases where the user
 specifies that LRT mode is used (described below), 1 or 3 OpenMP
 threads should be used. For multi-node runs, using 1 MPI task per
 physical core will often perform best, however, depending on the
 machine and scale, users might get better performance by decreasing
 the number of MPI tasks and using more OpenMP threads. For
 performance, the product of the number of MPI tasks and OpenMP
 threads should not exceed the number of available hardware threads in
 almost all cases.
 
 NOTE: Setting core affinity is often used to pin MPI tasks and OpenMP
 threads to a core or group of cores so that memory access can be
 uniform. Unless disabled at build time, affinity for MPI tasks and
 OpenMP threads on the host (CPU) will be set by default on the host
 {when using offload to a coprocessor}. In this case, it is unnecessary
 to use other methods to control affinity (e.g. taskset, numactl,
 I_MPI_PIN_DOMAIN, etc.). This can be disabled with the {no_affinity}
 option to the "package intel"_package.html command or by disabling the
 option at build time (by adding -DINTEL_OFFLOAD_NOAFFINITY to the
 CCFLAGS line of your Makefile). Disabling this option is not
 recommended, especially when running on a machine with Intel
 Hyper-Threading technology disabled.
 
 [Run with the USER-INTEL package from the command line:]
 
 To enable USER-INTEL optimizations for all available styles used in
 the input script, the "-sf intel"
 "command-line switch"_Section_start.html#start_6 can be used without
 any requirement for editing the input script. This switch will
 automatically append "intel" to styles that support it. It also
 invokes a default command: "package intel 1"_package.html. This
 package command is used to set options for the USER-INTEL package.
 The default package command will specify that USER-INTEL calculations
 are performed in mixed precision, that the number of OpenMP threads
 is specified by the OMP_NUM_THREADS environment variable, and that
 if coprocessors are present and the binary was built with offload
 support, that 1 coprocessor per node will be used with automatic
 balancing of work between the CPU and the coprocessor.
 
 You can specify different options for the USER-INTEL package by using
 the "-pk intel Nphi" "command-line switch"_Section_start.html#start_6
 with keyword/value pairs as specified in the documentation. Here,
 Nphi = # of Xeon Phi coprocessors/node (ignored without offload
 support). Common options to the USER-INTEL package include {omp} to
 override any OMP_NUM_THREADS setting and specify the number of OpenMP
 threads, {mode} to set the floating-point precision mode, and
 {lrt} to enable Long-Range Thread mode as described below. See the
 "package intel"_package.html command for details, including the
 default values used for all its options if not specified, and how to
 set the number of OpenMP threads via the OMP_NUM_THREADS environment
 variable if desired.
 
 Examples (see documentation for your MPI/Machine for differences in
 launching MPI applications):
 
 mpirun -np 72 -ppn 36 lmp_machine -sf intel -in in.script                                 # 2 nodes, 36 MPI tasks/node, $OMP_NUM_THREADS OpenMP Threads
 mpirun -np 72 -ppn 36 lmp_machine -sf intel -in in.script -pk intel 0 omp 2 mode double   # Don't use any coprocessors that might be available, use 2 OpenMP threads for each task, use double precision :pre
 
 [Or run with the USER-INTEL package by editing an input script:]
 
 As an alternative to adding command-line arguments, the input script
 can be edited to enable the USER-INTEL package. This requires adding
 the "package intel"_package.html command to the top of the input
 script. For the second example above, this would be:
 
 package intel 0 omp 2 mode double :pre
 
 To enable the USER-INTEL package only for individual styles, you can
 add an "intel" suffix to the individual style, e.g.:
 
 pair_style lj/cut/intel 2.5 :pre
 
 Alternatively, the "suffix intel"_suffix.html command can be added to
 the input script to enable USER-INTEL styles for the commands that
 follow in the input script.
 
 [Tuning for Performance:]
 
 NOTE: The USER-INTEL package will perform better with modifications
 to the input script when "PPPM"_kspace_style.html is used:
 "kspace_modify diff ad"_kspace_modify.html should be added to the
 input script.
 
 Long-Range Thread (LRT) mode is an option to the "package
 intel"_package.html command that can improve performance when using
 "PPPM"_kspace_style.html for long-range electrostatics on processors
 with SMT. It generates an extra pthread for each MPI task. The thread
 is dedicated to performing some of the PPPM calculations and MPI
 communications. On Intel Xeon Phi x200 series CPUs, this will likely
 always improve performance, even on a single node. On Intel Xeon
 processors, using this mode might result in better performance when
 using multiple nodes, depending on the machine. To use this mode,
 specify that the number of OpenMP threads is one less than would
 normally be used for the run and add the "lrt yes" option to the "-pk"
 command-line suffix or "package intel" command. For example, if a run
 would normally perform best with "-pk intel 0 omp 4", instead use
 "-pk intel 0 omp 3 lrt yes". When using LRT, you should set the
 environment variable "KMP_AFFINITY=none". LRT mode is not supported
 when using offload.
 
 NOTE: Changing the "newton"_newton.html setting to off can improve
 performance and/or scalability for simple 2-body potentials such as
 lj/cut or when using LRT mode on processors supporting AVX-512.
 
 Not all styles are supported in the USER-INTEL package. You can mix
 the USER-INTEL package with styles from the "OPT"_accelerate_opt.html
 package or the "USER-OMP package"_accelerate_omp.html. Of course,
 this requires that these packages were installed at build time. This
 can performed automatically by using "-sf hybrid intel opt" or
 "-sf hybrid intel omp" command-line options. Alternatively, the "opt"
 and "omp" suffixes can be appended manually in the input script. For
 the latter, the "package omp"_package.html command must be in the
 input script or the "-pk omp Nt" "command-line
 switch"_Section_start.html#start_6 must be used where Nt is the
 number of OpenMP threads. The number of OpenMP threads should not be
 set differently for the different packages. Note that the "suffix
 hybrid intel omp"_suffix.html command can also be used within the
 input script to automatically append the "omp" suffix to styles when
 USER-INTEL styles are not available.
 
 NOTE: For simulations on higher node counts, add "processors * * * 
 grid numa"_processors.html" to the beginning of the input script for
 better scalability.
 
 When running on many nodes, performance might be better when using
 fewer OpenMP threads and more MPI tasks. This will depend on the
 simulation and the machine. Using the "verlet/split"_run_style.html
 run style might also give better performance for simulations with
 "PPPM"_kspace_style.html electrostatics. Note that this is an
 alternative to LRT mode and the two cannot be used together.
 
 Currently, when using Intel MPI with Intel Xeon Phi x200 series
 CPUs, better performance might be obtained by setting the
 environment variable "I_MPI_SHM_LMT=shm" for Linux kernels that do
 not yet have full support for AVX-512. Runs on Intel Xeon Phi x200
 series processors will always perform better using MCDRAM. Please
 consult your system documentation for the best approach to specify
 that MPI runs are performed in MCDRAM.
 
 [Tuning for Offload Performance:]
 
 The default settings for offload should give good performance.
 
 When using LAMMPS with offload to Intel coprocessors, best performance
 will typically be achieved with concurrent calculations performed on
 both the CPU and the coprocessor. This is achieved by offloading only
 a fraction of the neighbor and pair computations to the coprocessor or
 using "hybrid"_pair_hybrid.html pair styles where only one style uses
 the "intel" suffix. For simulations with long-range electrostatics or
 bond, angle, dihedral, improper calculations, computation and data
 transfer to the coprocessor will run concurrently with computations
 and MPI communications for these calculations on the host CPU. This
 is illustrated in the figure below for the rhodopsin protein benchmark
 running on E5-2697v2 processors with a Intel Xeon Phi 7120p
 coprocessor. In this plot, the vertical access is time and routines
 running at the same time are running concurrently on both the host and
 the coprocessor.
 
 :c,image(JPG/offload_knc.png)
 
 The fraction of the offloaded work is controlled by the {balance}
 keyword in the "package intel"_package.html command. A balance of 0
 runs all calculations on the CPU.  A balance of 1 runs all
 supported calculations on the coprocessor.  A balance of 0.5 runs half
 of the calculations on the coprocessor.  Setting the balance to -1
 (the default) will enable dynamic load balancing that continously
 adjusts the fraction of offloaded work throughout the simulation.
 Because data transfer cannot be timed, this option typically produces
 results within 5 to 10 percent of the optimal fixed balance.
 
 If running short benchmark runs with dynamic load balancing, adding a
 short warm-up run (10-20 steps) will allow the load-balancer to find a
 near-optimal setting that will carry over to additional runs.
 
 The default for the "package intel"_package.html command is to have
 all the MPI tasks on a given compute node use a single Xeon Phi
 coprocessor.  In general, running with a large number of MPI tasks on
 each node will perform best with offload.  Each MPI task will
 automatically get affinity to a subset of the hardware threads
 available on the coprocessor.  For example, if your card has 61 cores,
 with 60 cores available for offload and 4 hardware threads per core
 (240 total threads), running with 24 MPI tasks per node will cause
 each MPI task to use a subset of 10 threads on the coprocessor.  Fine
 tuning of the number of threads to use per MPI task or the number of
 threads to use per core can be accomplished with keyword settings of
 the "package intel"_package.html command.
 
 The USER-INTEL package has two modes for deciding which atoms will be
 handled by the coprocessor.  This choice is controlled with the {ghost}
 keyword of the "package intel"_package.html command.  When set to 0,
 ghost atoms (atoms at the borders between MPI tasks) are not offloaded
 to the card.  This allows for overlap of MPI communication of forces
 with computation on the coprocessor when the "newton"_newton.html
 setting is "on".  The default is dependent on the style being used,
 however, better performance may be achieved by setting this option
 explicitly.
 
 When using offload with CPU Hyper-Threading disabled, it may help
 performance to use fewer MPI tasks and OpenMP threads than available
 cores.  This is due to the fact that additional threads are generated
 internally to handle the asynchronous offload tasks.
 
 If pair computations are being offloaded to an Intel Xeon Phi
 coprocessor, a diagnostic line is printed to the screen (not to the
 log file), during the setup phase of a run, indicating that offload
 mode is being used and indicating the number of coprocessor threads
 per MPI task.  Additionally, an offload timing summary is printed at
 the end of each run.  When offloading, the frequency for "atom
 sorting"_atom_modify.html is changed to 1 so that the per-atom data is
 effectively sorted at every rebuild of the neighbor lists. All the
 available coprocessor threads on each Phi will be divided among MPI
 tasks, unless the {tptask} option of the "-pk intel" "command-line
 switch"_Section_start.html#start_6 is used to limit the coprocessor
 threads per MPI task.
 
 [Restrictions:]
 
 When offloading to a coprocessor, "hybrid"_pair_hybrid.html styles
 that require skip lists for neighbor builds cannot be offloaded.
 Using "hybrid/overlay"_pair_hybrid.html is allowed.  Only one intel
 accelerated style may be used with hybrid styles.
 "Special_bonds"_special_bonds.html exclusion lists are not currently
 supported with offload, however, the same effect can often be
 accomplished by setting cutoffs for excluded atom types to 0.  None of
 the pair styles in the USER-INTEL package currently support the
 "inner", "middle", "outer" options for rRESPA integration via the
 "run_style respa"_run_style.html command; only the "pair" option is
 supported.
 
 [References:]
 
 Brown, W.M., Carrillo, J.-M.Y., Mishra, B., Gavhane, N., Thakker, F.M., De Kraker, A.R., Yamada, M., Ang, J.A., Plimpton, S.J., "Optimizing Classical Molecular Dynamics in LAMMPS," in Intel Xeon Phi Processor High Performance Programming: Knights Landing Edition, J. Jeffers, J. Reinders, A. Sodani, Eds. Morgan Kaufmann. :ulb,l
 
 Brown, W. M., Semin, A., Hebenstreit, M., Khvostov, S., Raman, K., Plimpton, S.J. "Increasing Molecular Dynamics Simulation Rates with an 8-Fold Increase in Electrical Power Efficiency."_http://dl.acm.org/citation.cfm?id=3014915 2016 High Performance Computing, Networking, Storage and Analysis, SC16: International Conference (pp. 82-95). :l
 
 Brown, W.M., Carrillo, J.-M.Y., Gavhane, N., Thakkar, F.M., Plimpton, S.J. Optimizing Legacy Molecular Dynamics Software with Directive-Based Offload. Computer Physics Communications. 2015. 195: p. 95-101. :l
 :ule
 
 
 
 
diff --git a/doc/src/pair_airebo.txt b/doc/src/pair_airebo.txt
index e66ecb637..1aa017f27 100644
--- a/doc/src/pair_airebo.txt
+++ b/doc/src/pair_airebo.txt
@@ -1,243 +1,246 @@
 "LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
 
 :link(lws,http://lammps.sandia.gov)
 :link(ld,Manual.html)
 :link(lc,Section_commands.html#comm)
 
 :line
 
 pair_style airebo command :h3
+pair_style airebo/intel command :h3
 pair_style airebo/omp command :h3
 pair_style airebo/morse command :h3
+pair_style airebo/morse/intel command :h3
 pair_style airebo/morse/omp command :h3
 pair_style rebo command :h3
+pair_style rebo/intel command :h3
 pair_style rebo/omp command :h3
 
 [Syntax:]
 
 pair_style style cutoff LJ_flag TORSION_flag cutoff_min :pre
 
 style = {airebo} or {airebo/morse} or {rebo}
 cutoff = LJ or Morse cutoff (sigma scale factor) (AIREBO and AIREBO-M only)
 LJ_flag = 0/1 to turn off/on the LJ or Morse term (AIREBO and AIREBO-M only, optional)
 TORSION_flag = 0/1 to turn off/on the torsion term (AIREBO and AIREBO-M only, optional)
 cutoff_min = Start of the transition region of cutoff (sigma scale factor) (AIREBO and AIREBO-M only, optional) :ul
 
 [Examples:]
 
 pair_style airebo 3.0
 pair_style airebo 2.5 1 0
 pair_coeff * * ../potentials/CH.airebo H C :pre
 
 pair_style airebo/morse 3.0
 pair_coeff * * ../potentials/CH.airebo-m H C :pre
 
 pair_style rebo
 pair_coeff * * ../potentials/CH.airebo H C :pre
 
 [Description:]
 
 The {airebo} pair style computes the Adaptive Intermolecular Reactive
 Empirical Bond Order (AIREBO) Potential of "(Stuart)"_#Stuart for a
 system of carbon and/or hydrogen atoms.  Note that this is the initial
 formulation of AIREBO from 2000, not the later formulation.
 
 The {airebo/morse} pair style computes the AIREBO-M potential, which
 is equivalent to AIREBO, but replaces the LJ term with a Morse potential.
 The Morse potentials are parameterized by high-quality quantum chemistry
 (MP2) calculations and do not diverge as quickly as particle density
 increases. This allows AIREBO-M to retain accuracy to much higher pressures
 than AIREBO (up to 40 GPa for Polyethylene). Details for this potential
 and its parameterization are given in "(O'Conner)"_#OConnor.
 
 The {rebo} pair style computes the Reactive Empirical Bond Order (REBO)
 Potential of "(Brenner)"_#Brenner. Note that this is the so-called
 2nd generation REBO from 2002, not the original REBO from 1990.
 As discussed below, 2nd generation REBO is closely related to the
 initial AIREBO; it is just a subset of the potential energy terms.
 
 The AIREBO potential consists of three terms:
 
 :c,image(Eqs/pair_airebo.jpg)
 
 By default, all three terms are included.  For the {airebo} style, if
 the first two optional flag arguments to the pair_style command are
 included, the LJ and torsional terms can be turned off.  Note that
 both or neither of the flags must be included.  If both of the LJ an
 torsional terms are turned off, it becomes the 2nd-generation REBO
 potential, with a small caveat on the spline fitting procedure
 mentioned below.  This can be specified directly as pair_style {rebo}
 with no additional arguments.
 
 The detailed formulas for this potential are given in
 "(Stuart)"_#Stuart; here we provide only a brief description.
 
 The E_REBO term has the same functional form as the hydrocarbon REBO
 potential developed in "(Brenner)"_#Brenner.  The coefficients for
 E_REBO in AIREBO are essentially the same as Brenner's potential, but
 a few fitted spline values are slightly different.  For most cases the
 E_REBO term in AIREBO will produce the same energies, forces and
 statistical averages as the original REBO potential from which it was
 derived.  The E_REBO term in the AIREBO potential gives the model its
 reactive capabilities and only describes short-ranged C-C, C-H and H-H
 interactions (r < 2 Angstroms). These interactions have strong
 coordination-dependence through a bond order parameter, which adjusts
 the attraction between the I,J atoms based on the position of other
 nearby atoms and thus has 3- and 4-body dependence.
 
 The E_LJ term adds longer-ranged interactions (2 < r < cutoff) using a
 form similar to the standard "Lennard Jones potential"_pair_lj.html.
 The E_LJ term in AIREBO contains a series of switching functions so
 that the short-ranged LJ repulsion (1/r^12) does not interfere with
 the energetics captured by the E_REBO term.  The extent of the E_LJ
 interactions is determined by the {cutoff} argument to the pair_style
 command which is a scale factor.  For each type pair (C-C, C-H, H-H)
 the cutoff is obtained by multiplying the scale factor by the sigma
 value defined in the potential file for that type pair.  In the
 standard AIREBO potential, sigma_CC = 3.4 Angstroms, so with a scale
 factor of 3.0 (the argument in pair_style), the resulting E_LJ cutoff
 would be 10.2 Angstroms.
 
 By default, the longer-ranged interaction is smoothly switched off
 between 2.16 and 3.0 sigma. By specifying {cutoff_min} in addition
 to {cutoff}, the switching can be configured to take place between 
 {cutoff_min} and {cutoff}. {cutoff_min} can only be specified if all
 optional arguments are given.
 
 The E_TORSION term is an explicit 4-body potential that describes
 various dihedral angle preferences in hydrocarbon configurations.
 
 :line
 
 Only a single pair_coeff command is used with the {airebo}, {airebo}
 or {rebo} style which specifies an AIREBO or AIREBO-M potential file
 with parameters for C and H.  Note that the {rebo} style in LAMMPS
 uses the same AIREBO-formatted potential file.  These are mapped to
 LAMMPS atom types by specifying N additional arguments after the
 filename in the pair_coeff command, where N is the number of LAMMPS
 atom types:
 
 filename
 N element names = mapping of AIREBO elements to atom types :ul
 
 See the "pair_coeff"_pair_coeff.html doc page for alternate ways
 to specify the path for the potential file.
 
 As an example, if your LAMMPS simulation has 4 atom types and you want
 the 1st 3 to be C, and the 4th to be H, you would use the following
 pair_coeff command:
 
 pair_coeff * * CH.airebo C C C H :pre
 
 The 1st 2 arguments must be * * so as to span all LAMMPS atom types.
 The first three C arguments map LAMMPS atom types 1,2,3 to the C
 element in the AIREBO file.  The final H argument maps LAMMPS atom
 type 4 to the H element in the SW file.  If a mapping value is
 specified as NULL, the mapping is not performed.  This can be used
 when a {airebo} potential is used as part of the {hybrid} pair style.
 The NULL values are placeholders for atom types that will be used with
 other potentials.
 
 The parameters/coefficients for the AIREBO potentials are listed in
 the CH.airebo file to agree with the original "(Stuart)"_#Stuart
 paper.  Thus the parameters are specific to this potential and the way
 it was fit, so modifying the file should be done cautiously.
 
 Similarly the parameters/coefficients for the AIREBO-M potentials are
 listed in the CH.airebo-m file to agree with the "(O'Connor)"_#OConnor
 paper. Thus the parameters are specific to this potential and the way
 it was fit, so modifying the file should be done cautiously. The
 AIREBO-M Morse potentials were parameterized using a cutoff of
 3.0 (sigma). Modifying this cutoff may impact simulation accuracy.
 
 This pair style tallies a breakdown of the total AIREBO potential
 energy into sub-categories, which can be accessed via the "compute
 pair"_compute_pair.html command as a vector of values of length 3.
 The 3 values correspond to the following sub-categories:
 
 {E_REBO} = REBO energy
 {E_LJ} = Lennard-Jones energy
 {E_TORSION} = Torsion energy :ol
 
 To print these quantities to the log file (with descriptive column
 headings) the following commands could be included in an input script:
 
 compute 0 all pair airebo
 variable REBO     equal c_0\[1\]
 variable LJ       equal c_0\[2\]
 variable TORSION  equal c_0\[3\]
 thermo_style custom step temp epair v_REBO v_LJ v_TORSION :pre
 
 :line
 
 Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
 functionally the same as the corresponding style without the suffix.
 They have been optimized to run faster, depending on your available
 hardware, as discussed in "Section 5"_Section_accelerate.html
 of the manual.  The accelerated styles take the same arguments and
 should produce the same results, except for round-off and precision
 issues.
 
 These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
 USER-OMP and OPT packages, respectively.  They are only enabled if
 LAMMPS was built with those packages.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.
 
 You can specify the accelerated styles explicitly in your input script
 by including their suffix, or you can use the "-suffix command-line
 switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
 use the "suffix"_suffix.html command in your input script.
 
 See "Section 5"_Section_accelerate.html of the manual for
 more instructions on how to use the accelerated styles effectively.
 
 :line
 
 [Mixing, shift, table, tail correction, restart, rRESPA info]:
 
 These pair styles do not support the "pair_modify"_pair_modify.html
 mix, shift, table, and tail options.
 
 These pair styles do not write their information to "binary restart
 files"_restart.html, since it is stored in potential files.  Thus, you
 need to re-specify the pair_style and pair_coeff commands in an input
 script that reads a restart file.
 
 These pair styles can only be used via the {pair} keyword of the
 "run_style respa"_run_style.html command.  They do not support the
 {inner}, {middle}, {outer} keywords.
 
 [Restrictions:]
 
 These pair styles are part of the MANYBODY package.  They are only
 enabled if LAMMPS was built with that package.  See the
 "Making LAMMPS"_Section_start.html#start_3 section for more info.
 
 These pair potentials require the "newton"_newton.html setting to be
 "on" for pair interactions.
 
 The CH.airebo and CH.airebo-m potential files provided with LAMMPS
 (see the potentials directory) are parameterized for metal "units"_units.html.
 You can use the AIREBO, AIREBO-M or REBO potential with any LAMMPS units,
 but you would need to create your own AIREBO or AIREBO-M potential file
 with coefficients listed in the appropriate units, if your simulation
 doesn't use "metal" units.
 
 [Related commands:]
 
 "pair_coeff"_pair_coeff.html
 
 [Default:] none
 
 :line
 
 :link(Stuart)
 [(Stuart)] Stuart, Tutein, Harrison, J Chem Phys, 112, 6472-6486
 (2000).
 
 :link(Brenner)
 [(Brenner)] Brenner, Shenderova, Harrison, Stuart, Ni, Sinnott, J
 Physics: Condensed Matter, 14, 783-802 (2002).
 
 :link(OConnor)
 [(O'Connor)] O'Connor et al., J. Chem. Phys. 142, 024903 (2015).
diff --git a/doc/src/pair_charmm.txt b/doc/src/pair_charmm.txt
index ef4ef41c9..75a8e4bff 100644
--- a/doc/src/pair_charmm.txt
+++ b/doc/src/pair_charmm.txt
@@ -1,269 +1,270 @@
 "LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
 
 :link(lws,http://lammps.sandia.gov)
 :link(ld,Manual.html)
 :link(lc,Section_commands.html#comm)
 
 :line
 
 pair_style lj/charmm/coul/charmm command :h3
+pair_style lj/charmm/coul/charmm/intel command :h3
 pair_style lj/charmm/coul/charmm/omp command :h3
 pair_style lj/charmm/coul/charmm/implicit command :h3
 pair_style lj/charmm/coul/charmm/implicit/omp command :h3
 pair_style lj/charmm/coul/long command :h3
 pair_style lj/charmm/coul/long/gpu command :h3
 pair_style lj/charmm/coul/long/intel command :h3
 pair_style lj/charmm/coul/long/opt command :h3
 pair_style lj/charmm/coul/long/omp command :h3
 pair_style lj/charmm/coul/msm command :h3
 pair_style lj/charmm/coul/msm/omp command :h3
 pair_style lj/charmmfsw/coul/charmmfsh command :h3
 pair_style lj/charmmfsw/coul/long command :h3
 
 [Syntax:]
 
 pair_style style args :pre
 
 style = {lj/charmm/coul/charmm} or {lj/charmm/coul/charmm/implicit} or {lj/charmm/coul/long} or {lj/charmm/coul/msm} or {lj/charmmfsw/coul/charmmfsh} or {lj/charmmfsw/coul/long}
 args = list of arguments for a particular style :ul
   {lj/charmm/coul/charmm} args = inner outer (inner2) (outer2)
     inner, outer = global switching cutoffs for Lennard Jones (and Coulombic if only 2 args)
     inner2, outer2 = global switching cutoffs for Coulombic (optional)
   {lj/charmm/coul/charmm/implicit} args = inner outer (inner2) (outer2)
     inner, outer = global switching cutoffs for LJ (and Coulombic if only 2 args)
     inner2, outer2 = global switching cutoffs for Coulombic (optional)
   {lj/charmm/coul/long} args = inner outer (cutoff)
     inner, outer = global switching cutoffs for LJ (and Coulombic if only 2 args)
     cutoff = global cutoff for Coulombic (optional, outer is Coulombic cutoff if only 2 args)
   {lj/charmm/coul/msm} args = inner outer (cutoff)
     inner, outer = global switching cutoffs for LJ (and Coulombic if only 2 args)
     cutoff = global cutoff for Coulombic (optional, outer is Coulombic cutoff if only 2 args)
   {lj/charmmfsw/coul/charmmfsh} args = inner outer (cutoff)
     inner, outer = global cutoffs for LJ (and Coulombic if only 2 args)
     cutoff = global cutoff for Coulombic (optional, outer is Coulombic cutoff if only 2 args)
   {lj/charmmfsw/coul/long} args = inner outer (cutoff)
     inner, outer = global cutoffs for LJ (and Coulombic if only 2 args)
     cutoff = global cutoff for Coulombic (optional, outer is Coulombic cutoff if only 2 args) :pre
 
 [Examples:]
 
 pair_style lj/charmm/coul/charmm 8.0 10.0
 pair_style lj/charmm/coul/charmm 8.0 10.0 7.0 9.0
 pair_style lj/charmmfsw/coul/charmmfsh 10.0 12.0
 pair_style lj/charmmfsw/coul/charmmfsh 10.0 12.0 9.0
 pair_coeff * * 100.0 2.0
 pair_coeff 1 1 100.0 2.0 150.0 3.5 :pre
 
 pair_style lj/charmm/coul/charmm/implicit 8.0 10.0
 pair_style lj/charmm/coul/charmm/implicit 8.0 10.0 7.0 9.0
 pair_coeff * * 100.0 2.0
 pair_coeff 1 1 100.0 2.0 150.0 3.5 :pre
 
 pair_style lj/charmm/coul/long 8.0 10.0
 pair_style lj/charmm/coul/long 8.0 10.0 9.0
 pair_style lj/charmmfsw/coul/long 8.0 10.0
 pair_style lj/charmmfsw/coul/long 8.0 10.0 9.0
 pair_coeff * * 100.0 2.0
 pair_coeff 1 1 100.0 2.0 150.0 3.5 :pre
 
 pair_style lj/charmm/coul/msm 8.0 10.0
 pair_style lj/charmm/coul/msm 8.0 10.0 9.0
 pair_coeff * * 100.0 2.0
 pair_coeff 1 1 100.0 2.0 150.0 3.5 :pre
 
 [Description:]
 
 These pair styles compute Lennard Jones (LJ) and Coulombic
 interactions with additional switching or shifting functions that ramp
 the energy and/or force smoothly to zero between an inner and outer
 cutoff.  They are implementations of the widely used CHARMM force
 field used in the "CHARMM"_http://www.scripps.edu/brooks MD code (and
 others).  See "(MacKerell)"_#pair-MacKerell for a description of the
 CHARMM force field.
 
 The styles with {charmm} (not {charmmfsw} or {charmmfsh}) in their
 name are the older, original LAMMPS implementations.  They compute the
 LJ and Coulombic interactions with an energy switching function (esw,
 shown in the formula below as S(r)), which ramps the energy smoothly
 to zero between the inner and outer cutoff.  This can cause
 irregularities in pair-wise forces (due to the discontinuous 2nd
 derivative of energy at the boundaries of the switching region), which
 in some cases can result in detectable artifacts in an MD simulation.
 
 The newer styles with {charmmfsw} or {charmmfsh} in their name replace
 the energy switching with force switching (fsw) and force shifting
 (fsh) functions, for LJ and Coulombic interactions respectively.
 These follow the formulas and description given in
 "(Steinbach)"_#Steinbach and "(Brooks)"_#Brooks1 to minimize these
 artifacts.
 
 NOTE: The newer {charmmfsw} or {charmmfsh} styles were released in
 March 2017.  We recommend they be used instead of the older {charmm}
 styles.  This includes the newer "dihedral_style
 charmmfsw"_dihedral_charmm.html command.  Eventually code from the new
 styles will propagate into the related pair styles (e.g. implicit,
 accelerator, free energy variants).
 
 NOTE: The newest CHARMM pair styles reset the Coulombic energy
 conversion factor used internally in the code, from the LAMMPS value
 to the CHARMM value, as if it were effectively a parameter of the
 force field.  This is because the CHARMM code uses a slightly
 different value for the this conversion factor in "real
 units"_units.html (Kcal/mole), namely CHARMM = 332.0716, LAMMPS =
 332.06371.  This is to enable more precise agreement by LAMMPS with
 the CHARMM force field energies and forces, when using one of these
 two CHARMM pair styles.
 
 :c,image(Eqs/pair_charmm.jpg)
 
 where S(r) is the energy switching function mentioned above for the
 {charmm} styles.  See the "(Steinbach)"_#Steinbach paper for the
 functional forms of the force switching and force shifting functions
 used in the {charmmfsw} and {charmmfsh} styles.
 
 When using the {lj/charmm/coul/charmm styles}, both the LJ and
 Coulombic terms require an inner and outer cutoff. They can be the
 same for both formulas or different depending on whether 2 or 4
 arguments are used in the pair_style command.  For the
 {lj/charmmfsw/coul/charmmfsh} style, the LJ term requires both an
 inner and outer cutoff, while the Coulombic term requires only one
 cutoff.  If the Coulomb cutoff is not specified (2 instead of 3
 arguments), the LJ outer cutoff is used for the Coulombic cutoff.  In
 all cases where an inner and outer cutoff are specified, the inner
 cutoff distance must be less than the outer cutoff.  It is typical to
 make the difference between the inner and outer cutoffs about 2.0
 Angstroms.
 
 Style {lj/charmm/coul/charmm/implicit} computes the same formulas as
 style {lj/charmm/coul/charmm} except that an additional 1/r term is
 included in the Coulombic formula.  The Coulombic energy thus varies
 as 1/r^2.  This is effectively a distance-dependent dielectric term
 which is a simple model for an implicit solvent with additional
 screening.  It is designed for use in a simulation of an unsolvated
 biomolecule (no explicit water molecules).
 
 Styles {lj/charmm/coul/long} and {lj/charmm/coul/msm} compute the same
 formulas as style {lj/charmm/coul/charmm} and style
 {lj/charmmfsw/coul/long} computes the same formulas as style
 {lj/charmmfsw/coul/charmmfsh}, except that an additional damping
 factor is applied to the Coulombic term, so it can be used in
 conjunction with the "kspace_style"_kspace_style.html command and its
 {ewald} or {pppm} or {msm} option.  Only one Coulombic cutoff is
 specified for these styles; if only 2 arguments are used in the
 pair_style command, then the outer LJ cutoff is used as the single
 Coulombic cutoff.  The Coulombic cutoff specified for these styles
 means that pairwise interactions within this distance are computed
 directly; interactions outside that distance are computed in
 reciprocal space.
 
 The following coefficients must be defined for each pair of atoms
 types via the "pair_coeff"_pair_coeff.html command as in the examples
 above, or in the data file or restart files read by the
 "read_data"_read_data.html or "read_restart"_read_restart.html
 commands, or by mixing as described below:
 
 epsilon (energy units)
 sigma (distance units)
 epsilon_14 (energy units)
 sigma_14 (distance units) :ul
 
 Note that sigma is defined in the LJ formula as the zero-crossing
 distance for the potential, not as the energy minimum at 2^(1/6)
 sigma.
 
 The latter 2 coefficients are optional.  If they are specified, they
 are used in the LJ formula between 2 atoms of these types which are
 also first and fourth atoms in any dihedral.  No cutoffs are specified
 because the CHARMM force field does not allow varying cutoffs for
 individual atom pairs; all pairs use the global cutoff(s) specified in
 the pair_style command.
 
 :line
 
 Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
 functionally the same as the corresponding style without the suffix.
 They have been optimized to run faster, depending on your available
 hardware, as discussed in "Section 5"_Section_accelerate.html
 of the manual.  The accelerated styles take the same arguments and
 should produce the same results, except for round-off and precision
 issues.
 
 These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
 USER-OMP and OPT packages, respectively.  They are only enabled if
 LAMMPS was built with those packages.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.
 
 You can specify the accelerated styles explicitly in your input script
 by including their suffix, or you can use the "-suffix command-line
 switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
 use the "suffix"_suffix.html command in your input script.
 
 See "Section 5"_Section_accelerate.html of the manual for
 more instructions on how to use the accelerated styles effectively.
 
 :line
 
 [Mixing, shift, table, tail correction, restart, rRESPA info]:
 
 For atom type pairs I,J and I != J, the epsilon, sigma, epsilon_14,
 and sigma_14 coefficients for all of the lj/charmm pair styles can be
 mixed.  The default mix value is {arithmetic} to coincide with the
 usual settings for the CHARMM force field.  See the "pair_modify"
 command for details.
 
 None of the {lj/charmm} or {lj/charmmfsw} pair styles support the
 "pair_modify"_pair_modify.html shift option, since the Lennard-Jones
 portion of the pair interaction is smoothed to 0.0 at the cutoff.
 
 The {lj/charmm/coul/long} and {lj/charmmfsw/coul/long} styles support
 the "pair_modify"_pair_modify.html table option since they can
 tabulate the short-range portion of the long-range Coulombic
 interaction.
 
 None of the {lj/charmm} or {lj/charmmfsw} pair styles support the
 "pair_modify"_pair_modify.html tail option for adding long-range tail
 corrections to energy and pressure, since the Lennard-Jones portion of
 the pair interaction is smoothed to 0.0 at the cutoff.
 
 All of the {lj/charmm} and {lj/charmmfsw} pair styles write their
 information to "binary restart files"_restart.html, so pair_style and
 pair_coeff commands do not need to be specified in an input script
 that reads a restart file.
 
 The {lj/charmm/coul/long} and {lj/charmmfsw/coul/long} pair styles
 support the use of the {inner}, {middle}, and {outer} keywords of the
 "run_style respa"_run_style.html command, meaning the pairwise forces
 can be partitioned by distance at different levels of the rRESPA
 hierarchy.  The other styles only support the {pair} keyword of
 run_style respa.  See the "run_style"_run_style.html command for
 details.
 
 :line
 
 [Restrictions:]
 
 All the styles with {coul/charmm} or {coul/charmmfsh} styles are part
 of the MOLECULE package.  All the styles with {coul/long} style are
 part of the KSPACE package.  They are only enabled if LAMMPS was built
 with those packages.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.  Note that
 the MOLECULE and KSPACE packages are installed by default.
 
 [Related commands:]
 
 "pair_coeff"_pair_coeff.html
 
 [Default:] none
 
 :line
 
 :link(Brooks1)
 [(Brooks)] Brooks, et al, J Comput Chem, 30, 1545 (2009).
 
 :link(pair-MacKerell)
 [(MacKerell)] MacKerell, Bashford, Bellott, Dunbrack, Evanseck, Field,
 Fischer, Gao, Guo, Ha, et al, J Phys Chem, 102, 3586 (1998).
 
 :link(Steinbach)
 [(Steinbach)] Steinbach, Brooks, J Comput Chem, 15, 667 (1994).
 
diff --git a/doc/src/pair_eam.txt b/doc/src/pair_eam.txt
index ce8495aff..a0026432e 100644
--- a/doc/src/pair_eam.txt
+++ b/doc/src/pair_eam.txt
@@ -1,448 +1,450 @@
 "LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
 
 :link(lws,http://lammps.sandia.gov)
 :link(ld,Manual.html)
 :link(lc,Section_commands.html#comm)
 
 :line
 
 pair_style eam command :h3
 pair_style eam/gpu command :h3
 pair_style eam/intel command :h3
 pair_style eam/kk command :h3
 pair_style eam/omp command :h3
 pair_style eam/opt command :h3
 pair_style eam/alloy command :h3
 pair_style eam/alloy/gpu command :h3
+pair_style eam/alloy/intel command :h3
 pair_style eam/alloy/kk command :h3
 pair_style eam/alloy/omp command :h3
 pair_style eam/alloy/opt command :h3
 pair_style eam/cd command :h3
 pair_style eam/cd/omp command :h3
 pair_style eam/fs command :h3
 pair_style eam/fs/gpu command :h3
+pair_style eam/fs/intel command :h3
 pair_style eam/fs/kk command :h3
 pair_style eam/fs/omp command :h3
 pair_style eam/fs/opt command :h3
 
 [Syntax:]
 
 pair_style style :pre
 
 style = {eam} or {eam/alloy} or {eam/cd} or {eam/fs} :ul
 
 [Examples:]
 
 pair_style eam
 pair_coeff * * cuu3
 pair_coeff 1*3 1*3 niu3.eam :pre
 
 pair_style eam/alloy
 pair_coeff * * ../potentials/NiAlH_jea.eam.alloy Ni Al Ni Ni :pre
 
 pair_style eam/cd
 pair_coeff * * ../potentials/FeCr.cdeam Fe Cr :pre
 
 pair_style eam/fs
 pair_coeff * * NiAlH_jea.eam.fs Ni Al Ni Ni :pre
 
 [Description:]
 
 Style {eam} computes pairwise interactions for metals and metal alloys
 using embedded-atom method (EAM) potentials "(Daw)"_#Daw.  The total
 energy Ei of an atom I is given by
 
 :c,image(Eqs/pair_eam.jpg)
 
 where F is the embedding energy which is a function of the atomic
 electron density rho, phi is a pair potential interaction, and alpha
 and beta are the element types of atoms I and J.  The multi-body
 nature of the EAM potential is a result of the embedding energy term.
 Both summations in the formula are over all neighbors J of atom I
 within the cutoff distance.
 
 The cutoff distance and the tabulated values of the functionals F,
 rho, and phi are listed in one or more files which are specified by
 the "pair_coeff"_pair_coeff.html command.  These are ASCII text files
 in a DYNAMO-style format which is described below.  DYNAMO was the
 original serial EAM MD code, written by the EAM originators.  Several
 DYNAMO potential files for different metals are included in the
 "potentials" directory of the LAMMPS distribution.  All of these files
 are parameterized in terms of LAMMPS "metal units"_units.html.
 
 NOTE: The {eam} style reads single-element EAM potentials in the
 DYNAMO {funcfl} format.  Either single element or alloy systems can be
 modeled using multiple {funcfl} files and style {eam}.  For the alloy
 case LAMMPS mixes the single-element potentials to produce alloy
 potentials, the same way that DYNAMO does.  Alternatively, a single
 DYNAMO {setfl} file or Finnis/Sinclair EAM file can be used by LAMMPS
 to model alloy systems by invoking the {eam/alloy} or {eam/cd} or
 {eam/fs} styles as described below.  These files require no mixing
 since they specify alloy interactions explicitly.
 
 NOTE: Note that unlike for other potentials, cutoffs for EAM
 potentials are not set in the pair_style or pair_coeff command; they
 are specified in the EAM potential files themselves.  Likewise, the
 EAM potential files list atomic masses; thus you do not need to use
 the "mass"_mass.html command to specify them.
 
 There are several WWW sites that distribute and document EAM
 potentials stored in DYNAMO or other formats:
 
 http://www.ctcms.nist.gov/potentials
 http://cst-www.nrl.navy.mil/ccm6/ap
 http://enpub.fulton.asu.edu/cms/potentials/main/main.htm :pre
 
 These potentials should be usable with LAMMPS, though the alternate
 formats would need to be converted to the DYNAMO format used by LAMMPS
 and described on this page.  The NIST site is maintained by Chandler
 Becker (cbecker at nist.gov) who is good resource for info on
 interatomic potentials and file formats.
 
 :line
 
 For style {eam}, potential values are read from a file that is in the
 DYNAMO single-element {funcfl} format.  If the DYNAMO file was created
 by a Fortran program, it cannot have "D" values in it for exponents.
 C only recognizes "e" or "E" for scientific notation.
 
 Note that unlike for other potentials, cutoffs for EAM potentials are
 not set in the pair_style or pair_coeff command; they are specified in
 the EAM potential files themselves.
 
 For style {eam} a potential file must be assigned to each I,I pair of
 atom types by using one or more pair_coeff commands, each with a
 single argument:
 
 filename :ul
 
 Thus the following command
 
 pair_coeff *2 1*2 cuu3.eam :pre
 
 will read the cuu3 potential file and use the tabulated Cu values for
 F, phi, rho that it contains for type pairs 1,1 and 2,2 (type pairs
 1,2 and 2,1 are ignored).  See the "pair_coeff"_pair_coeff.html doc
 page for alternate ways to specify the path for the potential file.
 In effect, this makes atom types 1 and 2 in LAMMPS be Cu atoms.
 Different single-element files can be assigned to different atom types
 to model an alloy system.  The mixing to create alloy potentials for
 type pairs with I != J is done automatically the same way that the
 serial DYNAMO code originally did it; you do not need to specify
 coefficients for these type pairs.
 
 {Funcfl} files in the {potentials} directory of the LAMMPS
 distribution have an ".eam" suffix.  A DYNAMO single-element {funcfl}
 file is formatted as follows:
 
 line 1: comment (ignored)
 line 2: atomic number, mass, lattice constant, lattice type (e.g. FCC)
 line 3: Nrho, drho, Nr, dr, cutoff :ul
 
 On line 2, all values but the mass are ignored by LAMMPS.  The mass is
 in mass "units"_units.html, e.g. mass number or grams/mole for metal
 units.  The cubic lattice constant is in Angstroms.  On line 3, Nrho
 and Nr are the number of tabulated values in the subsequent arrays,
 drho and dr are the spacing in density and distance space for the
 values in those arrays, and the specified cutoff becomes the pairwise
 cutoff used by LAMMPS for the potential.  The units of dr are
 Angstroms; I'm not sure of the units for drho - some measure of
 electron density.
 
 Following the three header lines are three arrays of tabulated values:
 
 embedding function F(rho) (Nrho values)
 effective charge function Z(r) (Nr values)
 density function rho(r) (Nr values) :ul
 
 The values for each array can be listed as multiple values per line,
 so long as each array starts on a new line.  For example, the
 individual Z(r) values are for r = 0,dr,2*dr, ... (Nr-1)*dr.
 
 The units for the embedding function F are eV.  The units for the
 density function rho are the same as for drho (see above, electron
 density).  The units for the effective charge Z are "atomic charge" or
 sqrt(Hartree * Bohr-radii).  For two interacting atoms i,j this is used
 by LAMMPS to compute the pair potential term in the EAM energy
 expression as r*phi, in units of eV-Angstroms, via the formula
 
 r*phi = 27.2 * 0.529 * Zi * Zj :pre
 
 where 1 Hartree = 27.2 eV and 1 Bohr = 0.529 Angstroms.
 
 :line
 
 Style {eam/alloy} computes pairwise interactions using the same
 formula as style {eam}.  However the associated
 "pair_coeff"_pair_coeff.html command reads a DYNAMO {setfl} file
 instead of a {funcfl} file.  {Setfl} files can be used to model a
 single-element or alloy system.  In the alloy case, as explained
 above, {setfl} files contain explicit tabulated values for alloy
 interactions.  Thus they allow more generality than {funcfl} files for
 modeling alloys.
 
 For style {eam/alloy}, potential values are read from a file that is
 in the DYNAMO multi-element {setfl} format, except that element names
 (Ni, Cu, etc) are added to one of the lines in the file.  If the
 DYNAMO file was created by a Fortran program, it cannot have "D"
 values in it for exponents.  C only recognizes "e" or "E" for
 scientific notation.
 
 Only a single pair_coeff command is used with the {eam/alloy} style
 which specifies a DYNAMO {setfl} file, which contains information for
 M elements.  These are mapped to LAMMPS atom types by specifying N
 additional arguments after the filename in the pair_coeff command,
 where N is the number of LAMMPS atom types:
 
 filename
 N element names = mapping of {setfl} elements to atom types :ul
 
 As an example, the potentials/NiAlH_jea.eam.alloy file is a {setfl}
 file which has tabulated EAM values for 3 elements and their alloy
 interactions: Ni, Al, and H.  See the "pair_coeff"_pair_coeff.html doc
 page for alternate ways to specify the path for the potential file.
 If your LAMMPS simulation has 4 atoms types and you want the 1st 3 to
 be Ni, and the 4th to be Al, you would use the following pair_coeff
 command:
 
 pair_coeff * * NiAlH_jea.eam.alloy Ni Ni Ni Al :pre
 
 The 1st 2 arguments must be * * so as to span all LAMMPS atom types.
 The first three Ni arguments map LAMMPS atom types 1,2,3 to the Ni
 element in the {setfl} file.  The final Al argument maps LAMMPS atom
 type 4 to the Al element in the {setfl} file.  Note that there is no
 requirement that your simulation use all the elements specified by the
 {setfl} file.
 
 If a mapping value is specified as NULL, the mapping is not performed.
 This can be used when an {eam/alloy} potential is used as part of the
 {hybrid} pair style.  The NULL values are placeholders for atom types
 that will be used with other potentials.
 
 {Setfl} files in the {potentials} directory of the LAMMPS distribution
 have an ".eam.alloy" suffix.  A DYNAMO multi-element {setfl} file is
 formatted as follows:
 
 lines 1,2,3 = comments (ignored)
 line 4: Nelements Element1 Element2 ... ElementN
 line 5: Nrho, drho, Nr, dr, cutoff :ul
 
 In a DYNAMO {setfl} file, line 4 only lists Nelements = the # of
 elements in the {setfl} file.  For LAMMPS, the element name (Ni, Cu,
 etc) of each element must be added to the line, in the order the
 elements appear in the file.
 
 The meaning and units of the values in line 5 is the same as for the
 {funcfl} file described above.  Note that the cutoff (in Angstroms) is
 a global value, valid for all pairwise interactions for all element
 pairings.
 
 Following the 5 header lines are Nelements sections, one for each
 element, each with the following format:
 
 line 1 = atomic number, mass, lattice constant, lattice type (e.g. FCC)
 embedding function F(rho) (Nrho values)
 density function rho(r) (Nr values) :ul
 
 As with the {funcfl} files, only the mass (in mass "units"_units.html,
 e.g. mass number or grams/mole for metal units) is used by LAMMPS from
 the 1st line.  The cubic lattice constant is in Angstroms.  The F and
 rho arrays are unique to a single element and have the same format and
 units as in a {funcfl} file.
 
 Following the Nelements sections, Nr values for each pair potential
 phi(r) array are listed for all i,j element pairs in the same format
 as other arrays.  Since these interactions are symmetric (i,j = j,i)
 only phi arrays with i >= j are listed, in the following order: i,j =
 (1,1), (2,1), (2,2), (3,1), (3,2), (3,3), (4,1), ..., (Nelements,
 Nelements).  Unlike the effective charge array Z(r) in {funcfl} files,
 the tabulated values for each phi function are listed in {setfl} files
 directly as r*phi (in units of eV-Angstroms), since they are for atom
 pairs.
 
 :line
 
 Style {eam/cd} is similar to the {eam/alloy} style, except that it
 computes alloy pairwise interactions using the concentration-dependent
 embedded-atom method (CD-EAM).  This model can reproduce the enthalpy
 of mixing of alloys over the full composition range, as described in
 "(Stukowski)"_#Stukowski.
 
 The pair_coeff command is specified the same as for the {eam/alloy}
 style.  However the DYNAMO {setfl} file must has two
 lines added to it, at the end of the file:
 
 line 1: Comment line (ignored)
 line 2: N Coefficient0 Coefficient1 ... CoefficientN :ul
 
 The last line begins with the degree {N} of the polynomial function
 {h(x)} that modifies the cross interaction between A and B elements.
 Then {N+1} coefficients for the terms of the polynomial are then
 listed.
 
 Modified EAM {setfl} files used with the {eam/cd} style must contain
 exactly two elements, i.e. in the current implementation the {eam/cd}
 style only supports binary alloys.  The first and second elements in
 the input EAM file are always taken as the {A} and {B} species.
 
 {CD-EAM} files in the {potentials} directory of the LAMMPS
 distribution have a ".cdeam" suffix.
 
 :line
 
 Style {eam/fs} computes pairwise interactions for metals and metal
 alloys using a generalized form of EAM potentials due to Finnis and
 Sinclair "(Finnis)"_#Finnis.  The total energy Ei of an atom I is
 given by
 
 :c,image(Eqs/pair_eam_fs.jpg)
 
 This has the same form as the EAM formula above, except that rho is
 now a functional specific to the atomic types of both atoms I and J,
 so that different elements can contribute differently to the total
 electron density at an atomic site depending on the identity of the
 element at that atomic site.
 
 The associated "pair_coeff"_pair_coeff.html command for style {eam/fs}
 reads a DYNAMO {setfl} file that has been extended to include
 additional rho_alpha_beta arrays of tabulated values.  A discussion of
 how FS EAM differs from conventional EAM alloy potentials is given in
 "(Ackland1)"_#Ackland1.  An example of such a potential is the same
 author's Fe-P FS potential "(Ackland2)"_#Ackland2.  Note that while FS
 potentials always specify the embedding energy with a square root
 dependence on the total density, the implementation in LAMMPS does not
 require that; the user can tabulate any functional form desired in the
 FS potential files.
 
 For style {eam/fs}, the form of the pair_coeff command is exactly the
 same as for style {eam/alloy}, e.g.
 
 pair_coeff * * NiAlH_jea.eam.fs Ni Ni Ni Al :pre
 
 where there are N additional arguments after the filename, where N is
 the number of LAMMPS atom types.  See the "pair_coeff"_pair_coeff.html
 doc page for alternate ways to specify the path for the potential
 file.  The N values determine the mapping of LAMMPS atom types to EAM
 elements in the file, as described above for style {eam/alloy}.  As
 with {eam/alloy}, if a mapping value is NULL, the mapping is not
 performed.  This can be used when an {eam/fs} potential is used as
 part of the {hybrid} pair style.  The NULL values are used as
 placeholders for atom types that will be used with other potentials.
 
 FS EAM files include more information than the DYNAMO {setfl} format
 files read by {eam/alloy}, in that i,j density functionals for all
 pairs of elements are included as needed by the Finnis/Sinclair
 formulation of the EAM.
 
 FS EAM files in the {potentials} directory of the LAMMPS distribution
 have an ".eam.fs" suffix.  They are formatted as follows:
 
 lines 1,2,3 = comments (ignored)
 line 4: Nelements Element1 Element2 ... ElementN
 line 5: Nrho, drho, Nr, dr, cutoff :ul
 
 The 5-line header section is identical to an EAM {setfl} file.
 
 Following the header are Nelements sections, one for each element I,
 each with the following format:
 
 line 1 = atomic number, mass, lattice constant, lattice type (e.g. FCC)
 embedding function F(rho) (Nrho values)
 density function rho(r) for element I at element 1 (Nr values)
 density function rho(r) for element I at element 2
 ...
 density function rho(r) for element I at element Nelement :ul
 
 The units of these quantities in line 1 are the same as for {setfl}
 files.  Note that the rho(r) arrays in Finnis/Sinclair can be
 asymmetric (i,j != j,i) so there are Nelements^2 of them listed in the
 file.
 
 Following the Nelements sections, Nr values for each pair potential
 phi(r) array are listed in the same manner (r*phi, units of
 eV-Angstroms) as in EAM {setfl} files.  Note that in Finnis/Sinclair,
 the phi(r) arrays are still symmetric, so only phi arrays for i >= j
 are listed.
 
 :line
 
 Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
 functionally the same as the corresponding style without the suffix.
 They have been optimized to run faster, depending on your available
 hardware, as discussed in "Section 5"_Section_accelerate.html
 of the manual.  The accelerated styles take the same arguments and
 should produce the same results, except for round-off and precision
 issues.
 
 These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
 USER-OMP and OPT packages, respectively.  They are only enabled if
 LAMMPS was built with those packages.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.
 
 You can specify the accelerated styles explicitly in your input script
 by including their suffix, or you can use the "-suffix command-line
 switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
 use the "suffix"_suffix.html command in your input script.
 
 See "Section 5"_Section_accelerate.html of the manual for more
 instructions on how to use the accelerated styles effectively.
 
 :line
 
 [Mixing, shift, table, tail correction, restart, rRESPA info]:
 
 For atom type pairs I,J and I != J, where types I and J correspond to
 two different element types, mixing is performed by LAMMPS as
 described above with the individual styles.  You never need to specify
 a pair_coeff command with I != J arguments for the eam styles.
 
 This pair style does not support the "pair_modify"_pair_modify.html
 shift, table, and tail options.
 
 The eam pair styles do not write their information to "binary restart
 files"_restart.html, since it is stored in tabulated potential files.
 Thus, you need to re-specify the pair_style and pair_coeff commands in
 an input script that reads a restart file.
 
 The eam pair styles can only be used via the {pair} keyword of the
 "run_style respa"_run_style.html command.  They do not support the
 {inner}, {middle}, {outer} keywords.
 
 :line
 
 [Restrictions:]
 
 All of these styles except the {eam/cd} style are part of the MANYBODY
 package.  They are only enabled if LAMMPS was built with that package.
 See the "Making LAMMPS"_Section_start.html#start_3 section for more info.
 
 The {eam/cd} style is part of the USER-MISC package and also requires
 the MANYBODY package.  It is only enabled if LAMMPS was built with
 those packages.  See the "Making LAMMPS"_Section_start.html#start_3
 section for more info.
 
 [Related commands:]
 
 "pair_coeff"_pair_coeff.html
 
 [Default:] none
 
 :line
 
 :link(Ackland1)
 [(Ackland1)] Ackland, Condensed Matter (2005).
 
 :link(Ackland2)
 [(Ackland2)] Ackland, Mendelev, Srolovitz, Han and Barashev, Journal
 of Physics: Condensed Matter, 16, S2629 (2004).
 
 :link(Daw)
 [(Daw)] Daw, Baskes, Phys Rev Lett, 50, 1285 (1983).
 Daw, Baskes, Phys Rev B, 29, 6443 (1984).
 
 :link(Finnis)
 [(Finnis)] Finnis, Sinclair, Philosophical Magazine A, 50, 45 (1984).
 
 :link(Stukowski)
 [(Stukowski)] Stukowski, Sadigh, Erhart, Caro; Modeling Simulation
 Materials Science & Engineering, 7, 075005 (2009).
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
index ac8279949..6a4c4c14b 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
@@ -1,123 +1,123 @@
 # intel_cpu_intelmpi = USER-INTEL package, Intel MPI, MKL FFT
 
 SHELL = /bin/sh
 
 # ---------------------------------------------------------------------
 # compiler/linker settings
 # specify flags and libraries needed for your compiler
 
 CC =		mpiicpc 
 OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
 CCFLAGS =	-qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
                 -fno-alias -ansi-alias -restrict $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpiicpc
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
 LIB =           -ltbbmalloc
 SIZE =		size
 
 ARCHIVE =	ar
 ARFLAGS =	-rc
 SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings, all OPTIONAL
 # specify settings for LAMMPS features you will use
 # if you change any -D setting, do full re-compile after "make clean"
 
 # LAMMPS ifdef settings
 # see possible settings in Section 2.2 (step 4) of manual
 
 LMP_INC =	-DLAMMPS_GZIP
 
 # MPI library
 # see discussion in Section 2.2 (step 5) of manual
 # MPI wrapper compiler/linker can provide this info
 # can point to dummy MPI library in src/STUBS as in Makefile.serial
 # use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts
 # INC = path for mpi.h, MPI compiler settings
 # PATH = path for MPI library
 # LIB = name of MPI library
 
 MPI_INC =       -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1
 MPI_PATH = 
 MPI_LIB =
 
 # FFT library
 # see discussion in Section 2.2 (step 6) of manaul
 # can be left blank to use provided KISS FFT library
 # INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
 # PATH = path for FFT library
 # LIB = name of FFT library
 
 FFT_INC =       -DFFT_MKL -DFFT_SINGLE
 FFT_PATH = 
 FFT_LIB =       -L$(MKLROOT)/lib/intel64/ -lmkl_intel_ilp64 \
                 -lmkl_sequential -lmkl_core	
 
 # JPEG and/or PNG library
 # see discussion in Section 2.2 (step 7) of manual
 # only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
 # INC = path(s) for jpeglib.h and/or png.h
 # PATH = path(s) for JPEG library and/or PNG library
 # LIB = name(s) of JPEG library and/or PNG library
 
 JPG_INC =       
 JPG_PATH = 	
 JPG_LIB =	
 
 # ---------------------------------------------------------------------
 # build rules and dependencies
 # do not edit this section
 
 include	Makefile.package.settings
 include	Makefile.package
 
 EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
 EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
 EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
 EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS)
 EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS)
 
 # Path to src files
 
 vpath %.cpp ..
 vpath %.h ..
 
 # Link target
 
 $(EXE):	$(OBJ) $(EXTRA_LINK_DEPENDS)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
 # Library targets
 
 lib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
 shlib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
         $(OBJ) $(EXTRA_LIB) $(LIB)
 
 # Compilation rules
 
 %.o:%.cpp $(EXTRA_CPP_DEPENDS)
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp $(EXTRA_CPP_DEPENDS)
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
 
 %.o:%.cu $(EXTRA_CPP_DEPENDS)
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 # Individual dependencies
 
 depend : fastdep.exe $(SRC)
 	@./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1
 
 fastdep.exe: ../DEPEND/fastdep.c
 	cc -O -o $@ $<
 
 sinclude .depend
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
index 389a578f7..d4cbdbdb0 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
@@ -1,122 +1,122 @@
 # intel_cpu_mpich = USER-INTEL package, MPICH with compiler set to Intel icc
 
 SHELL = /bin/sh
 
 # ---------------------------------------------------------------------
 # compiler/linker settings
 # specify flags and libraries needed for your compiler
 
 CC =		mpicxx -cxx=icc
-OPTFLAGS =      -xAVX -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
+OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
 CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
                 -fno-alias -ansi-alias -restrict $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpicxx -cxx=icc
 LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
 LIB =           
 SIZE =		size
 
 ARCHIVE =	ar
 ARFLAGS =	-rc
 SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings, all OPTIONAL
 # specify settings for LAMMPS features you will use
 # if you change any -D setting, do full re-compile after "make clean"
 
 # LAMMPS ifdef settings
 # see possible settings in Section 2.2 (step 4) of manual
 
 LMP_INC =	-DLAMMPS_GZIP
 
 # MPI library
 # see discussion in Section 2.2 (step 5) of manual
 # MPI wrapper compiler/linker can provide this info
 # can point to dummy MPI library in src/STUBS as in Makefile.serial
 # use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts
 # INC = path for mpi.h, MPI compiler settings
 # PATH = path for MPI library
 # LIB = name of MPI library
 
 MPI_INC =       -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 -I/usr/local/include
 MPI_PATH =      -L/usr/local/lib
 MPI_LIB =	-lmpich -lmpl -lpthread
 
 # FFT library
 # see discussion in Section 2.2 (step 6) of manaul
 # can be left blank to use provided KISS FFT library
 # INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
 # PATH = path for FFT library
 # LIB = name of FFT library
 
 FFT_INC =       
 FFT_PATH = 
 FFT_LIB =       
 
 # JPEG and/or PNG library
 # see discussion in Section 2.2 (step 7) of manual
 # only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
 # INC = path(s) for jpeglib.h and/or png.h
 # PATH = path(s) for JPEG library and/or PNG library
 # LIB = name(s) of JPEG library and/or PNG library
 
 JPG_INC =       
 JPG_PATH = 	
 JPG_LIB =	
 
 # ---------------------------------------------------------------------
 # build rules and dependencies
 # do not edit this section
 
 include	Makefile.package.settings
 include	Makefile.package
 
 EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
 EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
 EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
 EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS)
 EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS)
 
 # Path to src files
 
 vpath %.cpp ..
 vpath %.h ..
 
 # Link target
 
 $(EXE):	$(OBJ) $(EXTRA_LINK_DEPENDS)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
 # Library targets
 
 lib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
 shlib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
         $(OBJ) $(EXTRA_LIB) $(LIB)
 
 # Compilation rules
 
 %.o:%.cpp $(EXTRA_CPP_DEPENDS)
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp $(EXTRA_CPP_DEPENDS)
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
 
 %.o:%.cu $(EXTRA_CPP_DEPENDS)
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 # Individual dependencies
 
 depend : fastdep.exe $(SRC)
 	@./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1
 
 fastdep.exe: ../DEPEND/fastdep.c
 	cc -O -o $@ $<
 
 sinclude .depend
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
index b65905440..50433ce4c 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
@@ -1,123 +1,123 @@
 # intel_cpu_openmpi = USER-INTEL package, OpenMPI with compiler set to Intel icc
 
 SHELL = /bin/sh
 
 # ---------------------------------------------------------------------
 # compiler/linker settings
 # specify flags and libraries needed for your compiler
 
 export OMPI_CXX = icc
 CC =		mpicxx
-OPTFLAGS =      -xAVX -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
+OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
 CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
                 -fno-alias -ansi-alias -restrict $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpicxx
 LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
 LIB =           -ltbbmalloc -ltbbmalloc_proxy
 SIZE =		size
 
 ARCHIVE =	ar
 ARFLAGS =	-rc
 SHLIBFLAGS =	-shared
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings, all OPTIONAL
 # specify settings for LAMMPS features you will use
 # if you change any -D setting, do full re-compile after "make clean"
 
 # LAMMPS ifdef settings
 # see possible settings in Section 2.2 (step 4) of manual
 
 LMP_INC =	-DLAMMPS_GZIP
 
 # MPI library
 # see discussion in Section 2.2 (step 5) of manual
 # MPI wrapper compiler/linker can provide this info
 # can point to dummy MPI library in src/STUBS as in Makefile.serial
 # use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts
 # INC = path for mpi.h, MPI compiler settings
 # PATH = path for MPI library
 # LIB = name of MPI library
 
 MPI_INC =       -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 -I/usr/local/include
 MPI_PATH =      
 MPI_LIB =	
 
 # FFT library
 # see discussion in Section 2.2 (step 6) of manaul
 # can be left blank to use provided KISS FFT library
 # INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
 # PATH = path for FFT library
 # LIB = name of FFT library
 
 FFT_INC =       
 FFT_PATH = 
 FFT_LIB =       
 
 # JPEG and/or PNG library
 # see discussion in Section 2.2 (step 7) of manual
 # only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
 # INC = path(s) for jpeglib.h and/or png.h
 # PATH = path(s) for JPEG library and/or PNG library
 # LIB = name(s) of JPEG library and/or PNG library
 
 JPG_INC =       
 JPG_PATH = 	
 JPG_LIB =	
 
 # ---------------------------------------------------------------------
 # build rules and dependencies
 # do not edit this section
 
 include	Makefile.package.settings
 include	Makefile.package
 
 EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
 EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
 EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
 EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS)
 EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS)
 
 # Path to src files
 
 vpath %.cpp ..
 vpath %.h ..
 
 # Link target
 
 $(EXE):	$(OBJ) $(EXTRA_LINK_DEPENDS)
 	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
 	$(SIZE) $(EXE)
 
 # Library targets
 
 lib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
 	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
 
 shlib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
         $(OBJ) $(EXTRA_LIB) $(LIB)
 
 # Compilation rules
 
 %.o:%.cpp $(EXTRA_CPP_DEPENDS)
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 %.d:%.cpp $(EXTRA_CPP_DEPENDS)
 	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
 
 %.o:%.cu $(EXTRA_CPP_DEPENDS)
 	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
 
 # Individual dependencies
 
 depend : fastdep.exe $(SRC)
 	@./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1
 
 fastdep.exe: ../DEPEND/fastdep.c
 	cc -O -o $@ $<
 
 sinclude .depend
diff --git a/src/USER-INTEL/Install.sh b/src/USER-INTEL/Install.sh
index 736059aa0..275b4839f 100644
--- a/src/USER-INTEL/Install.sh
+++ b/src/USER-INTEL/Install.sh
@@ -1,67 +1,68 @@
 # Install/unInstall package files in LAMMPS
 # mode = 0/1/2 for uninstall/install/update
 
 mode=$1
 
 # arg1 = file, arg2 = file it depends on
 
 action () {
   if (test $mode = 0) then
     rm -f ../$1
   elif (! cmp -s $1 ../$1) then
     if (test -z "$2" || test -e ../$2) then
       cp $1 ..
       if (test $mode = 2) then
         echo "  updating src/$1"
       fi
     fi
   elif (test -n "$2") then
     if (test ! -e ../$2) then
       rm -f ../$1
     fi
   fi
 }
 
 # step 1: process all *_intel.cpp and *_intel.h files.
 # do not install child files if parent does not exist
 
 for file in *_intel.cpp; do
   dep=`echo $file | sed 's/neigh_full_intel/neigh_full/g' | \
       sed 's/_offload_intel//g' | sed 's/_intel//g'`
   action $file $dep
 done
 
 for file in *_intel.h; do
   dep=`echo $file | sed 's/_offload_intel//g' | sed 's/_intel//g'`
   action $file $dep
 done
 
 action intel_preprocess.h
 action intel_buffers.h
 action intel_buffers.cpp
 action math_extra_intel.h
 action nbin_intel.h
 action nbin_intel.cpp
 action npair_intel.h
 action npair_intel.cpp
 action intel_simd.h pair_sw_intel.cpp
 action intel_intrinsics.h pair_tersoff_intel.cpp
+action intel_intrinsics_airebo.h pair_airebo_intel.cpp
 action verlet_lrt_intel.h pppm.cpp
 action verlet_lrt_intel.cpp pppm.cpp
 
 # step 2: handle cases and tasks not handled in step 1.
 
 if (test $mode = 1) then
 
   if (test -e ../Makefile.package) then
     sed -i -e 's/[^ \t]*INTEL[^ \t]* //' ../Makefile.package
     sed -i -e 's|^PKG_INC =[ \t]*|&-DLMP_USER_INTEL |' ../Makefile.package
   fi
 
 elif (test $mode = 0) then
 
   if (test -e ../Makefile.package) then
     sed -i -e 's/[^ \t]*INTEL[^ \t]* //' ../Makefile.package
   fi
 
 fi
diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README
index c02014d0c..3b8444605 100644
--- a/src/USER-INTEL/README
+++ b/src/USER-INTEL/README
@@ -1,80 +1,80 @@
 
                      --------------------------------
                           LAMMPS Intel(R) Package
                      --------------------------------
                      
              W. Michael Brown (Intel) michael.w.brown at intel.com
+                  Markus Hohnerbach (RWTH Aachen University)
                    William McDoniel (RWTH Aachen University)
                    Rodrigo Canales (RWTH Aachen University)
-                  Markus H�hnerbach (RWTH Aachen University)
                            Stan Moore (Sandia)
 		   Ahmed E. Ismail (RWTH Aachen University)
                    Paolo Bientinesi (RWTH Aachen University)
                           Anupama Kurpad (Intel)
                           Biswajit Mishra (Shell)
 
 -----------------------------------------------------------------------------
 
 This package provides LAMMPS styles that:
 
    1. include support for single and mixed precision in addition to double.
    2. include modifications to support vectorization for key routines
    3. include modifications for data layouts to improve cache efficiency
    3. include modifications to support offload to Intel(R) Xeon Phi(TM) 
       coprocessors
 
 -----------------------------------------------------------------------------
 
 For Intel server processors codenamed "Skylake", the following flags should
 be added or changed in the Makefile depending on the version:
 
 2017 update 2         - No changes needed
 2017 updates 3 or 4   - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
 2018 or newer         - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high 
 
 -----------------------------------------------------------------------------
 
 When using the suffix command with "intel", intel styles will be used if they
 exist. If the suffix command is used with "hybrid intel omp" and the USER-OMP 
 USER-OMP styles will be used whenever USER-INTEL styles are not available. This
 allow for running most styles in LAMMPS with threading.
 
 -----------------------------------------------------------------------------
 
 The Long-Range Thread mode (LRT) in the Intel package currently uses
 pthreads by default. If pthreads are not supported in the build environment,
 the compile flag "-DLMP_INTEL_NOLRT" will disable the feature to allow for 
 builds without pthreads. Alternatively, "-DLMP_INTEL_LRT11" can be used to
 build with compilers that support threads using the C++11 standard. When using
 LRT mode, you might need to disable OpenMP affinity settings (e.g.
 export KMP_AFFINITY=none). LAMMPS will generate a warning if the settings
 need to be changed.
 
 -----------------------------------------------------------------------------
 
 In order to use offload to Intel(R) Xeon Phi(TM) coprocessors, the flag 
 -DLMP_INTEL_OFFLOAD should be set in the Makefile. Offload requires the use of 
 Intel compilers.
 
 -----------------------------------------------------------------------------
 
 For portability reasons, vectorization directives are currently only enabled 
 for Intel compilers. Using other compilers may result in significantly
 lower performance. This behavior can be changed by defining 
 LMP_SIMD_COMPILER for the preprocessor (see intel_preprocess.h).
 
 -----------------------------------------------------------------------------
 
 By default, when running with offload to Intel(R) coprocessors, affinity
 for host MPI tasks and OpenMP threads is set automatically within the code.
 This currently requires the use of system calls. To disable at build time,
 compile with -DINTEL_OFFLOAD_NOAFFINITY.
 
 -----------------------------------------------------------------------------
 
 Vector intrinsics are temporarily being used for the Stillinger-Weber 
 potential to allow for advanced features in the AVX512 instruction set to
 be exploited on early hardware. We hope to see compiler improvements for
 AVX512 that will eliminate this requirement, so it is not recommended to
 develop code based on the intrinsics implementation. Please e-mail the 
 authors for more details.
diff --git a/src/USER-INTEL/TEST/README b/src/USER-INTEL/TEST/README
index 758c37bf5..434189dd2 100644
--- a/src/USER-INTEL/TEST/README
+++ b/src/USER-INTEL/TEST/README
@@ -1,125 +1,127 @@
 #############################################################################
 # Benchmarks
 #
 # in.intel.lj -	        Atomic fluid (LJ Benchmark)
 # in.intel.rhodo -      Protein (Rhodopsin Benchmark)
 # in.intel.lc -	        Liquid Crystal w/ Gay-Berne potential
 # in.intel.eam -	Copper benchmark with Embedded Atom Method
 # in.intel.sw -	        Silicon benchmark with Stillinger-Weber
 # in.intel.tersoff -    Silicon benchmark with Tersoff
 # in.intel.water -      Coarse-grain water benchmark using Stillinger-Weber
+# in.intel.airebo -     Polyethelene benchmark with AIREBO
 #
 #############################################################################
 
 #############################################################################
 # Expected Timesteps/second with turbo on and HT enabled, LAMMPS June-2017
 #  - Compiled w/ Intel Parallel Studio 2017u2 and Makefile.intel_cpu_intelmpi
 #
 #                     Xeon E5-2697v4     Xeon Phi 7250
 #                    
 # in.intel.lj -            199.5               282.3
 # in.intel.rhodo -          12.4                17.5
 # in.intel.lc -	            19.0                25.7
 # in.intel.eam -            59.4                92.8
 # in.intel.sw -	           132.4               161.9
 # in.intel.tersoff -        83.3               101.1
 # in.intel.water -          53.4                90.3
+# in.intel.airebo -          7.3                11.8
 #
 #############################################################################
 
 #############################################################################
 # For Skylake server (Xeon) architectures, see notes in the USER-INTEL/README
 # for build flags that should be used. 
 #############################################################################
 
 #############################################################################
 # For Haswell (Xeon v3) architectures, depending on the compiler version, 
 # it may give better performance to compile for an AVX target (with -xAVX 
 # compiler option) instead of -xHost or -xCORE-AVX2 for some of the 
 # workloads. In most cases, FMA sensitive routines will still use AVX2 
 # (MKL and SVML detect the processor at runtime). For Broadwell (Xeon v4)
 # architectures, -xCORE-AVX2 or -xHost will work best for all.
 #############################################################################
 
 #############################################################################
 # The default benchmark timesteps will run between 30s and 1 minute with
 # the Intel package. You can specify a multiplier for all of the benchmarks
 # to increase or decrease the runtime. Example commandline arguments:
 #
 # -v m 2		# Run for twice as long
 # -v m 0.5		# Run for half as long
 #############################################################################
 
 #############################################################################
 # The LAMMPS newton setting can be controlled from the commandline for the
 # benchmarks with the N variable:
 #
 # -v N on		# newton on
 # -v N off		# newton off
 #
 # The default is on for all of the benchmarks except for LJ where the off
 # setting performs best with the USER-INTEL package
 #############################################################################
 
 #	Example for running benchmarks (see run_benchmarks.sh for script):
 
 # 	Number of physical cores per node not including hyperthreads
 export LMP_CORES=28
 
 #      If hyperthreading is enabled, number of hyperthreads to use per core
 #      (2 for Xeon; 2 or 4 for Xeon Phi)
 export OMP_NUM_THREADS=2
                         
 #      Name of the LAMMPS binary
 export LMP_BIN=../../lmp_intel_cpu
 
 #      LAMMPS root directory
 export LMP_ROOT=../../../
                
 source source /opt/intel/parallel_studio_xe_2017.2.050/psxevars.sh
 export KMP_BLOCKTIME=0
 export I_MPI_PIN_DOMAIN=core
 export I_MPI_FABRICS=shm		# For single node
 
 # ONLY FOR INTEL XEON PHI x200 SERIES PROCESSORS
 export I_MPI_SHM_LMT=shm
 
 #      Generate the restart file for use with liquid crystal benchmark
 mpirun -np $LMP_CORES $LMP_BIN -in in.lc_generate_restart -log none
 
 #      Benchmark to run
 export bench=in.intel.lj
 
 #############################################################################
 # For Intel Xeon Phi x200 series processors best performance is achieved by
 # using MCDRAM. In flat mode, this can be achieved with numactl,
 # MPI environment variables, or other options provided by batch schedulers
 #############################################################################
 
 #############################################################################
 # To run without a optimization package
 #############################################################################
 mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -v N on
 
 #############################################################################
 # To run with USER-OMP package
 #############################################################################
 mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk omp 0 -sf omp -v N on
 
 #############################################################################
 # To run with USER-INTEL package and no coprocessor
 #############################################################################
 mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 -sf intel
 
 #############################################################################
 # To run with USER-INTEL and automatic load balancing to 1 coprocessor
 #############################################################################
 mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 1 -sf intel
 
 #############################################################################
 # If using PPPM (e.g. in.intel.rhodo) on Intel Xeon Phi x200 series 
 #   or Skylake processors
 #############################################################################
 export KMP_AFFINITY=none
 rthreads=$((OMP_NUM_THREADS-1))
 mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 omp $rthreads lrt yes -sf intel
diff --git a/src/USER-INTEL/TEST/in.intel.airebo b/src/USER-INTEL/TEST/in.intel.airebo
new file mode 100644
index 000000000..fcd8af470
--- /dev/null
+++ b/src/USER-INTEL/TEST/in.intel.airebo
@@ -0,0 +1,47 @@
+# AIREBO polyethelene benchmark
+
+variable        N index on      # Newton Setting
+variable	w index 10	# Warmup Timesteps
+variable	t index 550	# Main Run Timesteps
+variable	m index 1	# Main Run Timestep Multiplier
+variable	n index 0	# Use NUMA Mapping for Multi-Node
+variable	p index 0	# Use Power Measurement
+variable	x index 4
+variable	y index 2
+variable	z index 2
+
+variable	xx equal 17*$x
+variable	yy equal 16*$y
+variable	zz equal 2*$z
+variable	rr equal floor($t*$m)
+variable        root getenv LMP_ROOT
+
+newton          $N
+if "$n > 0"	then "processors * * * grid numa"
+
+variable            root getenv LMP_ROOT
+
+units		    metal
+atom_style	    atomic
+
+read_data	    ${root}/examples/airebo/data.airebo
+
+replicate	    ${xx} ${yy} ${zz}
+
+neighbor	    0.5 bin
+neigh_modify	    delay 5 every 1
+
+pair_style	    airebo 3.0 1 1
+pair_coeff	    * * ${root}/potentials/CH.airebo C H
+
+velocity	    all create 300.0 761341
+
+fix		    1 all nve
+timestep	    0.0005
+
+thermo		    50
+
+if "$p > 0"	then "run_style verlet/power"
+
+if "$w > 0"	then "run $w"
+run		${rr}
diff --git a/src/USER-INTEL/TEST/in.intel.eam b/src/USER-INTEL/TEST/in.intel.eam
index 5a3b3064a..6486b22ee 100644
--- a/src/USER-INTEL/TEST/in.intel.eam
+++ b/src/USER-INTEL/TEST/in.intel.eam
@@ -1,49 +1,48 @@
 # bulk Cu lattice
 
 variable        N index on      # Newton Setting
 variable	w index 10      # Warmup Timesteps
 variable	t index 3100    # Main Run Timesteps
 variable	m index 1       # Main Run Timestep Multiplier
 variable	n index 0       # Use NUMA Mapping for Multi-Node
-variable	b index 3       # Neighbor binsize
 variable	p index 0       # Use Power Measurement
 
 variable	x index 4
 variable	y index 2
 variable	z index 2
 
 variable	rr equal floor($t*$m)
 variable	root getenv LMP_ROOT
 
 newton          $N
 if "$n > 0"	then "processors * * * grid numa"
 
 variable	xx equal 20*$x
 variable	yy equal 20*$y
 variable	zz equal 20*$z
 
 units		metal
 atom_style	atomic
 
 lattice		fcc 3.615
 region		box block 0 ${xx} 0 ${yy} 0 ${zz}
 create_box	1 box
 create_atoms	1 box
 
 pair_style	eam
 pair_coeff	1 1 ${root}/bench/Cu_u3.eam
 
 velocity	all create 1600.0 376847 loop geom
 
 neighbor	1.0 bin
 neigh_modify    every 1 delay 5 check yes
 
 fix		1 all nve
 
 timestep	0.005
 thermo		50
 
 if "$p > 0"	then "run_style verlet/power"
 
 if "$w > 0"	then "run $w"
 run		${rr}
diff --git a/src/USER-INTEL/TEST/in.intel.rhodo b/src/USER-INTEL/TEST/in.intel.rhodo
index 05145d79c..7ce7eb445 100644
--- a/src/USER-INTEL/TEST/in.intel.rhodo
+++ b/src/USER-INTEL/TEST/in.intel.rhodo
@@ -1,55 +1,54 @@
 # Rhodopsin model
 
 variable        N index on      # Newton Setting
 variable	w index 10	# Warmup Timesteps
 variable	t index 520	# Main Run Timesteps
 variable	m index 1	# Main Run Timestep Multiplier
 variable	n index 0	# Use NUMA Mapping for Multi-Node
-variable        b index 3       # Neighbor binsize
 variable	p index 0	# Use Power Measurement
 variable	c index 0	# 1 to use collectives for PPPM
 variable        d index 1       # 1 to use 'diff ad' for PPPM
 
 variable	x index 4
 variable	y index 2
 variable	z index 2
 
 variable	rr equal floor($t*$m)
 variable        root getenv LMP_ROOT
 
 newton          $N
 if "$n > 0"	then "processors * * * grid numa"
 
 units           real  
 neigh_modify    delay 5 every 1
 
 atom_style      full  
 bond_style      harmonic 
 angle_style     charmm 
 dihedral_style  charmm 
 improper_style  harmonic 
 pair_style      lj/charmm/coul/long 8.0 10.0 
 pair_modify     mix arithmetic 
 kspace_style    pppm 1e-4
 
 if "$c > 0"	then "kspace_modify collective yes"
 if "$d > 0"	then "kspace_modify diff ad"
 
 read_data       ${root}/bench/data.rhodo
 
 replicate	$x $y $z
 
 fix             1 all shake 0.0001 5 0 m 1.0 a 232
 fix             2 all npt temp 300.0 300.0 100.0 &
 		z 0.0 0.0 1000.0 mtk no pchain 0 tchain 1
 
 special_bonds   charmm
  
 thermo          100
 thermo_style    multi 
 timestep        2.0
 
 if "$p > 0"	then "run_style verlet/power"
 
 if "$w > 0"	then "run $w"
 run		${rr}
diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp
index 3664bc248..b4b664cb9 100644
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@@ -1,607 +1,646 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    This software is distributed under the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #include <math.h>
 #include "intel_buffers.h"
 #include "force.h"
 #include "memory.h"
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
     lmp(lmp_in), _x(0), _q(0), _quat(0), _f(0), _off_threads(0),
     _buf_size(0), _buf_local_size(0) {
   _list_alloc_atoms = 0;
   _ntypes = 0;
   _off_map_listlocal = 0;
   _ccachex = 0;
   _ncache_alloc = 0;
+  _ncachetag = 0;
+  _cutneighsq = 0;
+  _cutneighghostsq = 0;
   #ifdef _LMP_INTEL_OFFLOAD
   _separate_buffers = 0;
   _off_f = 0;
   _off_map_ilist = 0;
   _off_map_nmax = 0;
   _off_list_alloc = false;
   _off_threads = 0;
   _off_ccache = 0;
   _off_ncache = 0;
   _host_nmax = 0;
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 IntelBuffers<flt_t, acc_t>::~IntelBuffers()
 {
   free_buffers();
   free_all_nbor_buffers();
   free_ccache();
   set_ntypes(0);
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::free_buffers()
 {
   if (_buf_size > 0) {
     atom_t * x = get_x();
     flt_t * q = get_q();
     quat_t * quat = get_quat();
 
     #ifdef _LMP_INTEL_OFFLOAD
     vec3_acc_t * f_start = get_off_f();
     if (f_start != 0) {
       acc_t * ev_global = get_ev_global();
       if (ev_global != 0) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(x:alloc_if(0) free_if(1)) \
           nocopy(f_start:alloc_if(0) free_if(1)) \
           nocopy(ev_global:alloc_if(0) free_if(1))
       }
 
       if (q != 0) {
         #pragma offload_transfer target (mic:_cop) \
           nocopy(q:alloc_if(0) free_if(1))
       }
       if (quat != 0) {
         #pragma offload_transfer target (mic:_cop) \
           nocopy(quat:alloc_if(0) free_if(1))
       }
       lmp->memory->destroy(f_start);
     }
 
     if (_separate_buffers) {
       lmp->memory->destroy(_host_x);
       if (q != 0) lmp->memory->destroy(_host_q);
       if (quat != 0) lmp->memory->destroy(_host_quat);
     }
     #endif
 
     lmp->memory->destroy(x);
     if (q != 0) lmp->memory->destroy(q);
     if (quat != 0) lmp->memory->destroy(quat);
     lmp->memory->destroy(_f);
     _buf_size = _buf_local_size = 0;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
                                        const int nthreads,
                                        const int offload_end)
 {
   free_buffers();
   _buf_size = static_cast<double>(nall) * 1.1 + 1;
   if (lmp->force->newton_pair)
     _buf_local_size = _buf_size;
   else
     _buf_local_size = static_cast<double>(nlocal) * 1.1 + 1;
   const int f_stride = get_stride(_buf_local_size);
   lmp->memory->create(_x, _buf_size,"intel_x");
   if (lmp->atom->q != NULL)
     lmp->memory->create(_q, _buf_size, "intel_q");
   if (lmp->atom->ellipsoid != NULL)
     lmp->memory->create(_quat, _buf_size, "intel_quat");
   #ifdef _LMP_INTEL_OFFLOAD
   if (lmp->force->newton_pair)
   #else
   if (lmp->force->newton_pair || lmp->atom->molecular)
   #endif
     lmp->memory->create(_f, f_stride * nthreads, "intel_f");
   else
     lmp->memory->create(_f, f_stride, "intel_f");
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_separate_buffers) {
     lmp->memory->create(_host_x, _buf_size,"intel_host_x");
     if (lmp->atom->q != NULL)
       lmp->memory->create(_host_q, _buf_size, "intel_host_q");
     if (lmp->atom->ellipsoid != NULL)
       lmp->memory->create(_host_quat, _buf_size, "intel_host_quat");
   }
 
   if (offload_end > 0) {
     int fm;
     if (lmp->force->newton_pair) fm = _off_threads;
     else fm = 1;
     lmp->memory->create(_off_f, f_stride * fm, "intel_off_f");
     const atom_t * const x = get_x();
     const flt_t * const q = get_q();
     const vec3_acc_t * f_start = get_off_f();
     acc_t * ev_global = get_ev_global();
     if (lmp->atom->q != NULL) {
       if (x != NULL && q != NULL && f_start != NULL && ev_global != NULL) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(x,q:length(_buf_size) alloc_if(1) free_if(0)) \
           nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\
           nocopy(ev_global:length(8) alloc_if(1) free_if(0))
       }
     } else {
       if (x != NULL && f_start != NULL && ev_global != NULL) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(x:length(_buf_size) alloc_if(1) free_if(0)) \
           nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\
           nocopy(ev_global:length(8) alloc_if(1) free_if(0))
       }
     }
     if (lmp->atom->ellipsoid != NULL) {
       const quat_t * const quat = get_quat();
       if (quat != NULL) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(quat:length(_buf_size) alloc_if(1) free_if(0))
       }
     }
   }
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::free_nmax()
 {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_off_map_nmax > 0) {
     const int * tag = _off_map_tag;
     const int * special = _off_map_special;
     const int * nspecial = _off_map_nspecial;
     if (tag != 0 && special != 0 && nspecial !=0) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(tag:alloc_if(0) free_if(1)) \
         nocopy(special,nspecial:alloc_if(0) free_if(1))
     }
     _off_map_nmax = 0;
     _host_nmax = 0;
   }
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::_grow_nmax(const int offload_end)
 {
   #ifdef _LMP_INTEL_OFFLOAD
   free_nmax();
   int size = lmp->atom->nmax;
   _host_nmax = size;
 
   if (!offload_end) return;
   int *special, *nspecial;
   int tag_length, special_length, nspecial_length;
   if (lmp->atom->molecular) {
     special = lmp->atom->special[0];
     nspecial = lmp->atom->nspecial[0];
     special_length = size * lmp->atom->maxspecial;
     nspecial_length = size * 3;
   } else {
     special = &_special_holder;
     nspecial = &_nspecial_holder;
     special_length = 1;
     nspecial_length = 1;
   }
   if (_need_tag)
     tag_length = size;
   else
     tag_length = 1;
   int *tag = lmp->atom->tag;
   #pragma offload_transfer target(mic:_cop) \
     nocopy(tag:length(tag_length) alloc_if(1) free_if(0)) \
     nocopy(special:length(special_length) alloc_if(1) free_if(0)) \
     nocopy(nspecial:length(nspecial_length) alloc_if(1) free_if(0))
   _off_map_tag = tag;
   _off_map_special = special;
   _off_map_nspecial = nspecial;
   _off_map_nmax = size;
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::free_list_local()
 {
   if (_off_map_listlocal > 0) {
     int * cnumneigh = _cnumneigh;
     #ifdef _LMP_INTEL_OFFLOAD
     if (_off_map_ilist != NULL) {
       const int * ilist = _off_map_ilist;
       const int * numneigh = _off_map_numneigh;
       _off_map_ilist = NULL;
       if (numneigh != 0 && ilist != 0) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(ilist,numneigh,cnumneigh:alloc_if(0) free_if(1))
       }
     }
     #endif
     lmp->memory->destroy(cnumneigh);
     _off_map_listlocal = 0;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::_grow_list_local(NeighList *list,
                                                   const int offload_end)
 {
   free_list_local();
   int size = list->get_maxlocal();
   lmp->memory->create(_cnumneigh, size, "_cnumneigh");
   _off_map_listlocal = size;
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (offload_end > 0) {
     int * numneigh = list->numneigh;
     int * ilist = list->ilist;
     int * cnumneigh = _cnumneigh;
     if (cnumneigh != 0) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(ilist:length(size) alloc_if(1) free_if(0)) \
         nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \
         nocopy(cnumneigh:length(size) alloc_if(1) free_if(0))
     }
     _off_map_ilist = ilist;
     _off_map_numneigh = numneigh;
   }
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::free_nbor_list()
 {
   if (_list_alloc_atoms > 0) {
     #ifdef _LMP_INTEL_OFFLOAD
     if (_off_list_alloc) {
       int * list_alloc = _list_alloc;
       #pragma offload_transfer target(mic:_cop) \
         nocopy(list_alloc:alloc_if(0) free_if(1))
       _off_list_alloc = false;
     }
     #endif
     lmp->memory->destroy(_list_alloc);
     _list_alloc_atoms = 0;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list,
                                                  const int nlocal,
                                                  const int nthreads,
                                                  const int offload_end,
                                                  const int pack_width)
 {
   free_nbor_list();
   _list_alloc_atoms = 1.10 * nlocal;
   int nt = MAX(nthreads, _off_threads);
   int list_alloc_size = (_list_alloc_atoms + nt * 2 + pack_width - 1) *
     get_max_nbors();
   lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc");
   #ifdef _LMP_INTEL_OFFLOAD
   if (offload_end > 0) {
     int * list_alloc =_list_alloc;
 
     if (list_alloc != NULL) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(list_alloc:length(list_alloc_size) alloc_if(1) free_if(0))
       _off_list_alloc = true;
     }
   }
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::free_ccache()
 {
   if (_ccachex) {
     flt_t *ccachex = _ccachex;
     flt_t *ccachey = _ccachey;
     flt_t *ccachez = _ccachez;
     flt_t *ccachew = _ccachew;
     int *ccachei = _ccachei;
     int *ccachej = _ccachej;
     #ifdef LMP_USE_AVXCD
     acc_t *ccachef = _ccachef;
     #endif
 
     #ifdef _LMP_INTEL_OFFLOAD
     if (_off_ccache) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(ccachex,ccachey,ccachez,ccachew:alloc_if(0) free_if(1)) \
         nocopy(ccachei,ccachej:alloc_if(0) free_if(1))
 
       #ifdef LMP_USE_AVXCD
       #pragma offload_transfer target(mic:_cop) \
         nocopy(ccachef:alloc_if(0) free_if(1))
       #endif
     }
     _off_ccache = 0;
     #endif
 
     lmp->memory->destroy(ccachex);
     lmp->memory->destroy(ccachey);
     lmp->memory->destroy(ccachez);
     lmp->memory->destroy(ccachew);
     lmp->memory->destroy(ccachei);
     lmp->memory->destroy(ccachej);
     #ifdef LMP_USE_AVXCD
     lmp->memory->destroy(ccachef);
     #endif
 
     _ccachex = 0;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
         const int nthreads,
         const int width)
 {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_ccachex && off_flag && _off_ccache == 0)
     free_ccache();
   #endif
   if (_ccachex)
     return;
 
   const int nsize = get_max_nbors() * width;
   int esize = MIN(sizeof(int), sizeof(flt_t));
   IP_PRE_get_stride(_ccache_stride, nsize, esize, 0);
   int nt = MAX(nthreads, _off_threads);
   const int vsize = _ccache_stride * nt;
 
   lmp->memory->create(_ccachex, vsize , "_ccachex");
   lmp->memory->create(_ccachey, vsize, "_ccachey");
   lmp->memory->create(_ccachez, vsize, "_ccachez");
   lmp->memory->create(_ccachew, vsize, "_ccachew");
   lmp->memory->create(_ccachei, vsize, "_ccachei");
   lmp->memory->create(_ccachej, vsize, "_ccachej");
   #ifdef LMP_USE_AVXCD
   IP_PRE_get_stride(_ccache_stride3, nsize * 3, sizeof(acc_t), 0);
   lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef");
   #endif
   memset(_ccachej, 0, vsize * sizeof(int));
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (off_flag) {
     flt_t *ccachex = _ccachex;
     flt_t *ccachey = _ccachey;
     flt_t *ccachez = _ccachez;
     flt_t *ccachew = _ccachew;
     int *ccachei = _ccachei;
     int *ccachej = _ccachej;
 
     if (ccachex != NULL && ccachey !=NULL && ccachez != NULL &&
         ccachew != NULL && ccachei != NULL && ccachej !=NULL) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
         nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
         nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \
         in(ccachej:length(vsize) alloc_if(1) free_if(0))
     }
     #ifdef LMP_USE_AVXCD
     if (ccachef != NULL) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(ccachef:length(_ccache_stride3 * nt) alloc_if(1) free_if(0))
     }
     #endif
     _off_ccache = 1;
   }
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::free_ncache()
 {
   if (_ncache_alloc) {
     flt_t *ncachex = _ncachex;
     flt_t *ncachey = _ncachey;
     flt_t *ncachez = _ncachez;
     int *ncachej = _ncachej;
     int *ncachejtype = _ncachejtype;
+    int *ncachetag = _ncachetag;
 
     #ifdef _LMP_INTEL_OFFLOAD
     if (_off_ncache) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(ncachex,ncachey,ncachez,ncachej:alloc_if(0) free_if(1)) \
         nocopy(ncachejtype:alloc_if(0) free_if(1))
+      if (ncachetag) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(ncachetag:alloc_if(0) free_if(1))
+      }
     }
     _off_ncache = 0;
     #endif
 
     lmp->memory->destroy(ncachex);
     lmp->memory->destroy(ncachey);
     lmp->memory->destroy(ncachez);
     lmp->memory->destroy(ncachej);
     lmp->memory->destroy(ncachejtype);
-
+    if (ncachetag)
+      lmp->memory->destroy(ncachetag);
     _ncache_alloc = 0;
+    _ncachetag = 0;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
                                              const int nthreads)
 {
   const int nsize = get_max_nbors() * 3;
   int esize = MIN(sizeof(int), sizeof(flt_t));
   IP_PRE_get_stride(_ncache_stride, nsize, esize, 0);
   int nt = MAX(nthreads, _off_threads);
   const int vsize = _ncache_stride * nt;
 
   if (_ncache_alloc) {
-    if (vsize > _ncache_alloc)
+    if (vsize > _ncache_alloc || (need_tag() && _ncachetag == 0))
       free_ncache();
     #ifdef _LMP_INTEL_OFFLOAD
     else if (off_flag && _off_ncache == 0)
       free_ncache();
     #endif
     else
       return;
   }
 
   lmp->memory->create(_ncachex, vsize, "_ncachex");
   lmp->memory->create(_ncachey, vsize, "_ncachey");
   lmp->memory->create(_ncachez, vsize, "_ncachez");
   lmp->memory->create(_ncachej, vsize, "_ncachej");
   lmp->memory->create(_ncachejtype, vsize, "_ncachejtype");
+  if (need_tag())
+    lmp->memory->create(_ncachetag, vsize, "_ncachetag");
 
   _ncache_alloc = vsize;
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (off_flag) {
     flt_t *ncachex = _ncachex;
     flt_t *ncachey = _ncachey;
     flt_t *ncachez = _ncachez;
     int *ncachej = _ncachej;
     int *ncachejtype = _ncachejtype;
 
     if (ncachex != NULL && ncachey !=NULL && ncachez != NULL &&
         ncachej != NULL && ncachejtype != NULL) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(ncachex,ncachey:length(vsize) alloc_if(1) free_if(0)) \
         nocopy(ncachez,ncachej:length(vsize) alloc_if(1) free_if(0)) \
         nocopy(ncachejtype:length(vsize) alloc_if(1) free_if(0))
     }
+    int tsize = vsize;
+    if (!need_tag()) {
+      tsize = 16;
+      lmp->memory->create(_ncachetag, tsize, "_ncachetag");
+    }
+    int *ncachetag = _ncachetag;
+    #pragma offload_transfer target(mic:_cop)			\
+      nocopy(ncachetag:length(tsize) alloc_if(1) free_if(0))
     _off_ncache = 1;
   }
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 #ifndef _LMP_INTEL_OFFLOAD
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::fdotr_reduce_l5(const int lf, const int lt,
     const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1,
     acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5)
 {
   IP_PRE_fdotr_acc_force_l5(lf, lt, 0, nthreads, _f, f_stride, _x, ov0,
                             ov1, ov2, ov3, ov4, ov5);
 }
 #endif
 
 /* ---------------------------------------------------------------------- */
 
 #ifndef _LMP_INTEL_OFFLOAD
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::fdotr_reduce(const int nall,
     const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1,
     acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5)
 {
   int iifrom, iito, tid;
   IP_PRE_fdotr_acc_force(nall, 0, nthreads, _f, f_stride, _x, 0, 2,
                          ov0, ov1, ov2, ov3, ov4, ov5);
 }
 #endif
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
-void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
+void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes, 
+					    const int use_ghost_cut)
 {
   if (ntypes != _ntypes) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * cutneighsqo = _cutneighsq[0];
       if (_off_threads > 0 && cutneighsqo != 0) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(cutneighsqo:alloc_if(0) free_if(1))
       }
+      flt_t * cutneighghostsqo;
+      if (_cutneighghostsq && _off_threads > 0 && cutneighghostsqo != 0) {
+	cutneighghostsqo = _cutneighghostsq[0];
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(cutneighghostsqo:alloc_if(0) free_if(1))
+      }
       #endif
       lmp->memory->destroy(_cutneighsq);
+      if (_cutneighghostsq != 0) lmp->memory->destroy(_cutneighghostsq);
     }
     if (ntypes > 0) {
       lmp->memory->create(_cutneighsq, ntypes, ntypes, "_cutneighsq");
+      if (use_ghost_cut)
+	lmp->memory->create(_cutneighghostsq, ntypes, ntypes, 
+			    "_cutneighghostsq");
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * cutneighsqo = _cutneighsq[0];
+      const int ntypes2 = ntypes * ntypes;
       if (_off_threads > 0 && cutneighsqo != NULL) {
         #pragma offload_transfer target(mic:_cop) \
-          nocopy(cutneighsqo:length(ntypes * ntypes) alloc_if(1) free_if(0))
+          nocopy(cutneighsqo:length(ntypes2) alloc_if(1) free_if(0))
+      }
+      if (use_ghost_cut) {
+        flt_t * cutneighghostsqo = _cutneighghostsq[0];
+        if (_off_threads > 0 && cutneighghostsqo != NULL) {
+          #pragma offload_transfer target(mic:_cop) \
+            nocopy(cutneighghostsqo:length(ntypes2) alloc_if(1) free_if(0))
+        }
       }
       #endif
     }
     _ntypes = ntypes;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 double IntelBuffers<flt_t, acc_t>::memory_usage(const int nthreads)
 {
   double tmem = sizeof(atom_t);
   if (lmp->atom->q) tmem += sizeof(flt_t);
   if (lmp->atom->torque) tmem += sizeof(quat_t);
   #ifdef _LMP_INTEL_OFFLOAD
   if (_separate_buffers) tmem *= 2;
   #endif
   tmem *= _buf_size;
 
   const int fstride = get_stride(_buf_local_size);
   tmem += fstride * nthreads * sizeof(vec3_acc_t);
   #ifdef _LMP_INTEL_OFFLOAD
   if (_off_f) tmem += fstride*_off_threads * sizeof(vec3_acc_t);
   #endif
 
   tmem += (_list_alloc_atoms + _off_threads) * get_max_nbors() * sizeof(int);
   tmem += _ntypes * _ntypes * sizeof(int);
 
   return tmem;
 }
 
 /* ---------------------------------------------------------------------- */
 
 template class IntelBuffers<float,float>;
 template class IntelBuffers<float,double>;
 template class IntelBuffers<double,double>;
diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h
index 7a7640a20..8040715b2 100644
--- a/src/USER-INTEL/intel_buffers.h
+++ b/src/USER-INTEL/intel_buffers.h
@@ -1,344 +1,346 @@
 /* -*- c++ -*- -------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #ifndef LMP_INTEL_BUFFERS_H
 #define LMP_INTEL_BUFFERS_H
 
 #if defined(_OPENMP)
 #include <omp.h>
 #endif
 #include "atom.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "intel_preprocess.h"
 #include <cstring>
 
 namespace LAMMPS_NS {
 
 #define ATOM_T typename IntelBuffers<flt_t,acc_t>::atom_t
 #define QUAT_T typename IntelBuffers<flt_t,acc_t>::quat_t
 #define FORCE_T typename IntelBuffers<flt_t,acc_t>::vec3_acc_t
 
 // May not need a separate force array for mixed/double
 template <class flt_t, class acc_t>
 class IntelBuffers {
  public:
   typedef struct { flt_t x,y,z; int w; } atom_t;
   typedef struct { flt_t w,i,j,k; } quat_t;
   typedef struct { flt_t x,y,z,w; } vec3_t;
   typedef struct { flt_t x,y,z,w; } vec4_t;
   typedef struct { acc_t x,y,z,w; } vec3_acc_t;
 
   IntelBuffers(class LAMMPS *lmp_in);
   ~IntelBuffers();
 
   inline int get_stride(int nall) {
     int stride;
     IP_PRE_get_stride(stride, nall, sizeof(vec3_acc_t),
                          lmp->atom->torque);
     return stride;
   }
 
   template <class stype>
   inline int get_scalar_stride(const int n) {
     int stride;
     IP_PRE_get_stride(stride, n, sizeof(stype), 0);
     return stride;
   }
 
   void free_buffers();
   void free_nmax();
   inline void set_bininfo(int *atombin, int *binpacked)
     { _atombin = atombin; _binpacked = binpacked; }
   inline void grow(const int nall, const int nlocal, const int nthreads,
                    const int offload_end) {
     if (nall >= _buf_size || nlocal >= _buf_local_size)
       _grow(nall, nlocal, nthreads, offload_end);
     #ifdef _LMP_INTEL_OFFLOAD
     if (lmp->atom->nmax > _host_nmax)
       _grow_nmax(offload_end);
     #endif
   }
 
   inline void free_all_nbor_buffers() {
     free_nbor_list();
     free_nmax();
     free_list_local();
     free_ncache();
   }
 
   inline void grow_list(NeighList *list, const int nlocal, const int nthreads,
                         const int offload_end, const int pack_width=1) {
     grow_list_local(list, offload_end);
     grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
   }
 
   void free_list_local();
   inline void grow_list_local(NeighList *list, const int offload_end) {
     if (list->get_maxlocal() > _off_map_listlocal)
       _grow_list_local(list, offload_end);
   }
 
   void free_ccache();
   void grow_ccache(const int off_flag, const int nthreads, const int width=1);
   inline int ccache_stride() { return _ccache_stride; }
   inline flt_t * get_ccachex() { return _ccachex; }
   inline flt_t * get_ccachey() { return _ccachey; }
   inline flt_t * get_ccachez() { return _ccachez; }
   inline flt_t * get_ccachew() { return _ccachew; }
   inline int * get_ccachei() { return _ccachei; }
   inline int * get_ccachej() { return _ccachej; }
   #ifdef LMP_USE_AVXCD
   inline int ccache_stride3() { return _ccache_stride3; }
   inline acc_t * get_ccachef() { return _ccachef; }
   #endif
 
   void free_ncache();
   void grow_ncache(const int off_flag, const int nthreads);
+  void grow_ncachetag(const int off_flag, const int nthreads);
   inline int ncache_stride() { return _ncache_stride; }
   inline flt_t * get_ncachex() { return _ncachex; }
   inline flt_t * get_ncachey() { return _ncachey; }
   inline flt_t * get_ncachez() { return _ncachez; }
   inline int * get_ncachej() { return _ncachej; }
   inline int * get_ncachejtype() { return _ncachejtype; }
+  inline int * get_ncachetag() { return _ncachetag; }
 
   inline int get_max_nbors() {
     int mn = lmp->neighbor->oneatom * sizeof(int) /
         (INTEL_ONEATOM_FACTOR * INTEL_DATA_ALIGN);
     return mn * INTEL_DATA_ALIGN / sizeof(int);
   }
 
   void free_nbor_list();
 
   inline void grow_nbor_list(NeighList *list, const int nlocal,
                              const int nthreads, const int offload_end,
                              const int pack_width) {
     if (nlocal > _list_alloc_atoms)
       _grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
   }
 
-  void set_ntypes(const int ntypes);
+  void set_ntypes(const int ntypes, const int use_ghost_cut = 0);
 
   inline int * firstneigh(const NeighList *list) { return _list_alloc; }
   inline int * cnumneigh(const NeighList *list) { return _cnumneigh; }
   inline int * get_atombin() { return _atombin; }
   inline int * get_binpacked() { return _binpacked; }
 
   inline atom_t * get_x(const int offload = 1) {
     #ifdef _LMP_INTEL_OFFLOAD
     if (_separate_buffers && offload == 0) return _host_x;
     #endif
     return _x;
   }
   inline flt_t * get_q(const int offload = 1) {
     #ifdef _LMP_INTEL_OFFLOAD
     if (_separate_buffers && offload == 0) return _host_q;
     #endif
     return _q;
   }
   inline quat_t * get_quat(const int offload = 1) {
     #ifdef _LMP_INTEL_OFFLOAD
     if (_separate_buffers && offload == 0) return _host_quat;
     #endif
     return _quat;
   }
   inline vec3_acc_t * get_f() { return _f; }
   inline acc_t * get_ev_global() { return _ev_global; }
   inline acc_t * get_ev_global_host() { return _ev_global_host; }
   inline void zero_ev()
     { for (int i = 0; i < 8; i++) _ev_global[i] = _ev_global_host[i] = 0.0; }
   inline flt_t ** get_cutneighsq() { return _cutneighsq; }
+  inline flt_t ** get_cutneighghostsq() { return _cutneighghostsq; }
   inline int get_off_threads() { return _off_threads; }
   #ifdef _LMP_INTEL_OFFLOAD
   inline void set_off_params(const int n, const int cop,
                              const int separate_buffers)
     { _off_threads = n; _cop = cop; _separate_buffers = separate_buffers; }
   inline vec3_acc_t * get_off_f() { return _off_f; }
   #endif
 
   inline void thr_pack(const int ifrom, const int ito, const int ago) {
     if (ago == 0) {
       #if defined(LMP_SIMD_COMPILER)
       #pragma vector aligned
       #pragma ivdep
       #endif
       for (int i = ifrom; i < ito; i++) {
         _x[i].x = lmp->atom->x[i][0];
         _x[i].y = lmp->atom->x[i][1];
         _x[i].z = lmp->atom->x[i][2];
         _x[i].w = lmp->atom->type[i];
       }
       if (lmp->atom->q != NULL)
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma ivdep
         #endif
         for (int i = ifrom; i < ito; i++)
           _q[i] = lmp->atom->q[i];
     } else {
       #if defined(LMP_SIMD_COMPILER)
       #pragma vector aligned
       #pragma ivdep
       #endif
       for (int i = ifrom; i < ito; i++) {
         _x[i].x = lmp->atom->x[i][0];
         _x[i].y = lmp->atom->x[i][1];
         _x[i].z = lmp->atom->x[i][2];
       }
     }
   }
 
   #ifndef _LMP_INTEL_OFFLOAD
   void fdotr_reduce_l5(const int lf, const int lt, const int nthreads,
                        const int f_stride, acc_t &ov0, acc_t &ov1,
                        acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5);
   void fdotr_reduce(const int nall, const int nthreads, const int f_stride,
                     acc_t &ov0, acc_t &ov1, acc_t &ov2, acc_t &ov3,
                     acc_t &ov4, acc_t &ov5);
   #endif
 
   #ifdef _LMP_INTEL_OFFLOAD
   inline void thr_pack_cop(const int ifrom, const int ito,
                            const int offset, const bool dotype = false) {
     double ** x = lmp->atom->x + offset;
     if (dotype == false) {
       #if defined(LMP_SIMD_COMPILER)
       #pragma vector aligned
       #pragma ivdep
       #endif
       for (int i = ifrom; i < ito; i++) {
         _x[i].x = x[i][0];
         _x[i].y = x[i][1];
         _x[i].z = x[i][2];
       }
     } else {
       int *type = lmp->atom->type + offset;
       #if defined(LMP_SIMD_COMPILER)
       #pragma vector aligned
       #pragma ivdep
       #endif
       for (int i = ifrom; i < ito; i++) {
         _x[i].x = x[i][0];
         _x[i].y = x[i][1];
         _x[i].z = x[i][2];
         _x[i].w = type[i];
       }
     }
   }
 
   inline void thr_pack_host(const int ifrom, const int ito,
                             const int offset) {
     double ** x = lmp->atom->x + offset;
     #if defined(LMP_SIMD_COMPILER)
     #pragma vector aligned
     #pragma ivdep
     #endif
     for (int i = ifrom; i < ito; i++) {
       _host_x[i].x = x[i][0];
       _host_x[i].y = x[i][1];
       _host_x[i].z = x[i][2];
     }
   }
 
   inline void pack_sep_from_single(const int host_min_local,
                                    const int used_local,
                                    const int host_min_ghost,
                                    const int used_ghost) {
     memcpy(_host_x + host_min_local, _x + host_min_local,
            used_local * sizeof(atom_t));
     memcpy(_host_x + host_min_local + used_local, _x + host_min_ghost,
            used_ghost * sizeof(atom_t));
     int nall = used_local + used_ghost + host_min_local;
     _host_x[nall].x = INTEL_BIGP;
     _host_x[nall].y = INTEL_BIGP;
     _host_x[nall].z = INTEL_BIGP;
     _host_x[nall].w = 1;
     if (lmp->atom->q != NULL) {
       memcpy(_host_q + host_min_local, _q + host_min_local,
              used_local * sizeof(flt_t));
       memcpy(_host_q + host_min_local + used_local, _q + host_min_ghost,
              used_ghost * sizeof(flt_t));
     }
   }
+  #endif
 
   inline int need_tag() { return _need_tag; }
   inline void need_tag(const int nt) { _need_tag = nt; }
-  #else
-  inline int need_tag() { return 0; }
-  inline void need_tag(const int nt) { }
-  #endif
 
   double memory_usage(const int nthreads);
 
   tagint _special_holder;
   int _nspecial_holder;
 
  protected:
   LAMMPS *lmp;
   atom_t *_x;
   flt_t *_q;
   quat_t *_quat;
   vec3_acc_t * _f;
   int _off_threads, _off_map_listlocal;
 
   int _list_alloc_atoms;
   int *_list_alloc, *_cnumneigh, *_atombin, *_binpacked;
 
-  flt_t **_cutneighsq;
+  flt_t **_cutneighsq, **_cutneighghostsq;
   int _ntypes;
 
   int _ccache_stride;
   flt_t *_ccachex, *_ccachey, *_ccachez, *_ccachew;
   int *_ccachei, *_ccachej;
 
   int _ncache_stride, _ncache_alloc;
   flt_t *_ncachex, *_ncachey, *_ncachez;
-  int *_ncachej, *_ncachejtype;
+  int *_ncachej, *_ncachejtype, *_ncachetag;
+
+  int _need_tag, _host_nmax;
+
   #ifdef LMP_USE_AVXCD
   int _ccache_stride3;
   acc_t * _ccachef;
   #endif
 
   #ifdef _LMP_INTEL_OFFLOAD
   int _separate_buffers;
   atom_t *_host_x;
   flt_t *_host_q;
   quat_t *_host_quat;
   vec3_acc_t *_off_f;
   int _off_map_nmax, _cop, _off_ccache, _off_ncache;
   int *_off_map_ilist;
   int *_off_map_special, *_off_map_nspecial, *_off_map_tag;
   int *_off_map_numneigh;
   bool _off_list_alloc;
-  int _need_tag, _host_nmax;
   #endif
 
   int _buf_size, _buf_local_size;
   _alignvar(acc_t _ev_global[8],64);
   _alignvar(acc_t _ev_global_host[8],64);
 
   void _grow(const int nall, const int nlocal, const int nthreads,
              const int offload_end);
   void _grow_nmax(const int offload_end);
   void _grow_list_local(NeighList *list, const int offload_end);
   void _grow_nbor_list(NeighList *list, const int nlocal, const int nthreads,
                        const int offload_end, const int pack_width);
 };
 
 }
 
 #endif
diff --git a/src/USER-INTEL/intel_intrinsics_airebo.h b/src/USER-INTEL/intel_intrinsics_airebo.h
new file mode 100644
index 000000000..7b091a4ba
--- /dev/null
+++ b/src/USER-INTEL/intel_intrinsics_airebo.h
@@ -0,0 +1,2279 @@
+#ifndef LMP_INTEL_AIREBO_SCALAR
+# ifdef __INTEL_COMPILER
+#  if defined(__MIC__) || defined(__AVX512F__)
+#   define LMP_INTEL_AIREBO_512
+#  elif defined(__AVX__)
+#   define LMP_INTEL_AIREBO_256
+#  else
+#   define LMP_INTEL_AIREBO_SCALAR
+#  endif
+# else
+#  define LMP_INTEL_AIREBO_SCALAR
+# endif
+#endif
+
+#ifdef LMP_INTEL_AIREBO_512
+
+#include <cassert>
+#include <immintrin.h>
+
+#define VEC_INLINE __attribute__((always_inline))
+
+
+#ifndef FVEC_FIRST_PASS
+#  define FVEC_LEN 8
+#  define FVEC_SUFFIX(a) a##pd
+#  define FVEC_SUFFIX_MASK(a) a##pd_mask
+#  define FVEC_MASK_T __mmask8
+#  define FVEC_VEC_T __m512d
+#  define FVEC_SCAL_T double
+#  define IVEC_NAME ivec8
+#  define FVEC_NAME fvec8pd
+#  define BVEC_NAME bvec8
+#  define AVEC_NAME avec8pd
+#else
+#  undef FVEC_LEN
+#  undef FVEC_SUFFIX
+#  undef FVEC_SUFFIX_MASK
+#  undef FVEC_MASK_T
+#  undef FVEC_VEC_T
+#  undef FVEC_SCAL_T
+#  undef IVEC_NAME
+#  undef FVEC_NAME
+#  undef BVEC_NAME
+#  undef AVEC_NAME
+
+#  define FVEC_LEN 16
+#  define FVEC_SUFFIX(a) a##ps
+#  define FVEC_SUFFIX_MASK(a) a##ps_mask
+#  define FVEC_MASK_T __mmask16
+#  define FVEC_VEC_T __m512
+#  define FVEC_SCAL_T float
+#  define IVEC_NAME ivec16
+#  define FVEC_NAME fvec16ps
+#  define BVEC_NAME bvec16
+#  define AVEC_NAME avec16ps
+#endif
+
+namespace mm512 {
+
+#ifndef __AVX512F__
+
+#ifndef FVEC_FIRST_PASS
+VEC_INLINE static inline __m512i _mm512_mask_expand_epi32(__m512i src, 
+							  __mmask16 k, 
+							  __m512i a) {
+  int buf[16] __attribute__((aligned(64)));
+  _mm512_store_epi32(buf, a);
+  return _mm512_mask_loadunpacklo_epi32(src, k, buf);
+}
+VEC_INLINE static inline __m512i _mm512_maskz_expand_epi32(__mmask16 k, 
+							   __m512i a) {
+  int buf[16] __attribute__((aligned(64)));
+  _mm512_store_epi32(buf, a);
+  return _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k, buf);
+}
+VEC_INLINE static inline __m512i _mm512_mask_compress_epi32(__m512i src, 
+							    __mmask16 k, 
+							    __m512i a) {
+  int buf[16] __attribute__((aligned(64)));
+  _mm512_store_epi32(buf, src);
+  _mm512_mask_packstorelo_epi32(buf, k, a);
+  return _mm512_load_epi32(buf);
+}
+VEC_INLINE static inline __m512i _mm512_maskz_compress_epi32(__mmask16 k, 
+							     __m512i a) {
+  int buf[16] __attribute__((aligned(64))) = {0};
+  _mm512_mask_packstorelo_epi32(buf, k, a);
+  return _mm512_load_epi32(buf);
+}
+
+VEC_INLINE static inline void _mm512_mask_compressstoreu_epi32(int * dest, 
+							       __mmask16 mask, 
+							       __m512i src) {
+  _mm512_mask_packstorelo_epi32(dest, mask, src);
+  _mm512_mask_packstorehi_epi32(dest + 16, mask, src);
+}
+
+VEC_INLINE static inline __m512i _mm512_mask_loadu_epi32(__m512i src, 
+							 __mmask16 k, 
+							 const int * mem_addr) {
+  assert((k & (k + 1)) == 0);
+  __m512i ret = _mm512_mask_loadunpacklo_epi32(src, k, mem_addr);
+  ret = _mm512_mask_loadunpackhi_epi32(ret, k, mem_addr + 16);
+  return ret;
+}
+VEC_INLINE static inline __m512i _mm512_maskz_loadu_epi32(__mmask16 k, 
+							const int * mem_addr) {
+  assert((k & (k + 1)) == 0);
+  __m512i ret = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k, 
+					       mem_addr);
+  ret = _mm512_mask_loadunpackhi_epi32(ret, k, mem_addr + 16);
+  return ret;
+}
+VEC_INLINE static inline void _mm512_mask_storeu_epi32(int * dest, 
+						       __mmask16 mask, 
+						       __m512i src) {
+  assert((mask & (mask + 1)) == 0);
+  _mm512_mask_packstorelo_epi32(dest, mask, src);
+  _mm512_mask_packstorehi_epi32(dest + 16, mask, src);
+}
+#endif
+
+VEC_INLINE static inline FVEC_VEC_T FVEC_SUFFIX(_mm512_mask_expand_)
+  (FVEC_VEC_T src, __mmask16 k, FVEC_VEC_T a) {
+  FVEC_SCAL_T buf[FVEC_LEN] __attribute__((aligned(64)));
+  FVEC_SUFFIX(_mm512_store_)(buf, a);
+  return FVEC_SUFFIX(_mm512_mask_loadunpacklo_)(src, k, buf);
+}
+VEC_INLINE static inline FVEC_VEC_T FVEC_SUFFIX(_mm512_maskz_expand_)
+  (__mmask16 k, FVEC_VEC_T a) {
+  FVEC_SCAL_T buf[FVEC_LEN] __attribute__((aligned(64)));
+  FVEC_SUFFIX(_mm512_store_)(buf, a);
+  return FVEC_SUFFIX(_mm512_mask_loadunpacklo_)(FVEC_SUFFIX(_mm512_setzero_)(),
+						k, buf);
+}
+VEC_INLINE static inline FVEC_VEC_T FVEC_SUFFIX(_mm512_mask_compress_)
+  (FVEC_VEC_T src, __mmask16 k, FVEC_VEC_T a) {
+  FVEC_SCAL_T buf[FVEC_LEN] __attribute__((aligned(64)));
+  FVEC_SUFFIX(_mm512_store_)(buf, src);
+  FVEC_SUFFIX(_mm512_mask_packstorelo_)(buf, k, a);
+  return FVEC_SUFFIX(_mm512_load_)(buf);
+}
+VEC_INLINE static inline FVEC_VEC_T FVEC_SUFFIX(_mm512_maskz_compress_)
+  (__mmask16 k, FVEC_VEC_T a) {
+  FVEC_SCAL_T buf[FVEC_LEN] __attribute__((aligned(64))) = {0};
+  FVEC_SUFFIX(_mm512_mask_packstorelo_)(buf, k, a);
+  return FVEC_SUFFIX(_mm512_load_)(buf);
+}
+VEC_INLINE static inline void FVEC_SUFFIX(_mm512_mask_storeu_)
+  (FVEC_SCAL_T * dest, FVEC_MASK_T mask, FVEC_VEC_T src) {
+  assert((mask & (mask + 1)) == 0);
+  FVEC_SUFFIX(_mm512_mask_packstorelo_)(dest, mask, src);
+  FVEC_SUFFIX(_mm512_mask_packstorehi_)(dest + FVEC_LEN, mask, src);
+}
+#endif
+
+
+class FVEC_NAME;
+class IVEC_NAME;
+class AVEC_NAME;
+class BVEC_NAME {
+  friend class FVEC_NAME;
+  friend class IVEC_NAME;
+  friend class AVEC_NAME;
+# if FVEC_LEN==16
+  friend class avec16pd;
+# endif
+  FVEC_MASK_T val_;
+  VEC_INLINE BVEC_NAME(const FVEC_MASK_T &v) : val_(v) {}
+public:
+  VEC_INLINE BVEC_NAME() {}
+  VEC_INLINE static BVEC_NAME kand(const BVEC_NAME &a, const BVEC_NAME &b) {
+    return _mm512_kand(a.val_, b.val_);
+  }
+  VEC_INLINE static BVEC_NAME kandn(const BVEC_NAME &a, const BVEC_NAME &b) {
+    return _mm512_kandn(a.val_, b.val_);
+  }
+  VEC_INLINE static BVEC_NAME knot(const BVEC_NAME &a) {
+    return _mm512_knot(a.val_);
+  }
+  VEC_INLINE static int kortestz(const BVEC_NAME &a, const BVEC_NAME &b) {
+    return _mm512_kortestz(a.val_, b.val_);
+  }
+  VEC_INLINE static BVEC_NAME masku_compress(const BVEC_NAME &mask, 
+					     const BVEC_NAME &a) {
+    const __m512i c_i1 = _mm512_set1_epi32(1);
+    __m512i a_int_vec = _mm512_mask_blend_epi32(a.val_, _mm512_setzero_epi32(),
+						c_i1);
+    __m512i compressed = _mm512_mask_compress_epi32(_mm512_undefined_epi32(),
+						    mask.val_, a_int_vec);
+    return _mm512_cmpeq_epi32_mask(compressed, c_i1);
+  }
+  VEC_INLINE static BVEC_NAME mask_expand(const BVEC_NAME &src, 
+					  const BVEC_NAME &mask,
+					  const BVEC_NAME &a) {
+    const __m512i c_i1 = _mm512_set1_epi32(1);
+    __m512i a_int_vec = _mm512_mask_blend_epi32(a.val_, _mm512_setzero_epi32(),
+						c_i1);
+    __m512i src_int_vec = _mm512_mask_blend_epi32(src.val_, 
+						  _mm512_setzero_epi32(), c_i1);
+    __m512i compressed = _mm512_mask_expand_epi32(src_int_vec, mask.val_,
+						  a_int_vec);
+    return _mm512_cmpeq_epi32_mask(compressed, c_i1);
+  }
+  VEC_INLINE static BVEC_NAME full() {
+    return static_cast<FVEC_MASK_T>(0xFFFF);
+  }
+  VEC_INLINE static BVEC_NAME empty() {
+    return 0;
+  }
+  VEC_INLINE static BVEC_NAME only(int n) {
+    return full().val_ >> (FVEC_LEN - n);
+  }
+  VEC_INLINE static BVEC_NAME after(int n) {
+    return full().val_ << n;
+  }
+  VEC_INLINE static BVEC_NAME onlyafter(int only, int after) {
+    return (full().val_ >> (FVEC_LEN - only)) << after;
+  }
+  VEC_INLINE static int popcnt(const BVEC_NAME &a) {
+    return _popcnt32(a.val_);
+  }
+  VEC_INLINE static bool test_all_unset(const BVEC_NAME &a) {
+    return _mm512_kortestz(a.val_, a.val_);
+  }
+  VEC_INLINE static bool test_any_set(const BVEC_NAME &a) {
+    return ! test_all_unset(a);
+  }
+  VEC_INLINE static bool test_at(const BVEC_NAME &a, int i) {
+    assert(i < FVEC_LEN);
+    return a.val_ & (1 << i);
+  }
+  VEC_INLINE BVEC_NAME operator &(const BVEC_NAME &b) const {
+    return _mm512_kand(val_, b.val_);
+  }
+  VEC_INLINE BVEC_NAME operator |(const BVEC_NAME &b) const {
+    return _mm512_kor(val_, b.val_);
+  }
+  VEC_INLINE BVEC_NAME operator ~() const {
+    return _mm512_knot(val_);
+  }
+};
+
+class IVEC_NAME {
+  friend class FVEC_NAME;
+  friend class AVEC_NAME;
+# if FVEC_LEN==16
+  friend class avec16pd;
+# endif
+  __m512i val_;
+  VEC_INLINE IVEC_NAME(const __m512i &v) : val_(v) {}
+public:
+  static const int VL = 16;
+  VEC_INLINE IVEC_NAME() {}
+
+  #define IVEC_MASK_BINFN_B(the_name)                                \
+    VEC_INLINE static BVEC_NAME the_name(const IVEC_NAME &a,	     \
+      const IVEC_NAME &b) {					     \
+      return _mm512_##the_name##_epi32_mask(a.val_, b.val_);         \
+    }								     \
+    VEC_INLINE static BVEC_NAME mask_##the_name(			\
+						const BVEC_NAME &mask,	\
+						  const IVEC_NAME &a,	\
+						  const IVEC_NAME &b    \
+						  ) {			\
+      return _mm512_mask_##the_name##_epi32_mask(			\
+      mask.val_, a.val_, b.val_);					\
+    }
+  IVEC_MASK_BINFN_B(cmpeq)
+  IVEC_MASK_BINFN_B(cmplt)
+  IVEC_MASK_BINFN_B(cmpneq)
+  IVEC_MASK_BINFN_B(cmpgt)
+
+  #define IVEC_MASK_BINFN_I(the_name)					\
+    VEC_INLINE static IVEC_NAME mask_##the_name(			\
+        const IVEC_NAME &src, const BVEC_NAME &mask,                    \
+        const IVEC_NAME &a, const IVEC_NAME &b                          \
+    ) {                                                                 \
+       return _mm512_mask_##the_name##_epi32(				\
+        src.val_, mask.val_, a.val_, b.val_);                           \
+    }
+  IVEC_MASK_BINFN_I(add)
+  VEC_INLINE static IVEC_NAME mask_blend(
+      const BVEC_NAME &mask, const IVEC_NAME &a, const IVEC_NAME &b
+  ) {
+    return _mm512_mask_blend_epi32(mask.val_, a.val_, b.val_);
+  }
+
+  #define IVEC_BINFN_I(the_name)                                     \
+    VEC_INLINE static IVEC_NAME the_name(const IVEC_NAME &a,	     \
+					 const IVEC_NAME &b) {	     \
+      return _mm512_##the_name##_epi32(a.val_, b.val_);              \
+    }
+  IVEC_BINFN_I(mullo)
+  IVEC_BINFN_I(srlv)
+  VEC_INLINE static IVEC_NAME the_and(const IVEC_NAME &a, const IVEC_NAME &b) {
+    return _mm512_and_epi32(a.val_, b.val_);
+  }
+
+  VEC_INLINE static IVEC_NAME mask_expand(
+      const IVEC_NAME &src, const BVEC_NAME &a, const IVEC_NAME &b
+  ) {
+    return _mm512_mask_expand_epi32(src.val_,
+      a.val_, b.val_);
+  }
+  VEC_INLINE static IVEC_NAME masku_compress(
+      const BVEC_NAME &a, const IVEC_NAME &b
+  ) {
+    return _mm512_mask_compress_epi32(_mm512_undefined_epi32(), a.val_, b.val_);
+  }
+
+  VEC_INLINE static int at(const IVEC_NAME &a, int b) {
+    int data[16] __attribute__((aligned(64)));
+    _mm512_store_epi32(data, a.val_);
+    return data[b];
+  }
+
+  VEC_INLINE static IVEC_NAME load(const int * src) {
+    return _mm512_load_epi32(src);
+  }
+  VEC_INLINE static IVEC_NAME mask_loadu(const BVEC_NAME &mask, 
+                                         const int * src) {
+    assert((mask.val_ & (mask.val_ + 1)) == 0);
+    assert(mask.val_ <= BVEC_NAME::full().val_);
+    return _mm512_mask_loadu_epi32(_mm512_undefined_epi32(), mask.val_, src);
+  }
+  VEC_INLINE static IVEC_NAME maskz_loadu(const BVEC_NAME &mask, 
+                                          const int * src) {
+    assert((mask.val_ & (mask.val_ + 1)) == 0);
+    assert(mask.val_ <= BVEC_NAME::full().val_);
+    return _mm512_maskz_loadu_epi32(mask.val_, src);
+  }
+  VEC_INLINE static void mask_storeu(const BVEC_NAME &mask, int * dest, 
+    const IVEC_NAME &src) {
+    assert((mask.val_ & (mask.val_ + 1)) == 0);
+    assert(mask.val_ <= BVEC_NAME::full().val_);
+    _mm512_mask_storeu_epi32(dest, mask.val_, src.val_);
+  }
+  VEC_INLINE static void store(int * dest, const IVEC_NAME &src) {
+    _mm512_store_epi32(dest, src.val_);
+  }
+
+  VEC_INLINE static IVEC_NAME mask_gather(
+      const IVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx, 
+      const int * mem, const int scale
+  ) {
+    assert(mask.val_ <= BVEC_NAME::full().val_);
+    assert(scale == sizeof(int));
+    return _mm512_mask_i32gather_epi32(src.val_, mask.val_, idx.val_, mem, 
+      sizeof(int));
+  }
+  VEC_INLINE static void mask_i32scatter(
+      int * mem, const BVEC_NAME &mask, const IVEC_NAME &idx, 
+      const IVEC_NAME &a, const int scale
+  ) {
+    assert(mask.val_ <= BVEC_NAME::full().val_);
+    assert(scale == sizeof(int));
+    _mm512_mask_i32scatter_epi32(mem, mask.val_, idx.val_, a.val_, sizeof(int));
+  }
+
+  VEC_INLINE static void mask_compressstore(const BVEC_NAME &mask, int * dest,
+    const IVEC_NAME &src) {
+    _mm512_mask_compressstoreu_epi32(dest, mask.val_, src.val_);
+  }
+
+  VEC_INLINE static IVEC_NAME set1(int i) {
+    return _mm512_set1_epi32(i);
+  }
+  VEC_INLINE static IVEC_NAME setzero() {
+    return _mm512_setzero_epi32();
+  }
+  VEC_INLINE static IVEC_NAME undefined() {
+    return _mm512_undefined_epi32();
+  }
+
+  VEC_INLINE IVEC_NAME operator +(const IVEC_NAME &b) const {
+    return _mm512_add_epi32(this->val_, b.val_);
+  }
+  VEC_INLINE static void print(const char * str, const IVEC_NAME &a) {
+    int data[8] __attribute__((aligned(32)));
+    store(data, a);
+    printf("%s:", str);
+    for (int i = 0; i < FVEC_LEN; i++) {
+      printf(" %d", data[i]);
+    }
+    printf("\n");
+  }
+};
+
+class FVEC_NAME {
+  friend class AVEC_NAME;
+#if FVEC_LEN==16
+  friend class avec16pd;
+#endif
+  FVEC_VEC_T val_;
+  VEC_INLINE FVEC_NAME(const FVEC_VEC_T &v) : val_(v) {}
+public:
+  static const int VL = FVEC_LEN;
+  VEC_INLINE FVEC_NAME() {}
+  VEC_INLINE static FVEC_SCAL_T at(const FVEC_NAME &a, int i) {
+    assert(i < FVEC_LEN);
+    FVEC_SCAL_T data[FVEC_LEN] __attribute__((aligned(64)));
+    FVEC_SUFFIX(_mm512_store_)(data, a.val_);
+    return data[i];
+  }
+  VEC_INLINE static bool fast_compress() { return true; }
+
+  #define FVEC_MASK_BINFN_B(the_name)                                \
+    VEC_INLINE static BVEC_NAME the_name(const FVEC_NAME &a,         \
+                                         const FVEC_NAME &b) {	     \
+      return FVEC_SUFFIX_MASK(_mm512_##the_name##_)(a.val_, b.val_); \
+    }                                                                \
+    VEC_INLINE static BVEC_NAME mask_##the_name(                     \
+        const BVEC_NAME &mask,                                       \
+        const FVEC_NAME &a, const FVEC_NAME &b                       \
+    ) {                                                              \
+      return FVEC_SUFFIX_MASK(_mm512_mask_##the_name##_)(            \
+        mask.val_, a.val_, b.val_);                                  \
+    }
+  FVEC_MASK_BINFN_B(cmple)
+  FVEC_MASK_BINFN_B(cmplt)
+  FVEC_MASK_BINFN_B(cmpneq)
+  FVEC_MASK_BINFN_B(cmpnle)
+  FVEC_MASK_BINFN_B(cmpnlt)
+
+  #define FVEC_UNFN_F(the_name)                                      \
+    VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a) {       \
+      return FVEC_SUFFIX(_mm512_##the_name##_)(a.val_);              \
+    }
+  FVEC_UNFN_F(abs)
+  FVEC_UNFN_F(exp)
+  FVEC_UNFN_F(invsqrt)
+  FVEC_UNFN_F(recip)
+  FVEC_UNFN_F(sqrt)
+
+  #define FVEC_MASK_UNFN_F(the_name)                                 \
+    VEC_INLINE static FVEC_NAME mask_##the_name(                     \
+        const FVEC_NAME &src, const BVEC_NAME &mask,                 \
+        const FVEC_NAME &a                                           \
+    ) {                                                              \
+      return FVEC_SUFFIX(_mm512_mask_##the_name##_)(                 \
+        src.val_, mask.val_, a.val_);                                \
+    }
+  FVEC_MASK_UNFN_F(cos)
+  FVEC_MASK_UNFN_F(recip)
+  FVEC_MASK_UNFN_F(sqrt)
+
+  #define FVEC_BINFN_F(the_name)                                     \
+    VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a,         \
+                                         const FVEC_NAME &b) {       \
+      return FVEC_SUFFIX(_mm512_##the_name##_)(a.val_, b.val_);      \
+    }
+  FVEC_BINFN_F(max)
+  FVEC_BINFN_F(min)
+
+  #define FVEC_MASK_BINFN_F(the_name)                                \
+    VEC_INLINE static FVEC_NAME mask_##the_name(                     \
+        const FVEC_NAME &src, const BVEC_NAME &mask,                 \
+        const FVEC_NAME &a, const FVEC_NAME &b                       \
+    ) {                                                              \
+      return FVEC_SUFFIX(_mm512_mask_##the_name##_)(                 \
+        src.val_, mask.val_, a.val_, b.val_);                        \
+    }
+  FVEC_MASK_BINFN_F(add)
+  FVEC_MASK_BINFN_F(div)
+  FVEC_MASK_BINFN_F(mul)
+  FVEC_MASK_BINFN_F(sub)
+  VEC_INLINE static FVEC_NAME mask_blend(
+      const BVEC_NAME &mask, const FVEC_NAME &a, const FVEC_NAME &b
+  ) {
+    return FVEC_SUFFIX(_mm512_mask_blend_)(mask.val_, a.val_, b.val_);
+  }
+
+  VEC_INLINE static FVEC_NAME mask_expand(
+      const FVEC_NAME &src, const BVEC_NAME &a, const FVEC_NAME &b
+  ) {
+    return FVEC_SUFFIX(_mm512_mask_expand_)(src.val_,
+      a.val_, b.val_);
+  }
+  VEC_INLINE static FVEC_NAME masku_compress(
+      const BVEC_NAME &a, const FVEC_NAME &b
+  ) {
+    return FVEC_SUFFIX(_mm512_mask_compress_)(FVEC_SUFFIX(_mm512_undefined_)(),
+						a.val_, b.val_);
+  }
+
+  VEC_INLINE static FVEC_NAME set1(const FVEC_SCAL_T &a) {
+    return FVEC_SUFFIX(_mm512_set1_)(a);
+  }
+  VEC_INLINE static FVEC_NAME setzero() {
+    return FVEC_SUFFIX(_mm512_setzero_)();
+  }
+  VEC_INLINE static FVEC_NAME undefined() {
+    return FVEC_SUFFIX(_mm512_undefined_)();
+  }
+
+  VEC_INLINE static FVEC_NAME load(const FVEC_SCAL_T *mem) {
+    return FVEC_SUFFIX(_mm512_load_)(mem);
+  }
+  VEC_INLINE static void mask_storeu(const BVEC_NAME &mask, FVEC_SCAL_T * dest,
+				       const FVEC_NAME &a) {
+    FVEC_SUFFIX(_mm512_mask_storeu_)(dest, mask.val_, a.val_);
+  }
+  VEC_INLINE static void store(FVEC_SCAL_T * dest, const FVEC_NAME &a) {
+    FVEC_SUFFIX(_mm512_store_)(dest, a.val_);
+  }
+
+  VEC_INLINE static FVEC_NAME gather(const IVEC_NAME &idx, 
+				     const FVEC_SCAL_T * mem, 
+				     const int scale) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+#   if FVEC_LEN==8
+    return FVEC_SUFFIX(_mm512_i32logather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
+#   else
+    return FVEC_SUFFIX(_mm512_i32gather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
+#   endif
+  }
+  VEC_INLINE static FVEC_NAME mask_gather(
+      const FVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const FVEC_SCAL_T * mem, const int scale
+  ) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+#   if FVEC_LEN==8
+    return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
+                       mem, sizeof(FVEC_SCAL_T));
+#   else
+    return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
+                       mem, sizeof(FVEC_SCAL_T));
+#   endif
+  }
+
+  VEC_INLINE static void gather_3_adjacent(const IVEC_NAME &idx, 
+					   const FVEC_SCAL_T * mem, 
+					   const int scale, 
+					   FVEC_NAME * out_0, 
+					   FVEC_NAME * out_1, 
+					   FVEC_NAME * out_2) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+    *out_0 = FVEC_NAME::gather(idx, mem + 0, scale);
+    *out_1 = FVEC_NAME::gather(idx, mem + 1, scale);
+    *out_2 = FVEC_NAME::gather(idx, mem + 2, scale);
+  }
+  VEC_INLINE static void gather_4_adjacent(const IVEC_NAME &idx, 
+					   const FVEC_SCAL_T * mem, 
+					   const int scale, FVEC_NAME * out_0,
+					   FVEC_NAME * out_1, 
+					   FVEC_NAME * out_2, 
+					   FVEC_NAME * out_3) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+    *out_0 = FVEC_NAME::gather(idx, mem + 0, scale);
+    *out_1 = FVEC_NAME::gather(idx, mem + 1, scale);
+    *out_2 = FVEC_NAME::gather(idx, mem + 2, scale);
+    *out_3 = FVEC_NAME::gather(idx, mem + 3, scale);
+  }
+
+  VEC_INLINE static FVEC_SCAL_T mask_reduce_add(const BVEC_NAME &mask, 
+						const FVEC_NAME &a) {
+    return FVEC_SUFFIX(_mm512_mask_reduce_add_)(mask.val_, a.val_);
+  }
+  VEC_INLINE static FVEC_SCAL_T reduce_add(const FVEC_NAME &a) {
+    return FVEC_SUFFIX(_mm512_reduce_add_)(a.val_);
+  }
+
+  VEC_INLINE static IVEC_NAME unpackloepi32(const FVEC_NAME &a) {
+#   if FVEC_LEN==8
+    return _mm512_maskz_compress_epi32(0x5555, _mm512_castpd_si512(a.val_));
+#   else
+    return _mm512_castps_si512(a.val_);
+#   endif
+  }
+
+  VEC_INLINE static FVEC_NAME mask_sincos(
+      FVEC_NAME * cos, const FVEC_NAME &src_a, const FVEC_NAME &src_b,
+      const BVEC_NAME &mask, const FVEC_NAME &arg
+  ) {
+    return FVEC_SUFFIX(_mm512_mask_sincos_)(&cos->val_, src_a.val_, src_b.val_,
+      mask.val_, arg.val_);
+  }
+
+  #define FVEC_BINOP(the_sym, the_name)                                      \
+    VEC_INLINE inline FVEC_NAME operator the_sym(const FVEC_NAME &b) const { \
+    return FVEC_SUFFIX(_mm512_##the_name##_)(this->val_, b.val_);            \
+  }
+  FVEC_BINOP(+, add)
+  FVEC_BINOP(-, sub)
+  FVEC_BINOP(*, mul)
+  FVEC_BINOP(/, div)
+
+  VEC_INLINE static void gather_prefetch0(const IVEC_NAME &a, void * mem) {
+    #ifdef __AVX512PF__
+    _mm512_mask_prefetch_i32gather_ps(a.val_, BVEC_NAME::full().val_, mem, 
+      sizeof(FVEC_SCAL_T), _MM_HINT_T0);
+    #endif
+  }
+};
+
+class AVEC_NAME {
+  FVEC_VEC_T val_;
+  VEC_INLINE AVEC_NAME(const FVEC_VEC_T &a) : val_(a) {}
+public:
+  VEC_INLINE AVEC_NAME(const FVEC_NAME &a) : val_(a.val_) {}
+  VEC_INLINE static AVEC_NAME undefined() {
+    return FVEC_SUFFIX(_mm512_undefined_)();
+  }
+  VEC_INLINE static AVEC_NAME mask_gather(
+      const AVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const FVEC_SCAL_T * mem, const int scale
+  ) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+#   if FVEC_LEN==8
+    return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
+						 mem, sizeof(FVEC_SCAL_T));
+#   else
+    return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
+                                               mem, sizeof(FVEC_SCAL_T));
+#   endif
+  }
+  VEC_INLINE static void mask_i32loscatter(
+      FVEC_SCAL_T * mem, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const AVEC_NAME &a, const int scale
+  ) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+#   if FVEC_LEN==8
+    FVEC_SUFFIX(_mm512_mask_i32loscatter_)(mem, mask.val_, idx.val_, a.val_, 
+					   sizeof(FVEC_SCAL_T));
+#   else
+    FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_, idx.val_, a.val_, 
+					 sizeof(FVEC_SCAL_T));
+#   endif
+  }
+
+  #define AVEC_BINOP(the_sym, the_name)                                      \
+    VEC_INLINE inline AVEC_NAME operator the_sym(const AVEC_NAME &b) const { \
+    return FVEC_SUFFIX(_mm512_##the_name##_)(this->val_, b.val_);            \
+  }
+  AVEC_BINOP(-, sub)
+
+  VEC_INLINE static void gather_prefetch0(const IVEC_NAME &a, void * mem) {
+    _mm512_mask_prefetch_i32gather_ps(a.val_, BVEC_NAME::full().val_, mem, 
+				      sizeof(FVEC_SCAL_T), _MM_HINT_T0);
+  }
+};
+
+#if FVEC_LEN==16
+class avec16pd {
+  __m512d lo_, hi_;
+  VEC_INLINE avec16pd(const __m512d &lo, const __m512d &hi) : lo_(lo), hi_(hi) 
+    {}
+  VEC_INLINE static __mmask8 get_bvec_hi(__mmask16 a) {
+    return a >> 8;
+  }
+  VEC_INLINE static __m512i get_ivec_hi(__m512i a) {
+    return _mm512_permute4f128_epi32(a, _MM_PERM_BADC);
+  }
+public:
+  VEC_INLINE avec16pd(const FVEC_NAME &a) {
+    lo_ = _mm512_cvtpslo_pd(a.val_);
+    hi_ = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(a.val_, _MM_PERM_BADC));
+  }
+  VEC_INLINE static avec16pd undefined() {
+    return avec16pd(_mm512_undefined_pd(), _mm512_undefined_pd());
+  }
+  VEC_INLINE static avec16pd mask_gather(
+      const avec16pd &src, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const double * mem, const int scale
+  ) {
+    assert(scale == sizeof(double));
+    __m512d lo = _mm512_mask_i32logather_pd(src.lo_, mask.val_, idx.val_, mem, 
+					    sizeof(double));
+    __m512d hi = _mm512_mask_i32logather_pd(src.hi_, get_bvec_hi(mask.val_), 
+					    get_ivec_hi(idx.val_), mem, 
+					    sizeof(double));
+    return avec16pd(lo, hi);
+  }
+  VEC_INLINE static void mask_i32loscatter(
+      double * mem, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const avec16pd &a, const int scale
+  ) {
+    assert(scale == sizeof(double));
+    _mm512_mask_i32loscatter_pd(mem, mask.val_, idx.val_, a.lo_, 
+				sizeof(double));
+    _mm512_mask_i32loscatter_pd(mem, get_bvec_hi(mask.val_), 
+				get_ivec_hi(idx.val_), a.hi_, sizeof(double));
+  }
+
+  #define AVEC2_BINOP(the_sym, the_name)                                    \
+    VEC_INLINE inline avec16pd operator the_sym(const avec16pd &b) const {  \
+    __m512d lo = _mm512_##the_name##_pd(this->lo_, b.lo_);                  \
+    __m512d hi = _mm512_##the_name##_pd(this->hi_, b.hi_);                  \
+    return avec16pd(lo, hi);                                                \
+  }
+  AVEC2_BINOP(-, sub)
+
+  VEC_INLINE static void gather_prefetch0(const IVEC_NAME &a, void * mem) {
+    _mm512_mask_prefetch_i32gather_ps(a.val_, BVEC_NAME::full().val_, mem, 
+				      sizeof(double), _MM_HINT_T0);
+  }
+};
+#endif
+
+}
+
+
+#ifdef FVEC_FIRST_PASS
+
+template<typename flt_t, typename acc_t>
+struct intr_types;
+
+template<>
+struct intr_types<double,double> {
+  typedef mm512::fvec8pd fvec;
+  typedef mm512::ivec8 ivec;
+  typedef mm512::bvec8 bvec;
+  typedef mm512::avec8pd avec;
+};
+
+template<>
+struct intr_types<float,float> {
+  typedef mm512::fvec16ps fvec;
+  typedef mm512::ivec16 ivec;
+  typedef mm512::bvec16 bvec;
+  typedef mm512::avec16ps avec;
+};
+
+template<>
+struct intr_types<float,double> {
+  typedef mm512::fvec16ps fvec;
+  typedef mm512::ivec16 ivec;
+  typedef mm512::bvec16 bvec;
+  typedef mm512::avec16pd avec;
+};
+
+#endif
+
+
+#ifndef FVEC_FIRST_PASS
+#  define FVEC_FIRST_PASS
+#  include "intel_intrinsics_airebo.h"
+#endif
+
+#endif
+
+#ifdef LMP_INTEL_AIREBO_256
+
+#include <cassert>
+#include <immintrin.h>
+#include <stdint.h>
+
+#define VEC_INLINE __attribute__((always_inline))
+
+
+#ifndef FVEC_FIRST_PASS
+#  define FVEC_LEN 4
+#  define FVEC_SUFFIX(a) a##pd
+#  define FVEC_MASK_T __m256d
+#  define FVEC_VEC_T __m256d
+#  define FVEC_SCAL_T double
+#  define IVEC_NAME ivec4
+#  define FVEC_NAME fvec4pd
+#  define BVEC_NAME bvec4
+#  define AVEC_NAME avec4pd
+#else
+#  undef FVEC_LEN
+#  undef FVEC_SUFFIX
+#  undef FVEC_SUFFIX_MASK
+#  undef FVEC_MASK_T
+#  undef FVEC_VEC_T
+#  undef FVEC_SCAL_T
+#  undef IVEC_NAME
+#  undef FVEC_NAME
+#  undef BVEC_NAME
+#  undef AVEC_NAME
+
+#  define FVEC_LEN 8
+#  define FVEC_SUFFIX(a) a##ps
+#  define FVEC_MASK_T __m256
+#  define FVEC_VEC_T __m256
+#  define FVEC_SCAL_T float
+#  define IVEC_NAME ivec8
+#  define FVEC_NAME fvec8ps
+#  define BVEC_NAME bvec8
+#  define AVEC_NAME avec8ps
+#endif
+
+
+
+namespace mm256 {
+
+//#define __AVX2__ __AVX2__
+
+#if !defined(__AVX2__) && !defined(FVEC_FIRST_PASS)
+
+#define IVEC_EM_BIN(op) \
+  __m128i a_lo = _mm256_castsi256_si128(a);  \
+  __m128i b_lo = _mm256_castsi256_si128(b);  \
+  __m128i a_hi = _mm256_extractf128_si256(a, 1);  \
+  __m128i b_hi = _mm256_extractf128_si256(b, 1);  \
+  __m128i c_lo = op(a_lo, b_lo); \
+  __m128i c_hi = op(a_hi, b_hi); \
+  __m256i ret = _mm256_setr_m128i(c_lo, c_hi); \
+  return ret;
+
+VEC_INLINE inline __m256i _cm256_add_epi32(const __m256i &a, const __m256i &b) {
+  IVEC_EM_BIN(_mm_add_epi32)
+}
+
+VEC_INLINE inline __m256i _cm256_and_si256(const __m256i &a, const __m256i &b) {
+  IVEC_EM_BIN(_mm_and_si128)
+}
+
+VEC_INLINE inline __m256i _cm256_andnot_si256(const __m256i &a, 
+					      const __m256i &b) {
+  IVEC_EM_BIN(_mm_andnot_si128)
+}
+
+VEC_INLINE inline __m256i _cm256_cmpeq_epi32(const __m256i &a, 
+					     const __m256i &b) {
+  IVEC_EM_BIN(_mm_cmpeq_epi32)
+}
+
+VEC_INLINE inline __m256i _cm256_cmpgt_epi32(const __m256i &a, 
+					     const __m256i &b) {
+  IVEC_EM_BIN(_mm_cmpgt_epi32)
+}
+
+VEC_INLINE inline __m256i _cm256_cvtepu8_epi32(const __m128i &a) {
+  __m128i a_hi = _mm_castps_si128(_mm_permute_ps(_mm_castsi128_ps(a), 1));
+  __m128i c_lo = _mm_cvtepu8_epi32(a);
+  __m128i c_hi = _mm_cvtepu8_epi32(a_hi);
+  __m256i ret = _mm256_setr_m128i(c_lo, c_hi);
+  return ret;
+
+}
+
+#define IVEC_EM_SCAL(op)                       \
+  int buf_a[8] __attribute__((aligned(32)));   \
+  int buf_b[8] __attribute__((aligned(32)));   \
+  int dest[8] __attribute__((aligned(32)));    \
+  _mm256_store_si256((__m256i*)buf_a, a);      \
+  _mm256_store_si256((__m256i*)buf_b, b);      \
+  for (int i = 0; i < 8; i++) {		       \
+    dest[i] = op;			       \
+  }					       \
+  return _mm256_load_si256((__m256i*) dest);
+
+VEC_INLINE inline __m256i _cm256_permutevar8x32_epi32(const __m256i &a, 
+						      const __m256i &b) {
+  IVEC_EM_SCAL(buf_a[buf_b[i]])
+}
+
+VEC_INLINE inline __m256i _cm256_mullo_epi32(__m256i a, __m256i b) {
+  IVEC_EM_BIN(_mm_mullo_epi32)
+}
+
+VEC_INLINE inline __m256i _cm256_srlv_epi32(__m256i a, __m256i b) {
+  IVEC_EM_SCAL(buf_a[i] >> buf_b[i])
+}
+
+
+VEC_INLINE inline __m256 _cm256_permutevar8x32_ps(const __m256 &a, 
+						  const __m256i &b) {
+  return _mm256_castsi256_ps(_cm256_permutevar8x32_epi32(_mm256_castps_si256(a),
+							 b));
+}
+
+VEC_INLINE inline __m128i _cm_maskload_epi32(int const * mem, __m128i mask) {
+  return _mm_castps_si128(_mm_maskload_ps((float const *) mem, mask));
+}
+
+VEC_INLINE inline __m256i _cm256_maskload_epi32(int const * mem, __m256i mask) {
+  __m128i a_lo = _mm256_castsi256_si128(mask);
+  __m128i a_hi = _mm256_extractf128_si256(mask, 1);
+  __m128i c_lo = _cm_maskload_epi32(mem, a_lo);
+  __m128i c_hi = _cm_maskload_epi32(mem + 4, a_hi);
+  __m256i ret = _mm256_setr_m128i(c_lo, c_hi);
+  return ret;
+}
+
+
+VEC_INLINE inline __m256i _cm256_mask_i32gather_epi32(__m256i src, 
+						      int const * base_addr, 
+						      __m256i index, 
+						      __m256i mask, 
+						      const int scale) {
+  assert(scale == sizeof(int));
+  int buf_index[8] __attribute__((aligned(32)));
+  int buf_mask[8] __attribute__((aligned(32)));
+  int dest[8] __attribute__((aligned(32)));
+  _mm256_store_si256((__m256i*)dest, src);
+  _mm256_store_si256((__m256i*)buf_index, index);
+  _mm256_store_si256((__m256i*)buf_mask, mask);
+  for (int i = 0; i < 8; i++) {
+    if (buf_mask[i]) dest[i] = base_addr[buf_index[i]];
+  }
+  return _mm256_load_si256((__m256i*) dest);
+}
+
+VEC_INLINE inline __m256 _cm256_mask_i32gather_ps(__m256 src, 
+						  float const * base_addr, 
+						  __m256i index, __m256 mask, 
+						  const int scale) {
+  return _mm256_castsi256_ps(_cm256_mask_i32gather_epi32(
+    _mm256_castps_si256(src), (const int *) base_addr, index,
+    _mm256_castps_si256(mask), scale));
+}
+
+VEC_INLINE inline __m256d _cm256_mask_i32gather_pd(__m256d src, 
+						   double const * base_addr,
+						   __m128i index, __m256d mask,
+						   const int scale) {
+  assert(scale == sizeof(double));
+  int buf_index[4] __attribute__((aligned(32)));
+  int buf_mask[8] __attribute__((aligned(32)));
+  double dest[4] __attribute__((aligned(32)));
+  _mm256_store_pd(dest, src);
+  _mm_store_si128((__m128i*)buf_index, index);
+  _mm256_store_si256((__m256i*)buf_mask, _mm256_castpd_si256(mask));
+  for (int i = 0; i < 4; i++) {
+    if (buf_mask[2*i]) dest[i] = base_addr[buf_index[i]];
+  }
+  return _mm256_load_pd(dest);
+}
+
+VEC_INLINE inline __m256i _cm256_i32gather_epi32(int const * base_addr,
+						 __m256i index, 
+						 const int scale) {
+  assert(scale == sizeof(int));
+  int buf_index[8] __attribute__((aligned(32)));
+  int dest[8] __attribute__((aligned(32)));
+  _mm256_store_si256((__m256i*)buf_index, index);
+  for (int i = 0; i < 8; i++) {
+    dest[i] = base_addr[buf_index[i]];
+  }
+  return _mm256_load_si256((__m256i*) dest);
+}
+
+VEC_INLINE inline __m256 _cm256_i32gather_ps(float const * base_addr,
+					     __m256i index, const int scale) {
+  return _mm256_castsi256_ps(_cm256_i32gather_epi32((const int *) base_addr,
+						    index, scale));
+}
+
+VEC_INLINE inline __m256d _cm256_i32gather_pd(double const * base_addr,
+					      __m128i index, const int scale) {
+  assert(scale == sizeof(double));
+  int buf_index[4] __attribute__((aligned(32)));
+  double dest[4] __attribute__((aligned(32)));
+  _mm_store_si128((__m128i*)buf_index, index);
+  for (int i = 0; i < 4; i++) {
+    dest[i] = base_addr[buf_index[i]];
+  }
+  return _mm256_load_pd(dest);
+}
+
+VEC_INLINE inline uint64_t _cdep_u64(uint64_t tmp, uint64_t mask) {
+  uint64_t dst = 0;
+  uint64_t k = 0;
+  const uint64_t one = 1;
+  const uint64_t zero = 0;
+  for (uint64_t m = 0; m < 64; m++) {
+    if (mask & (one << m)) {
+      dst |= static_cast<uint64_t>((tmp & (one << k)) != zero) << m;
+      k += 1;
+    }
+  }
+  return dst;
+}
+
+VEC_INLINE inline uint64_t _cext_u64(uint64_t tmp, uint64_t mask) {
+  uint64_t dst = 0;
+  uint64_t k = 0;
+  const uint64_t one = 1;
+  const uint64_t zero = 0;
+  for (uint64_t m = 0; m < 64; m++) {
+    if (mask & (one << m)) {
+      dst |= static_cast<uint64_t>((tmp & (one << m)) != zero) << k;
+      k += 1;
+    }
+  }
+  return dst;
+}
+
+#define _mm256_add_epi32 _cm256_add_epi32
+#define _mm256_and_si256 _cm256_and_si256
+#define _mm256_andnot_si256 _cm256_andnot_si256
+#define _mm256_cmpeq_epi32 _cm256_cmpeq_epi32
+#define _mm256_cmpgt_epi32 _cm256_cmpgt_epi32
+#define _mm256_permutevar8x32_epi32 _cm256_permutevar8x32_epi32
+#define _mm256_permutevar8x32_ps _cm256_permutevar8x32_ps
+#define _mm_maskload_epi32 _cm_maskload_epi32
+#define _mm256_maskload_epi32 _cm256_maskload_epi32
+#define _mm256_mullo_epi32 _cm256_mullo_epi32
+#define _mm256_srlv_epi32 _cm256_srlv_epi32
+#define _mm256_mask_i32gather_epi32 _cm256_mask_i32gather_epi32
+#define _mm256_mask_i32gather_pd _cm256_mask_i32gather_pd
+#define _mm256_mask_i32gather_ps _cm256_mask_i32gather_ps
+#define _mm256_i32gather_epi32 _cm256_i32gather_epi32
+#define _mm256_i32gather_pd _cm256_i32gather_pd
+#define _mm256_i32gather_ps _cm256_i32gather_ps
+#define _pdep_u64 _cdep_u64
+#define _pext_u64 _cext_u64
+#define _mm256_cvtepu8_epi32 _cm256_cvtepu8_epi32
+
+#endif
+
+#ifndef FVEC_FIRST_PASS
+
+VEC_INLINE inline __m256 _mm256_compress_ps(__m256 mask, __m256 a) {
+# ifdef __AVX2__
+  uint64_t expanded_mask = _pdep_u64(_mm256_movemask_ps(mask), 
+				     0x0101010101010101);
+  // unpack each bit to a byte
+  expanded_mask *= 0xFF;   // mask |= mask<<1 | mask<<2 | ... | mask<<7;
+  // the identity shuffle for vpermps, packed to one index per byte
+  const uint64_t identity_indices = 0x0706050403020100;   
+  uint64_t wanted_indices = _pext_u64(identity_indices, expanded_mask);
+
+  __m128i bytevec = _mm_cvtsi64_si128(wanted_indices);
+  __m256i shufmask = _mm256_cvtepu8_epi32(bytevec);
+
+  return _mm256_permutevar8x32_ps(a, shufmask);
+# else
+  int mask_buf[8] __attribute__((aligned(32)));
+  float a_buf[8] __attribute__((aligned(32)));
+  float dst_buf[8] __attribute__((aligned(32)));
+  _mm256_store_si256((__m256i*) mask_buf, _mm256_castps_si256(mask));
+  _mm256_store_ps(a_buf, a);
+  int k = 0;
+  for (int i = 0; i < 8; i++) {
+    if (mask[i]) {
+      dst_buf[k++] = a_buf[i];
+    }
+  }
+  return _mm256_load_ps(dst_buf);
+# endif
+}
+VEC_INLINE inline __m256 _mm256_expand_ps(__m256 mask, __m256 a) {
+# ifdef __AVX2__
+  uint64_t expanded_mask = _pdep_u64(_mm256_movemask_ps(mask), 
+				     0x0101010101010101);
+  expanded_mask *= 0xFF;
+  const uint64_t identity_indices = 0x0706050403020100;
+  uint64_t wanted_indices = _pdep_u64(identity_indices, expanded_mask);
+  __m128i bytevec = _mm_cvtsi64_si128(wanted_indices);
+  __m256i shufmask = _mm256_cvtepu8_epi32(bytevec);
+  return _mm256_permutevar8x32_ps(a, shufmask);
+# else
+  int mask_buf[8] __attribute__((aligned(32)));
+  float a_buf[8] __attribute__((aligned(32)));
+  float dst_buf[8] __attribute__((aligned(32))) = {0};
+  _mm256_store_si256((__m256i*) mask_buf, _mm256_castps_si256(mask));
+  _mm256_store_ps(a_buf, a);
+  int k = 0;
+  for (int i = 0; i < 8; i++) {
+    if (mask[i]) {
+      dst_buf[i] = a_buf[k++];
+    }
+  }
+  return _mm256_load_ps(dst_buf);
+# endif
+}
+
+VEC_INLINE inline __m256d _mm256_compress_pd(__m256d mask, __m256d a) {
+  return _mm256_castps_pd(_mm256_compress_ps(_mm256_castpd_ps(mask), 
+					     _mm256_castpd_ps(a)));
+}
+VEC_INLINE inline __m256d _mm256_expand_pd(__m256d mask, __m256d a) {
+  return _mm256_castps_pd(_mm256_expand_ps(_mm256_castpd_ps(mask), 
+                                           _mm256_castpd_ps(a)));
+}
+#endif
+
+
+class FVEC_NAME;
+class IVEC_NAME;
+class AVEC_NAME;
+class BVEC_NAME {
+  friend class FVEC_NAME;
+  friend class IVEC_NAME;
+  friend class AVEC_NAME;
+# if FVEC_LEN==8
+  friend class avec8pd;
+# endif
+  FVEC_MASK_T val_;
+  VEC_INLINE BVEC_NAME(const FVEC_MASK_T &v) : val_(v) {}
+  VEC_INLINE BVEC_NAME(const __m256i &v) : val_(FVEC_SUFFIX(_mm256_castsi256_)
+						(v)) {}
+public:
+  VEC_INLINE BVEC_NAME() {}
+  VEC_INLINE static BVEC_NAME kand(const BVEC_NAME &a, const BVEC_NAME &b) {
+    return FVEC_SUFFIX(_mm256_and_)(a.val_, b.val_);
+  }
+  VEC_INLINE static BVEC_NAME kandn(const BVEC_NAME &a, const BVEC_NAME &b) {
+    return FVEC_SUFFIX(_mm256_andnot_)(a.val_, b.val_);
+  }
+  VEC_INLINE static BVEC_NAME masku_compress(const BVEC_NAME &mask, 
+					     const BVEC_NAME &a) {
+    return FVEC_SUFFIX(_mm256_compress_)(mask.val_, a.val_);
+  }
+  VEC_INLINE static BVEC_NAME mask_expand(const BVEC_NAME &src, 
+					  const BVEC_NAME &mask, 
+					  const BVEC_NAME &a) {
+    FVEC_MASK_T ret = FVEC_SUFFIX(_mm256_expand_)(mask.val_, a.val_);
+    ret = FVEC_SUFFIX(_mm256_and_)(mask.val_, ret);
+    ret = FVEC_SUFFIX(_mm256_or_)(ret, FVEC_SUFFIX(_mm256_andnot_)
+				  (mask.val_, src.val_));
+    return ret;
+  }
+  VEC_INLINE static BVEC_NAME full() {
+    __m256i a = _mm256_undefined_si256();
+    return FVEC_SUFFIX(_mm256_castsi256_)(_mm256_cmpeq_epi32(a, a));
+  }
+  VEC_INLINE static BVEC_NAME empty() {
+    return FVEC_SUFFIX(_mm256_setzero_)();
+  }
+  VEC_INLINE static BVEC_NAME only(int n) {
+    static const unsigned int FULL_ps = (unsigned int) -1;
+    static const unsigned int LUT_ps[9][8] = {
+      {0, 0, 0, 0, 0, 0, 0, 0},
+      {FULL_ps, 0, 0, 0, 0, 0, 0, 0},
+      {FULL_ps, FULL_ps, 0, 0, 0, 0, 0, 0},
+      {FULL_ps, FULL_ps, FULL_ps, 0, 0, 0, 0, 0},
+      {FULL_ps, FULL_ps, FULL_ps, FULL_ps, 0, 0, 0, 0},
+      {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, 0, 0, 0},
+      {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, 0, 0},
+      {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, 0},
+      {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps},
+    };
+    static const unsigned long long FULL_pd = (unsigned long long) -1;
+    static const unsigned long long LUT_pd[5][4] = {
+      {0, 0, 0, 0},
+      {FULL_pd, 0, 0, 0},
+      {FULL_pd, FULL_pd, 0, 0},
+      {FULL_pd, FULL_pd, FULL_pd, 0},
+      {FULL_pd, FULL_pd, FULL_pd, FULL_pd},
+    };
+    return FVEC_SUFFIX(_mm256_load_)((const FVEC_SCAL_T*) FVEC_SUFFIX(LUT_)[n]);
+  }
+  VEC_INLINE static BVEC_NAME after(int n) {
+    static const unsigned int FULL_ps = (unsigned int) -1;
+    static const unsigned int LUT_ps[9][8] = {
+      {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps},
+      {0, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps},
+      {0, 0, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps},
+      {0, 0, 0, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps},
+      {0, 0, 0, 0, FULL_ps, FULL_ps, FULL_ps, FULL_ps},
+      {0, 0, 0, 0, 0, FULL_ps, FULL_ps, FULL_ps},
+      {0, 0, 0, 0, 0, 0, FULL_ps, FULL_ps},
+      {0, 0, 0, 0, 0, 0, 0, FULL_ps},
+      {0, 0, 0, 0, 0, 0, 0, 0},
+    };
+    static const unsigned long long FULL_pd = (unsigned long long) -1;
+    static const unsigned long long LUT_pd[5][4] = {
+      {FULL_pd, FULL_pd, FULL_pd, FULL_pd},
+      {0, FULL_pd, FULL_pd, FULL_pd},
+      {0, 0, FULL_pd, FULL_pd},
+      {0, 0, 0, FULL_pd},
+      {0, 0, 0, 0},
+    };
+    return FVEC_SUFFIX(_mm256_load_)((const FVEC_SCAL_T*) FVEC_SUFFIX(LUT_)[n]);
+  }
+  VEC_INLINE static BVEC_NAME onlyafter(int only_, int after_) {
+    return kand(after(after_), only(after_ + only_));
+  }
+  VEC_INLINE static int popcnt(const BVEC_NAME &a) {
+    return _popcnt32(FVEC_SUFFIX(_mm256_movemask_)(a.val_));
+  }
+  VEC_INLINE static bool test_all_unset(const BVEC_NAME &a) {
+    return FVEC_SUFFIX(_mm256_testz_)(a.val_, a.val_);
+  }
+  VEC_INLINE static bool test_any_set(const BVEC_NAME &a) {
+    return ! test_all_unset(a);
+  }
+  VEC_INLINE static bool test_at(const BVEC_NAME &a, int i) {
+    assert(i < FVEC_LEN);
+    return FVEC_SUFFIX(_mm256_movemask_)(a.val_) & (1 << i);
+  }
+  VEC_INLINE BVEC_NAME operator &(const BVEC_NAME &b) const {
+    return FVEC_SUFFIX(_mm256_and_)(val_, b.val_);
+  }
+  VEC_INLINE BVEC_NAME operator |(const BVEC_NAME &b) const {
+    return FVEC_SUFFIX(_mm256_or_)(val_, b.val_);
+  }
+  VEC_INLINE BVEC_NAME operator ~() const {
+    return FVEC_SUFFIX(_mm256_andnot_)(val_, full().val_);
+  }
+};
+
+class IVEC_NAME {
+  friend class FVEC_NAME;
+  friend class AVEC_NAME;
+# if FVEC_LEN==8
+  friend class avec8pd;
+# endif
+  __m256i val_;
+  VEC_INLINE IVEC_NAME(const __m256i &v) : val_(v) {}
+  VEC_INLINE static __m256i to(const FVEC_VEC_T &a) {
+#   if FVEC_LEN==4
+    return _mm256_castpd_si256(a);
+#   else
+    return _mm256_castps_si256(a);
+#   endif
+  }
+  VEC_INLINE static FVEC_VEC_T from(const __m256i &a) {
+    return FVEC_SUFFIX(_mm256_castsi256_)(a);
+  }
+public:
+  static const int VL = 8;
+  VEC_INLINE IVEC_NAME() {}
+
+  #define IVEC_MASK_BINFN_B(the_name)                                \
+    VEC_INLINE static BVEC_NAME the_name(const IVEC_NAME &a,         \
+                                         const IVEC_NAME &b) {	     \
+      return _mm256_##the_name##_epi32(a.val_, b.val_);              \
+    }                                                                \
+    VEC_INLINE static BVEC_NAME mask_##the_name(                     \
+        const BVEC_NAME &mask,                                       \
+        const IVEC_NAME &a, const IVEC_NAME &b                       \
+    ) {                                                              \
+      BVEC_NAME ret = _mm256_##the_name##_epi32(                     \
+        a.val_, b.val_);                                             \
+      return mask & ret;                                             \
+    }
+  IVEC_MASK_BINFN_B(cmpeq)
+  IVEC_MASK_BINFN_B(cmpgt)
+
+  VEC_INLINE static __m256i _mm256_cmplt_epi32(__m256i a, __m256i b) {
+    __m256i le = _mm256_cmpgt_epi32(b, a);
+    __m256i eq = _mm256_cmpeq_epi32(a, b);
+    return _mm256_andnot_si256(eq, le);
+  }
+
+  VEC_INLINE static __m256i _mm256_cmpneq_epi32(__m256i a, __m256i b) {
+    __m256i eq = _mm256_cmpeq_epi32(a, b);
+    __m256i t = _mm256_undefined_si256();
+    __m256i f = _mm256_cmpeq_epi32(t, t);
+    return _mm256_andnot_si256(eq, f);
+  }
+
+  IVEC_MASK_BINFN_B(cmplt)
+  IVEC_MASK_BINFN_B(cmpneq)
+  #undef IVEC_MASK_BINFN_B
+
+  VEC_INLINE static IVEC_NAME mask_blend(
+      const BVEC_NAME &mask, const IVEC_NAME &a, const IVEC_NAME &b
+  ) {
+    return to(FVEC_SUFFIX(_mm256_blendv_)(from(a.val_), from(b.val_), 
+              mask.val_));
+  }
+  #define IVEC_MASK_BINFN_I(the_name)                                \
+    VEC_INLINE static IVEC_NAME mask_##the_name(                     \
+        const IVEC_NAME &src, const BVEC_NAME &mask,                 \
+        const IVEC_NAME &a, const IVEC_NAME &b                       \
+    ) {                                                              \
+      IVEC_NAME ret = _mm256_##the_name##_epi32(                     \
+						a.val_, b.val_);     \
+	return mask_blend(mask, src, ret);			     \
+    }
+  IVEC_MASK_BINFN_I(add)
+  #undef IVEC_MASK_BINFN_I
+
+  #define IVEC_BINFN_I(the_name)                                     \
+    VEC_INLINE static IVEC_NAME the_name(const IVEC_NAME &a,         \
+					 const IVEC_NAME &b) {	     \
+      return _mm256_##the_name##_epi32(a.val_, b.val_);              \
+    }
+  IVEC_BINFN_I(mullo)
+  IVEC_BINFN_I(srlv)
+  #undef IVEC_BINFN_I
+  VEC_INLINE static IVEC_NAME the_and(const IVEC_NAME &a, const IVEC_NAME &b) {
+    return _mm256_and_si256(a.val_, b.val_);
+  }
+
+  VEC_INLINE static IVEC_NAME masku_compress(const BVEC_NAME &mask, 
+					     const IVEC_NAME &b) {
+    return to(FVEC_SUFFIX(_mm256_compress_)(mask.val_, from(b.val_)));
+  }
+  VEC_INLINE static IVEC_NAME mask_expand(
+      const IVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &b
+  ) {
+    FVEC_VEC_T ret = FVEC_SUFFIX(_mm256_expand_)(mask.val_, from(b.val_));
+    ret = FVEC_SUFFIX(_mm256_and_)(mask.val_, ret);
+    ret = FVEC_SUFFIX(_mm256_or_)(ret, FVEC_SUFFIX(_mm256_andnot_)
+				    (mask.val_, from(src.val_)));
+    return to(ret);
+  }
+
+  VEC_INLINE static void store(int * dest, const IVEC_NAME &src) {
+    _mm256_store_si256((__m256i*)dest, src.val_);
+#   if FVEC_LEN==4
+    dest[1] = dest[2];
+    dest[2] = dest[4];
+    dest[3] = dest[6];
+#   endif
+  }
+
+  VEC_INLINE static int at(const IVEC_NAME &a, int b) {
+    int data[8] __attribute__((aligned(32)));
+    store(data, a);
+    return data[b];
+  }
+
+  VEC_INLINE static void print(const char * str, const IVEC_NAME &a) {
+    int data[8] __attribute__((aligned(32)));
+    store(data, a);
+    printf("%s:", str);
+    for (int i = 0; i < FVEC_LEN; i++) {
+      printf(" %d", data[i]);
+    }
+    printf("\n");
+  }
+
+  VEC_INLINE static IVEC_NAME maskz_loadu(const BVEC_NAME &mask, 
+					  const int * src) {
+    FVEC_VEC_T mask_val = mask.val_;
+#   if FVEC_LEN==4
+#    ifdef __AVX2__
+    static const unsigned int mask_shuffle[8] __attribute__((aligned(32))) =
+      {0, 2, 4, 6, 0, 0, 0, 0};
+    __m256 m = _mm256_castpd_ps(mask_val);
+    m = _mm256_permutevar8x32_ps(m, _mm256_load_si256((__m256i*)mask_shuffle));
+    __m128i ret = _mm_maskload_epi32(src, 
+       _mm256_castsi256_si128(_mm256_castps_si256(m)));
+    static const unsigned int load_shuffle[8] __attribute__((aligned(32))) =
+      {0, 0, 1, 1, 2, 2, 3, 3};
+    return _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ret), 
+      _mm256_load_si256((__m256i*)load_shuffle));
+#    else
+    int dest[8] __attribute__((aligned(32))) = {0};
+    int mask_buf[8] __attribute__((aligned(32)));
+    _mm256_store_pd((double*) mask_buf, mask.val_);
+    for (int i = 0; i < 4; i++) {
+      if (mask_buf[2*i]) {
+        int val = src[i];
+        dest[2*i+0] = val;
+        dest[2*i+1] = val;
+      }
+    }
+    return _mm256_load_si256((__m256i*) dest);
+#    endif
+#   else
+    return _mm256_maskload_epi32(src, to(mask_val));
+#   endif
+  }
+
+  VEC_INLINE static IVEC_NAME mask_gather(
+      const IVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx, 
+      const int * mem, const int scale
+  ) {
+    assert(scale == sizeof(int));
+    return _mm256_mask_i32gather_epi32(src.val_, mem, idx.val_, to(mask.val_), 
+				       sizeof(int));
+  }
+
+  VEC_INLINE static void mask_compressstore(const BVEC_NAME &mask, int * dest,
+					    const IVEC_NAME &src) {
+    int buf[8] __attribute__((aligned(64)));
+    const int stride = FVEC_LEN==4 ? 2 : 1;
+    _mm256_store_si256((__m256i*)buf, src.val_);
+    int mask_val = FVEC_SUFFIX(_mm256_movemask_)(mask.val_);
+    int k = 0;
+    #pragma unroll
+    for (int i = 0; i < FVEC_LEN; i++) {
+      if (mask_val & (1 << i))
+        dest[k++] = buf[stride*i];
+    }
+  }
+
+  VEC_INLINE static IVEC_NAME set1(int i) {
+    return _mm256_set1_epi32(i);
+  }
+  VEC_INLINE static IVEC_NAME setzero() {
+    return _mm256_setzero_si256();
+  }
+  VEC_INLINE static IVEC_NAME undefined() {
+    return _mm256_undefined_si256();
+  }
+
+  VEC_INLINE IVEC_NAME operator +(const IVEC_NAME &b) const {
+    return _mm256_add_epi32(this->val_, b.val_);
+  }
+};
+
+class FVEC_NAME {
+  friend class AVEC_NAME;
+#if FVEC_LEN==8
+  friend class avec8pd;
+#endif
+  FVEC_VEC_T val_;
+  VEC_INLINE FVEC_NAME(const FVEC_VEC_T &v) : val_(v) {}
+public:
+  static const int VL = FVEC_LEN;
+# if defined(__AVX2__) || defined(__MIC__) || defined(__AVX512F__)
+  VEC_INLINE static bool fast_compress() { return true; }
+# else
+  VEC_INLINE static bool fast_compress() { return false; }
+# endif
+  VEC_INLINE FVEC_NAME() {}
+  VEC_INLINE static FVEC_SCAL_T at(const FVEC_NAME &a, int i) {
+    assert(i < FVEC_LEN);
+    FVEC_SCAL_T data[FVEC_LEN] __attribute__((aligned(64)));
+    FVEC_SUFFIX(_mm256_store_)(data, a.val_);
+    return data[i];
+  }
+
+  #define FVEC_MASK_BINFN_B(the_name, the_imm)				\
+    VEC_INLINE static BVEC_NAME the_name(const FVEC_NAME &a,		\
+					 const FVEC_NAME &b) {		\
+      return FVEC_SUFFIX(_mm256_cmp_)(a.val_, b.val_, the_imm);         \
+    }									\
+    VEC_INLINE static BVEC_NAME mask_##the_name(                        \
+        const BVEC_NAME &mask,                                          \
+        const FVEC_NAME &a, const FVEC_NAME &b                          \
+    ) {                                                                 \
+      BVEC_NAME ret = FVEC_SUFFIX(_mm256_cmp_)(                         \
+        a.val_, b.val_, the_imm);                                       \
+      return mask & ret;						\
+    }
+  FVEC_MASK_BINFN_B(cmple, _CMP_LE_OS)
+  FVEC_MASK_BINFN_B(cmplt, _CMP_LT_OS)
+  FVEC_MASK_BINFN_B(cmpneq, _CMP_NEQ_UQ)
+  FVEC_MASK_BINFN_B(cmpnle, _CMP_NLE_US)
+  FVEC_MASK_BINFN_B(cmpnlt, _CMP_NLT_US)
+  #undef FVEC_MASK_BINFN_B
+
+  VEC_INLINE static __m256d _mm256_recip_pd(__m256d a) {
+    __m256d c_1 = _mm256_set1_pd(1);
+    return _mm256_div_pd(c_1, a);
+  }
+  VEC_INLINE static __m256 _mm256_recip_ps(__m256 a) {
+    return _mm256_rcp_ps(a);
+  }
+  VEC_INLINE static __m256d _mm256_abs_pd(__m256d a) {
+    const unsigned long long abs_mask = 0x7FFFFFFFFFFFFFFF;
+    const unsigned long long abs_full[8] =
+        {abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, 
+	   abs_mask};
+    return _mm256_and_pd(_mm256_load_pd((double*)abs_full), a);
+  }
+  VEC_INLINE static __m256 _mm256_abs_ps(__m256 a) {
+    const unsigned long long abs_mask = 0x7FFFFFFF;
+    const unsigned long long abs_full[16] =
+        {abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, 
+	   abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, 
+	   abs_mask, abs_mask, abs_mask};
+    return _mm256_and_ps(_mm256_load_ps((float*)abs_full), a);
+  }
+
+  #define FVEC_UNFN_F(the_name)                                      \
+    VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a) {       \
+      return FVEC_SUFFIX(_mm256_##the_name##_)(a.val_);              \
+    }
+  FVEC_UNFN_F(abs)
+  FVEC_UNFN_F(exp)
+  FVEC_UNFN_F(invsqrt)
+  FVEC_UNFN_F(recip)
+  FVEC_UNFN_F(sqrt)
+  #undef FVEC_UNFN_F
+
+  VEC_INLINE static FVEC_NAME mask_blend(
+      const BVEC_NAME &mask, const FVEC_NAME &a, const FVEC_NAME &b
+  ) {
+    return FVEC_SUFFIX(_mm256_blendv_)(a.val_, b.val_, mask.val_);
+  }
+  #define FVEC_MASK_UNFN_F(the_name)                                 \
+    VEC_INLINE static FVEC_NAME mask_##the_name(                     \
+        const FVEC_NAME &src, const BVEC_NAME &mask,                 \
+        const FVEC_NAME &a                                           \
+    ) {                                                              \
+      FVEC_NAME ret = FVEC_SUFFIX(_mm256_##the_name##_)(             \
+							a.val_);     \
+      return mask_blend(mask, src, ret);			     \
+    }
+  FVEC_MASK_UNFN_F(cos)
+  FVEC_MASK_UNFN_F(recip)
+  FVEC_MASK_UNFN_F(sqrt)
+  #undef FVEC_MASK_UNFN_F
+
+  #define FVEC_BINFN_F(the_name)                                     \
+    VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a,         \
+                                         const FVEC_NAME &b) {	     \
+      return FVEC_SUFFIX(_mm256_##the_name##_)(a.val_, b.val_);	     \
+    }
+  FVEC_BINFN_F(max)
+  FVEC_BINFN_F(min)
+  #undef FVEC_BINFN_F
+
+  #define FVEC_MASK_BINFN_F(the_name)                                \
+    VEC_INLINE static FVEC_NAME mask_##the_name(	             \
+        const FVEC_NAME &src, const BVEC_NAME &mask,                 \
+        const FVEC_NAME &a, const FVEC_NAME &b                       \
+    ) {                                                              \
+      FVEC_NAME ret = FVEC_SUFFIX(_mm256_##the_name##_)(             \
+        a.val_, b.val_);                                             \
+      return mask_blend(mask, src, ret);                             \
+    }
+  FVEC_MASK_BINFN_F(add)
+  FVEC_MASK_BINFN_F(div)
+  FVEC_MASK_BINFN_F(mul)
+  FVEC_MASK_BINFN_F(sub)
+  #undef FVEC_MASK_BINFN_F
+
+  VEC_INLINE static FVEC_NAME mask_expand(
+      const FVEC_NAME &src, const BVEC_NAME &mask, const FVEC_NAME &b
+  ) {
+    FVEC_VEC_T ret = FVEC_SUFFIX(_mm256_expand_)(mask.val_, b.val_);
+    ret = FVEC_SUFFIX(_mm256_and_)(mask.val_, ret);
+    ret = FVEC_SUFFIX(_mm256_or_)(ret, FVEC_SUFFIX(_mm256_andnot_)
+      (mask.val_, src.val_));
+    return ret;
+  }
+  VEC_INLINE static FVEC_NAME masku_compress(
+      const BVEC_NAME &mask, const FVEC_NAME &b
+  ) {
+    return FVEC_SUFFIX(_mm256_compress_)(mask.val_, b.val_);
+  }
+
+  VEC_INLINE static FVEC_NAME set1(const FVEC_SCAL_T &a) {
+    return FVEC_SUFFIX(_mm256_set1_)(a);
+  }
+  VEC_INLINE static FVEC_NAME setzero() {
+    return FVEC_SUFFIX(_mm256_setzero_)();
+  }
+  VEC_INLINE static FVEC_NAME undefined() {
+    return FVEC_SUFFIX(_mm256_undefined_)();
+  }
+
+  VEC_INLINE static FVEC_NAME load(const FVEC_SCAL_T *mem) {
+    return FVEC_SUFFIX(_mm256_load_)(mem);
+  }
+  VEC_INLINE static void store(FVEC_SCAL_T * dest, const FVEC_NAME &a) {
+    FVEC_SUFFIX(_mm256_store_)(dest, a.val_);
+  }
+
+
+  VEC_INLINE static FVEC_NAME gather(const IVEC_NAME &idx, 
+    const FVEC_SCAL_T * mem, const int scale) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+#   if FVEC_LEN==4
+#    ifdef __AVX2__
+    static const unsigned int mask_shuffle[8] __attribute__((aligned(32))) =
+      {0, 2, 4, 6, 0, 0, 0, 0};
+    __m256i m = _mm256_permutevar8x32_epi32(idx.val_, 
+      _mm256_load_si256((__m256i*)mask_shuffle));
+    __m128i idx_short = _mm256_castsi256_si128(m);
+    return FVEC_SUFFIX(_mm256_i32gather_)(mem, idx_short, sizeof(FVEC_SCAL_T));
+#    else
+    int idx_buf[8] __attribute__((aligned(32)));
+    _mm256_store_si256((__m256i*) idx_buf, idx.val_);
+    double dest[4] __attribute__((aligned(32)));
+    for (int i = 0; i < 4; i++) {
+      dest[i] = mem[idx_buf[2*i]];
+    }
+    return _mm256_load_pd(dest);
+#    endif
+#   else
+    return FVEC_SUFFIX(_mm256_i32gather_)(mem, idx.val_, sizeof(FVEC_SCAL_T));
+#   endif
+  }
+  VEC_INLINE static FVEC_NAME mask_gather(
+      const FVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const FVEC_SCAL_T * mem, const int scale
+  ) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+#   if FVEC_LEN==4
+#    ifdef __AVX2__
+    static const unsigned int mask_shuffle[8] __attribute__((aligned(32))) =
+      {0, 2, 4, 6, 0, 0, 0, 0};
+    __m256i m = _mm256_permutevar8x32_epi32(idx.val_, 
+      _mm256_load_si256((__m256i*)mask_shuffle));
+    __m128i idx_short = _mm256_castsi256_si128(m);
+    return FVEC_SUFFIX(_mm256_mask_i32gather_)(src.val_, mem, idx_short, 
+      mask.val_, sizeof(FVEC_SCAL_T));
+#    else
+    int idx_buf[8] __attribute__((aligned(32)));
+    int mask_buf[8] __attribute__((aligned(32)));
+    _mm256_store_si256((__m256i*) idx_buf, idx.val_);
+    _mm256_store_pd((double*) mask_buf, mask.val_);
+    double dest[4] __attribute__((aligned(32)));
+    _mm256_store_pd((double*) dest, src.val_);
+    for (int i = 0; i < 4; i++) {
+      if (mask_buf[2*i])
+        dest[i] = mem[idx_buf[2*i]];
+    }
+    return _mm256_load_pd(dest);
+#    endif
+#   else
+    return FVEC_SUFFIX(_mm256_mask_i32gather_)(src.val_, mem, idx.val_, 
+      mask.val_, sizeof(FVEC_SCAL_T));
+#   endif
+  }
+
+  VEC_INLINE static void gather_4_adjacent(const IVEC_NAME &idx, 
+      const FVEC_SCAL_T * mem, const int scale, FVEC_NAME * out_0, 
+      FVEC_NAME * out_1, FVEC_NAME * out_2, FVEC_NAME * out_3) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+    int idx_buf[8] __attribute__((aligned(32)));
+    _mm256_store_si256((__m256i*) idx_buf, idx.val_);
+#   if FVEC_LEN==4
+    __m256d a0 = _mm256_load_pd(&mem[idx_buf[0]]);
+    __m256d a1 = _mm256_load_pd(&mem[idx_buf[2]]);
+    __m256d a2 = _mm256_load_pd(&mem[idx_buf[4]]);
+    __m256d a3 = _mm256_load_pd(&mem[idx_buf[6]]);
+    __m256d b0 = _mm256_unpacklo_pd(a0, a1);
+    __m256d b1 = _mm256_unpackhi_pd(a0, a1);
+    __m256d b2 = _mm256_unpacklo_pd(a2, a3);
+    __m256d b3 = _mm256_unpackhi_pd(a2, a3);
+    *out_0 = _mm256_permute2f128_pd(b0, b2, 0x20);
+    *out_1 = _mm256_permute2f128_pd(b1, b3, 0x20);
+    *out_2 = _mm256_permute2f128_pd(b0, b2, 0x31);
+    *out_3 = _mm256_permute2f128_pd(b1, b3, 0x31);
+#   else
+    const float *e0 = &mem[idx_buf[0]];
+    const float *e1 = &mem[idx_buf[1]];
+    const float *e2 = &mem[idx_buf[2]];
+    const float *e3 = &mem[idx_buf[3]];
+    const float *e4 = &mem[idx_buf[4]];
+    const float *e5 = &mem[idx_buf[5]];
+    const float *e6 = &mem[idx_buf[6]];
+    const float *e7 = &mem[idx_buf[7]];
+    __m256 a0 = _mm256_loadu2_m128(e4, e0);
+    __m256 a1 = _mm256_loadu2_m128(e5, e1);
+    __m256 b0 = _mm256_unpacklo_ps(a0, a1);
+    __m256 b1 = _mm256_unpackhi_ps(a0, a1);
+    __m256 a2 = _mm256_loadu2_m128(e6, e2);
+    __m256 a3 = _mm256_loadu2_m128(e7, e3);
+    __m256 b2 = _mm256_unpacklo_ps(a2, a3);
+    __m256 b3 = _mm256_unpackhi_ps(a2, a3);
+    *out_0 = _mm256_shuffle_ps(b0, b2, 0x44);
+    *out_1 = _mm256_shuffle_ps(b0, b2, 0xEE);
+    *out_2 = _mm256_shuffle_ps(b1, b3, 0x44);
+    *out_3 = _mm256_shuffle_ps(b1, b3, 0xEE);
+#   endif
+  }
+  VEC_INLINE static void gather_3_adjacent(const IVEC_NAME &idx, 
+					   const FVEC_SCAL_T * mem, 
+					   const int scale, 
+					   FVEC_NAME * out_0, 
+					   FVEC_NAME * out_1, 
+					   FVEC_NAME * out_2) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+    FVEC_NAME tmp_3;
+    gather_4_adjacent(idx, mem, scale, out_0, out_1, out_2, &tmp_3);
+  }
+
+  VEC_INLINE static double _mm256_reduce_add_pd(__m256d a) {
+    __m256d t1 = _mm256_hadd_pd(a, a);
+    __m128d t2 = _mm256_extractf128_pd(t1, 1);
+    __m128d t3 = _mm256_castpd256_pd128(t1);
+    return _mm_cvtsd_f64(_mm_add_pd(t2, t3));
+  }
+
+  VEC_INLINE static float _mm256_reduce_add_ps(__m256 a) {
+    __m256 t1 = _mm256_hadd_ps(a, a);
+    __m128 t2 = _mm256_extractf128_ps(t1, 1);
+    __m128 t3 = _mm256_castps256_ps128(t1);
+    __m128 t4 = _mm_add_ps(t2, t3);
+    __m128 t5 = _mm_permute_ps(t4, 0x1B); // 0x1B = reverse
+    return _mm_cvtss_f32(_mm_add_ps(t4, t5));
+  }
+
+  VEC_INLINE static FVEC_SCAL_T reduce_add(const FVEC_NAME &a) {
+    return FVEC_SUFFIX(_mm256_reduce_add_)(a.val_);
+  }
+  VEC_INLINE static FVEC_SCAL_T mask_reduce_add(const BVEC_NAME &mask, 
+						const FVEC_NAME &a) {
+    return reduce_add(FVEC_SUFFIX(_mm256_and_)(mask.val_, a.val_));
+  }
+
+  VEC_INLINE static IVEC_NAME unpackloepi32(const FVEC_NAME &a) {
+#   if FVEC_LEN==4
+#    if __AVX2__
+    static const unsigned int mask_shuffle[8] __attribute__((aligned(32))) =
+      {0, 0, 2, 2, 4, 4, 6, 6};
+    __m256 m = _mm256_permutevar8x32_ps(_mm256_castpd_ps(a.val_),
+      _mm256_load_si256((__m256i*)mask_shuffle));
+    return _mm256_castps_si256(m);
+#    else
+    __m128i a_lo = _mm256_castsi256_si128(_mm256_castpd_si256(a.val_));
+    __m128i a_hi = _mm256_extractf128_si256(_mm256_castpd_si256(a.val_), 1);
+    __m128i c_lo = _mm_shuffle_epi32(a_lo, 0xA0); /*1010 0000*/
+    __m128i c_hi = _mm_shuffle_epi32(a_hi, 0xA0);
+    __m256i ret = _mm256_setr_m128i(c_lo, c_hi);
+    return ret;
+#    endif
+#   else
+    return _mm256_castps_si256(a.val_);
+#   endif
+  }
+
+  VEC_INLINE static FVEC_NAME mask_sincos(
+      FVEC_NAME * cos, const FVEC_NAME &src_a, const FVEC_NAME &src_b,
+      const BVEC_NAME &mask, const FVEC_NAME &arg
+  ) {
+    FVEC_VEC_T c, s = FVEC_SUFFIX(_mm256_sincos_)(&c, arg.val_);
+    *cos = mask_blend(mask, src_b, c);
+    return mask_blend(mask, src_a, s);
+  }
+
+  #define FVEC_BINOP(the_sym, the_name)                                      \
+    VEC_INLINE inline FVEC_NAME operator the_sym(const FVEC_NAME &b) const { \
+    return FVEC_SUFFIX(_mm256_##the_name##_)(this->val_, b.val_);            \
+  }
+  FVEC_BINOP(+, add)
+  FVEC_BINOP(-, sub)
+  FVEC_BINOP(*, mul)
+  FVEC_BINOP(/, div)
+  #undef FVEC_BINOP
+
+  VEC_INLINE static void gather_prefetch0(const IVEC_NAME &a, void * mem) {
+    /* NOP */
+  }
+};
+
+class AVEC_NAME {
+  friend class avec8pd;
+  FVEC_VEC_T val_;
+  VEC_INLINE AVEC_NAME(const FVEC_VEC_T &a) : val_(a) {}
+public:
+  VEC_INLINE AVEC_NAME(const FVEC_NAME &a) : val_(a.val_) {}
+  VEC_INLINE static AVEC_NAME undefined() {
+    return FVEC_SUFFIX(_mm256_undefined_)();
+  }
+  VEC_INLINE static AVEC_NAME mask_gather(
+      const AVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const FVEC_SCAL_T * mem, const int scale
+  ) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+    return FVEC_NAME::mask_gather(src.val_, mask, idx, mem, scale);
+  }
+  VEC_INLINE static void mask_i32loscatter(
+      FVEC_SCAL_T * mem, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const AVEC_NAME &a, const int scale
+  ) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+    for (int l = 0; l < FVEC_NAME::VL; l++) {
+      if (BVEC_NAME::test_at(mask, l))
+        mem[IVEC_NAME::at(idx, l)] = FVEC_NAME::at(a.val_, l);
+    }
+  }
+
+  #define AVEC_BINOP(the_sym, the_name)                                      \
+    VEC_INLINE inline AVEC_NAME operator the_sym(const AVEC_NAME &b) const { \
+    return FVEC_SUFFIX(_mm256_##the_name##_)(this->val_, b.val_);            \
+  }
+  AVEC_BINOP(-, sub)
+  #undef AVEC_BINOP
+};
+
+#if FVEC_LEN==8
+class avec8pd {
+  __m256d lo_, hi_;
+  VEC_INLINE avec8pd(const __m256d &lo, const __m256d &hi) : lo_(lo), hi_(hi) {}
+  VEC_INLINE static __m128 get_ps_hi(__m256 a) {
+    return _mm256_extractf128_ps(a, 1);
+  }
+  VEC_INLINE static __m128 get_ps_lo(__m256 a) {
+    return _mm256_castps256_ps128(a);
+  }
+  VEC_INLINE static __m128i get_si_hi(__m256i a) {
+    return _mm_castps_si128(get_ps_hi(_mm256_castsi256_ps(a)));
+  }
+  VEC_INLINE static __m128i get_si_lo(__m256i a) {
+    return _mm_castps_si128(get_ps_lo(_mm256_castsi256_ps(a)));
+  }
+public:
+  VEC_INLINE avec8pd(const FVEC_NAME &a) {
+    lo_ = _mm256_cvtps_pd(get_ps_lo(a.val_));
+    hi_ = _mm256_cvtps_pd(get_ps_hi(a.val_));
+  }
+  VEC_INLINE static avec8pd undefined() {
+    return avec8pd(_mm256_undefined_pd(), _mm256_undefined_pd());
+  }
+  VEC_INLINE static avec8pd mask_gather(
+      const avec8pd &src, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const double * mem, const int scale
+  ) {
+#   ifndef __AVX2__
+    assert(scale == sizeof(double));
+    int idx_buf[8] __attribute__((aligned(32)));
+    _mm256_store_si256((__m256i*) idx_buf, idx.val_);
+    int mask_val = _mm256_movemask_ps(mask.val_);
+    double ret_buf[8] __attribute__((aligned(32)));
+    _mm256_store_pd(&ret_buf[0], src.lo_);
+    _mm256_store_pd(&ret_buf[4], src.hi_);
+    for (int i = 0; i < 8; i++) {
+      if (mask_val & (1 << i)) {
+        ret_buf[i] = mem[idx_buf[i]];
+      }
+    }
+    __m256d lo = _mm256_load_pd(&ret_buf[0]);
+    __m256d hi = _mm256_load_pd(&ret_buf[4]);
+#   else
+    static const unsigned int lo_shuffle[8] __attribute__((aligned(32))) =
+      {0, 0, 1, 1, 2, 2, 3, 3};
+    static const unsigned int hi_shuffle[8] __attribute__((aligned(32))) =
+      {4, 4, 5, 5, 6, 6, 7, 7};
+    __m256d lo_mask = _mm256_castps_pd(_mm256_permutevar8x32_ps(mask.val_,
+      _mm256_load_si256((__m256i*) lo_shuffle)));
+    __m256d hi_mask = _mm256_castps_pd(_mm256_permutevar8x32_ps(mask.val_, 
+      _mm256_load_si256((__m256i*) hi_shuffle)));
+    __m256d lo = _mm256_mask_i32gather_pd(src.lo_, mem, get_si_lo(idx.val_), 
+					  lo_mask, sizeof(double));
+    __m256d hi = _mm256_mask_i32gather_pd(src.hi_, mem, get_si_hi(idx.val_), 
+					  hi_mask, sizeof(double));
+#   endif
+    return avec8pd(lo, hi);
+  }
+  VEC_INLINE static void mask_i32loscatter(
+      double * mem, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const avec8pd &a, const int scale
+  ) {
+    assert(scale == sizeof(double));
+    double a_buf[8] __attribute__((aligned(32)));
+    _mm256_store_pd(a_buf, a.lo_);
+    _mm256_store_pd(&a_buf[4], a.hi_);
+    int idx_buf[8] __attribute__((aligned(32)));
+    _mm256_store_si256((__m256i*)idx_buf, idx.val_);
+    int mask_val = _mm256_movemask_ps(mask.val_);
+    for (int i = 0; i < 8; i++) {
+      if (mask_val & (1 << i))
+        mem[idx_buf[i]] = a_buf[i];
+    }
+  }
+
+  #define AVEC2_BINOP(the_sym, the_name)                                    \
+    VEC_INLINE inline avec8pd operator the_sym(const avec8pd &b) const {    \
+    __m256d lo = _mm256_##the_name##_pd(this->lo_, b.lo_);                  \
+    __m256d hi = _mm256_##the_name##_pd(this->hi_, b.hi_);                  \
+    return avec8pd(lo, hi);                                                 \
+  }
+  AVEC2_BINOP(-, sub)
+};
+#endif
+
+}
+
+
+#ifdef FVEC_FIRST_PASS
+
+template<typename flt_t, typename acc_t>
+struct intr_types;
+
+template<>
+struct intr_types<double,double> {
+  typedef mm256::fvec4pd fvec;
+  typedef mm256::ivec4 ivec;
+  typedef mm256::bvec4 bvec;
+  typedef mm256::avec4pd avec;
+};
+
+template<>
+struct intr_types<float,float> {
+  typedef mm256::fvec8ps fvec;
+  typedef mm256::ivec8 ivec;
+  typedef mm256::bvec8 bvec;
+  typedef mm256::avec8ps avec;
+};
+
+template<>
+struct intr_types<float,double> {
+  typedef mm256::fvec8ps fvec;
+  typedef mm256::ivec8 ivec;
+  typedef mm256::bvec8 bvec;
+  typedef mm256::avec8pd avec;
+};
+
+#endif
+
+#ifndef FVEC_FIRST_PASS
+#  define FVEC_FIRST_PASS
+#  include "intel_intrinsics_airebo.h"
+#endif
+
+#endif
+
+#ifdef LMP_INTEL_AIREBO_SCALAR
+
+#include <cassert>
+#include <cmath>
+#include <immintrin.h>
+
+#define VEC_INLINE __attribute__((always_inline))
+
+template<typename flt_t, typename acc_t>
+struct intr_types {
+
+class fvec;
+class ivec;
+class avec;
+class bvec {
+  friend class fvec;
+  friend class ivec;
+  friend class avec;
+  bool val_;
+  VEC_INLINE bvec(const bool &v) : val_(v) {}
+public:
+  VEC_INLINE bvec() {}
+  VEC_INLINE static bvec kand(const bvec &a, const bvec &b) {
+    return a.val_ && b.val_;
+  }
+  VEC_INLINE static bvec kandn(const bvec &a, const bvec &b) {
+    return (! a.val_) && b.val_;
+  }
+  VEC_INLINE static bvec knot(const bvec &a) {
+    return ! a.val_;
+  }
+  VEC_INLINE static int kortestz(const bvec &a, const bvec &b) {
+    return (! a.val_) && (! b.val_) ? true : false;
+  }
+  VEC_INLINE static bvec masku_compress(const bvec &mask, const bvec &a) {
+    return mask.val_ ? a.val_ : false;
+  }
+  VEC_INLINE static bvec mask_expand(const bvec &src, const bvec &mask, 
+				     const bvec &a) {
+    return mask.val_ ? a.val_ : src.val_;
+  }
+  VEC_INLINE static bvec full() {
+    return true;
+  }
+  VEC_INLINE static bvec empty() {
+    return false;
+  }
+  VEC_INLINE static bvec only(int n) {
+    return n == 1 ? true : false;
+  }
+  VEC_INLINE static bvec after(int n) {
+    return n == 0 ? true : false;
+  }
+  VEC_INLINE static bvec onlyafter(int only, int after) {
+    return after == 0 && only == 1 ? true : false;
+  }
+  VEC_INLINE static int popcnt(const bvec &a) {
+    return static_cast<int>(a.val_);
+  }
+  VEC_INLINE static bool test_all_unset(const bvec &a) {
+    return kortestz(a, a);
+  }
+  VEC_INLINE static bool test_any_set(const bvec &a) {
+    return ! test_all_unset(a);
+  }
+  VEC_INLINE static bool test_at(const bvec &a, int i) {
+    assert(i < 1);
+    return a.val_;
+  }
+  VEC_INLINE bvec operator &(const bvec &b) const {
+    return val_ && b.val_;
+  }
+  VEC_INLINE bvec operator |(const bvec &b) const {
+    return val_ || b.val_;
+  }
+  VEC_INLINE bvec operator ~() const {
+    return ! val_;
+  }
+};
+
+class ivec {
+  friend class fvec;
+  friend class avec;
+  int val_;
+  VEC_INLINE ivec(const int &v) : val_(v) {}
+public:
+  static const int VL = 1;
+  VEC_INLINE ivec() {}
+
+  #define IVEC_MASK_BINFN_B(the_name, the_op)                        \
+    VEC_INLINE static bvec the_name(const ivec &a, const ivec &b) {  \
+      return a.val_ the_op b.val_;                                   \
+    }                                                                \
+    VEC_INLINE static bvec mask_##the_name(                          \
+        const bvec &mask,                                            \
+        const ivec &a, const ivec &b                                 \
+    ) {                                                              \
+      return mask.val_ && (a.val_ the_op b.val_);                    \
+                                                                     \
+    }
+  IVEC_MASK_BINFN_B(cmpeq, ==)
+  IVEC_MASK_BINFN_B(cmplt, <)
+  IVEC_MASK_BINFN_B(cmpneq, !=)
+  IVEC_MASK_BINFN_B(cmpgt, >)
+
+  #define IVEC_MASK_BINFN_I(the_name, the_op)                        \
+    VEC_INLINE static ivec mask_##the_name(                          \
+        const ivec &src, const bvec &mask,                           \
+        const ivec &a, const ivec &b                                 \
+    ) {                                                              \
+      return mask.val_ ? a.val_ the_op b.val_ : src.val_;            \
+    }
+  IVEC_MASK_BINFN_I(add, +)
+  VEC_INLINE static ivec mask_blend(
+      const bvec &mask, const ivec &a, const ivec &b
+  ) {
+    return mask.val_ ? b.val_ : a.val_;
+  }
+
+  #define IVEC_BINFN_I(the_name, the_op)                             \
+    VEC_INLINE static ivec the_name(const ivec &a, const ivec &b) {  \
+      return a.val_ the_op b.val_;                                   \
+    }
+  IVEC_BINFN_I(mullo, *)
+  IVEC_BINFN_I(srlv, >>)
+  VEC_INLINE static ivec the_and(const ivec &a, const ivec &b) {
+    return a.val_ & b.val_;
+  }
+
+  VEC_INLINE static ivec mask_expand(
+      const ivec &src, const bvec &a, const ivec &b
+  ) {
+    return a.val_ ? b.val_ : src.val_;
+  }
+  VEC_INLINE static ivec masku_compress(
+      const bvec &a, const ivec &b
+  ) {
+    return a.val_ ? b.val_ : 0;
+  }
+
+  VEC_INLINE static int at(const ivec &a, int b) {
+    assert(b == 0);
+    return a.val_;
+  }
+
+  VEC_INLINE static ivec load(const int * src) {
+    return *src;
+  }
+  VEC_INLINE static ivec mask_loadu(const bvec &mask, const int * src) {
+    return mask.val_ ? *src : 0xDEAD;
+  }
+  VEC_INLINE static ivec maskz_loadu(const bvec &mask, const int * src) {
+    return mask.val_ ? *src : 0;
+  }
+  VEC_INLINE static void mask_storeu(const bvec &mask, int * dest, 
+    const ivec &src) {
+    if (mask.val_) *dest = src.val_;
+  }
+  VEC_INLINE static void store(int * dest, const ivec &src) {
+    *dest = src.val_;
+  }
+
+  VEC_INLINE static ivec mask_gather(
+      const ivec &src, const bvec &mask, const ivec &idx, const int * mem, 
+	const int scale
+  ) {
+    return mask.val_ ? *reinterpret_cast<const int *>
+      (reinterpret_cast<const char*>(mem) + scale * idx.val_) : src.val_;
+  }
+  VEC_INLINE static void mask_i32scatter(
+      int * mem, const bvec &mask, const ivec &idx, const ivec &a, 
+	const int scale
+  ) {
+    if (mask.val_) *reinterpret_cast<int *>(reinterpret_cast<char*>(mem) + 
+      scale * idx.val_) = a.val_;
+  }
+
+  VEC_INLINE static void mask_compressstore(const bvec &mask, int * dest, 
+      const ivec &src) {
+    if (mask.val_) *dest = src.val_;
+  }
+
+  VEC_INLINE static ivec set(
+      int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
+      int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0
+  ) {
+    return i0;
+  }
+  VEC_INLINE static ivec set1(int i) {
+    return i;
+  }
+  VEC_INLINE static ivec setzero() {
+    return 0;
+  }
+  VEC_INLINE static ivec undefined() {
+    return 0xDEAD;
+  }
+
+  VEC_INLINE ivec operator +(const ivec &b) const {
+    return val_ + b.val_;
+  }
+};
+
+class fvec {
+  friend class avec;
+  flt_t val_;
+  VEC_INLINE fvec(const flt_t &v) : val_(v) {}
+public:
+  static const int VL = 1;
+  VEC_INLINE fvec() {}
+  VEC_INLINE static flt_t at(const fvec &a, int i) {
+    assert(i < 1);
+    return a.val_;
+  }
+  VEC_INLINE static bool fast_compress() { return false; }
+
+  #define FVEC_MASK_BINFN_B(the_name, the_op)                        \
+    VEC_INLINE static bvec the_name(const fvec &a, const fvec &b) {  \
+      return a.val_ the_op b.val_;                                   \
+    }                                                                \
+    VEC_INLINE static bvec mask_##the_name(                          \
+        const bvec &mask,                                            \
+        const fvec &a, const fvec &b                                 \
+    ) {                                                              \
+      return mask.val_ && (a.val_ the_op b.val_);                    \
+    }
+  FVEC_MASK_BINFN_B(cmple, <=)
+  FVEC_MASK_BINFN_B(cmplt, <)
+  FVEC_MASK_BINFN_B(cmpneq, !=)
+  FVEC_MASK_BINFN_B(cmpnle, >)
+  FVEC_MASK_BINFN_B(cmpnlt, >=)
+
+  #define FVEC_UNFN_F(the_name, the_fn)                              \
+    VEC_INLINE static fvec the_name(const fvec &a) {                 \
+      return the_fn(a.val_);                                         \
+    }
+  FVEC_UNFN_F(abs, fabs)
+  FVEC_UNFN_F(exp, ::exp)
+  FVEC_UNFN_F(invsqrt, 1/std::sqrt)
+  FVEC_UNFN_F(recip, 1/)
+  FVEC_UNFN_F(sqrt, std::sqrt)
+
+  #define FVEC_MASK_UNFN_F(the_name, the_fn)                         \
+    VEC_INLINE static fvec mask_##the_name(                          \
+        const fvec &src, const bvec &mask,                           \
+        const fvec &a                                                \
+    ) {                                                              \
+      return mask.val_ ? the_fn(a.val_) : src.val_;                  \
+    }
+  FVEC_MASK_UNFN_F(cos, std::cos)
+  FVEC_MASK_UNFN_F(recip, 1/)
+  FVEC_MASK_UNFN_F(sqrt, std::sqrt)
+
+  #define FVEC_BINFN_F(the_name, the_fn)                             \
+    VEC_INLINE static fvec the_name(const fvec &a, const fvec &b) {  \
+      return the_fn(a.val_, b.val_);                                 \
+    }
+  FVEC_BINFN_F(max, ::fmax)
+  FVEC_BINFN_F(min, ::fmin)
+
+  #define FVEC_MASK_BINFN_F(the_name, the_op)                        \
+    VEC_INLINE static fvec mask_##the_name(                          \
+        const fvec &src, const bvec &mask,                           \
+        const fvec &a, const fvec &b                                 \
+    ) {                                                              \
+      return mask.val_ ? a.val_ the_op b.val_ : src.val_;            \
+    }
+  FVEC_MASK_BINFN_F(add, +)
+  FVEC_MASK_BINFN_F(div, /)
+  FVEC_MASK_BINFN_F(mul, *)
+  FVEC_MASK_BINFN_F(sub, -)
+  VEC_INLINE static fvec mask_blend(
+      const bvec &mask, const fvec &a, const fvec &b
+  ) {
+    return mask.val_ ? b.val_ : a.val_;
+  }
+
+  VEC_INLINE static fvec mask_expand(
+      const fvec &src, const bvec &a, const fvec &b
+  ) {
+    return a.val_ ? b.val_ : src.val_;
+  }
+  VEC_INLINE static fvec masku_compress(
+      const bvec &a, const fvec &b
+  ) {
+    return a.val_ ? b.val_ : 0;
+  }
+
+  VEC_INLINE static fvec set1(const flt_t &a) {
+    return a;
+  }
+  VEC_INLINE static fvec setzero() {
+    return 0;
+  }
+  VEC_INLINE static fvec undefined() {
+    return 1337.1337;
+  }
+
+  VEC_INLINE static fvec load(const flt_t *mem) {
+    return *mem;
+  }
+  VEC_INLINE static void mask_storeu(const bvec &mask, flt_t * dest, 
+				     const fvec &a) {
+    if (mask.val_) *dest = a.val_;
+  }
+  VEC_INLINE static void store(flt_t * dest, const fvec &a) {
+    *dest = a.val_;
+  }
+
+  VEC_INLINE static fvec gather(const ivec &idx, const flt_t * mem, 
+				const int scale) {
+    return *reinterpret_cast<const flt_t*>(reinterpret_cast<const char*>(mem) +
+      scale * idx.val_);
+  }
+  VEC_INLINE static fvec mask_gather(
+      const fvec &src, const bvec &mask, const ivec &idx,
+      const flt_t * mem, const int scale
+  ) {
+    return mask.val_ ? *reinterpret_cast<const flt_t*>
+      (reinterpret_cast<const char*>(mem) + scale * idx.val_) : src.val_;
+  }
+
+  VEC_INLINE static void gather_3_adjacent(const ivec &idx, const flt_t * mem,
+					   const int scale, fvec * out_0, 
+					   fvec * out_1, fvec * out_2) {
+    assert(scale == sizeof(flt_t));
+    *out_0 = gather(idx, mem + 0, scale);
+    *out_1 = gather(idx, mem + 1, scale);
+    *out_2 = gather(idx, mem + 2, scale);
+  }
+  VEC_INLINE static void gather_4_adjacent(const ivec &idx, const flt_t * mem,
+					   const int scale, fvec * out_0, 
+					   fvec * out_1, fvec * out_2, 
+					   fvec * out_3) {
+    assert(scale == sizeof(flt_t));
+    *out_0 = gather(idx, mem + 0, scale);
+    *out_1 = gather(idx, mem + 1, scale);
+    *out_2 = gather(idx, mem + 2, scale);
+    *out_3 = gather(idx, mem + 3, scale);
+  }
+
+  VEC_INLINE static flt_t mask_reduce_add(const bvec &mask, const fvec &a) {
+    return mask.val_ ? a.val_ : 0;
+  }
+  VEC_INLINE static flt_t reduce_add(const fvec &a) {
+    return a.val_;
+  }
+
+  VEC_INLINE static ivec unpackloepi32(const fvec &a) {
+    return reinterpret_cast<const int*>(&a.val_)[0];
+  }
+
+  VEC_INLINE static fvec mask_sincos(
+      fvec * cos_out, const fvec &src_a, const fvec &src_b,
+      const bvec &mask, const fvec &arg
+  ) {
+    cos_out->val_ = mask.val_ ? ::cos(arg.val_) : src_b.val_;
+    return mask.val_ ? ::sin(arg.val_) : src_a.val_;
+  }
+
+  #define FVEC_BINOP(the_sym, the_name)                              \
+    VEC_INLINE inline fvec operator the_sym(const fvec &b) const {   \
+    return this->val_ the_sym b.val_;                                \
+  }
+  FVEC_BINOP(+, add)
+  FVEC_BINOP(-, sub)
+  FVEC_BINOP(*, mul)
+  FVEC_BINOP(/, div)
+
+  VEC_INLINE static void gather_prefetch0(const ivec &idx, const void * mem) {}
+};
+
+class avec {
+  acc_t val_;
+  VEC_INLINE avec(const acc_t &a) : val_(a) {}
+public:
+  VEC_INLINE avec(const fvec &a) : val_(a.val_) {}
+  VEC_INLINE static avec undefined() {
+    return 1337.1337;
+  }
+  VEC_INLINE static avec mask_gather(const avec &src, const bvec &mask, 
+				     const ivec &idx, const acc_t * mem, 
+				     const int scale) {
+    return mask.val_ ? *reinterpret_cast<const acc_t*>
+      (reinterpret_cast<const char*>(mem) + scale * idx.val_) : src.val_;
+  }
+  VEC_INLINE static void mask_i32loscatter(acc_t * mem, const bvec &mask, 
+					   const ivec &idx, const avec &a, 
+					   const int scale) {
+    if (mask.val_) *reinterpret_cast<acc_t*>(reinterpret_cast<char*>(mem) + 
+					     idx.val_ * scale) = a.val_;
+  }
+
+  #define AVEC_BINOP(the_sym, the_name)                              \
+    VEC_INLINE inline avec operator the_sym(const avec &b) const {   \
+    return this->val_ the_sym b.val_;                                \
+  }
+  AVEC_BINOP(-, sub)
+};
+
+};
+
+#endif
diff --git a/src/USER-INTEL/nbin_intel.cpp b/src/USER-INTEL/nbin_intel.cpp
index c5574a78c..3a36ead49 100644
--- a/src/USER-INTEL/nbin_intel.cpp
+++ b/src/USER-INTEL/nbin_intel.cpp
@@ -1,252 +1,250 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #include "nbin_intel.h"
 #include "atom.h"
 #include "group.h"
 #include "domain.h"
 #include "comm.h"
 #include "update.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 NBinIntel::NBinIntel(LAMMPS *lmp) : NBinStandard(lmp) {
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   _fix = static_cast<FixIntel *>(modify->fix[ifix]);
   _precision_mode = _fix->precision();
   _atombin = NULL;
   _binpacked = NULL;
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = _fix->coprocessor_number();
   _offload_alloc = 0;
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 NBinIntel::~NBinIntel() {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_offload_alloc) {
     const int * binhead = this->binhead;
     const int * bins = this->bins;
     const int * _atombin = this->_atombin;
     const int * _binpacked = this->_binpacked;
     #pragma offload_transfer target(mic:_cop)   \
       nocopy(binhead,bins,_atombin,_binpacked:alloc_if(0) free_if(1))
   }
   #endif
 }
 
 /* ----------------------------------------------------------------------
    setup for bin_atoms()
 ------------------------------------------------------------------------- */
 
 void NBinIntel::bin_atoms_setup(int nall)
 {
   // binhead = per-bin vector, mbins in length
   // add 1 bin for USER-INTEL package
 
   if (mbins > maxbin) {
     #ifdef _LMP_INTEL_OFFLOAD
     if (_offload_alloc) {
       const int * binhead = this->binhead;
       #pragma offload_transfer target(mic:_cop) \
         nocopy(binhead:alloc_if(0) free_if(1))
     }
     #endif
 
     maxbin = mbins;
     memory->destroy(binhead);
     memory->create(binhead,maxbin+1,"neigh:binhead");
 
     #ifdef _LMP_INTEL_OFFLOAD
     if (_fix->offload_balance() != 0) {
       int * binhead = this->binhead;
       #pragma offload_transfer target(mic:_cop) \
          nocopy(binhead:length(maxbin+1) alloc_if(1) free_if(0))
     }
     #endif
   }
 
   // bins = per-atom vector
 
   if (nall > maxatom) {
     maxatom = nall;
 
     #ifdef _LMP_INTEL_OFFLOAD
     if (_offload_alloc) {
       const int * bins = this->bins;
       const int * _atombin = this->_atombin;
       const int * _binpacked = this->_binpacked;
       #pragma offload_transfer target(mic:_cop) \
         nocopy(bins,_atombin,_binpacked:alloc_if(0) free_if(1))
     }
     #endif
     memory->destroy(bins);
     memory->destroy(_atombin);
     memory->destroy(_binpacked);
 
     memory->create(bins,maxatom,"neigh:bins");
     memory->create(_atombin,maxatom,"neigh:bins");
     memory->create(_binpacked,maxatom,"neigh:bins");
     #ifdef _LMP_INTEL_OFFLOAD
     if (_fix->offload_balance() != 0) {
       const int * bins = this->bins;
       const int * _atombin = this->_atombin;
       const int * _binpacked = this->_binpacked;
       #pragma offload_transfer target(mic:_cop) \
         nocopy(bins,_atombin,_binpacked:length(maxatom) alloc_if(1) free_if(0))
       _offload_alloc=1;
     }
     #endif
 
     if (_precision_mode == FixIntel::PREC_MODE_MIXED)
       _fix->get_mixed_buffers()->set_bininfo(_atombin,_binpacked);
     else if (_precision_mode == FixIntel::PREC_MODE_SINGLE)
       _fix->get_single_buffers()->set_bininfo(_atombin,_binpacked);
     else
       _fix->get_double_buffers()->set_bininfo(_atombin,_binpacked);
   }
 }
 
 /* ----------------------------------------------------------------------
    bin owned and ghost atoms
 ------------------------------------------------------------------------- */
 
 void NBinIntel::bin_atoms()
 {
   last_bin = update->ntimestep;
 
   if (_precision_mode == FixIntel::PREC_MODE_MIXED)
     bin_atoms(_fix->get_mixed_buffers());
   else if (_precision_mode == FixIntel::PREC_MODE_SINGLE)
     bin_atoms(_fix->get_single_buffers());
   else
     bin_atoms(_fix->get_double_buffers());
 }
 
 template <class flt_t, class acc_t>
 void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
   const int nlocal = atom->nlocal;
   const int nall = nlocal + atom->nghost;
   const int aend = _fix->offload_end_neighbor();
 
 
   // ---------- Sanity check for padding --------------
   {
     const flt_t dx = (INTEL_BIGP - bboxhi[0]);
     const flt_t dy = (INTEL_BIGP - bboxhi[1]);
     const flt_t dz = (INTEL_BIGP - bboxhi[2]);
     if (dx * dx + dy * dy + dz * dz <
         static_cast<flt_t>(neighbor->cutneighmaxsq))
       error->one(FLERR,
         "Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
   }
 
   // ---------- Grow and cast/pack buffers -------------
   _fix->start_watch(TIME_PACK);
   buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
 
   ATOM_T biga;
   biga.x = INTEL_BIGP;
   biga.y = INTEL_BIGP;
   biga.z = INTEL_BIGP;
   biga.w = 1;
   buffers->get_x()[nall] = biga;
 
   int nthreads;
   if (comm->nthreads > INTEL_HTHREADS) nthreads = comm->nthreads;
   else nthreads = 1;
   #if defined(_OPENMP)
   #pragma omp parallel if(nthreads > INTEL_HTHREADS)
   #endif
   {
     int ifrom, ito, tid;
     IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
                               sizeof(ATOM_T));
     buffers->thr_pack(ifrom, ito, 0);
   }
   _fix->stop_watch(TIME_PACK);
 
 
   // ---------- Bin Atoms -------------
   _fix->start_watch(TIME_HOST_NEIGHBOR);
   const ATOM_T * _noalias const x = buffers->get_x();
   int * _noalias const atombin = this->_atombin;
   int * _noalias const binpacked = this->_binpacked;
 
 
   const double sboxlo0 = bboxlo[0] + mbinxlo/bininvx;
   const double sboxlo1 = bboxlo[1] + mbinylo/bininvy;
   const double sboxlo2 = bboxlo[2] + mbinzlo/bininvz;
 
   int i, ibin;
 
   for (i = 0; i < mbins; i++) binhead[i] = -1;
 
   int *mask = atom->mask;
 
   if (includegroup) {
     int bitmask = group->bitmask[includegroup];
     for (i = nall-1; i >= nlocal; i--) {
       if (mask[i] & bitmask) {
         ibin = coord2bin(atom->x[i]);
+	// Only necessary to store when neighboring ghost
+	atombin[i] = ibin;
         bins[i] = binhead[ibin];
         binhead[ibin] = i;
       }
     }
     for (i = atom->nfirst-1; i >= 0; i--) {
       ibin = coord2bin(atom->x[i]);
       atombin[i] = ibin;
       bins[i] = binhead[ibin];
       binhead[ibin] = i;
     }
   } else {
-    for (i = nall-1; i >= nlocal; i--) {
-      ibin = coord2bin(atom->x[i]);
-      bins[i] = binhead[ibin];
-      binhead[ibin] = i;
-    }
-    for (i = nlocal-1; i >= 0; i--) {
+    for (i = nall-1; i >= 0; i--) {
       ibin = coord2bin(atom->x[i]);
-      atombin[i]=ibin;
+      // Only necessary to store for ghost when neighboring ghost
+      atombin[i] = ibin;
       bins[i] = binhead[ibin];
       binhead[ibin] = i;
     }
   }
   int newhead = 0;
   for (i = 0; i < mbins; i++) {
     int j = binhead[i];
     binhead[i] = newhead;
     for ( ; j >= 0; j = bins[j])
       binpacked[newhead++] = j;
   }
   binhead[mbins] = newhead;
 }
 
 /* ---------------------------------------------------------------------- */
 
 bigint NBinIntel::memory_usage()
 {
   return NBinStandard::memory_usage() + maxatom*2*sizeof(int);
 }
diff --git a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
new file mode 100644
index 000000000..12101712f
--- /dev/null
+++ b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
@@ -0,0 +1,593 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "npair_full_bin_ghost_intel.h"
+#include "neighbor.h"
+#include "nstencil.h"
+#include "neigh_list.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "molecule.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+NPairFullBinGhostIntel::NPairFullBinGhostIntel(LAMMPS *lmp) : NPairIntel(lmp) {}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction for all neighbors
+   include neighbors of ghost atoms, but no "special neighbors" for ghosts
+   every neighbor pair appears in list of both atoms i and j
+------------------------------------------------------------------------- */
+
+void NPairFullBinGhostIntel::build(NeighList *list)
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_fix->offload_noghost())
+    error->all(FLERR,
+      "The 'ghost no' option cannot be used with this USER-INTEL pair style.");
+  #endif
+
+  if (nstencil > INTEL_MAX_STENCIL_CHECK)
+    error->all(FLERR, "Too many neighbor bins for USER-INTEL package.");
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (exclude)
+    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
+  #endif
+
+  if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
+    fbi(list, _fix->get_mixed_buffers());
+  else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    fbi(list, _fix->get_double_buffers());
+  else
+    fbi(list, _fix->get_single_buffers());
+
+  _fix->stop_watch(TIME_HOST_NEIGHBOR);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t>
+void NPairFullBinGhostIntel::fbi(NeighList * list, 
+				 IntelBuffers<flt_t,acc_t> * buffers) 
+{
+  const int nlocal = atom->nlocal;
+  const int nall = atom->nlocal + atom->nghost;
+  list->inum = atom->nlocal;
+  list->gnum = atom->nghost;
+
+  int host_start = _fix->host_start_neighbor();
+  const int off_end = _fix->offload_end_neighbor();
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (off_end) grow_stencil();
+  if (_fix->full_host_list()) host_start = 0;
+  int offload_noghost = _fix->offload_noghost();
+  #endif
+
+  // only uses offload_end_neighbor to check whether we are doing offloading
+  // at all, no need to correct this later
+  buffers->grow_list(list, nall, comm->nthreads, off_end,
+		     _fix->nbor_pack_width());
+
+  int need_ic = 0;
+  if (atom->molecular)
+    dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
+			 neighbor->cutneighmax);
+
+  if (need_ic) {
+    fbi<flt_t,acc_t,1>(1, list, buffers, 0, off_end);
+    fbi<flt_t,acc_t,1>(0, list, buffers, host_start, nlocal);
+  } else {
+    fbi<flt_t,acc_t,0>(1, list, buffers, 0, off_end);
+    fbi<flt_t,acc_t,0>(0, list, buffers, host_start, nlocal);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int need_ic>
+void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, 
+				 IntelBuffers<flt_t,acc_t> * buffers,
+				 const int pstart, const int pend) {
+  if (pend-pstart == 0) return;
+
+  const int nall = atom->nlocal + atom->nghost;
+  int pad = 1;
+  int nall_t = nall;
+  const int aend = nall;
+
+  const int pack_width = _fix->nbor_pack_width();
+  const ATOM_T * _noalias const x = buffers->get_x();
+  int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int e_nall = nall_t;
+
+  const int molecular = atom->molecular;
+  int *ns = NULL;
+  tagint *s = NULL;
+  int tag_size = 0, special_size;
+  if (buffers->need_tag()) tag_size = e_nall;
+  if (molecular) {
+    s = atom->special[0];
+    ns = atom->nspecial[0];
+    special_size = aend;
+  } else {
+    s = &buffers->_special_holder;
+    ns = &buffers->_nspecial_holder;
+    special_size = 0;
+  }
+  const tagint * _noalias const special = s;
+  const int * _noalias const nspecial = ns;
+  const int maxspecial = atom->maxspecial;
+  const tagint * _noalias const tag = atom->tag;
+
+  int * _noalias const ilist = list->ilist;
+  int * _noalias numneigh = list->numneigh;
+  int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int nstencil = this->nstencil;
+  const int * _noalias const stencil = this->stencil;
+  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
+  const flt_t * _noalias const cutneighghostsq = 
+    buffers->get_cutneighghostsq()[0];
+  const int ntypes = atom->ntypes + 1;
+  const int nlocal = atom->nlocal;
+
+  #ifndef _LMP_INTEL_OFFLOAD
+  int * const mask = atom->mask;
+  tagint * const molecule = atom->molecule;
+  #endif
+
+  int *molindex = atom->molindex;
+  int *molatom = atom->molatom;
+  Molecule **onemols = atom->avec->onemols;
+  int moltemplate;
+  if (molecular == 2) moltemplate = 1;
+  else moltemplate = 0;
+  if (moltemplate) 
+    error->all(FLERR, 
+	       "Can't use moltemplate with npair style full/bin/ghost/intel.");
+
+  int tnum;
+  int *overflow;
+  double *timer_compute;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload) {
+    timer_compute = _fix->off_watch_neighbor();
+    tnum = buffers->get_off_threads();
+    overflow = _fix->get_off_overflow_flag();
+    _fix->stop_watch(TIME_HOST_NEIGHBOR);
+    _fix->start_watch(TIME_OFFLOAD_LATENCY);
+  } else
+  #endif
+  {
+    tnum = comm->nthreads;
+    overflow = _fix->get_overflow_flag();
+  }
+  const int nthreads = tnum;
+  const int maxnbors = buffers->get_max_nbors();
+  int * _noalias const atombin = buffers->get_atombin();
+  const int * _noalias const binpacked = buffers->get_binpacked();
+
+  const int xperiodic = domain->xperiodic;
+  const int yperiodic = domain->yperiodic;
+  const int zperiodic = domain->zperiodic;
+  const flt_t xprd_half = domain->xprd_half;
+  const flt_t yprd_half = domain->yprd_half;
+  const flt_t zprd_half = domain->zprd_half;
+
+  flt_t * _noalias const ncachex = buffers->get_ncachex();
+  flt_t * _noalias const ncachey = buffers->get_ncachey();
+  flt_t * _noalias const ncachez = buffers->get_ncachez();
+  int * _noalias const ncachej = buffers->get_ncachej();
+  int * _noalias const ncachejtype = buffers->get_ncachejtype();
+  int * _noalias const ncachetag = buffers->get_ncachetag();
+  const int ncache_stride = buffers->ncache_stride();
+
+  const int mbinx = this->mbinx;
+  const int mbiny = this->mbiny;
+  const int mbinz = this->mbinz;
+  const int * const stencilxyz = &this->stencilxyz[0][0];
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  const int * _noalias const binhead = this->binhead;
+  const int * _noalias const bins = this->bins;
+  const int cop = _fix->coprocessor_number();
+  const int separate_buffers = _fix->separate_buffers();
+  #pragma offload target(mic:cop) if(offload) \
+    in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
+    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
+    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
+    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
+    in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
+    in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
+    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
+    in(cutneighghostsq:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
+    in(atombin:length(aend) alloc_if(0) free_if(0)) \
+    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
+    in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
+    in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \
+    in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
+    in(separate_buffers,aend,nlocal,molecular,ntypes,mbinx,mbiny) \
+    in(mbinz,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
+    in(stencilxyz:length(3*nstencil)) \
+    out(overflow:length(5) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(tag)
+  #endif
+  {
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    overflow[LMP_LOCAL_MIN] = 0;
+    overflow[LMP_LOCAL_MAX] = aend - 1;
+    overflow[LMP_GHOST_MIN] = e_nall;
+    overflow[LMP_GHOST_MAX] = -1;
+    #endif
+
+    int nstencilp = 0;
+    int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
+    for (int k = 0; k < nstencil; k++) {
+      binstart[nstencilp] = stencil[k];
+      int end = stencil[k] + 1;
+      for (int kk = k + 1; kk < nstencil; kk++) {
+        if (stencil[kk-1]+1 == stencil[kk]) {
+          end++;
+          k++;
+        } else break;
+      }
+      binend[nstencilp] = end;
+      nstencilp++;
+    }
+
+    const int mbinyx = mbiny * mbinx;
+
+    #if defined(_OPENMP)
+    #pragma omp parallel
+    #endif
+    {
+      const int num = aend;
+      int tid, ifrom, ito;
+
+      const double balance_factor = 2.0;
+      const double ibalance_factor = 1.0 / balance_factor;
+      const int gnum = num - nlocal;
+      const int wlocal = static_cast<int>(ceil(balance_factor * nlocal));
+      const int snum = wlocal + gnum;
+      IP_PRE_omp_range_id(ifrom, ito, tid, snum, nthreads);
+      if (ifrom < wlocal) ifrom = static_cast<int>(ibalance_factor * ifrom);
+      else ifrom -= wlocal - nlocal;
+      if (ito < wlocal) ito = static_cast<int>(ibalance_factor * ito);
+      else ito -= wlocal - nlocal;
+
+      int e_ito = ito;
+      const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
+
+      int which;
+
+      int pack_offset = maxnbors;
+      int ct = (ifrom + tid * 2) * maxnbors;
+      int *neighptr = firstneigh + ct;
+      const int obound = pack_offset + maxnbors * 2;
+
+      const int toffs = tid * ncache_stride;
+      flt_t * _noalias const tx = ncachex + toffs;
+      flt_t * _noalias const ty = ncachey + toffs;
+      flt_t * _noalias const tz = ncachez + toffs;
+      int * _noalias const tj = ncachej + toffs;
+      int * _noalias const tjtype = ncachejtype + toffs;
+      int * _noalias const ttag = ncachetag + toffs;
+
+      // loop over all atoms in other bins in stencil, store every pair
+      int istart, icount, ncount, oldbin = -9999999, lane, max_chunk;
+      for (int i = ifrom; i < ito; i++) {
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        const int itype = x[i].w;
+        const tagint itag = tag[i];
+        const int ioffset = ntypes * itype;
+
+        const int ibin = atombin[i];
+        if (ibin != oldbin) {
+          oldbin = ibin;
+          ncount = 0;
+	  if (i < nlocal) {
+	    for (int k = 0; k < nstencilp; k++) {
+	      const int bstart = binhead[ibin + binstart[k]];
+	      const int bend = binhead[ibin + binend[k]];
+              #if defined(LMP_SIMD_COMPILER)
+              #pragma vector aligned
+              #pragma simd
+              #endif
+              for (int jj = bstart; jj < bend; jj++)
+                tj[ncount++] = binpacked[jj];
+	    }
+	  } else {
+	    const int zbin = ibin / mbinyx;
+	    const int zrem = ibin % mbinyx;
+	    const int ybin = zrem / mbinx;
+	    const int xbin = zrem % mbinx;
+	    for (int k = 0; k < nstencil; k++) {
+	      const int xbin2 = xbin + stencilxyz[3 * k + 0];
+	      const int ybin2 = ybin + stencilxyz[3 * k + 1];
+	      const int zbin2 = zbin + stencilxyz[3 * k + 2];
+	      if (xbin2 < 0 || xbin2 >= mbinx ||
+                  ybin2 < 0 || ybin2 >= mbiny ||
+                  zbin2 < 0 || zbin2 >= mbinz) continue;
+
+	      const int bstart = binhead[ibin + stencil[k]];
+	      const int bend = binhead[ibin + stencil[k] + 1];
+              #if defined(LMP_SIMD_COMPILER)
+              #pragma vector aligned
+              #pragma simd
+              #endif
+              for (int jj = bstart; jj < bend; jj++)
+                tj[ncount++] = binpacked[jj];
+	    }
+	  } // if i < nlocal
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+          #pragma simd
+          #endif
+          for (int u = 0; u < ncount; u++) {
+            const int j = tj[u];
+            tx[u] = x[j].x;
+            ty[u] = x[j].y;
+            tz[u] = x[j].z;
+            tjtype[u] = x[j].w;
+	    ttag[u] = tag[j];
+          }
+	} // if ibin != oldbin
+
+        // ---------------------- Loop over other bins
+
+        int n = maxnbors;
+        int n2 = n * 2;
+	int *neighptr2 = neighptr;
+	const flt_t * _noalias cutsq;
+	if (i < nlocal) cutsq = cutneighsq;
+	else cutsq = cutneighghostsq;
+
+	const int icp = i;
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int u = 0; u < ncount; u++) {
+          int addme = 1;
+          int j = tj[u];
+
+	  if (i == j) addme = 0;
+
+          // Cutoff Check
+          const flt_t delx = xtmp - tx[u];
+          const flt_t dely = ytmp - ty[u];
+          const flt_t delz = ztmp - tz[u];
+          const int jtype = tjtype[u];
+	  const int jtag = ttag[u];
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+          if (rsq > cutsq[ioffset + jtype]) addme = 0;
+
+          if (need_ic && icp < nlocal) {
+            int no_special;
+	    ominimum_image_check(no_special, delx, dely, delz);
+            if (no_special)
+              j = -j - 1;
+          }
+
+	  int flist = 0;
+	  if (itag > jtag) {
+	    if (((itag+jtag) & 1) == 0) flist = 1;
+	  } else if (itag < jtag) {
+	    if (((itag+jtag) & 1) == 1) flist = 1;
+	  } else {
+	    if (tz[u] < ztmp) flist = 1;
+	    else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
+	    else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp)
+	      flist = 1;
+	  }
+	  if (addme) {
+	    if (flist)
+	      neighptr2[n2++] = j;
+	    else
+	      neighptr[n++] = j;
+	  }
+        } // for u
+
+        #ifndef _LMP_INTEL_OFFLOAD
+        if (exclude) {
+          int alln = n;
+          n = maxnbors;
+          for (int u = pack_offset; u < alln; u++) {
+            const int j = neighptr[u];
+            int pj = j;
+            if (need_ic)
+              if (pj < 0) pj = -j - 1;
+            const int jtype = x[pj].w;
+            if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
+            neighptr[n++] = j;
+          }
+	  alln = n2;
+	  n2 = maxnbors * 2;
+	  for (int u = n2; u < alln; u++) {
+	    const int j = neighptr[u];
+	    int pj = j;
+	    if (need_ic)
+	      if (pj < 0) pj = -j - 1;
+	    const int jtype = x[pj].w;
+	    if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
+	    neighptr[n2++] = j;
+	  }
+        }
+        #endif
+        int ns = n - maxnbors;
+	int alln = n;
+	atombin[i] = ns;
+	n = 0;
+	for (int u = maxnbors; u < alln; u++)
+          neighptr[n++] = neighptr[u];
+	ns += n2 - maxnbors * 2;
+	for (int u = maxnbors * 2; u < n2; u++)
+          neighptr[n++] = neighptr[u];
+	if (ns > maxnbors) *overflow = 1;
+
+        ilist[i] = i;
+        cnumneigh[i] = ct;
+        numneigh[i] = ns;
+
+	ct += ns;
+	const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
+	const int edge = ct & (alignb - 1);
+	if (edge) ct += alignb - edge;
+	neighptr = firstneigh + ct;
+	if (ct + obound > list_size) {
+	  if (i < ito - 1) {
+	    *overflow = 1;
+	    ct = (ifrom + tid * 2) * maxnbors;
+	  }
+	}
+      }
+
+      if (*overflow == 1)
+        for (int i = ifrom; i < ito; i++)
+          numneigh[i] = 0;
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      int ghost_offset = 0, nall_offset = e_nall;
+      if (separate_buffers) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+          #if __INTEL_COMPILER+0 > 1499
+          #pragma vector aligned
+          #pragma simd
+          #endif
+          for (int jj = 0; jj < jnum; jj++) {
+            int j = jlist[jj];
+            if (need_ic && j < 0) j = -j - 1;
+          }
+        }
+
+	overflow[LMP_LOCAL_MIN] = 0;
+	overflow[LMP_LOCAL_MAX] = nlocal - 1;
+	overflow[LMP_GHOST_MIN] = nlocal;
+	overflow[LMP_GHOST_MAX] = e_nall - 1;
+
+        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
+        if (nghost < 0) nghost = 0;
+        if (offload) {
+          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
+          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
+        } else {
+          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
+          nall_offset = nlocal + nghost;
+        }
+      } // if separate_buffers
+      #endif
+
+      if (molecular) {
+	int ito_m = ito;
+	if (ito >= nlocal) ito_m = nlocal; 
+        for (int i = ifrom; i < ito_m; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+          #pragma simd
+          #endif
+          for (int jj = 0; jj < jnum; jj++) {
+            const int j = jlist[jj];
+            if (need_ic && j < 0) {
+              which = 0;
+              jlist[jj] = -j - 1;
+            } else
+              ofind_special(which, special, nspecial, i, tag[j]);
+            #ifdef _LMP_INTEL_OFFLOAD
+            if (j >= nlocal) {
+              if (j == e_nall)
+                jlist[jj] = nall_offset;
+              else if (which)
+                jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
+              else jlist[jj]-=ghost_offset;
+            } else
+            #endif
+            if (which) jlist[jj] = j ^ (which << SBBITS);
+          }
+        } // for i
+      } // if molecular
+      #ifdef _LMP_INTEL_OFFLOAD
+      else if (separate_buffers) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+          int jj = 0;
+          #pragma vector aligned
+          #pragma simd
+          for (jj = 0; jj < jnum; jj++) {
+            if (jlist[jj] >= nlocal) {
+              if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
+              else jlist[jj] -= ghost_offset;
+            }
+          }
+        }
+      }
+      #endif
+    } // end omp
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload) {
+    _fix->stop_watch(TIME_OFFLOAD_LATENCY);
+    _fix->start_watch(TIME_HOST_NEIGHBOR);
+    for (int n = 0; n < aend; n++) {
+      ilist[n] = n;
+      numneigh[n] = 0;
+    }
+  } else {
+    for (int i = 0; i < aend; i++)
+      list->firstneigh[i] = firstneigh + cnumneigh[i];
+    if (separate_buffers) {
+      _fix->start_watch(TIME_PACK);
+      _fix->set_neighbor_host_sizes();
+      buffers->pack_sep_from_single(_fix->host_min_local(),
+                                    _fix->host_used_local(),
+                                    _fix->host_min_ghost(),
+                                    _fix->host_used_ghost());
+      _fix->stop_watch(TIME_PACK);
+    }
+  }
+  #else
+  #pragma vector aligned
+  #pragma simd
+  for (int i = 0; i < aend; i++)
+    list->firstneigh[i] = firstneigh + cnumneigh[i];
+  #endif
+}
diff --git a/src/USER-INTEL/npair_full_bin_ghost_intel.h b/src/USER-INTEL/npair_full_bin_ghost_intel.h
new file mode 100644
index 000000000..4449dfa1e
--- /dev/null
+++ b/src/USER-INTEL/npair_full_bin_ghost_intel.h
@@ -0,0 +1,55 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef NPAIR_CLASS
+
+NPairStyle(full/bin/ghost/intel,
+           NPairFullBinGhostIntel,
+           NP_FULL | NP_BIN | NP_GHOST | NP_NEWTON | NP_NEWTOFF | 
+           NP_ORTHO | NP_TRI | NP_INTEL)
+
+#else
+
+#ifndef LMP_NPAIR_FULL_BIN_GHOST_INTEL_H
+#define LMP_NPAIR_FULL_BIN_GHOST_INTEL_H
+
+#include "npair_intel.h"
+
+namespace LAMMPS_NS {
+
+class NPairFullBinGhostIntel : public NPairIntel {
+ public:
+  NPairFullBinGhostIntel(class LAMMPS *);
+  ~NPairFullBinGhostIntel() {}
+  void build(class NeighList *);
+ private:
+  template<class flt_t, class acc_t>
+  void fbi(NeighList * list, IntelBuffers<flt_t,acc_t> * buffers);
+  template<class flt_t, class acc_t, int need_ic>
+  void fbi(const int offload, NeighList * list, 
+	   IntelBuffers<flt_t,acc_t> * buffers, 
+           const int astart, const int aend);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/USER-INTEL/npair_intel.cpp b/src/USER-INTEL/npair_intel.cpp
index b20b1dcd0..79dc75366 100644
--- a/src/USER-INTEL/npair_intel.cpp
+++ b/src/USER-INTEL/npair_intel.cpp
@@ -1,937 +1,938 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #include "npair_intel.h"
 #include "nstencil.h"
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 NPairIntel::NPairIntel(LAMMPS *lmp) : NPair(lmp) {
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   _fix = static_cast<FixIntel *>(modify->fix[ifix]);
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = _fix->coprocessor_number();
   _off_map_stencil = 0;
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 NPairIntel::~NPairIntel() {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_off_map_stencil) {
     const int * stencil = this->stencil;
     #pragma offload_transfer target(mic:_cop)   \
       nocopy(stencil:alloc_if(0) free_if(1))
   }
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t, int offload_noghost, int need_ic,
           int FULL, int TRI, int THREE>
 void NPairIntel::bin_newton(const int offload, NeighList *list,
                             IntelBuffers<flt_t,acc_t> *buffers,
                             const int astart, const int aend,
                             const int offload_end) {
 
   if (aend-astart == 0) return;
 
   const int nall = atom->nlocal + atom->nghost;
   int pad = 1;
   int nall_t = nall;
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (offload_noghost && offload) nall_t = atom->nlocal;
   if (THREE == 0 && offload) {
     if (INTEL_MIC_NBOR_PAD > 1)
       pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
   } else
   #endif
     if (THREE == 0 && INTEL_NBOR_PAD > 1)
       pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
   const int pad_width = pad;
   const int pack_width = _fix->nbor_pack_width();
 
   const ATOM_T * _noalias const x = buffers->get_x();
   int * _noalias const firstneigh = buffers->firstneigh(list);
   const int e_nall = nall_t;
 
   const int molecular = atom->molecular;
   int *ns = NULL;
   tagint *s = NULL;
   int tag_size = 0, special_size;
   if (buffers->need_tag()) tag_size = e_nall;
   if (molecular) {
     s = atom->special[0];
     ns = atom->nspecial[0];
     special_size = aend;
   } else {
     s = &buffers->_special_holder;
     ns = &buffers->_nspecial_holder;
     special_size = 0;
   }
   const tagint * _noalias const special = s;
   const int * _noalias const nspecial = ns;
   const int maxspecial = atom->maxspecial;
   const tagint * _noalias const tag = atom->tag;
 
   int * _noalias const ilist = list->ilist;
   int * _noalias numneigh = list->numneigh;
   int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int nstencil = this->nstencil;
   const int * _noalias const stencil = this->stencil;
   const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
   const int ntypes = atom->ntypes + 1;
   const int nlocal = atom->nlocal;
 
   #ifndef _LMP_INTEL_OFFLOAD
   int * const mask = atom->mask;
   tagint * const molecule = atom->molecule;
   #endif
 
   int tnum;
   int *overflow;
   double *timer_compute;
   #ifdef _LMP_INTEL_OFFLOAD
   if (offload) {
     timer_compute = _fix->off_watch_neighbor();
     tnum = buffers->get_off_threads();
     overflow = _fix->get_off_overflow_flag();
     _fix->stop_watch(TIME_HOST_NEIGHBOR);
     _fix->start_watch(TIME_OFFLOAD_LATENCY);
   } else
   #endif
   {
     tnum = comm->nthreads;
     overflow = _fix->get_overflow_flag();
   }
   const int nthreads = tnum;
   const int maxnbors = buffers->get_max_nbors();
   int * _noalias const atombin = buffers->get_atombin();
   const int * _noalias const binpacked = buffers->get_binpacked();
 
   const int xperiodic = domain->xperiodic;
   const int yperiodic = domain->yperiodic;
   const int zperiodic = domain->zperiodic;
   const flt_t xprd_half = domain->xprd_half;
   const flt_t yprd_half = domain->yprd_half;
   const flt_t zprd_half = domain->zprd_half;
 
   flt_t * _noalias const ncachex = buffers->get_ncachex();
   flt_t * _noalias const ncachey = buffers->get_ncachey();
   flt_t * _noalias const ncachez = buffers->get_ncachez();
   int * _noalias const ncachej = buffers->get_ncachej();
   int * _noalias const ncachejtype = buffers->get_ncachejtype();
+  int * _noalias const ncachetag = buffers->get_ncachetag();
   const int ncache_stride = buffers->ncache_stride();
 
   #ifdef _LMP_INTEL_OFFLOAD
   const int * _noalias const binhead = this->binhead;
   const int * _noalias const bins = this->bins;
   const int cop = _fix->coprocessor_number();
   const int separate_buffers = _fix->separate_buffers();
   #pragma offload target(mic:cop) if(offload) \
     in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
     in(tag:length(tag_size) alloc_if(0) free_if(0)) \
     in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
     in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
     in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
     in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
     in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     out(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(ilist:length(0) alloc_if(0) free_if(0)) \
     in(atombin:length(aend) alloc_if(0) free_if(0)) \
     in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
     in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
-    in(ncachejtype:length(0) alloc_if(0) free_if(0)) \
+    in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \
     in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
     in(pad_width,offload_end,separate_buffers,astart,aend,nlocal,molecular) \
     in(ntypes,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
     in(pack_width) \
     out(overflow:length(5) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
     signal(tag)
   #endif
   {
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
     #ifdef _LMP_INTEL_OFFLOAD
     overflow[LMP_LOCAL_MIN] = astart;
     overflow[LMP_LOCAL_MAX] = aend - 1;
     overflow[LMP_GHOST_MIN] = e_nall;
     overflow[LMP_GHOST_MAX] = -1;
     #endif
 
     int nstencilp = 0;
     int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
     for (int k = 0; k < nstencil; k++) {
       binstart[nstencilp] = stencil[k];
       int end = stencil[k] + 1;
       for (int kk = k + 1; kk < nstencil; kk++) {
         if (stencil[kk-1]+1 == stencil[kk]) {
           end++;
           k++;
         } else break;
       }
       binend[nstencilp] = end;
       nstencilp++;
     }
 
     #if defined(_OPENMP)
     #pragma omp parallel default(none) \
       shared(numneigh, overflow, nstencilp, binstart, binend)
     #endif
     {
       #ifdef _LMP_INTEL_OFFLOAD
       int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
       #endif
 
       const int num = aend - astart;
       int tid, ifrom, ito;
 
       if (THREE) {
         IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
       } else {
         IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
       }
       ifrom += astart;
       ito += astart;
       int e_ito = ito;
       if (THREE && ito == num) {
-        int imod = ito % pack_width;
+        int imod = ito & (pack_width - 1);
         if (imod) e_ito += pack_width - imod;
       }
       const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
 
       int which;
 
       int pack_offset = maxnbors;
       if (THREE) pack_offset *= pack_width;
       int ct = (ifrom + tid * 2) * maxnbors;
       int *neighptr = firstneigh + ct;
       const int obound = pack_offset + maxnbors * 2;
 
       const int toffs = tid * ncache_stride;
       flt_t * _noalias const tx = ncachex + toffs;
       flt_t * _noalias const ty = ncachey + toffs;
       flt_t * _noalias const tz = ncachez + toffs;
       int * _noalias const tj = ncachej + toffs;
       int * _noalias const tjtype = ncachejtype + toffs;
+      int * _noalias const ttag = ncachetag + toffs;
 
       flt_t * _noalias itx;
       flt_t * _noalias ity;
       flt_t * _noalias itz;
       int * _noalias itj;
       int * _noalias itjtype;
 
       // loop over all atoms in other bins in stencil, store every pair
       int istart, icount, ncount, oldbin = -9999999, lane, max_chunk;
       if (THREE) {
         lane = 0;
         max_chunk = 0;
       }
       for (int i = ifrom; i < ito; i++) {
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         const int itype = x[i].w;
         tagint itag;
         if (THREE) itag = tag[i];
         const int ioffset = ntypes * itype;
 
         const int ibin = atombin[i];
         if (ibin != oldbin) {
           oldbin = ibin;
           ncount = 0;
           for (int k = 0; k < nstencilp; k++) {
             const int bstart = binhead[ibin + binstart[k]];
             const int bend = binhead[ibin + binend[k]];
             #if defined(LMP_SIMD_COMPILER)
             #pragma vector aligned
             #pragma simd
             #endif
             for (int jj = bstart; jj < bend; jj++)
               tj[ncount++] = binpacked[jj];
           }
           #if defined(LMP_SIMD_COMPILER)
           #pragma vector aligned
           #pragma simd
           #endif
           for (int u = 0; u < ncount; u++) {
             const int j = tj[u];
             tx[u] = x[j].x;
             ty[u] = x[j].y;
             tz[u] = x[j].z;
             tjtype[u] = x[j].w;
+	    if (THREE) ttag[u] = tag[j];
           }
 
           if (FULL == 0 || TRI == 1) {
             icount = 0;
             istart = ncount;
             const int alignb = INTEL_DATA_ALIGN / sizeof(int);
-            int nedge = istart % alignb;
+            int nedge = istart & (alignb - 1);
             if (nedge) istart + (alignb - nedge);
             itx = tx + istart;
             ity = ty + istart;
             itz = tz + istart;
             itj = tj + istart;
             itjtype = tjtype + istart;
 
             const int bstart = binhead[ibin];
             const int bend = binhead[ibin + 1];
             #if defined(LMP_SIMD_COMPILER)
             #pragma vector aligned
             #pragma simd
             #endif
             for (int jj = bstart; jj < bend; jj++) {
               const int j = binpacked[jj];
               itj[icount] = j;
               itx[icount] = x[j].x;
               ity[icount] = x[j].y;
               itz[icount] = x[j].z;
               itjtype[icount] = x[j].w;
               icount++;
             }
             if (icount + istart > obound) *overflow = 1;
           } else
             if (ncount > obound) *overflow = 1;
         }
 
         // ---------------------- Loop over i bin
 
         int n = 0;
         if (FULL == 0 || TRI == 1) {
           #if defined(LMP_SIMD_COMPILER)
           #pragma vector aligned
           #pragma ivdep
           #endif
           for (int u = 0; u < icount; u++) {
             int addme = 1;
             int j = itj[u];
 
             // Cutoff Check
             const flt_t delx = xtmp - itx[u];
             const flt_t dely = ytmp - ity[u];
             const flt_t delz = ztmp - itz[u];
             const int jtype = itjtype[u];
             const flt_t rsq = delx * delx + dely * dely + delz * delz;
             if (rsq > cutneighsq[ioffset + jtype]) addme = 0;
 
             // i bin (half) check and offload ghost check
             if (j < nlocal) {
-              const int ijmod = (i + j) % 2;
+              const int ijmod = (i + j) & 1;
               if (i > j) {
                 if (ijmod == 0) addme = 0;
               } else if (i < j) {
                 if (ijmod == 1) addme = 0;
               } else
                 addme = 0;
               #ifdef _LMP_INTEL_OFFLOAD
               if (offload_noghost && i < offload_end) addme = 0;
               #endif
             } else {
               #ifdef _LMP_INTEL_OFFLOAD
               if (offload_noghost && offload) addme = 0;
               #endif
               if (itz[u] < ztmp) addme = 0;
               if (itz[u] == ztmp) {
                 if (ity[u] < ytmp) addme = 0;
                 if (ity[u] == ytmp && itx[u] < xtmp) addme = 0;
               }
             }
 
             if (need_ic) {
               int no_special;
               ominimum_image_check(no_special, delx, dely, delz);
               if (no_special)
                 j = -j - 1;
             }
 
             if (addme)
               neighptr[n++] = j;
           }
         } // if FULL==0
 
         // ---------------------- Loop over other bins
 
         int n2, *neighptr2;
         if (THREE) {
           n = pack_offset;
           n2 = pack_offset + maxnbors;
           neighptr2 = neighptr;
         }
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma ivdep
         #endif
         for (int u = 0; u < ncount; u++) {
           int addme = 1;
           int j = tj[u];
 
           if (FULL)
             if (i == j) addme = 0;
 
           // Cutoff Check
           const flt_t delx = xtmp - tx[u];
           const flt_t dely = ytmp - ty[u];
           const flt_t delz = ztmp - tz[u];
           const int jtype = tjtype[u];
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
           if (rsq > cutneighsq[ioffset + jtype]) addme = 0;
 
           // Triclinic
           if (TRI) {
             if (tz[u] < ztmp) addme = 0;
             if (tz[u] == ztmp) {
               if (ty[u] < ytmp) addme = 0;
               if (ty[u] == ytmp) {
                 if (tx[u] < xtmp) addme = 0;
                 if (tx[u] == xtmp && j <= i) addme = 0;
               }
             }
           }
 
           // offload ghost check
           #ifdef _LMP_INTEL_OFFLOAD
           if (offload_noghost) {
             if (j < nlocal) {
               if (i < offload_end) addme = 0;
             } else if (offload) addme = 0;
           }
           #endif
 
-          int pj;
-          if (THREE) pj = j;
           if (need_ic) {
             int no_special;
             ominimum_image_check(no_special, delx, dely, delz);
             if (no_special)
               j = -j - 1;
           }
 
           if (THREE) {
-            const int jtag = tag[pj];
+            const int jtag = ttag[u];
             int flist = 0;
             if (itag > jtag) {
-              if ((itag+jtag) % 2 == 0) flist = 1;
+	      if (((itag+jtag) & 1) == 0) flist = 1;
             } else if (itag < jtag) {
-              if ((itag+jtag) % 2 == 1) flist = 1;
+	      if (((itag+jtag) & 1) == 1) flist = 1;
             } else {
               if (tz[u] < ztmp) flist = 1;
               else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
               else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp)
                 flist = 1;
             }
             if (addme) {
               if (flist)
                 neighptr2[n2++] = j;
               else
                 neighptr[n++] = j;
             }
           } else {
             if (addme)
               neighptr[n++] = j;
           }
         } // for u
 
         #ifndef _LMP_INTEL_OFFLOAD
         if (exclude) {
           int alln = n;
           if (THREE) n = pack_offset;
           else n = 0;
           for (int u = pack_offset; u < alln; u++) {
             const int j = neighptr[u];
             int pj = j;
             if (need_ic)
               if (pj < 0) pj = -j - 1;
             const int jtype = x[pj].w;
             if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
             neighptr[n++] = j;
           }
           if (THREE) {
             alln = n2;
             n2 = pack_offset + maxnbors;
             for (int u = pack_offset + maxnbors; u < alln; u++) {
               const int j = neighptr[u];
               int pj = j;
               if (need_ic)
                 if (pj < 0) pj = -j - 1;
               const int jtype = x[pj].w;
               if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
               neighptr[n2++] = j;
             }
           }
         }
         #endif
         int ns;
         if (THREE) {
           int alln = n;
           ns = n - pack_offset;
           atombin[i] = ns;
           n = lane;
           for (int u = pack_offset; u < alln; u++) {
             neighptr[n] = neighptr[u];
             n += pack_width;
           }
           ns += n2 - pack_offset - maxnbors;
           for (int u = pack_offset + maxnbors; u < n2; u++) {
             neighptr[n] = neighptr[u];
             n += pack_width;
           }
           if (ns > maxnbors) *overflow = 1;
         } else
           if (n > maxnbors) *overflow = 1;
 
         ilist[i] = i;
         cnumneigh[i] = ct;
         if (THREE) {
           cnumneigh[i] += lane;
           numneigh[i] = ns;
         } else {
-          int edge = (n % pad_width);
+          int edge = n & (pad_width - 1);
           if (edge) {
             const int pad_end = n + (pad_width - edge);
             #if defined(LMP_SIMD_COMPILER)
             #pragma vector aligned
             #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
                     avg=INTEL_COMPILE_WIDTH/2
             #endif
             for ( ; n < pad_end; n++)
               neighptr[n] = e_nall;
           }
           numneigh[i] = n;
         }
 
         if (THREE) {
           if (ns > max_chunk) max_chunk = ns;
           lane++;
           if (lane == pack_width) {
             ct += max_chunk * pack_width;
             const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-            const int edge = (ct % alignb);
+            const int edge = ct & (alignb - 1);
             if (edge) ct += alignb - edge;
             neighptr = firstneigh + ct;
             max_chunk = 0;
             pack_offset = maxnbors * pack_width;
             lane = 0;
             if (ct + obound > list_size) {
               if (i < ito - 1) {
                 *overflow = 1;
                 ct = (ifrom + tid * 2) * maxnbors;
               }
             }
           }
         } else {
           ct += n;
           const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-          const int edge = (ct % alignb);
+          const int edge = ct & (alignb - 1);
           if (edge) ct += alignb - edge;
           neighptr = firstneigh + ct;
           if (ct + obound > list_size) {
             if (i < ito - 1) {
               *overflow = 1;
               ct = (ifrom + tid * 2) * maxnbors;
             }
           }
         }
       }
 
       if (*overflow == 1)
         for (int i = ifrom; i < ito; i++)
           numneigh[i] = 0;
 
       #ifdef _LMP_INTEL_OFFLOAD
       int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
       int ghost_offset = 0, nall_offset = e_nall;
       if (separate_buffers) {
         for (int i = ifrom; i < ito; ++i) {
           int * _noalias jlist = firstneigh + cnumneigh[i];
           const int jnum = numneigh[i];
           #if __INTEL_COMPILER+0 > 1499
           #pragma vector aligned
           #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
           #endif
           for (int jj = 0; jj < jnum; jj++) {
             int j = jlist[jj];
             if (need_ic && j < 0) j = -j - 1;
             if (j < nlocal) {
               if (j < vlmin) vlmin = j;
               if (j > vlmax) vlmax = j;
             } else {
               if (j < vgmin) vgmin = j;
               if (j > vgmax) vgmax = j;
             }
           }
         }
         lmin = MIN(lmin,vlmin);
         gmin = MIN(gmin,vgmin);
         lmax = MAX(lmax,vlmax);
         gmax = MAX(gmax,vgmax);
 
         #if defined(_OPENMP)
         #pragma omp critical
         #endif
         {
           if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
           if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
           if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
           if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
         }
         #pragma omp barrier
 
         int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
         if (nghost < 0) nghost = 0;
         if (offload) {
           ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
           nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
         } else {
           ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
           nall_offset = nlocal + nghost;
         }
       } // if separate_buffers
       #endif
 
       if (molecular) {
         for (int i = ifrom; i < ito; ++i) {
           int * _noalias jlist = firstneigh + cnumneigh[i];
           const int jnum = numneigh[i];
 
           if (THREE) {
             const int trip = jnum * pack_width;
             for (int jj = 0; jj < trip; jj+=pack_width) {
               const int j = jlist[jj];
               if (need_ic && j < 0) {
                 which = 0;
                 jlist[jj] = -j - 1;
               } else
                 ofind_special(which, special, nspecial, i, tag[j]);
               #ifdef _LMP_INTEL_OFFLOAD
               if (j >= nlocal) {
                 if (j == e_nall)
                   jlist[jj] = nall_offset;
                 else if (which)
                   jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
                 else jlist[jj]-=ghost_offset;
               } else
               #endif
               if (which) jlist[jj] = j ^ (which << SBBITS);
             }
           } else {
             #if defined(LMP_SIMD_COMPILER)
             #pragma vector aligned
             #pragma simd
             #endif
             for (int jj = 0; jj < jnum; jj++) {
               const int j = jlist[jj];
               if (need_ic && j < 0) {
                 which = 0;
                 jlist[jj] = -j - 1;
               } else
                 ofind_special(which, special, nspecial, i, tag[j]);
               #ifdef _LMP_INTEL_OFFLOAD
               if (j >= nlocal) {
                 if (j == e_nall)
                   jlist[jj] = nall_offset;
                 else if (which)
                   jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
                 else jlist[jj]-=ghost_offset;
               } else
               #endif
               if (which) jlist[jj] = j ^ (which << SBBITS);
             }
           }
         } // for i
       } // if molecular
       #ifdef _LMP_INTEL_OFFLOAD
       else if (separate_buffers) {
         for (int i = ifrom; i < ito; ++i) {
           int * _noalias jlist = firstneigh + cnumneigh[i];
           const int jnum = numneigh[i];
           int jj = 0;
           #pragma vector aligned
           #pragma simd
           for (jj = 0; jj < jnum; jj++) {
             if (jlist[jj] >= nlocal) {
               if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
               else jlist[jj] -= ghost_offset;
             }
           }
         }
       }
       #endif
     } // end omp
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end offload
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (offload) {
     _fix->stop_watch(TIME_OFFLOAD_LATENCY);
     _fix->start_watch(TIME_HOST_NEIGHBOR);
     for (int n = 0; n < aend; n++) {
       ilist[n] = n;
       numneigh[n] = 0;
     }
   } else {
     for (int i = astart; i < aend; i++)
       list->firstneigh[i] = firstneigh + cnumneigh[i];
     if (separate_buffers) {
       _fix->start_watch(TIME_PACK);
       _fix->set_neighbor_host_sizes();
       buffers->pack_sep_from_single(_fix->host_min_local(),
                                     _fix->host_used_local(),
                                     _fix->host_min_ghost(),
                                     _fix->host_used_ghost());
       _fix->stop_watch(TIME_PACK);
     }
   }
   #else
   #pragma vector aligned
   #pragma simd
   for (int i = astart; i < aend; i++)
     list->firstneigh[i] = firstneigh + cnumneigh[i];
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 #ifdef _LMP_INTEL_OFFLOAD
 void NPairIntel::grow_stencil()
 {
   if (_off_map_stencil != stencil) {
     if (_off_map_stencil) {
       const int * stencil = _off_map_stencil;
       #pragma offload_transfer target(mic:_cop) \
         nocopy(stencil:alloc_if(0) free_if(1))
     }
     _off_map_stencil = stencil;
     const int * stencil = _off_map_stencil;
     const int maxstencil = ns->get_maxstencil();
     #pragma offload_transfer target(mic:_cop)   \
       in(stencil:length(maxstencil) alloc_if(1) free_if(0))
   }
 }
 #endif
 
 /* ---------------------------------------------------------------------- */
 
 // ---- Half, no IC
 
 template void NPairIntel::bin_newton<float, float, 0, 0, 0, 0, 0>
   (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<float, double, 0, 0, 0, 0, 0>
   (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<double, double, 0, 0, 0, 0, 0>
   (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
    const int);
 
 // ---- Half, IC
 
 template void NPairIntel::bin_newton<float, float, 0, 1, 0, 0, 0>
   (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<float, double, 0, 1, 0, 0, 0>
   (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<double, double, 0, 1, 0, 0, 0>
   (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
    const int);
 
 // ---- Tri, no IC
 
 template void NPairIntel::bin_newton<float, float, 0, 0, 0, 1, 0>
   (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<float, double, 0, 0, 0, 1, 0>
   (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<double, double, 0, 0, 0, 1, 0>
   (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
    const int);
 
 // ---- Tri, IC
 
 template void NPairIntel::bin_newton<float, float, 0, 1, 0, 1, 0>
   (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<float, double, 0, 1, 0, 1, 0>
   (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<double, double, 0, 1, 0, 1, 0>
   (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
    const int);
 
 // ---- Full, no IC
 
 template void NPairIntel::bin_newton<float, float, 0, 0, 1, 0, 0>
   (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<float, double, 0, 0, 1, 0, 0>
   (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<double, double, 0, 0, 1, 0, 0>
   (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
    const int);
 
 // ---- Full, IC
 
 template void NPairIntel::bin_newton<float, float, 0, 1, 1, 0, 0>
   (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<float, double, 0, 1, 1, 0, 0>
   (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<double, double, 0, 1, 1, 0, 0>
   (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
    const int);
 
 // ---- 3-body, no IC
 
 template void NPairIntel::bin_newton<float, float, 0, 0, 1, 0, 1>
   (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<float, double, 0, 0, 1, 0, 1>
   (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<double, double, 0, 0, 1, 0, 1>
   (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
    const int);
 
 // ---- 3-body, IC
 
 template void NPairIntel::bin_newton<float, float, 0, 1, 1, 0, 1>
   (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<float, double, 0, 1, 1, 0, 1>
   (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<double, double, 0, 1, 1, 0, 1>
   (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
    const int);
 
 #ifdef _LMP_INTEL_OFFLOAD
 
 // ---- Half, no IC, no ghost
 
 template void NPairIntel::bin_newton<float, float, 1, 0, 0, 0, 0>
   (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<float, double, 1, 0, 0, 0, 0>
   (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<double, double, 1, 0, 0, 0, 0>
   (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
    const int);
 
 // ---- Half, IC, no ghost
 
 template void NPairIntel::bin_newton<float, float, 1, 1, 0, 0, 0>
   (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<float, double, 1, 1, 0, 0, 0>
   (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<double, double, 1, 1, 0, 0, 0>
   (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
    const int);
 
 // ---- Tri, no IC, no ghost
 
 template void NPairIntel::bin_newton<float, float, 1, 0, 0, 1, 0>
   (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<float, double, 1, 0, 0, 1, 0>
   (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<double, double, 1, 0, 0, 1, 0>
   (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
    const int);
 
 // ---- Tri, IC, no ghost
 
 template void NPairIntel::bin_newton<float, float, 1, 1, 0, 1, 0>
   (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<float, double, 1, 1, 0, 1, 0>
   (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<double, double, 1, 1, 0, 1, 0>
   (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
    const int);
 
 // ---- Full, no IC, no ghost
 
 template void NPairIntel::bin_newton<float, float, 1, 0, 1, 0, 0>
   (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<float, double, 1, 0, 1, 0, 0>
   (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<double, double, 1, 0, 1, 0, 0>
   (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
    const int);
 
 // ---- Full, IC, no ghost
 
 template void NPairIntel::bin_newton<float, float, 1, 1, 1, 0, 0>
   (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<float, double, 1, 1, 1, 0, 0>
   (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<double, double, 1, 1, 1, 0, 0>
   (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
    const int);
 
 // ---- 3-body, no IC, no ghost
 
 template void NPairIntel::bin_newton<float, float, 1, 0, 1, 0, 1>
   (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<float, double, 1, 0, 1, 0, 1>
   (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<double, double, 1, 0, 1, 0, 1>
   (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
    const int);
 
 // ---- 3-body, IC, no ghost
 
 template void NPairIntel::bin_newton<float, float, 1, 1, 1, 0, 1>
   (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<float, double, 1, 1, 1, 0, 1>
   (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
    const int);
 template void NPairIntel::bin_newton<double, double, 1, 1, 1, 0, 1>
   (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
    const int);
 
 #endif
diff --git a/src/USER-INTEL/pair_airebo_intel.cpp b/src/USER-INTEL/pair_airebo_intel.cpp
new file mode 100644
index 000000000..ad3c97c9d
--- /dev/null
+++ b/src/USER-INTEL/pair_airebo_intel.cpp
@@ -0,0 +1,4891 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(push, target(mic))
+#endif
+#include <unistd.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+#include <stddef.h>
+#include "lmptype.h"
+#include "intel_preprocess.h"
+#include "intel_intrinsics_airebo.h"
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(pop)
+#endif
+
+#include <omp.h>
+#include <string.h>
+#include "pair_airebo_intel.h"
+#include "atom.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "force.h"
+#include "comm.h"
+#include "memory.h"
+#include "error.h"
+#include "group.h"
+#include "kspace.h"
+#include "modify.h"
+#include "suffix.h"
+
+using namespace LAMMPS_NS;
+
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(push, target(mic))
+#endif
+
+template<typename flt_t, typename acc_t>
+struct LAMMPS_NS::PairAIREBOIntelParam {
+  flt_t cutlj, cutljrebosq, cut3rebo;
+  flt_t sigmin, sigcut;
+  flt_t cutljsq[2][2];
+  flt_t lj1[2][2], lj2[2][2], lj3[2][2], lj4[2][2];
+
+  flt_t smin, Nmin, Nmax, NCmin, NCmax, thmin, thmax;
+  flt_t rcmin[2][2], rcmax[2][2], rcmaxsq[2][2], rcmaxp[2][2];
+  flt_t Q[2][2], alpha[2][2], A[2][2], rho[2][2], BIJc[2][2][3],
+      Beta[2][2][3];
+  flt_t rcLJmin[2][2], rcLJmax[2][2], rcLJmaxsq[2][2], bLJmin[2][2],
+      bLJmax[2][2];
+  flt_t epsilon[2][2], sigma[2][2], epsilonT[2][2];
+
+  // spline coefficients
+
+  flt_t gCdom[5], gC1[4][6], gC2[4][6], gHdom[4], gH[3][6];
+  flt_t gDom[5+4];
+  flt_t gVal[(4+4+3)*6];
+  flt_t pCCdom[2][2], pCHdom[2][2], pCC[4][4][16], pCH[4][4][16];
+  flt_t piCCdom[3][2], piCHdom[3][2], piHHdom[3][2];
+  acc_t piCC[4][4][9][64], piCH[4][4][9][64], piHH[4][4][9][64];
+  flt_t Tijdom[3][2];
+  acc_t Tijc[4][4][9][64];
+
+  // spline knot values
+
+  flt_t PCCf[5][5], PCCdfdx[5][5], PCCdfdy[5][5], PCHf[5][5];
+  flt_t PCHdfdx[5][5], PCHdfdy[5][5];
+  flt_t piCCf[5][5][11], piCCdfdx[5][5][11];
+  flt_t piCCdfdy[5][5][11], piCCdfdz[5][5][11];
+  flt_t piCHf[5][5][11], piCHdfdx[5][5][11];
+  flt_t piCHdfdy[5][5][11], piCHdfdz[5][5][11];
+  flt_t piHHf[5][5][11], piHHdfdx[5][5][11];
+  flt_t piHHdfdy[5][5][11], piHHdfdz[5][5][11];
+  flt_t Tf[5][5][10], Tdfdx[5][5][10], Tdfdy[5][5][10], Tdfdz[5][5][10];
+};
+
+namespace {
+
+struct NeighListAIREBO {
+  int * num; /* num_all */
+  int * num_half; /* num_all */
+  int * offset; /* num_all */
+  int * entries; /* num_all * num_neighs_per_atom */
+};
+
+template<typename flt_t>
+struct AtomAIREBOT {
+  flt_t x, y, z;
+  int w;
+};
+
+template<typename acc_t>
+struct ResultForceT {
+  acc_t x, y, z, w;
+};
+
+template<typename flt_t, typename acc_t>
+struct KernelArgsAIREBOT {
+  int num_local;
+  int num_all;
+  int num_neighs_per_atom;
+  int num_types;
+  int frebo_from_atom, frebo_to_atom;
+  int neigh_from_atom, neigh_to_atom;
+  int rebuild_flag;
+  flt_t skin;
+  struct NeighListAIREBO neigh_lmp;
+  struct NeighListAIREBO neigh_rebo;
+  PairAIREBOIntelParam<flt_t,acc_t> params;
+  struct AtomAIREBOT<flt_t> * x; /* num_all */
+  int * tag; /* num_all */
+  flt_t * nC, * nH; /* num_all */
+  int * map; /* num_types+1 */
+  struct ResultForceT<acc_t> * result_f; /* num_all */
+  acc_t result_eng;
+};
+
+template<typename flt_t, typename acc_t>
+void aut_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka, int morseflag);
+template<typename flt_t, typename acc_t>
+void aut_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka);
+template<typename flt_t, typename acc_t>
+void aut_frebo(KernelArgsAIREBOT<flt_t,acc_t> * ka, int torsion_flag);
+
+}
+
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(pop)
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+PairAIREBOIntel::PairAIREBOIntel(LAMMPS *lmp) : PairAIREBO(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+  REBO_cnumneigh = NULL;
+  REBO_num_skin = NULL;
+  REBO_list_data = NULL;
+  fix = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairAIREBOIntel::~PairAIREBOIntel()
+{
+  memory->destroy(REBO_cnumneigh);
+  memory->destroy(REBO_num_skin);
+  memory->destroy(REBO_list_data);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairAIREBOIntel::init_style()
+{
+  PairAIREBO::init_style();
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  fix->pair_init_check();
+  #ifdef _LMP_INTEL_OFFLOAD
+  _cop = fix->coprocessor_number();
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+    pack_force_const(fix->get_mixed_buffers());
+    fix->get_mixed_buffers()->need_tag(1);
+  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+    pack_force_const(fix->get_double_buffers());
+    fix->get_double_buffers()->need_tag(1);
+  } else {
+    pack_force_const(fix->get_single_buffers());
+    fix->get_single_buffers()->need_tag(1);
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (fix->offload_noghost())
+    error->all(FLERR,"The 'ghost no' option cannot be used with airebo/intel.");
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<typename T>
+T * calloc_it(size_t size) {
+  return static_cast<T*>(calloc(size, sizeof(T)));
+}
+
+void PairAIREBOIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision()==FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers());
+  else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers());
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers());
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t>
+PairAIREBOIntelParam<flt_t,acc_t> PairAIREBOIntel::get_param()
+{
+  PairAIREBOIntelParam<flt_t,acc_t> fc;
+
+#define A(a)                                                           \
+  for (int i = 0; i < sizeof(this->a)/sizeof(double); i++) {           \
+    reinterpret_cast<flt_t*>(&fc.a)[i] =			       \
+      reinterpret_cast<double*>(&this->a)[i];			       \
+  }
+#define A0(a)								\
+  for (int i = 0; i < sizeof(fc.a)/sizeof(flt_t); i++) {		\
+    reinterpret_cast<flt_t*>(&fc.a)[i] =				\
+      reinterpret_cast<double*>(this->a[0])[i];				\
+  }
+#define B(a)								\
+  for (int i = 0; i < sizeof(this->a)/sizeof(double); i++) {		\
+    reinterpret_cast<acc_t*>(&fc.a)[i] =				\
+      reinterpret_cast<double*>(&this->a)[i];				\
+  }
+
+  A(cutlj) A(cutljrebosq) A(cut3rebo) A(sigmin);
+  A(sigcut) A0(cutljsq) A0(lj1) A0(lj2) A0(lj3);
+  A0(lj4) A(smin) A(Nmin) A(Nmax) A(NCmin) A(NCmax) A(thmin) A(thmax);
+  A(rcmin) A(rcmax) A(rcmaxsq) A(rcmaxp) A(Q) A(alpha) A(A) A(rho) A(BIJc);
+  A(Beta) A(rcLJmin) A(rcLJmax) A(rcLJmaxsq) A(bLJmin) A(bLJmax) A(epsilon);
+  A(sigma) A(epsilonT) A(gCdom) A(gC1) A(gC2) A(gHdom) A(gH) A(pCCdom);
+  A(pCHdom) A(pCC) A(pCH) A(piCCdom) A(piCHdom) A(piHHdom) B(piCC);
+  B(piCH) B(piHH) A(Tijdom) B(Tijc) A(PCCf) A(PCCdfdx) A(PCCdfdy) A(PCHf);
+  A(PCHdfdx) A(PCHdfdy) A(piCCf) A(piCCdfdx) A(piCCdfdy) A(piCCdfdz);
+  A(piCHf) A(piCHdfdx) A(piCHdfdy) A(piCHdfdz) A(piHHf) A(piHHdfdx);
+  A(piHHdfdy) A(piHHdfdz) A(Tf) A(Tdfdx) A(Tdfdy) A(Tdfdz);
+
+#undef A
+#undef A0
+#undef B
+  for (int i = 0; i < 5; i++) fc.gDom[i] = fc.gCdom[i];
+  for (int i = 0; i < 4; i++) fc.gDom[5+i] = fc.gHdom[i];
+  for (int i = 0; i < 4; i++) for (int j = 0; j < 6; j++) 
+				fc.gVal[6*i+j] = fc.gC1[i][j];
+  for (int i = 0; i < 4; i++) for (int j = 0; j < 6; j++) 
+				fc.gVal[4*6+6*i+j] = fc.gC2[i][j];
+  for (int i = 0; i < 3; i++) for (int j = 0; j < 6; j++) 
+				fc.gVal[8*6+6*i+j] = fc.gH[i][j];
+
+  return fc;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t>
+void PairAIREBOIntel::compute(
+    int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers
+) {
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = vflag_atom = 0;
+  pvector[0] = pvector[1] = pvector[2] = 0.0;
+
+  const int inum = list->inum;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+
+  if (ago != 0 && fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
+    #if defined(_OPENMP)
+    #pragma omp parallel if(packthreads > 1)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
+                                packthreads, sizeof(ATOM_T));
+        buffers->thr_pack(ifrom,ito,ago);
+    }
+    fix->stop_watch(TIME_PACK);
+  }
+
+  if (atom->nmax > maxlocal) {
+    #ifdef LMP_INTEL_OFFLOAD
+    if (maxlocal > 0 && _cop >= 0) {
+      int * const REBO_numneigh = this->REBO_numneigh;
+      int * const REBO_num_skin = this->REBO_num_skin;
+      int * const REBO_cnumneigh = this->REBO_cnumneigh;
+      int * const REBO_list_data = this->REBO_list_data;
+      double * const nC = this->nC;
+      double * const nH = this->nH;
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(REBO_numneigh: alloc_if(0) free_if(1)) \
+        nocopy(REBO_cnumneigh: alloc_if(0) free_if(1)) \
+        nocopy(REBO_num_skin: alloc_if(0) free_if(1)) \
+        nocopy(REBO_list_data: alloc_if(0) free_if(1)) \
+        nocopy(nH: alloc_if(0) free_if(1)) \
+        nocopy(nC: alloc_if(0) free_if(1))
+    }
+    #endif
+    maxlocal = atom->nmax;
+    memory->destroy(REBO_numneigh);
+    memory->destroy(REBO_cnumneigh);
+    memory->destroy(REBO_list_data);
+    memory->sfree(REBO_firstneigh);
+    memory->destroy(nC);
+    memory->destroy(nH);
+    memory->create(REBO_numneigh,maxlocal,"AIREBO:numneigh");
+    memory->create(REBO_cnumneigh,maxlocal,"AIREBO:cnumneigh");
+    memory->create(REBO_num_skin,maxlocal,"AIREBO:cnumneigh");
+    int max_nbors = buffers->get_max_nbors();
+    memory->create(REBO_list_data,maxlocal * max_nbors,"AIREBO:list_data");
+    REBO_firstneigh = (int **) memory->smalloc(maxlocal*sizeof(int *),
+                                               "AIREBO:firstneigh");
+    memory->create(nC,maxlocal,"AIREBO:nC");
+    memory->create(nH,maxlocal,"AIREBO:nH");
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (_cop >= 0) {
+      int * const REBO_numneigh = this->REBO_numneigh;
+      int * const REBO_num_skin = this->REBO_num_skin;
+      int * const REBO_cnumneigh = this->REBO_cnumneigh;
+      int * const REBO_list_data = this->REBO_list_data;
+      double * const nC = this->nC;
+      double * const nH = this->nH;
+      const int mnml = max_nbors * maxlocal;
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(REBO_numneigh: length(maxlocal) alloc_if(1) free_if(0)) \
+        nocopy(REBO_cnumneigh:length(maxlocal) alloc_if(1) free_if(0)) \
+        nocopy(REBO_num_skin: length(maxlocal) alloc_if(1) free_if(0)) \
+        nocopy(REBO_list_data:length(mnml) alloc_if(1) free_if(0)) \
+        nocopy(nH: length(maxlocal) alloc_if(1) free_if(0)) \
+        nocopy(nC: length(maxlocal) alloc_if(1) free_if(0))
+    }
+    #endif
+  }
+
+  if (evflag || vflag_fdotr) {
+    int ovflag = 0;
+    if (vflag_fdotr) ovflag = 2;
+    else if (vflag) ovflag = 1;
+    if (eflag) {
+      eval<1,1>(1, ovflag, buffers, 0, offload_end);
+      eval<1,1>(0, ovflag, buffers, host_start, inum);
+    } else {
+      eval<1,0>(1, ovflag, buffers, 0, offload_end);
+      eval<1,0>(0, ovflag, buffers, host_start, inum);
+    }
+  } else {
+    eval<0,0>(1, 0, buffers, 0, offload_end);
+    eval<0,0>(0, 0, buffers, host_start, inum);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<int EVFLAG, int EFLAG, class flt_t, class acc_t>
+void PairAIREBOIntel::eval(
+    const int offload, const int vflag,
+    IntelBuffers<flt_t,acc_t> * buffers,
+    const int astart, const int aend
+) {
+  const int inum = aend - astart;
+  if (inum == 0) {
+    return;
+  }
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
+
+  ATOM_T * _noalias const x = buffers->get_x(offload);
+  const int * _noalias const numneighhalf = buffers->get_atombin();
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  int * const tag = atom->tag;
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, 1 /*NEWTON_PAIR*/, EFLAG, vflag,
+		       buffers, offload, fix, separate_flag,
+		       x_size, q_size, ev_size, f_stride);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+
+  const int nthreads = tc;
+  const double skin = neighbor->skin;
+  const int max_nbor = buffers->get_max_nbors();
+  const PairAIREBOIntelParam<flt_t,acc_t> param = get_param<flt_t,acc_t>();
+
+  // offload here
+  #ifdef _LMP_INTEL_OFFLOAD
+  int *overflow = fix->get_off_overflow_flag();
+  double *timer_compute = fix->off_watch_pair();
+
+  int * const REBO_numneigh = this->REBO_numneigh;
+  int * const REBO_num_skin = this->REBO_num_skin;
+  int * const REBO_cnumneigh = this->REBO_cnumneigh;
+  int * const REBO_list_data = this->REBO_list_data;
+  double * const nC = this->nC;
+  double * const nH = this->nH;
+  const int torflag = this->torflag;
+  const int ljflag = this->ljflag;
+  const int morseflag = this->morseflag;
+  int * const map = this->map;
+
+  if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
+
+  #pragma offload target(mic:_cop) if(offload) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
+    in(x:length(x_size) alloc_if(0) free_if(0)) \
+    in(overflow:length(0) alloc_if(0) free_if(0)) \
+    in(astart,nthreads,inum,nall,ntypes,vflag,eatom) \
+    in(f_stride,nlocal,minlocal,separate_flag,offload) \
+    out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
+    out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    in(param,skin,max_nbor) \
+    in(tag: length(0) alloc_if(0) free_if(0)) \
+    in(torflag, ljflag, morseflag, ago) \
+    in(nC: length(0) alloc_if(0) free_if(0)) \
+    in(nH: length(0) alloc_if(0) free_if(0)) \
+    in(REBO_numneigh: length(0) alloc_if(0) free_if(0)) \
+    in(REBO_cnumneigh: length(0) alloc_if(0) free_if(0)) \
+    in(REBO_num_skin: length(0) alloc_if(0) free_if(0)) \
+    in(REBO_list_data: length(0) alloc_if(0) free_if(0)) \
+    in(map: length(0) alloc_if(0) free_if(0)) \
+    signal(f_start)
+  #endif
+  {
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    IP_PRE_repack_for_offload(1 /*NEWTON_PAIR*/, separate_flag, nlocal, nall,
+			      f_stride, x, 0/*q*/);
+
+    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EVFLAG) {
+      oevdwl = oecoul = (acc_t)0;
+      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    }
+
+    // loop over neighbors of my atoms
+    #if defined(_OPENMP)
+    #pragma omp parallel \
+      shared(f_start,f_stride,nlocal,nall,minlocal)	\
+      reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iito, tid;
+      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      int neigh_iifrom, neigh_iito;
+      IP_PRE_omp_range(neigh_iifrom, neigh_iito, tid, nall, nthreads);
+
+      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
+      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+
+      KernelArgsAIREBOT<flt_t,acc_t> args;
+      args.num_local = nlocal;
+      args.num_all = nall;
+      args.num_neighs_per_atom = max_nbor;
+      args.num_types = ntypes;
+      args.frebo_from_atom = 0;
+      args.frebo_to_atom = args.num_local;
+      args.neigh_from_atom = 0;
+      args.neigh_to_atom = args.num_all;
+      args.rebuild_flag = ago == 0;
+      args.skin = skin;
+      args.neigh_lmp.num = const_cast<int*>(numneigh);
+      args.neigh_lmp.num_half = const_cast<int*>(numneighhalf);
+      args.neigh_lmp.offset = const_cast<int*>(cnumneigh);
+      args.neigh_lmp.entries = const_cast<int*>(firstneigh);
+      args.neigh_rebo.num = REBO_numneigh;
+      args.neigh_rebo.num_half = REBO_num_skin;
+      args.neigh_rebo.offset = REBO_cnumneigh;
+      args.neigh_rebo.entries = REBO_list_data;
+      args.params = param;
+      args.tag = tag;
+      args.nC = reinterpret_cast<flt_t*>(nC);
+      args.nH = reinterpret_cast<flt_t*>(nH);
+      args.map = map;
+      args.result_eng = 0;
+      args.x = (AtomAIREBOT<flt_t>*) x;
+
+      args.result_f = (ResultForceT<acc_t> *) f;
+      args.neigh_from_atom = neigh_iifrom;
+      args.neigh_to_atom = neigh_iito;
+      args.frebo_from_atom = iifrom;
+      args.frebo_to_atom = iito;
+
+      aut_rebo_neigh(&args);
+      #if defined(_OPENMP)
+      #pragma omp barrier
+      #endif
+      aut_frebo(&args, torflag);
+      if (ljflag) aut_lennard_jones(&args, morseflag);
+
+      oevdwl += args.result_eng;
+
+      IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride, x,
+                              offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5);
+    } // end of omp parallel region
+    IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+    if (EVFLAG) {
+      if (EFLAG) {
+        ev_global[0] = oevdwl;
+        ev_global[1] = oecoul;
+      }
+      if (vflag) {
+        ev_global[2] = ov0;
+        ev_global[3] = ov1;
+        ev_global[4] = ov2;
+        ev_global[5] = ov3;
+        ev_global[6] = ov4;
+        ev_global[7] = ov5;
+      }
+    }
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end of offload region
+
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EVFLAG)
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t>
+void PairAIREBOIntel::pack_force_const(IntelBuffers<flt_t,acc_t> * buffers) {
+  int tp1 = atom->ntypes + 1;
+
+  buffers->set_ntypes(tp1,1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+  flt_t **cutneighghostsq = buffers->get_cutneighghostsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i, j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+        cut = cutghost[i][j] + neighbor->skin;
+        cutneighghostsq[i][j] = cutneighghostsq[j][i] = cut*cut;
+      }
+    }
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop < 0) return;
+  flt_t * ocutneighsq = cutneighsq[0];
+  size_t VL = 512 / 8 / sizeof(flt_t);
+  int ntypes = tp1;
+  int tp1sq = tp1 * tp1;
+  // TODO the lifecycle of "map" is currently not 100% correct
+  // it might not be freed if this method is called more than once
+  int * map = this->map;
+  #pragma offload_transfer target(mic:_cop) \
+    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) \
+    in(map: length(tp1) alloc_if(1) free_if(0))
+  #endif
+
+}
+
+/* ----------------------------------------------------------------------
+    Implementation
+   ---------------------------------------------------------------------- */
+
+namespace {
+
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(push, target(mic))
+#endif
+
+namespace overloaded {
+  double sqrt(double a) { return ::sqrt(a); }
+  float sqrt(float a) { return ::sqrtf(a); }
+  double sin(double a) { return ::sin(a); }
+  float sin(float a) { return ::sinf(a); }
+  double cos(double a) { return ::cos(a); }
+  float cos(float a) { return ::cosf(a); }
+  double exp(double a) { return ::exp(a); }
+  float exp(float a) { return ::expf(a); }
+  double pow(double a, double b) { return ::pow(a, b); }
+  float pow(float a, float b) { return ::powf(a, b); }
+}
+
+/* ----------------------------------------------------------------------
+    Scalar AIREBO implementation, standalone, with massive code reuse
+    compared to original code.
+   ---------------------------------------------------------------------- */
+
+#define M_PI           3.14159265358979323846  /* pi */
+
+#define CARBON 0
+#define HYDROGEN 1
+#define TOL 1.0e-9
+
+template<typename T>
+inline T fmin_nonan(T a, T b) {
+  return a < b ? a : b;
+}
+template<typename T>
+inline T fmax_nonan(T a, T b) {
+  return a > b ? a : b;
+}
+
+template<typename flt_t>
+inline flt_t Sp(flt_t r, flt_t lo, flt_t hi, flt_t * del) {
+  flt_t t = (r - lo) / (hi - lo);
+  if (t <= 0) {
+    if (del) *del = 0;
+    return 1;
+  } else if (t >= 1) {
+    if (del) *del = 0;
+    return 0;
+  } else {
+    t *= static_cast<flt_t>(M_PI);
+    if (del) *del = static_cast<flt_t>(-0.5 * M_PI)
+                  * overloaded::sin(t) / (hi - lo);
+    return static_cast<flt_t>(0.5) * (1 + overloaded::cos(t));
+  }
+}
+
+template<typename flt_t>
+inline flt_t Sp2(flt_t r, flt_t lo, flt_t hi, flt_t * del) {
+  flt_t t = (r - lo) / (hi - lo);
+  if (t <= 0) {
+    if (del) *del = 0;
+    return 1;
+  } else if (t >= 1) {
+    if (del) *del = 0;
+    return 0;
+  } else {
+    if (del) *del = 6 * (t * t - t) / (hi - lo);
+    return 1 - t * t * (3 - 2 * t);
+  }
+}
+
+template<typename flt_t>
+inline flt_t eval_poly_lin(int n, flt_t * coeffs, flt_t x, flt_t * deriv) {
+  flt_t result = coeffs[n - 1];
+  *deriv = coeffs[n - 1] * (n - 1);
+  for (int i = n - 2; i > 0; i--) {
+    result = coeffs[i] + x * result;
+    *deriv = coeffs[i] * i + x * (*deriv);
+  }
+  result = coeffs[0] + x * result;
+  return result;
+}
+
+template<typename flt_t, typename acc_t>
+inline flt_t gSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, flt_t cos, flt_t N, flt_t * dgdc, flt_t * dgdN) {
+  flt_t NCmin = ka->params.NCmin;
+  flt_t NCmax = ka->params.NCmax;
+  int index = 0;
+  flt_t * gDom = NULL;
+  int nDom = 0;
+  int offs = 0;
+  if (itype == 0) {
+    nDom = 4;
+    gDom = &ka->params.gCdom[0];
+    if (N > NCmin) offs = 4 * 6;
+  } else {
+    nDom = 3;
+    gDom = &ka->params.gHdom[0];
+    offs = 8 * 6;
+  }
+  cos = fmax_nonan(gDom[0], fmin_nonan(gDom[nDom], cos));
+  int i;
+  for (i = 0; i < nDom; i++) {
+    if (cos >= gDom[i] && cos <= gDom[i + 1]) {
+      index = i;
+    }
+  }
+  flt_t g = eval_poly_lin(6, &ka->params.gVal[offs+index*6], cos, dgdc);
+  *dgdN = 0;
+  if (itype == 0 && N > NCmin && N < NCmax) {
+    flt_t dg1;
+    flt_t g1 = eval_poly_lin(6, &ka->params.gVal[index*6], cos, &dg1);
+    flt_t dS;
+    flt_t cut = Sp(N, NCmin, NCmax, &dS);
+    *dgdN = dS * (g1 - g);
+    g = g + cut * (g1 - g);
+    *dgdc = *dgdc + cut * (dg1 - *dgdc);
+  }
+  return g;
+}
+
+template<typename flt_t>
+inline flt_t eval_poly_bi(int n, flt_t * coeffs, flt_t x, flt_t y, 
+			  flt_t * deriv) {
+  flt_t dy;
+  flt_t vy = eval_poly_lin(n, &coeffs[n * (n - 1)], y, &dy);
+  flt_t result = vy;
+  deriv[0] = vy * (n - 1);
+  deriv[1] = dy;
+  for (int i = n - 2; i > 0; i--) {
+    vy = eval_poly_lin(n, &coeffs[n * i], y, &dy);
+    result = vy + x * result;
+    deriv[0] = vy * i + x * deriv[0];
+    deriv[1] = dy + x * deriv[1];
+  }
+  result = eval_poly_lin(n, &coeffs[0], y, &dy) + x * result;
+  deriv[1] = dy + x * deriv[1];
+  return result;
+}
+
+template<typename flt_t>
+inline flt_t eval_poly_tri(int n, flt_t * coeffs, flt_t x, flt_t y, flt_t z, 
+			   flt_t * deriv) {
+  flt_t dyz[2];
+  flt_t vyz = eval_poly_bi(n, &coeffs[n * n * (n - 1)], y, z, &dyz[0]);
+  flt_t result = vyz;
+  deriv[0] = vyz * (n - 1);
+  deriv[1] = dyz[0];
+  deriv[2] = dyz[1];
+  for (int i = n - 2; i > 0; i--) {
+    vyz = eval_poly_bi(n, &coeffs[n * n * i], y, z, &dyz[0]);
+    result = vyz + x * result;
+    deriv[0] = vyz * i + x * deriv[0];
+    deriv[1] = dyz[0] + x * deriv[1];
+    deriv[2] = dyz[1] + x * deriv[2];
+  }
+  result = eval_poly_bi(n, &coeffs[0], y, z, &dyz[0]) + x * result;
+  deriv[1] = dyz[0] + x * deriv[1];
+  deriv[2] = dyz[1] + x * deriv[2];
+  return result;
+}
+
+template<typename flt_t, typename acc_t>
+inline flt_t PijSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, 
+		       int jtype, flt_t NC, flt_t NH, flt_t * dN) {
+  dN[0] = 0.0;
+  dN[1] = 0.0;
+  if (itype == HYDROGEN) return 0;
+  flt_t *pCJdom = jtype == CARBON ? &ka->params.pCCdom[0][0] : 
+    &ka->params.pCHdom[0][0];
+  NC = fmax_nonan(pCJdom[0], fmin_nonan(pCJdom[1], NC));
+  NH = fmax_nonan(pCJdom[2], fmin_nonan(pCJdom[3], NH));
+  int nC = floor(NC);
+  int nH = floor(NH);
+  #define PijSelect(a, b) (jtype == CARBON ? ka->params.a : ka->params.b)
+  if (fabs(NC - nC) < TOL && fabs(NH - nH) < TOL) {
+    dN[0] = PijSelect(PCCdfdx, PCHdfdx)[nC][nH];
+    dN[1] = PijSelect(PCCdfdy, PCHdfdy)[nC][nH];
+    return PijSelect(PCCf, PCHf)[nC][nH];
+  }
+  if (NC == pCJdom[1]) nC -= 1;
+  if (NH == pCJdom[3]) nH -= 1;
+  return eval_poly_bi(4, &PijSelect(pCC, pCH)[nC][nH][0], NC, NH, dN);
+  #undef PijSelect
+}
+
+template<typename flt_t, typename acc_t>
+inline flt_t TijSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, flt_t Nij, 
+    flt_t Nji, flt_t Nijconj, acc_t * dN3) {
+  flt_t * Tijdom = &ka->params.Tijdom[0][0];
+  Nij = fmax_nonan(Tijdom[0], fmin_nonan(Tijdom[1], Nij));
+  Nji = fmax_nonan(Tijdom[2], fmin_nonan(Tijdom[3], Nji));
+  Nijconj = fmax_nonan(Tijdom[4], fmin_nonan(Tijdom[5], Nijconj));
+  int nij = floor(Nij);
+  int nji = floor(Nji);
+  int nijconj = floor(Nijconj);
+  if (fabs(Nij - nij) < TOL && fabs(Nji - nji) < 
+			  TOL && fabs(Nijconj - nijconj) < TOL) {
+    dN3[0] = ka->params.Tdfdx[nij][nji][nijconj];
+    dN3[1] = ka->params.Tdfdy[nij][nji][nijconj];
+    dN3[2] = ka->params.Tdfdz[nij][nji][nijconj];
+    return ka->params.Tf[nij][nji][nijconj];
+  }
+  if (Nij == Tijdom[1]) nij -= 1;
+  if (Nji == Tijdom[3]) nji -= 1;
+  if (Nijconj == Tijdom[5]) nijconj -= 1;
+  return eval_poly_tri<acc_t>(4, &ka->params.Tijc[nij][nji][nijconj][0], Nij, 
+    Nji, Nijconj, dN3);
+}
+
+template<typename flt_t, typename acc_t>
+inline flt_t piRCSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, 
+    int jtype, flt_t Nij, flt_t Nji, flt_t Nijconj, acc_t * dN3) {
+  const int HH = 2;
+  const int CH = 1;
+  /* const int CC = 0; */
+  int select = itype + jtype;
+  #define piRCSelect(a, b, c) (select == HH ? ka->params.a : select == CH ? \
+			       ka->params.b : ka->params.c)
+  flt_t * piIJdom = &piRCSelect(piHHdom, piCHdom, piCCdom)[0][0];
+  if (select == HH) {
+    if (Nij < piIJdom[0] || Nij > piIJdom[1] || Nji < piIJdom[2] || 
+	Nji > piIJdom[3] || Nijconj < piIJdom[4] || Nijconj > piIJdom[5]) {
+      Nij = 0;
+      Nji = 0;
+      Nijconj = 0;
+    }
+  }
+  Nij = fmax_nonan(piIJdom[0], fmin_nonan(piIJdom[1], Nij));
+  Nji = fmax_nonan(piIJdom[2], fmin_nonan(piIJdom[3], Nji));
+  Nijconj = fmax_nonan(piIJdom[4], fmin_nonan(piIJdom[5], Nijconj));
+  int nij = floor(Nij);
+  int nji = floor(Nji);
+  int nijconj = floor(Nijconj);
+  if (fabs(Nij - nij) < TOL && fabs(Nji - nji) < 
+			  TOL && fabs(Nijconj - nijconj) < TOL) {
+    dN3[0] = piRCSelect(piHHdfdx, piCHdfdx, piCCdfdx)[nij][nji][nijconj];
+    dN3[1] = piRCSelect(piHHdfdy, piCHdfdy, piCCdfdy)[nij][nji][nijconj];
+    dN3[2] = piRCSelect(piHHdfdz, piCHdfdz, piCCdfdz)[nij][nji][nijconj];
+    return piRCSelect(piHHf, piCHf, piCCf)[nij][nji][nijconj];
+  }
+  if (Nij == piIJdom[1]) nij -= 1;
+  if (Nji == piIJdom[3]) nji -= 1;
+  if (Nijconj == piIJdom[5]) nijconj -= 1;
+  return eval_poly_tri<acc_t>(4, 
+    &piRCSelect(piHH, piCH, piCC)[nij][nji][nijconj][0], Nij, Nji, Nijconj, 
+    dN3);
+  #undef piRCSelect
+}
+
+/*
+ * Implements the p_ij term in airebo, which occurs on 4 different occasions
+ * in the original lammps code.
+ */
+template<typename flt_t, typename acc_t>
+inline flt_t frebo_pij(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i, int j, 
+    flt_t rijx, flt_t rijy, flt_t rijz, flt_t rijmag, flt_t wij, flt_t VA, 
+    flt_t * sum_N, acc_t fij[3]) {
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  flt_t * nC = ka->nC;
+  flt_t * nH = ka->nH;
+  flt_t x_i = x[i].x;
+  flt_t y_i = x[i].y;
+  flt_t z_i = x[i].z;
+  int itype = map[x[i].w];
+  int jtype = map[x[j].w];
+  flt_t invrijm = 1 / rijmag;
+  flt_t invrijm2 = invrijm * invrijm;
+  flt_t rcminij = ka->params.rcmin[itype][jtype];
+  flt_t rcmaxij = ka->params.rcmax[itype][jtype];
+  flt_t Nmin = ka->params.Nmin;
+  flt_t Nmax = ka->params.Nmax;
+  flt_t Nij = nC[i] + nH[i] - wij;
+  flt_t NijC = nC[i] - wij * (1 - jtype);
+  flt_t NijH = nH[i] - wij * jtype;
+  flt_t sum_pij = 0;
+  flt_t sum_dpij_dN = 0;
+  flt_t dN2[2] = {0};
+  flt_t pij = 0;
+  *sum_N = 0;
+  int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
+  int pass;
+  for (pass = 0; pass < 2; pass++) {
+    int kk;
+    int knum = ka->neigh_rebo.num[i];
+    for (kk = 0; kk < knum; kk++) {
+      int k = neighs[kk];
+      if (k == j) continue;
+      flt_t rikx = x_i - x[k].x;
+      flt_t riky = y_i - x[k].y;
+      flt_t rikz = z_i - x[k].z;
+      int ktype = map[x[k].w];
+      flt_t rikmag = overloaded::sqrt(rikx * rikx + riky * riky + rikz * rikz);
+      flt_t rho_k = ka->params.rho[ktype][1];
+      flt_t rho_j = ka->params.rho[jtype][1];
+      flt_t lamdajik = 4 * itype * ((rho_k - rikmag) - (rho_j - rijmag));
+      flt_t ex_lam = exp(lamdajik);
+      flt_t rcminik = ka->params.rcmin[itype][ktype];
+      flt_t rcmaxik = ka->params.rcmax[itype][ktype];
+      flt_t dwik;
+      flt_t wik = Sp(rikmag, rcminik, rcmaxik, &dwik);
+      flt_t Nki = nC[k] + nH[k] - wik;
+      flt_t cosjik = (rijx * rikx + rijy * riky + rijz * rikz) / 
+	(rijmag * rikmag);
+      cosjik = fmin_nonan<flt_t>(1, fmax_nonan<flt_t>(-1, cosjik));
+      flt_t dgdc, dgdN;
+      flt_t g = gSpline(ka, itype, cosjik, Nij, &dgdc, &dgdN);
+      if (pass == 0) {
+        sum_pij += wik * g * ex_lam;
+        sum_dpij_dN += wik * dgdN * ex_lam;
+        flt_t cutN = Sp<flt_t>(Nki, Nmin, Nmax, NULL);
+        *sum_N += (1 - ktype) * wik * cutN;
+      } else {
+        flt_t tmp = -0.5 * pij * pij * pij;
+        flt_t invrikm = 1 / rikmag;
+        flt_t rjkx = rikx - rijx;
+        flt_t rjky = riky - rijy;
+        flt_t rjkz = rikz - rijz;
+        flt_t rjkmag = sqrt(rjkx * rjkx + rjky * rjky + rjkz * rjkz);
+        flt_t rijrik = 2 * rijmag * rikmag;
+        flt_t rr = rijmag * rijmag - rikmag * rikmag;
+        flt_t dctdjk = -2 / rijrik;
+        flt_t dctdik = (-rr + rjkmag * rjkmag) / (rijrik * rikmag * rikmag);
+        flt_t dctdij = (rr + rjkmag * rjkmag) / (rijrik * rijmag * rijmag);
+
+        acc_t fi[3], fj[3], fk[3];
+        flt_t pref = 0.5 * VA * tmp;
+        flt_t tmp20 = pref * wik * dgdc * ex_lam;
+        fj[0] = fj[1] = fj[2] = 0;
+        fi[0] = -tmp20 * dctdik * rikx;
+        fi[1] = -tmp20 * dctdik * riky;
+        fi[2] = -tmp20 * dctdik * rikz;
+        fk[0] =  tmp20 * dctdik * rikx;
+        fk[1] =  tmp20 * dctdik * riky;
+        fk[2] =  tmp20 * dctdik * rikz;
+
+        fij[0] += -tmp20 * dctdij * rijx;
+        fij[1] += -tmp20 * dctdij * rijy;
+        fij[2] += -tmp20 * dctdij * rijz;
+
+        fi[0] += -tmp20 * dctdjk * rjkx;
+        fi[1] += -tmp20 * dctdjk * rjky;
+        fi[2] += -tmp20 * dctdjk * rjkz;
+        fk[0] +=  tmp20 * dctdjk * rjkx;
+        fk[1] +=  tmp20 * dctdjk * rjky;
+        fk[2] +=  tmp20 * dctdjk * rjkz;
+        fij[0] -= -tmp20 * dctdjk * rjkx;
+        fij[1] -= -tmp20 * dctdjk * rjky;
+        fij[2] -= -tmp20 * dctdjk * rjkz;
+
+        flt_t tmp21 = pref * (wik * g * ex_lam * 4 * itype);
+        fij[0] -= 1 * tmp21 * rijx * invrijm;
+        fij[1] -= 1 * tmp21 * rijy * invrijm;
+        fij[2] -= 1 * tmp21 * rijz * invrijm;
+        fi[0] -= tmp21 * (-rikx * invrikm);
+        fi[1] -= tmp21 * (-riky * invrikm);
+        fi[2] -= tmp21 * (-rikz * invrikm);
+        fk[0] -= tmp21 * (rikx * invrikm);
+        fk[1] -= tmp21 * (riky * invrikm);
+        fk[2] -= tmp21 * (rikz * invrikm);
+
+        // coordination forces
+
+        // dwik forces
+        flt_t tmp22 = pref * dwik * g * ex_lam * invrikm;
+        fi[0] -= tmp22 * rikx;
+        fi[1] -= tmp22 * riky;
+        fi[2] -= tmp22 * rikz;
+        fk[0] += tmp22 * rikx;
+        fk[1] += tmp22 * riky;
+        fk[2] += tmp22 * rikz;
+
+        // PIJ forces
+        flt_t tmp23 = pref * dN2[ktype] * dwik * invrikm;
+        fi[0] -= tmp23 * rikx;
+        fi[1] -= tmp23 * riky;
+        fi[2] -= tmp23 * rikz;
+        fk[0] += tmp23 * rikx;
+        fk[1] += tmp23 * riky;
+        fk[2] += tmp23 * rikz;
+
+        // dgdN forces
+        flt_t tmp24 = pref * sum_dpij_dN * dwik * invrikm;
+        fi[0] -= tmp24 * rikx;
+        fi[1] -= tmp24 * riky;
+        fi[2] -= tmp24 * rikz;
+        fk[0] += tmp24 * rikx;
+        fk[1] += tmp24 * riky;
+        fk[2] += tmp24 * rikz;
+
+        result_f[i].x += fi[0];
+        result_f[i].y += fi[1];
+        result_f[i].z += fi[2];
+        result_f[j].x += fj[0];
+        result_f[j].y += fj[1];
+        result_f[j].z += fj[2];
+        result_f[k].x += fk[0];
+        result_f[k].y += fk[1];
+        result_f[k].z += fk[2];
+      }
+    }
+    if (pass == 0) {
+      flt_t PijS = PijSpline(ka, itype, jtype, NijC, NijH, dN2);
+      pij = 1 / overloaded::sqrt(1 + sum_pij + PijS);
+    }
+  }
+  return pij;
+}
+
+template<typename flt_t, typename acc_t>
+inline flt_t frebo_pi_rc(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, 
+    int jtype, flt_t Nij, flt_t Nji, flt_t Nijconj, flt_t * dN3) {
+  acc_t dN3tmp[3] = {0};
+  flt_t ret = piRCSpline(ka, itype, jtype, Nij, Nji, Nijconj, dN3tmp);
+  dN3[0] = dN3tmp[0];
+  dN3[1] = dN3tmp[1];
+  dN3[2] = dN3tmp[2];
+  return ret;
+}
+
+template<typename flt_t, typename acc_t>
+inline flt_t frebo_Tij(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, 
+    int jtype, flt_t Nij, flt_t Nji, flt_t Nijconj, flt_t * dN3) {
+  dN3[0] = 0;
+  dN3[1] = 0;
+  dN3[2] = 0;
+  if (itype == HYDROGEN || jtype == HYDROGEN) return 0;
+  acc_t dN3tmp[3] = {0};
+  flt_t ret = TijSpline(ka, Nij, Nji, Nijconj, dN3tmp);
+  dN3[0] = dN3tmp[0];
+  dN3[1] = dN3tmp[1];
+  dN3[2] = dN3tmp[2];
+  return ret;
+}
+
+/*
+ * Implements a scalar version of the sum cos^1(omega) term used in pi^dh_ij.
+ * Occurs in both bondorder and bondorderLJ.
+ */
+template<typename flt_t, typename acc_t>
+inline flt_t frebo_sum_omega(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i, int j,
+ flt_t r23x, flt_t r23y, flt_t r23z, flt_t r23mag, flt_t VA, acc_t fij[3]) {
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  acc_t sum_omega = 0;
+  int a2 = i;
+  int a3 = j;
+  flt_t r32x = - r23x;
+  flt_t r32y = - r23y;
+  flt_t r32z = - r23z;
+  int * map = ka->map;
+  AtomAIREBOT<flt_t> * x = ka->x;
+  flt_t thmin = ka->params.thmin;
+  flt_t thmax = ka->params.thmax;
+  int itype = map[x[i].w];
+  int jtype = map[x[j].w];
+  int * neighs_i = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
+  int * neighs_j = ka->neigh_rebo.entries + ka->neigh_rebo.offset[j];
+  int num_i = ka->neigh_rebo.num[i];
+  int num_j = ka->neigh_rebo.num[j];
+  int kk;
+  for (kk = 0; kk < num_i; kk++) {
+    int k = neighs_i[kk];
+    if (k == j) continue;
+    int a1 = k;
+    int ktype = map[x[k].w];
+    flt_t r21x = x[a2].x - x[a1].x;
+    flt_t r21y = x[a2].y - x[a1].y;
+    flt_t r21z = x[a2].z - x[a1].z;
+    flt_t r21mag = overloaded::sqrt(r21x * r21x + r21y * r21y + r21z * r21z);
+    flt_t cos321 = (r23x * r21x + r23y * r21y + r23z * r21z) / 
+      (r23mag * r21mag);
+    cos321 = fmin_nonan<flt_t>(1, fmax_nonan<flt_t>(-1, cos321));
+    flt_t sin321 = overloaded::sqrt(1 - cos321 * cos321);
+    if (sin321 == 0) continue;
+    flt_t sink2i = 1 / (sin321 * sin321);
+    flt_t rik2i = 1 / (r21mag * r21mag);
+    flt_t rr = r23mag * r23mag - r21mag * r21mag;
+    flt_t r31x = r21x - r23x;
+    flt_t r31y = r21y - r23y;
+    flt_t r31z = r21z - r23z;
+    flt_t r31mag2 = r31x * r31x + r31y * r31y + r31z * r31z;
+    flt_t rijrik = 2 * r23mag * r21mag;
+    flt_t r21mag2 = r21mag * r21mag;
+    flt_t dctik = (-rr + r31mag2) / (rijrik * r21mag2);
+    flt_t dctij = (rr + r31mag2) / (rijrik * r23mag * r23mag);
+    flt_t dctjk = -2 / rijrik;
+    flt_t rcmin21  = ka->params.rcmin [itype][ktype];
+    flt_t rcmaxp21 = ka->params.rcmaxp[itype][ktype];
+    flt_t dw21;
+    flt_t w21 = Sp(r21mag, rcmin21, rcmaxp21, &dw21);
+    // why does this additional cutoff in the cosine exist?
+    // the original code by stuart answers this:
+    // it avoid issues when bonds in the dihedral are linear
+    // by switching the dihedral off beforehand.
+    // This is the reason for both the sin == 0 checks and the
+    // tspjik = Sp2(..) calls.
+    // Unfortunately, this is not exactly stated in the original paper.
+    // It might be similar in purpose to the H(sin - s^min) term that
+    // appears in that paper, but can not be found in original REBO papers.
+    flt_t dtsjik;
+    flt_t tspjik = Sp2(cos321, thmin, thmax, &dtsjik);
+    dtsjik = - dtsjik;
+    int ll;
+    for (ll = 0; ll < num_j; ll++) {
+      int l = neighs_j[ll];
+      if (l == i || l == k) continue;
+      int ltype = map[x[l].w];
+      int a4 = l;
+      flt_t r34x = x[a3].x - x[a4].x;
+      flt_t r34y = x[a3].y - x[a4].y;
+      flt_t r34z = x[a3].z - x[a4].z;
+      flt_t r34mag = overloaded::sqrt(r34x * r34x + r34y * r34y + r34z * r34z);
+      flt_t cos234 = (r32x * r34x + r32y * r34y + r32z * r34z) / 
+	(r23mag * r34mag);
+      cos234 = fmin_nonan<flt_t>(1, fmax_nonan<flt_t>(-1, cos234));
+      flt_t sin234 = overloaded::sqrt(1 - cos234 * cos234);
+      if (sin234 == 0) continue;
+      flt_t sinl2i = 1 / (sin234 * sin234);
+      flt_t rjl2i = 1 / (r34mag * r34mag);
+
+      flt_t rcminjl = ka->params.rcmin[jtype][ltype];
+      flt_t rcmaxpjl = ka->params.rcmaxp[jtype][ltype];
+      flt_t dw34;
+      flt_t w34 = Sp(r34mag, rcminjl, rcmaxpjl, &dw34);
+      flt_t rr = (r23mag * r23mag) - (r34mag * r34mag);
+      flt_t r24x = r23x + r34x;
+      flt_t r24y = r23y + r34y;
+      flt_t r24z = r23z + r34z;
+      flt_t r242 =
+          (r24x * r24x) + (r24y * r24y) + (r24z * r24z);
+      flt_t rijrjl = 2 * r23mag * r34mag;
+      flt_t rjl2 = r34mag * r34mag;
+      flt_t dctjl = (-rr + r242) / (rijrjl * rjl2);
+      flt_t dctji = (rr + r242) / (rijrjl * r23mag * r23mag);
+      flt_t dctil = -2 / rijrjl;
+      flt_t dtsijl;
+      flt_t tspijl = Sp2(cos234, thmin, thmax, &dtsijl);
+      dtsijl = -dtsijl; // need minus sign
+      flt_t prefactor = VA;
+
+      flt_t cross321x = (r32y * r21z) - (r32z * r21y);
+      flt_t cross321y = (r32z * r21x) - (r32x * r21z);
+      flt_t cross321z = (r32x * r21y) - (r32y * r21x);
+      flt_t cross234x = (r23y * r34z) - (r23z * r34y);
+      flt_t cross234y = (r23z * r34x) - (r23x * r34z);
+      flt_t cross234z = (r23x * r34y) - (r23y * r34x);
+
+      flt_t cwnum = (cross321x * cross234x) +
+              (cross321y * cross234y) +
+              (cross321z * cross234z);
+      flt_t cwnom = r21mag * r34mag * r23mag * r23mag * sin321 * sin234;
+      flt_t om1234 = cwnum / cwnom;
+      flt_t cw = om1234;
+      sum_omega += ((1 - (om1234 * om1234)) * w21 * w34) *
+              (1 - tspjik) * (1 - tspijl);
+      if (VA == static_cast<flt_t>(0.0)) continue;
+
+      flt_t dt1dik = (rik2i) - (dctik * sink2i * cos321);
+      flt_t dt1djk = (-dctjk * sink2i * cos321);
+      flt_t dt1djl = (rjl2i) - (dctjl * sinl2i * cos234);
+      flt_t dt1dil = (-dctil * sinl2i * cos234);
+      flt_t dt1dij = (2 / (r23mag * r23mag)) -
+               (dctij * sink2i * cos321) -
+               (dctji * sinl2i * cos234);
+
+      flt_t dt2dikx = (-r23z * cross234y) + (r23y * cross234z);
+      flt_t dt2diky = (-r23x * cross234z) + (r23z * cross234x);
+      flt_t dt2dikz = (-r23y * cross234x) + (r23x * cross234y);
+
+      flt_t dt2djlx = (-r23y * cross321z) + (r23z * cross321y);
+      flt_t dt2djly = (-r23z * cross321x) + (r23x * cross321z);
+      flt_t dt2djlz = (-r23x * cross321y) + (r23y * cross321x);
+
+      flt_t dt2dijx = (r21z * cross234y) - (r34z * cross321y) -
+      flt_t      (r21y * cross234z) + (r34y * cross321z);
+      flt_t dt2dijy = (r21x * cross234z) - (r34x * cross321z) -
+      flt_t      (r21z * cross234x) + (r34z * cross321x);
+      flt_t dt2dijz = (r21y * cross234x) - (r34y * cross321x) -
+      flt_t      (r21x * cross234y) + (r34x * cross321y);
+
+      flt_t aa = (prefactor * 2 * cw / cwnom) * w21 * w34 *
+           (1 - tspjik) * (1 - tspijl);
+      flt_t aaa1 = -prefactor * (1 - (om1234 * om1234)) *
+             (1 - tspjik) * (1 - tspijl);
+      flt_t aaa2 = -prefactor * (1 - (om1234 * om1234)) * w21 * w34;
+      flt_t at2 = aa * cwnum;
+
+      flt_t fcijpc = (-dt1dij * at2) +
+              (aaa2 * dtsjik * dctij * (1 - tspijl)) +
+              (aaa2 * dtsijl * dctji * (1 - tspjik));
+      flt_t fcikpc = (-dt1dik * at2) +
+              (aaa2 * dtsjik * dctik * (1 - tspijl));
+      flt_t fcjlpc = (-dt1djl * at2) +
+              (aaa2 * dtsijl * dctjl * (1 - tspjik));
+      flt_t fcjkpc = (-dt1djk * at2) +
+              (aaa2 * dtsjik * dctjk * (1 - tspijl));
+      flt_t fcilpc = (-dt1dil * at2) +
+              (aaa2 * dtsijl * dctil * (1 - tspjik));
+
+      flt_t F23x = (fcijpc * r23x) + (aa * dt2dijx);
+      flt_t F23y = (fcijpc * r23y) + (aa * dt2dijy);
+      flt_t F23z = (fcijpc * r23z) + (aa * dt2dijz);
+
+      flt_t F12x = (fcikpc * r21x) + (aa * dt2dikx);
+      flt_t F12y = (fcikpc * r21y) + (aa * dt2diky);
+      flt_t F12z = (fcikpc * r21z) + (aa * dt2dikz);
+
+      flt_t F34x = (fcjlpc * r34x) + (aa * dt2djlx);
+      flt_t F34y = (fcjlpc * r34y) + (aa * dt2djly);
+      flt_t F34z = (fcjlpc * r34z) + (aa * dt2djlz);
+
+      flt_t F31x = (fcjkpc * r31x);
+      flt_t F31y = (fcjkpc * r31y);
+      flt_t F31z = (fcjkpc * r31z);
+
+      flt_t F24x = (fcilpc * r24x);
+      flt_t F24y = (fcilpc * r24y);
+      flt_t F24z = (fcilpc * r24z);
+
+      flt_t f1x = -F12x - F31x;
+      flt_t f1y = -F12y - F31y;
+      flt_t f1z = -F12z - F31z;
+      flt_t f2x = F12x + F31x;
+      flt_t f2y = F12y + F31y;
+      flt_t f2z = F12z + F31z;
+      flt_t f3x = F34x + F24x;
+      flt_t f3y = F34y + F24y;
+      flt_t f3z = F34z + F24z;
+      flt_t f4x = -F34x - F24x;
+      flt_t f4y = -F34y - F24y;
+      flt_t f4z = -F34z - F24z;
+
+      fij[0] += F23x + F24x - F31x;
+      fij[1] += F23y + F24y - F31y;
+      fij[2] += F23z + F24z - F31z;
+
+      // coordination forces
+
+      flt_t tmp20 = VA * ((1 - (om1234 * om1234))) *
+             (1 - tspjik) * (1 - tspijl) * dw21 * w34 / r21mag;
+      f2x -= tmp20 * r21x;
+      f2y -= tmp20 * r21y;
+      f2z -= tmp20 * r21z;
+      f1x += tmp20 * r21x;
+      f1y += tmp20 * r21y;
+      f1z += tmp20 * r21z;
+
+      flt_t tmp21 = VA * ((1 - (om1234 * om1234))) *
+             (1 - tspjik) * (1 - tspijl) * w21 * dw34 / r34mag;
+      f3x -= tmp21 * r34x;
+      f3y -= tmp21 * r34y;
+      f3z -= tmp21 * r34z;
+      f4x += tmp21 * r34x;
+      f4y += tmp21 * r34y;
+      f4z += tmp21 * r34z;
+
+      result_f[a1].x += f1x;
+      result_f[a1].y += f1y;
+      result_f[a1].z += f1z;
+      result_f[a2].x += f2x;
+      result_f[a2].y += f2y;
+      result_f[a2].z += f2z;
+      result_f[a3].x += f3x;
+      result_f[a3].y += f3y;
+      result_f[a3].z += f3z;
+      result_f[a4].x += f4x;
+      result_f[a4].y += f4y;
+      result_f[a4].z += f4z;
+    }
+  }
+  return sum_omega;
+}
+
+/*
+ * Implements a scalar implementation the force update due to splines.
+ * It is used for both pi^rc_ij and T_ij.
+ * Occurs four times in each bondorder and bondorderLJ.
+ */
+template<typename flt_t, typename acc_t>
+inline void frebo_N_spline_force(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i, 
+    int j, flt_t VA, flt_t dN, flt_t dNconj, flt_t Nconj) {
+  int * map = ka->map;
+  AtomAIREBOT<flt_t> * x = ka->x;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  flt_t * nC = ka->nC;
+  flt_t * nH = ka->nH;
+  flt_t Nmin = ka->params.Nmin;
+  flt_t Nmax = ka->params.Nmax;
+  int itype = map[x[i].w];
+  int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
+  int knum = ka->neigh_rebo.num[i];
+  int kk;
+  for (kk = 0; kk < knum; kk++) {
+    int k = neighs[kk];
+    if (k == j) continue;
+    flt_t rikx = x[i].x - x[k].x;
+    flt_t riky = x[i].y - x[k].y;
+    flt_t rikz = x[i].z - x[k].z;
+    flt_t rikmag = overloaded::sqrt(rikx * rikx + riky * riky + rikz * rikz);
+    int ktype = map[x[k].w];
+    flt_t rcminik = ka->params.rcmin[itype][ktype];
+    flt_t rcmaxik = ka->params.rcmax[itype][ktype];
+    flt_t dwik;
+    flt_t wik = Sp(rikmag, rcminik, rcmaxik, &dwik);
+    flt_t Nki = nC[k] + nH[k] - wik;
+    flt_t dNki;
+    flt_t SpN = Sp(Nki, Nmin, Nmax, &dNki);
+    flt_t fdN = VA * dN * dwik / rikmag;
+    flt_t fdNconj = VA * dNconj * 2 * Nconj * dwik * SpN / rikmag;
+    flt_t ffactor = fdN;
+    if (ktype == 0) ffactor += fdNconj;
+    flt_t fkx = ffactor * rikx;
+    flt_t fky = ffactor * riky;
+    flt_t fkz = ffactor * rikz;
+    result_f[i].x -= fkx;
+    result_f[i].y -= fky;
+    result_f[i].z -= fkz;
+    result_f[k].x += fkx;
+    result_f[k].y += fky;
+    result_f[k].z += fkz;
+    if (ktype != 0 || fabs(dNki) <= TOL) continue;
+    int * neighs_k = ka->neigh_rebo.entries + ka->neigh_rebo.offset[k];
+    int nnum = ka->neigh_rebo.num[k];
+    int nn;
+    for (nn = 0; nn < nnum; nn++) {
+      int n = neighs_k[nn];
+      if (n == i) continue;
+      flt_t rknx = x[k].x - x[n].x;
+      flt_t rkny = x[k].y - x[n].y;
+      flt_t rknz = x[k].z - x[n].z;
+      flt_t rknmag = overloaded::sqrt(rknx * rknx + rkny * rkny + rknz * rknz);
+      int ntype = map[x[n].w];
+      flt_t rcminkn = ka->params.rcmin[ktype][ntype];
+      flt_t rcmaxkn = ka->params.rcmax[ktype][ntype];
+      flt_t dwkn;
+      Sp(rknmag, rcminkn, rcmaxkn, &dwkn);
+      flt_t ffactor = VA * dNconj * 2 * Nconj * wik * dNki * dwkn / rknmag;
+      result_f[k].x -= ffactor * rknx;
+      result_f[k].y -= ffactor * rkny;
+      result_f[k].z -= ffactor * rknz;
+      result_f[n].x += ffactor * rknx;
+      result_f[n].y += ffactor * rkny;
+      result_f[n].z += ffactor * rknz;
+    }
+  }
+}
+
+/*
+ * This data-structure contains the result of a search through neighbor-lists.
+ * It is used to calculate C_ij and the corresponding force updates.
+ */
+template<typename flt_t>
+struct LennardJonesPathAIREBOT {
+  AtomAIREBOT<flt_t> del[3];
+  int num;
+  flt_t w[3];
+  flt_t dw[3];
+  flt_t r[3];
+  int idx[4];
+};
+
+/*
+ * Checks a candidate path stored in idxs whether it is better than *path
+ * and updates *path accordingly.
+ */
+template<typename flt_t, typename acc_t>
+inline flt_t ref_lennard_jones_test_path_single(
+ KernelArgsAIREBOT<flt_t,acc_t> * ka, flt_t best, int num, int * idxs, 
+ LennardJonesPathAIREBOT<flt_t> * path) {
+  LennardJonesPathAIREBOT<flt_t> result;
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  result.num = num;
+  flt_t combined = 1;
+  for (int i = num - 2; i >= 0; i--) {
+    int a0 = idxs[i+0];
+    int a1 = idxs[i+1];
+    flt_t delx = x[a1].x - x[a0].x;
+    flt_t dely = x[a1].y - x[a0].y;
+    flt_t delz = x[a1].z - x[a0].z;
+    flt_t rsq = delx * delx + dely * dely + delz * delz;
+    int type0 = map[x[a0].w];
+    int type1 = map[x[a1].w];
+    if (rsq >= ka->params.rcmaxsq[type0][type1]) return best;
+    flt_t r = overloaded::sqrt(rsq);
+    flt_t dw, w = Sp<flt_t>(r, ka->params.rcmin[type0][type1], 
+                            ka->params.rcmax[type0][type1], &dw);
+    if (w == 0) return best;
+    combined *= w;
+    if (combined <= best) return best;
+    result.idx[i] = a0;
+    result.del[i].x = delx;
+    result.del[i].y = dely;
+    result.del[i].z = delz;
+    result.r[i] = r;
+    result.w[i] = w;
+    result.dw[i] = dw;
+  }
+  result.idx[num - 1] = idxs[num - 1];
+  *path = result;
+  return combined;
+}
+
+/*
+ * Test through all paths surrounding i and j to find the corresponding
+ * best path. Uses the same iteration ordering as FLJ() does.
+ * Note that an optimization would use the j neighlist instead in the inner
+ * loop.
+ */
+template<typename flt_t, typename acc_t>
+inline flt_t ref_lennard_jones_test_path(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+    int i, int j, flt_t rij, flt_t rcmax, 
+    LennardJonesPathAIREBOT<flt_t> * path) {
+  int idxs[4];
+  idxs[0] = i;
+  idxs[1] = j;
+  flt_t best = 0;
+  if (rij <= rcmax) {
+    best = ref_lennard_jones_test_path_single(ka, best, 2, idxs, path);
+    if (best == static_cast<flt_t>(1.0)) return 0;
+  }
+  for (int kk = 0; kk < ka->neigh_rebo.num[i]; kk++) {
+    int k = ka->neigh_rebo.entries[ka->neigh_rebo.offset[i] + kk];
+    if (k == j) continue;
+    idxs[1] = k;
+    idxs[2] = j;
+    best = ref_lennard_jones_test_path_single(ka, best, 3, idxs, path);
+    if (best == static_cast<flt_t>(1.0)) return 0;
+    for (int mm = 0; mm < ka->neigh_rebo.num[k]; mm++) {
+      int m = ka->neigh_rebo.entries[ka->neigh_rebo.offset[k] + mm];
+      if (m == i || m == j) continue;
+      idxs[2] = m;
+      idxs[3] = j;
+      best = ref_lennard_jones_test_path_single(ka, best, 4, idxs, path);
+      if (best == static_cast<flt_t>(1.0)) return 0;
+    }
+  }
+  return 1 - best;
+}
+
+/*
+ * Conducts the force update due to C_ij, given the active path.
+ */
+template<typename flt_t, typename acc_t>
+inline void ref_lennard_jones_force_path(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+    flt_t dC, LennardJonesPathAIREBOT<flt_t> * path) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  for (int i = 0; i < path->num - 1; i++) {
+    flt_t fpair = dC * path->dw[i] / path->r[i];
+    for (int j = 0; j < path->num - 1; j++) {
+      if (i != j) fpair *= path->w[j];
+    }
+    result_f[path->idx[i+0]].x -= fpair * path->del[i].x;
+    result_f[path->idx[i+0]].y -= fpair * path->del[i].y;
+    result_f[path->idx[i+0]].z -= fpair * path->del[i].z;
+    result_f[path->idx[i+1]].x += fpair * path->del[i].x;
+    result_f[path->idx[i+1]].y += fpair * path->del[i].y;
+    result_f[path->idx[i+1]].z += fpair * path->del[i].z;
+  }
+}
+
+/*
+ * Calculate the bondorderLJ term.
+ */
+template<typename flt_t, typename acc_t>
+inline flt_t ref_lennard_jones_bondorder(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+    int i, int j, flt_t VA, acc_t fij[3]) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+
+  int itype = map[x[i].w];
+  int jtype = map[x[j].w];
+
+  flt_t delx = x[i].x - x[j].x;
+  flt_t dely = x[i].y - x[j].y;
+  flt_t delz = x[i].z - x[j].z;
+  flt_t rsq = delx * delx + dely * dely + delz * delz;
+  flt_t rij = overloaded::sqrt(rsq);
+
+  flt_t rcminij = ka->params.rcmin[itype][jtype];
+  flt_t rcmaxij = ka->params.rcmax[itype][jtype];
+  flt_t dwij;
+  flt_t wij = Sp(rij, rcminij, rcmaxij, &dwij);
+
+  flt_t the_r = ka->params.rcmin[itype][jtype];
+  flt_t scale = the_r / rij;
+  flt_t Nij = ka->nH[i] + ka->nC[i] - wij;
+  flt_t Nji = ka->nH[j] + ka->nC[j] - wij;
+  flt_t NconjtmpI;
+  acc_t fijc[3] = {0}, fjic[3] = {0};
+  flt_t pij = frebo_pij<flt_t,acc_t>(ka, i, j, delx * scale, dely * scale, 
+    delz * scale, the_r, wij, 0.0, &NconjtmpI, fijc);
+  flt_t NconjtmpJ;
+  flt_t pji = frebo_pij<flt_t,acc_t>(ka, j, i, -delx * scale, -dely * scale, 
+    -delz * scale, the_r, wij, 0.0, &NconjtmpJ, fjic);
+  flt_t Nijconj = 1.0 + (NconjtmpI * NconjtmpI) + (NconjtmpJ * NconjtmpJ);
+  flt_t dN3_pi_rc[3];
+  flt_t pi_rc = frebo_pi_rc<flt_t,acc_t>(ka, itype, jtype, Nij, Nji, Nijconj, 
+    dN3_pi_rc);
+  flt_t dN3_Tij[3];
+  flt_t Tij = frebo_Tij<flt_t,acc_t>(ka, itype, jtype, Nij, Nji, Nijconj, 
+    dN3_Tij);
+  flt_t sum_omega = 0;
+  if (fabs(Tij) > TOL) {
+    sum_omega = frebo_sum_omega<flt_t,acc_t>(ka, i, j, delx * scale, dely * 
+                                             scale, delz * scale, the_r, 0.0, 
+                                             fijc);
+  }
+  flt_t pi_dh = Tij * sum_omega;
+  flt_t bij = 0.5 * (pij + pji) + pi_rc + pi_dh;
+  flt_t dStb;
+  flt_t Stb = Sp2<flt_t>(bij, ka->params.bLJmin[itype][jtype], 
+    ka->params.bLJmax[itype][jtype], &dStb);
+  if (dStb != 0) {
+    flt_t pij_reverse = frebo_pij<flt_t,acc_t>(ka, i, j, delx * scale, 
+      dely * scale, delz * scale, the_r, wij, VA * dStb, &NconjtmpI, fijc);
+    flt_t pji_reverse = frebo_pij<flt_t,acc_t>(ka, j, i, -delx * scale, 
+      -dely * scale, -delz * scale, the_r, wij, VA * dStb, &NconjtmpJ, fjic);
+    fijc[0] -= fjic[0];
+    fijc[1] -= fjic[1];
+    fijc[2] -= fjic[2];
+    frebo_N_spline_force<flt_t,acc_t>(ka, i, j, VA * dStb, dN3_pi_rc[0], 
+      dN3_pi_rc[2], NconjtmpI);
+    frebo_N_spline_force<flt_t,acc_t>(ka, j, i, VA * dStb, dN3_pi_rc[1], 
+      dN3_pi_rc[2], NconjtmpJ);
+    if (fabs(Tij) > TOL) {
+      flt_t sum_omega_reverse = frebo_sum_omega<flt_t,acc_t>(ka, i, j, 
+        delx * scale, dely * scale, delz * scale, the_r, VA * dStb * Tij, fijc);
+      frebo_N_spline_force(ka, i, j, VA * dStb * sum_omega, dN3_Tij[0], 
+        dN3_Tij[2], NconjtmpI);
+      frebo_N_spline_force(ka, j, i, VA * dStb * sum_omega, dN3_Tij[1], 
+        dN3_Tij[2], NconjtmpJ);
+    }
+    assert(fij[0] == 0);
+    assert(fij[1] == 0);
+    assert(fij[2] == 0);
+    fij[0] = scale * (fijc[0] - (delx * delx * fijc[0] + dely * delx * 
+                                 fijc[1] + delz * delx * fijc[2]) / rsq);
+    fij[1] = scale * (fijc[1] - (delx * dely * fijc[0] + dely * dely * 
+                                 fijc[1] + delz * dely * fijc[2]) / rsq);
+    fij[2] = scale * (fijc[2] - (delx * delz * fijc[0] + dely * delz * 
+                                 fijc[1] + delz * delz * fijc[2]) / rsq);
+  }
+  return Stb;
+}
+
+/*
+ * Scalar reference implementation of neighbor routine.
+ */
+template<typename flt_t, typename acc_t>
+void ref_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
+  int offset = ka->neigh_from_atom * ka->num_neighs_per_atom;
+  for (int i = ka->neigh_from_atom; i < ka->neigh_to_atom; i++) {
+    ka->neigh_rebo.offset[i] = offset;
+    int itype = ka->map[ka->x[i].w];
+    int n = 0;
+    ka->nC[i] = 0;
+    ka->nH[i] = 0;
+    for (int j = 0; j < ka->neigh_lmp.num[i]; j++) {
+      int ji = ka->neigh_lmp.entries[ka->neigh_lmp.offset[i] + j];
+      flt_t delx = ka->x[i].x - ka->x[ji].x;
+      flt_t dely = ka->x[i].y - ka->x[ji].y;
+      flt_t delz = ka->x[i].z - ka->x[ji].z;
+      flt_t rsq = delx * delx + dely * dely + delz * delz;
+      int jtype = ka->map[ka->x[ji].w];
+      if (rsq < ka->params.rcmaxsq[itype][jtype]) {
+        ka->neigh_rebo.entries[offset + n++] = ji;
+        flt_t rcmin = ka->params.rcmin[itype][jtype];
+        flt_t rcmax = ka->params.rcmax[itype][jtype];
+        if (jtype == CARBON)
+          ka->nC[i] += Sp<flt_t>(overloaded::sqrt(rsq), rcmin, rcmax, NULL);
+        else
+          ka->nH[i] += Sp<flt_t>(overloaded::sqrt(rsq), rcmin, rcmax, NULL);
+      }
+    }
+    ka->neigh_rebo.num[i] = n;
+    offset += n;
+  }
+}
+
+template<typename flt_t, typename acc_t>
+void ref_torsion_single_interaction(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i,
+                                    int j) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  ResultForceT<acc_t> * f = ka->result_f;
+  flt_t (*rcmin)[2] = ka->params.rcmin;
+  flt_t (*rcmax)[2] = ka->params.rcmax;
+  flt_t (*epsilonT)[2] = ka->params.epsilonT;
+  flt_t thmin = ka->params.thmin;
+  flt_t thmax = ka->params.thmax;
+  int itype = map[x[i].w];
+  flt_t xtmp = x[i].x;
+  flt_t ytmp = x[i].y;
+  flt_t ztmp = x[i].z;
+  int * REBO_neighs_i = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]];
+  int jnum = ka->neigh_rebo.num[i];
+  int jtype = map[x[j].w];
+
+  flt_t del32x = x[j].x-x[i].x;
+  flt_t del32y = x[j].y-x[i].y;
+  flt_t del32z = x[j].z-x[i].z;
+  flt_t rsq = del32x*del32x + del32y*del32y + del32z*del32z;
+  flt_t r32 = overloaded::sqrt(rsq);
+  flt_t del23x = -del32x;
+  flt_t del23y = -del32y;
+  flt_t del23z = -del32z;
+  flt_t r23 = r32;
+  flt_t dw23, w23 = Sp<flt_t>(r23,rcmin[itype][jtype],rcmax[itype][jtype],
+    &dw23);
+
+  assert(itype == 0);
+  assert(jtype == 0);
+
+  for (int kk = 0; kk < jnum; kk++) {
+    int k = REBO_neighs_i[kk];
+    int ktype = map[x[k].w];
+    if (k == j) continue;
+    flt_t del21x = x[i].x-x[k].x;
+    flt_t del21y = x[i].y-x[k].y;
+    flt_t del21z = x[i].z-x[k].z;
+    flt_t rsq = del21x*del21x + del21y*del21y + del21z*del21z;
+    flt_t r21 = overloaded::sqrt(rsq);
+    flt_t cos321 = - ((del21x*del32x) + (del21y*del32y) +
+                (del21z*del32z)) / (r21*r32);
+    cos321 = fmin(cos321,1);
+    cos321 = fmax(cos321,-1);
+    flt_t sin321 = overloaded::sqrt(1 - cos321*cos321);
+    if (sin321 < TOL) continue;
+
+    flt_t deljkx = del21x-del23x;
+    flt_t deljky = del21y-del23y;
+    flt_t deljkz = del21z-del23z;
+    flt_t rjk2 = deljkx*deljkx + deljky*deljky + deljkz*deljkz;
+    flt_t rjk = overloaded::sqrt(rjk2);
+    flt_t rik2 = r21*r21;
+    flt_t dw21, w21 = Sp<flt_t>(r21,rcmin[itype][ktype],rcmax[itype][ktype],
+      &dw21);
+
+    flt_t rij = r32;
+    flt_t rik = r21;
+    flt_t rij2 = r32*r32;
+    flt_t costmp = static_cast<flt_t>(0.5)*(rij2+rik2-rjk2)/rij/rik;
+    flt_t dtsjik, tspjik = Sp2<flt_t>(costmp,thmin,thmax,&dtsjik);
+    dtsjik = -dtsjik;
+
+    int * REBO_neighs_j = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[j]];
+    int lnum = ka->neigh_rebo.num[j];
+    for (int ll = 0; ll < lnum; ll++) {
+      int l = REBO_neighs_j[ll];
+      int ltype = map[x[l].w];
+      if (l == i || l == k) continue;
+      flt_t del34x = x[j].x-x[l].x;
+      flt_t del34y = x[j].y-x[l].y;
+      flt_t del34z = x[j].z-x[l].z;
+      flt_t rsq = del34x*del34x + del34y*del34y + del34z*del34z;
+      flt_t r34 = overloaded::sqrt(rsq);
+      flt_t cos234 = (del32x*del34x + del32y*del34y +
+                del32z*del34z) / (r32*r34);
+      cos234 = fmin(cos234,1);
+      cos234 = fmax(cos234,-1);
+      flt_t sin234 = overloaded::sqrt(1 - cos234*cos234);
+      if (sin234 < TOL) continue;
+      flt_t dw34, w34 = Sp<flt_t>(r34,rcmin[jtype][ltype],rcmax[jtype][ltype],
+        &dw34);
+      flt_t delilx = del23x + del34x;
+      flt_t delily = del23y + del34y;
+      flt_t delilz = del23z + del34z;
+      flt_t ril2 = delilx*delilx + delily*delily + delilz*delilz;
+      flt_t ril = overloaded::sqrt(ril2);
+      flt_t rjl2 = r34*r34;
+
+      flt_t rjl = r34;
+      flt_t costmp = static_cast<flt_t>(0.5)*(rij2+rjl2-ril2)/rij/rjl;
+      flt_t dtsijl, tspijl = Sp2<flt_t>(costmp,thmin,thmax,&dtsijl);
+      dtsijl = -dtsijl; //need minus sign
+      flt_t cross321x = (del32y*del21z)-(del32z*del21y);
+      flt_t cross321y = (del32z*del21x)-(del32x*del21z);
+      flt_t cross321z = (del32x*del21y)-(del32y*del21x);
+      flt_t cross321mag = overloaded::sqrt(cross321x*cross321x+
+                         cross321y*cross321y + cross321z*cross321z);
+      flt_t cross234x = (del23y*del34z)-(del23z*del34y);
+      flt_t cross234y = (del23z*del34x)-(del23x*del34z);
+      flt_t cross234z = (del23x*del34y)-(del23y*del34x);
+      flt_t cross234mag = overloaded::sqrt(cross234x*cross234x+
+                         cross234y*cross234y + cross234z*cross234z);
+      flt_t cwnum = (cross321x*cross234x) +
+        (cross321y*cross234y)+(cross321z*cross234z);
+      flt_t cwnom = r21*r34*r32*r32*sin321*sin234;
+      flt_t cw = cwnum/cwnom;
+
+      flt_t cw2 = (static_cast<flt_t>(.5)*(1-cw));
+      flt_t ekijl = epsilonT[ktype][ltype];
+      flt_t Ec = 256*ekijl/405;
+      flt_t Vtors = (Ec*(overloaded::pow(cw2,5)))-(ekijl/10);
+
+      ka->result_eng += Vtors*w21*w23*w34*(1-tspjik)*(1-tspijl);
+
+      flt_t dndijx = (cross234y*del21z)-(cross234z*del21y);
+      flt_t dndijy = (cross234z*del21x)-(cross234x*del21z);
+      flt_t dndijz = (cross234x*del21y)-(cross234y*del21x);
+
+      flt_t tmpvecx = (del34y*cross321z)-(del34z*cross321y);
+      flt_t tmpvecy = (del34z*cross321x)-(del34x*cross321z);
+      flt_t tmpvecz = (del34x*cross321y)-(del34y*cross321x);
+
+      dndijx = dndijx+tmpvecx;
+      dndijy = dndijy+tmpvecy;
+      dndijz = dndijz+tmpvecz;
+
+      flt_t dndikx = (del23y*cross234z)-(del23z*cross234y);
+      flt_t dndiky = (del23z*cross234x)-(del23x*cross234z);
+      flt_t dndikz = (del23x*cross234y)-(del23y*cross234x);
+
+      flt_t dndjlx = (cross321y*del23z)-(cross321z*del23y);
+      flt_t dndjly = (cross321z*del23x)-(cross321x*del23z);
+      flt_t dndjlz = (cross321x*del23y)-(cross321y*del23x);
+
+      flt_t dcidij = ((r23*r23)-(r21*r21)+(rjk*rjk))/(2*r23*r23*r21);
+      flt_t dcidik = ((r21*r21)-(r23*r23)+(rjk*rjk))/(2*r23*r21*r21);
+      flt_t dcidjk = (-rjk)/(r23*r21);
+      flt_t dcjdji = ((r23*r23)-(r34*r34)+(ril*ril))/(2*r23*r23*r34);
+      flt_t dcjdjl = ((r34*r34)-(r23*r23)+(ril*ril))/(2*r23*r34*r34);
+      flt_t dcjdil = (-ril)/(r23*r34);
+
+      flt_t dsidij = (-cos321/sin321)*dcidij;
+      flt_t dsidik = (-cos321/sin321)*dcidik;
+      flt_t dsidjk = (-cos321/sin321)*dcidjk;
+
+      flt_t dsjdji = (-cos234/sin234)*dcjdji;
+      flt_t dsjdjl = (-cos234/sin234)*dcjdjl;
+      flt_t dsjdil = (-cos234/sin234)*dcjdil;
+
+      flt_t dxidij = (r21*sin321)+(r23*r21*dsidij);
+      flt_t dxidik = (r23*sin321)+(r23*r21*dsidik);
+      flt_t dxidjk = (r23*r21*dsidjk);
+
+      flt_t dxjdji = (r34*sin234)+(r23*r34*dsjdji);
+      flt_t dxjdjl = (r23*sin234)+(r23*r34*dsjdjl);
+      flt_t dxjdil = (r23*r34*dsjdil);
+
+      flt_t ddndij = (dxidij*cross234mag)+(cross321mag*dxjdji);
+      flt_t ddndik = dxidik*cross234mag;
+      flt_t ddndjk = dxidjk*cross234mag;
+      flt_t ddndjl = cross321mag*dxjdjl;
+      flt_t ddndil = cross321mag*dxjdil;
+      flt_t dcwddn = -cwnum/(cwnom*cwnom);
+      flt_t dcwdn = 1/cwnom;
+      flt_t dvpdcw = (-1)*Ec*static_cast<flt_t>(-0.5)*5*overloaded::pow(cw2,4)*
+                      w23*w21*w34*(1-tspjik)*(1-tspijl);
+
+      flt_t Ftmpx = dvpdcw*((dcwdn*dndijx)+(dcwddn*ddndij*del23x/r23));
+      flt_t Ftmpy = dvpdcw*((dcwdn*dndijy)+(dcwddn*ddndij*del23y/r23));
+      flt_t Ftmpz = dvpdcw*((dcwdn*dndijz)+(dcwddn*ddndij*del23z/r23));
+      flt_t fix = Ftmpx;
+      flt_t fiy = Ftmpy;
+      flt_t fiz = Ftmpz;
+      flt_t fjx = -Ftmpx;
+      flt_t fjy = -Ftmpy;
+      flt_t fjz = -Ftmpz;
+
+      Ftmpx = dvpdcw*((dcwdn*dndikx)+(dcwddn*ddndik*del21x/r21));
+      Ftmpy = dvpdcw*((dcwdn*dndiky)+(dcwddn*ddndik*del21y/r21));
+      Ftmpz = dvpdcw*((dcwdn*dndikz)+(dcwddn*ddndik*del21z/r21));
+      fix += Ftmpx;
+      fiy += Ftmpy;
+      fiz += Ftmpz;
+      flt_t fkx = -Ftmpx;
+      flt_t fky = -Ftmpy;
+      flt_t fkz = -Ftmpz;
+
+      Ftmpx = (dvpdcw*dcwddn*ddndjk*deljkx)/rjk;
+      Ftmpy = (dvpdcw*dcwddn*ddndjk*deljky)/rjk;
+      Ftmpz = (dvpdcw*dcwddn*ddndjk*deljkz)/rjk;
+      fjx += Ftmpx;
+      fjy += Ftmpy;
+      fjz += Ftmpz;
+      fkx -= Ftmpx;
+      fky -= Ftmpy;
+      fkz -= Ftmpz;
+
+      Ftmpx = dvpdcw*((dcwdn*dndjlx)+(dcwddn*ddndjl*del34x/r34));
+      Ftmpy = dvpdcw*((dcwdn*dndjly)+(dcwddn*ddndjl*del34y/r34));
+      Ftmpz = dvpdcw*((dcwdn*dndjlz)+(dcwddn*ddndjl*del34z/r34));
+      fjx += Ftmpx;
+      fjy += Ftmpy;
+      fjz += Ftmpz;
+      flt_t flx = -Ftmpx;
+      flt_t fly = -Ftmpy;
+      flt_t flz = -Ftmpz;
+
+      Ftmpx = (dvpdcw*dcwddn*ddndil*delilx)/ril;
+      Ftmpy = (dvpdcw*dcwddn*ddndil*delily)/ril;
+      Ftmpz = (dvpdcw*dcwddn*ddndil*delilz)/ril;
+      fix += Ftmpx;
+      fiy += Ftmpy;
+      fiz += Ftmpz;
+      flx -= Ftmpx;
+      fly -= Ftmpy;
+      flz -= Ftmpz;
+
+      // coordination forces
+
+      flt_t fpair = Vtors*dw21*w23*w34*(1-tspjik)*(1-tspijl) / r21;
+      fix -= del21x*fpair;
+      fiy -= del21y*fpair;
+      fiz -= del21z*fpair;
+      fkx += del21x*fpair;
+      fky += del21y*fpair;
+      fkz += del21z*fpair;
+
+      fpair = Vtors*w21*dw23*w34*(1-tspjik)*(1-tspijl) / r23;
+      fix -= del23x*fpair;
+      fiy -= del23y*fpair;
+      fiz -= del23z*fpair;
+      fjx += del23x*fpair;
+      fjy += del23y*fpair;
+      fjz += del23z*fpair;
+
+      fpair = Vtors*w21*w23*dw34*(1-tspjik)*(1-tspijl) / r34;
+      fjx -= del34x*fpair;
+      fjy -= del34y*fpair;
+      fjz -= del34z*fpair;
+      flx += del34x*fpair;
+      fly += del34y*fpair;
+      flz += del34z*fpair;
+
+      // additional cut off function forces
+
+      flt_t fcpc = -Vtors*w21*w23*w34*dtsjik*(1-tspijl);
+      fpair = fcpc*dcidij/rij;
+      fix += fpair*del23x;
+      fiy += fpair*del23y;
+      fiz += fpair*del23z;
+      fjx -= fpair*del23x;
+      fjy -= fpair*del23y;
+      fjz -= fpair*del23z;
+
+      fpair = fcpc*dcidik/rik;
+      fix += fpair*del21x;
+      fiy += fpair*del21y;
+      fiz += fpair*del21z;
+      fkx -= fpair*del21x;
+      fky -= fpair*del21y;
+      fkz -= fpair*del21z;
+
+      fpair = fcpc*dcidjk/rjk;
+      fjx += fpair*deljkx;
+      fjy += fpair*deljky;
+      fjz += fpair*deljkz;
+      fkx -= fpair*deljkx;
+      fky -= fpair*deljky;
+      fkz -= fpair*deljkz;
+
+      fcpc = -Vtors*w21*w23*w34*(1-tspjik)*dtsijl;
+      fpair = fcpc*dcjdji/rij;
+      fix += fpair*del23x;
+      fiy += fpair*del23y;
+      fiz += fpair*del23z;
+      fjx -= fpair*del23x;
+      fjy -= fpair*del23y;
+      fjz -= fpair*del23z;
+
+      fpair = fcpc*dcjdjl/rjl;
+      fjx += fpair*del34x;
+      fjy += fpair*del34y;
+      fjz += fpair*del34z;
+      flx -= fpair*del34x;
+      fly -= fpair*del34y;
+      flz -= fpair*del34z;
+
+      fpair = fcpc*dcjdil/ril;
+      fix += fpair*delilx;
+      fiy += fpair*delily;
+      fiz += fpair*delilz;
+      flx -= fpair*delilx;
+      fly -= fpair*delily;
+      flz -= fpair*delilz;
+
+      // sum per-atom forces into atom force array
+
+      f[i].x += fix; f[i].y += fiy; f[i].z += fiz;
+      f[j].x += fjx; f[j].y += fjy; f[j].z += fjz;
+      f[k].x += fkx; f[k].y += fky; f[k].z += fkz;
+      f[l].x += flx; f[l].y += fly; f[l].z += flz;
+    }
+  }
+}
+
+template<typename flt_t, typename acc_t>
+void ref_torsion(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  int * tag = ka->tag;
+  for (int ii = ka->frebo_from_atom; ii < ka->frebo_to_atom; ii++) {
+    int i = ii;
+    int itag = tag[i];
+    int itype = map[x[i].w];
+    if (itype != 0) continue;
+    flt_t xtmp = x[i].x;
+    flt_t ytmp = x[i].y;
+    flt_t ztmp = x[i].z;
+    int * REBO_neighs_i = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]];
+    int jnum = ka->neigh_rebo.num[i];
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = REBO_neighs_i[jj];
+      int jtag = tag[j];
+
+      if (itag > jtag) {
+        if (((itag+jtag) & 1) == 0) continue;
+      } else if (itag < jtag) {
+        if (((itag+jtag) & 1) == 1) continue;
+      } else {
+        if (x[j].z < ztmp) continue;
+        if (x[j].z == ztmp && x[j].y < ytmp) continue;
+        if (x[j].z == ztmp && x[j].y == ytmp && x[j].x < xtmp) continue;
+      }
+
+      int jtype = map[x[j].w];
+      if (jtype != 0) continue;
+      ref_torsion_single_interaction(ka, i, j);
+    }
+  }
+}
+
+/*
+ * Calculate single REBO interaction.
+ * Corresponds to FREBO method. Note that the bondorder() function is
+ * inlined.
+ */
+template<typename flt_t, typename acc_t>
+void ref_frebo_single_interaction(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i, 
+    int j) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  int jj;
+  int itype = map[x[i].w];
+  flt_t x_i = x[i].x;
+  flt_t y_i = x[i].y;
+  flt_t z_i = x[i].z;
+  int jtype = map[x[j].w];
+  flt_t delx = x[i].x - x[j].x;
+  flt_t dely = x[i].y - x[j].y;
+  flt_t delz = x[i].z - x[j].z;
+  flt_t rsq = delx * delx + dely * dely + delz * delz;
+  flt_t rij = overloaded::sqrt(rsq);
+  flt_t rcminij = ka->params.rcmin[itype][jtype];
+  flt_t rcmaxij = ka->params.rcmax[itype][jtype];
+  flt_t dwij;
+  flt_t wij = Sp(rij, rcminij, rcmaxij, &dwij);
+  if (wij <= TOL) return;
+
+  flt_t Qij = ka->params.Q[itype][jtype];
+  flt_t Aij = ka->params.A[itype][jtype];
+  flt_t alphaij = ka->params.alpha[itype][jtype];
+
+  flt_t exp_alphar = exp(-alphaij * rij);
+  flt_t VR_by_wij = (1.0 + (Qij / rij)) * Aij * exp_alphar;
+  flt_t VR = wij * VR_by_wij;
+  flt_t pre = wij * Aij * exp_alphar;
+  flt_t dVRdi = pre * ((-alphaij) - (Qij / rsq) - (Qij * alphaij / rij));
+  dVRdi += VR_by_wij * dwij;
+
+  flt_t VA_by_wij = 0, dVA = 0;
+  for (int k = 0; k < 3; k++) {
+    flt_t BIJc = ka->params.BIJc[itype][jtype][k];
+    flt_t Betaij = ka->params.Beta[itype][jtype][k];
+    flt_t term = -BIJc * overloaded::exp(-Betaij * rij);
+    VA_by_wij += term;
+    dVA += -Betaij * wij * term;
+  }
+  dVA += VA_by_wij * dwij;
+  flt_t VA = VA_by_wij * wij;
+
+  acc_t fij[3] = {0};
+  flt_t Nij = ka->nH[i] + ka->nC[i] - wij;
+  flt_t Nji = ka->nH[j] + ka->nC[j] - wij;
+  flt_t NconjtmpI;
+  flt_t pij = frebo_pij(ka, i, j, delx, dely, delz, rij, wij, VA, &NconjtmpI, 
+    fij);
+  flt_t NconjtmpJ;
+  acc_t fji[3] = {0};
+  flt_t pji = frebo_pij(ka, j, i, -delx, -dely, -delz, rij, wij, VA, 
+    &NconjtmpJ, fji);
+  fij[0] -= fji[0]; fij[1] -= fji[1]; fij[2] -= fji[2];
+  flt_t Nijconj = 1.0 + (NconjtmpI * NconjtmpI) + (NconjtmpJ * NconjtmpJ);
+  flt_t dN3[3];
+  flt_t pi_rc = frebo_pi_rc(ka, itype, jtype, Nij, Nji, Nijconj, dN3);
+  frebo_N_spline_force(ka, i, j, VA, dN3[0], dN3[2], NconjtmpI);
+  frebo_N_spline_force(ka, j, i, VA, dN3[1], dN3[2], NconjtmpJ);
+  flt_t Tij = frebo_Tij(ka, itype, jtype, Nij, Nji, Nijconj, dN3);
+  flt_t sum_omega = 0.0;
+  if (fabs(Tij) > TOL) {
+    sum_omega = frebo_sum_omega(ka, i, j, delx, dely, delz, rij, VA * Tij, fij);
+    frebo_N_spline_force(ka, i, j, VA * sum_omega, dN3[0], dN3[2], NconjtmpI);
+    frebo_N_spline_force(ka, j, i, VA * sum_omega, dN3[1], dN3[2], NconjtmpJ);
+  }
+  flt_t pi_dh = Tij * sum_omega;
+  flt_t bij = static_cast<flt_t>(0.5) * (pij + pji) + pi_rc + pi_dh;
+  flt_t dVAdi = bij * dVA;
+  flt_t fpair = -(dVRdi + dVAdi) / rij;
+
+  result_f[i].x += fpair * delx + fij[0];
+  result_f[i].y += fpair * dely + fij[1];
+  result_f[i].z += fpair * delz + fij[2];
+  result_f[j].x -= fpair * delx + fij[0];
+  result_f[j].y -= fpair * dely + fij[1];
+  result_f[j].z -= fpair * delz + fij[2];
+
+  flt_t evdwl = VR + bij * VA;
+  ka->result_eng += evdwl;
+  result_f[i].w += 0.5 * evdwl;
+  result_f[j].w += 0.5 * evdwl;
+}
+
+
+template<typename flt_t, typename acc_t>
+inline void ref_frebo_single_atom(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * tag = ka->tag;
+  int jj;
+  int itag = tag[i];
+  flt_t x_i = x[i].x;
+  flt_t y_i = x[i].y;
+  flt_t z_i = x[i].z;
+  int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
+  int jnum = ka->neigh_rebo.num[i];
+  for (jj = 0; jj < jnum; jj++) {
+    int j = neighs[jj];
+    int jtag = tag[j];
+    if (itag > jtag) {
+      if (((itag + jtag) & 1) == 0)
+        continue;
+    } else if (itag < jtag) {
+      if (((itag + jtag) & 1) == 1)
+        continue;
+    } else {
+      if (x[j].z < z_i)
+        continue;
+      if (x[j].z == z_i && x[j].y < y_i)
+        continue;
+      if (x[j].z == z_i && x[j].y == y_i && x[j].x < x_i)
+        continue;
+    }
+    ref_frebo_single_interaction(ka, i, j);
+  }
+}
+
+
+template<typename flt_t, typename acc_t>
+void ref_frebo(KernelArgsAIREBOT<flt_t,acc_t> * ka, int torflag) {
+  for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) {
+    ref_frebo_single_atom(ka, i);
+  }
+  if (torflag) ref_torsion(ka);
+}
+
+template<typename flt_t, typename acc_t>
+void ref_lennard_jones_single_interaction(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+    int i, int j, int morseflag) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+
+  int itype = map[x[i].w];
+  int jtype = map[x[j].w];
+
+  flt_t delx = x[i].x - x[j].x;
+  flt_t dely = x[i].y - x[j].y;
+  flt_t delz = x[i].z - x[j].z;
+  flt_t rsq = delx * delx + dely * dely + delz * delz;
+
+  if (rsq >= ka->params.cutljsq[itype][jtype]) { return; }
+  flt_t rij = overloaded::sqrt(rsq);
+
+  LennardJonesPathAIREBOT<flt_t> testpath;
+  flt_t cij = 1.0;
+  if (rij < ka->params.cut3rebo) {
+    #pragma noinline
+    cij = ref_lennard_jones_test_path<flt_t,acc_t>(ka, i, j, rij, 
+      ka->params.rcmax[itype][jtype], &testpath);
+  }
+  if (cij == 0) {
+    return;
+  }
+
+  flt_t sigcut = ka->params.sigcut;
+  flt_t sigmin = ka->params.sigmin;
+  flt_t sigma = ka->params.sigma[itype][jtype];
+  flt_t rljmax = sigcut * sigma;
+  flt_t rljmin = sigmin * sigma;
+
+  flt_t dslw, slw = Sp2(rij, rljmin, rljmax, &dslw);
+
+  flt_t vdw, dvdw;
+  if (morseflag) {
+    const flt_t exr = exp(-rij * ka->params.lj4[itype][jtype]);
+    vdw = ka->params.lj1[itype][jtype] * exr * 
+      (ka->params.lj2[itype][jtype]*exr - 2);
+    dvdw = ka->params.lj3[itype][jtype] * exr * 
+      (1 - ka->params.lj2[itype][jtype]*exr);
+  } else {
+    flt_t r2inv = 1 / rsq;
+    flt_t r6inv = r2inv * r2inv * r2inv;
+
+    vdw = r6inv * (ka->params.lj3[itype][jtype]*r6inv - 
+		   ka->params.lj4[itype][jtype]);
+    dvdw = -r6inv * (ka->params.lj1[itype][jtype]*r6inv - 
+		     ka->params.lj2[itype][jtype]) / rij;
+  }
+
+  flt_t VLJ = vdw * slw;
+  flt_t dVLJ = dvdw * slw + vdw * dslw;
+
+  flt_t dStr, Str = Sp2<flt_t>(rij, ka->params.rcLJmin[itype][jtype], 
+    ka->params.rcLJmax[itype][jtype], &dStr);
+  flt_t VA = Str * cij * VLJ;
+  flt_t Stb = 0;
+  acc_t fij[3] = {0};
+  if (Str > 0) {
+    #pragma noinline
+    Stb = ref_lennard_jones_bondorder(ka, i, j, VA, fij);
+  }
+  flt_t fpair = -(dStr * (Stb * cij * VLJ - cij * VLJ) +
+                   dVLJ * (Str * Stb * cij + cij - Str * cij)) / rij;
+  flt_t evdwl = VA * Stb + (1 - Str) * cij * VLJ;
+  result_f[i].x += fpair * delx + fij[0];
+  result_f[i].y += fpair * dely + fij[1];
+  result_f[i].z += fpair * delz + fij[2];
+  result_f[j].x -= fpair * delx + fij[0];
+  result_f[j].y -= fpair * dely + fij[1];
+  result_f[j].z -= fpair * delz + fij[2];
+  ka->result_eng += evdwl;
+
+  if (cij < 1) {
+    #pragma noinline
+    ref_lennard_jones_force_path(ka, Str * Stb * VLJ + (1 - Str) * VLJ, 
+      &testpath);
+  }
+}
+
+template<typename flt_t, typename acc_t>
+void ref_lennard_jones_single_atom(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i,
+				   int morseflag) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * tag = ka->tag;
+  int jj;
+  int itag = tag[i];
+  int * neighs = ka->neigh_lmp.entries + ka->neigh_lmp.offset[i];
+  int jnum = ka->neigh_lmp.num_half[i];
+  for (jj = 0; jj < jnum; jj++) {
+    int j = neighs[jj];
+    ref_lennard_jones_single_interaction(ka, i, j, morseflag);
+  }
+}
+
+template<typename flt_t, typename acc_t>
+void ref_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka, int morseflag) {
+  for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) {
+    #pragma noinline
+    ref_lennard_jones_single_atom(ka, i, morseflag);
+  }
+}
+
+/* ----------------------------------------------------------------------
+    Vectorized AIREBO implementation, standalone, using caching to reduce
+    memory access.
+   ---------------------------------------------------------------------- */
+
+template<typename flt_t, typename acc_t>
+struct aut_wrap {
+
+typedef typename intr_types<flt_t, acc_t>::fvec fvec;
+typedef typename intr_types<flt_t, acc_t>::avec avec;
+typedef typename intr_types<flt_t, acc_t>::ivec ivec;
+typedef typename intr_types<flt_t, acc_t>::bvec bvec;
+
+VEC_INLINE inline
+static void aut_loadatoms_vec(
+    AtomAIREBOT<flt_t> * atoms, ivec j_vec,
+    fvec *x, fvec * y, fvec * z, bvec * type_mask, int * map, ivec map_i, 
+    ivec c_1
+) {
+  const ivec c_4 = ivec::set1(4);
+  ivec j_vec_4 = ivec::mullo(c_4, j_vec);
+  fvec w;
+  fvec::gather_4_adjacent(j_vec_4, &atoms[0].x, sizeof(flt_t), x, y, z, &w);
+  ivec jtype = fvec::unpackloepi32(w);
+  jtype = ivec::srlv(map_i, jtype); //_mm512_castpd_si512(w));
+  jtype = ivec::the_and(c_1, jtype);
+  bvec jtype_mask = ivec::cmpneq(jtype, ivec::setzero());
+  *type_mask = jtype_mask;
+}
+
+VEC_INLINE inline
+static void aut_loadatoms_vec_notype(
+    AtomAIREBOT<flt_t> * atoms, ivec j_vec,
+    fvec *x, fvec * y, fvec * z
+) {
+  const ivec c_4 = ivec::set1(4);
+  ivec j_vec_4 = ivec::mullo(c_4, j_vec);
+  fvec::gather_3_adjacent(j_vec_4, &atoms[0].x, sizeof(flt_t), x, y, z);
+}
+
+static fvec aut_Sp2_deriv(fvec r, fvec lo, fvec hi, fvec * d) {
+  fvec c_1 = fvec::set1(1);
+  fvec c_2 = fvec::set1(2);
+  fvec c_3 = fvec::set1(3);
+  fvec c_6 = fvec::set1(6);
+  bvec m_lo = fvec::cmple(r, lo);
+  bvec m_hi = fvec::cmpnlt(r, hi); // nlt == ge
+  bvec m_tr = bvec::kandn(m_lo, ~ m_hi);
+  fvec ret = c_1;
+  ret = fvec::mask_blend(m_hi, ret, fvec::setzero());
+  fvec der = fvec::setzero();
+  if (bvec::test_any_set(m_tr)) {
+    fvec diff = hi -  lo;
+    fvec rcp = fvec::recip(diff);
+    fvec t = (r -  lo) *  rcp;
+    fvec v = c_1 -  t *  t * ( c_3 -  c_2 *  t);
+    ret = fvec::mask_blend(m_tr, ret, v);
+    fvec dv = c_6 *  rcp * ( t *  t -  t);
+    der = fvec::mask_blend(m_tr, der, dv);
+  }
+  *d = der;
+  return ret;
+}
+
+static fvec aut_Sp_deriv(fvec r, fvec lo, fvec hi, fvec * d) {
+  fvec c_1 = fvec::set1(1);
+  fvec c_0_5 = fvec::set1(0.5);
+  fvec c_m0_5 = fvec::set1(-0.5);
+  fvec c_PI = fvec::set1(M_PI);
+  bvec m_lo = fvec::cmple(r, lo);
+  bvec m_hi = fvec::cmpnlt(r, hi); // nlt == ge
+  bvec m_tr = bvec::kandn(m_lo, ~ m_hi);
+  fvec ret = c_1;
+  ret = fvec::mask_blend(m_hi, ret, fvec::setzero());
+  fvec der = fvec::setzero();
+  if (bvec::test_any_set(m_tr)) {
+    fvec diff = hi -  lo;
+    fvec rcp = fvec::mask_recip(c_1, m_tr, diff);
+    fvec t = (r -  lo) /  diff;
+    fvec sinval, cosval;
+    sinval = fvec::mask_sincos(&cosval, fvec::setzero(), c_1, m_tr, c_PI *  t);
+    fvec v = c_0_5 * ( c_1 +  cosval);
+    ret = fvec::mask_blend(m_tr, ret, v);
+    fvec dv = c_PI *  c_m0_5 *  rcp *  sinval;
+    der = fvec::mask_blend(m_tr, der, dv);
+  }
+  *d = der;
+  return ret;
+}
+
+static fvec aut_mask_Sp(bvec mask, fvec r, fvec lo, fvec hi) {
+  fvec c_1 = fvec::set1(1);
+  fvec c_0_5 = fvec::set1(0.5);
+  fvec c_PI = fvec::set1(M_PI);
+  bvec m_lo = fvec::mask_cmple(mask, r, lo);
+  bvec m_hi = fvec::mask_cmpnlt(mask, r, hi); // nlt == ge
+  bvec m_tr = bvec::kandn(m_lo, bvec::kandn(m_hi, mask));
+  fvec ret = c_1;
+  ret = fvec::mask_blend(m_hi, ret, fvec::setzero());
+  if (bvec::test_any_set(m_tr)) {
+    fvec rcp = fvec::mask_recip(c_1, m_tr, hi -  lo);
+    fvec t = (r -  lo) *  rcp;
+    fvec v = c_0_5 * ( c_1 +  fvec::mask_cos(c_1, m_tr, c_PI *  t));
+    ret = fvec::mask_blend(m_tr, ret, v);
+  }
+  return ret;
+}
+
+static void aut_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
+  int offset = ka->neigh_from_atom * ka->num_neighs_per_atom;
+  ivec c_CARBON = ivec::setzero();
+  int map_i = 0;
+  int i;
+  for (i = 1; i < ka->num_types; i++) {
+    if (ka->map[i])
+      map_i |= (1 << i);
+  }
+  ivec c_i1 = ivec::set1(1);
+  ivec c_im = ivec::set1(map_i);
+  AtomAIREBOT<flt_t> * _noalias x = ka->x;
+
+  for (i = ka->neigh_from_atom; i < ka->neigh_to_atom; i++) {
+
+    fvec x_i = fvec::set1(x[i].x);
+    fvec y_i = fvec::set1(x[i].y);
+    fvec z_i = fvec::set1(x[i].z);
+    int itype = ka->map[ka->x[i].w];
+
+    fvec rcmaxsq0 = fvec::set1(ka->params.rcmaxsq[itype][0]);
+    fvec rcmaxsq1 = fvec::set1(ka->params.rcmaxsq[itype][1]);
+    fvec rcmax0 = fvec::set1(ka->params.rcmax[itype][0]);
+    fvec rcmax1 = fvec::set1(ka->params.rcmax[itype][1]);
+    fvec rcmin0 = fvec::set1(ka->params.rcmin[itype][0]);
+    fvec rcmin1 = fvec::set1(ka->params.rcmin[itype][1]);
+    fvec rcmaxskinsq0 = fvec::set1(
+        (ka->params.rcmax[itype][0] + ka->skin) * (ka->params.rcmax[itype][0] +
+						   ka->skin));
+    fvec rcmaxskinsq1 = fvec::set1(
+        (ka->params.rcmax[itype][1] + ka->skin) * (ka->params.rcmax[itype][1] +
+						   ka->skin));
+    fvec nC = fvec::setzero();
+    fvec nH = fvec::setzero();
+
+    ka->neigh_rebo.offset[i] = offset;
+
+    int jnum = ka->rebuild_flag ? ka->neigh_lmp.num[i] : 
+      ka->neigh_rebo.num_half[i];
+    int * neighs = ka->rebuild_flag ? 
+      &ka->neigh_lmp.entries[ka->neigh_lmp.offset[i]] : 
+      &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]+jnum];
+    int * skin_target = &ka->neigh_rebo.entries[offset+ka->num_neighs_per_atom];
+    int n = 0;
+    int n_skin = 0;
+
+    int lowest_idx;
+    #pragma unroll(4)
+    for (lowest_idx = 0; lowest_idx < jnum; lowest_idx += fvec::VL) {
+      bvec j_mask = bvec::full();
+      if (lowest_idx + fvec::VL > jnum) j_mask = bvec::only(jnum - lowest_idx);
+
+      int * _noalias neighs_l = neighs + lowest_idx;
+      fvec x_j, y_j, z_j;
+      bvec jtype_mask;
+      ivec ji = ivec::maskz_loadu(j_mask, neighs_l);
+      aut_loadatoms_vec(x, ji,
+          &x_j, &y_j, &z_j, &jtype_mask, ka->map, c_im, c_i1);
+      fvec delx = x_i -  x_j;
+      fvec dely = y_i -  y_j;
+      fvec delz = z_i -  z_j;
+      fvec rsq = delx *  delx +  dely *  dely +  delz *  delz;
+      if (ka->rebuild_flag) {
+        fvec rcmaxskinsq = fvec::mask_blend(jtype_mask, rcmaxskinsq0, 
+					    rcmaxskinsq1);
+        bvec c_mask = fvec::mask_cmplt(j_mask, rsq, rcmaxskinsq);
+        ivec::mask_compressstore(c_mask, &skin_target[n_skin], ji);
+        n_skin += bvec::popcnt(c_mask);
+      }
+      fvec rcmaxsq = fvec::mask_blend(jtype_mask, rcmaxsq0, rcmaxsq1);
+      bvec c_mask = fvec::mask_cmplt(j_mask, rsq, rcmaxsq);
+      if (bvec::test_all_unset(c_mask)) continue;
+      ivec::mask_compressstore(c_mask, &ka->neigh_rebo.entries[offset + n], ji);
+      n += bvec::popcnt(c_mask);
+      fvec rcmax = fvec::mask_blend(jtype_mask, rcmax0, rcmax1);
+      fvec rcmin = fvec::mask_blend(jtype_mask, rcmin0, rcmin1);
+      fvec sp = aut_mask_Sp(c_mask, fvec::sqrt(rsq), rcmin, rcmax);
+      nC = fvec::mask_add(nC, bvec::kandn(jtype_mask, c_mask), nC, sp);
+      nH = fvec::mask_add(nH, bvec::kand (jtype_mask, c_mask), nH, sp);
+    }
+    ka->neigh_rebo.num[i] = n;
+    if (ka->rebuild_flag) {
+      for (int i = 0; i < n_skin; i++) {
+        ka->neigh_rebo.entries[offset+n_skin+i] = skin_target[i];
+      }
+    }
+    if (ka->rebuild_flag) {
+      assert(n <= n_skin);
+      offset += 2 * n_skin;
+      ka->neigh_rebo.num_half[i] = n_skin;
+    } else {
+      assert(n <= jnum);
+      offset += 2 * jnum;
+    }
+    ka->nC[i] = fvec::reduce_add(nC);
+    ka->nH[i] = fvec::reduce_add(nH);
+  }
+}
+
+
+static fvec aut_eval_poly_lin_pd_2(int n, flt_t * vals, ivec idx, fvec x, 
+				   fvec * deriv) {
+  fvec c_1 = fvec::set1(1);
+  fvec x_i = c_1;
+  fvec x_im1 = fvec::setzero();
+  fvec result = fvec::setzero();
+  fvec i_v = fvec::setzero();
+  *deriv = fvec::setzero();
+  int i;
+  for (i = 0; i < n; i++) {
+    fvec coeff = fvec::gather(idx, vals + i, sizeof(flt_t));
+    result = result +  coeff *  x_i;
+    *deriv = *deriv +  coeff *  x_im1 *  i_v;
+    x_im1 = x_i;
+    x_i = x_i *  x;
+    i_v = i_v +  c_1;
+  }
+  return result;
+}
+
+static fvec aut_mask_gSpline_pd_2(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+				  bvec active_mask, int itype, fvec cosjik, 
+				  fvec Nij, fvec *dgdc, fvec *dgdN) {
+  int i;
+  flt_t * gDom = NULL;
+  int nDom = 0;
+  ivec offs = ivec::setzero();
+  fvec NCmin = fvec::set1(ka->params.NCmin);
+  bvec Ngt = fvec::cmpnle(Nij, NCmin); //gt
+  if (itype == 0) {
+    nDom = 4;
+    gDom = &ka->params.gCdom[0];
+    offs = ivec::mask_blend(Ngt, offs, ivec::set1(4*6));
+  } else {
+    nDom = 3;
+    gDom = &ka->params.gHdom[0];
+    offs = ivec::set1(8 * 6);
+  }
+  cosjik = fvec::max(fvec::set1(gDom[0]), fvec::min(fvec::set1(gDom[nDom]), 
+						    cosjik));
+  ivec index6 = ivec::setzero();
+  for (i = 0; i < nDom; i++) {
+    bvec cosge = fvec::cmpnlt(cosjik, fvec::set1(gDom[i])); //ge
+    bvec cosle = fvec::cmple(cosjik, fvec::set1(gDom[i+1]));
+    index6 = ivec::mask_blend(cosge & cosle, index6, ivec::set1(6*i));
+  }
+  fvec g = aut_eval_poly_lin_pd_2(6, &ka->params.gVal[0], offs +  index6, 
+				  cosjik, dgdc);
+  *dgdN = fvec::setzero();
+  if (itype == 0) {
+    fvec NCmax = fvec::set1(ka->params.NCmax);
+    bvec Nlt = fvec::cmplt(Nij, NCmax); //gt
+    bvec Nmask = Ngt & Nlt;
+    if (bvec::test_any_set(Nmask)) {
+      fvec dg1;
+      fvec g1 = aut_eval_poly_lin_pd_2(6, &ka->params.gVal[0], index6, cosjik, 
+				       &dg1);
+      fvec dS;
+      fvec cut = aut_Sp_deriv(Nij, NCmin, NCmax, &dS);
+      *dgdN = fvec::mask_mul(*dgdN, Nmask, dS, g1 -  g);
+      g = fvec::mask_add(g, Nmask, g, cut * ( g1 -  g));
+      *dgdc = fvec::mask_add(*dgdc, Nmask, *dgdc, cut * ( dg1 -  *dgdc));
+    }
+  }
+  return g;
+}
+
+static fvec aut_PijSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, 
+			  int jtype, fvec NijC, fvec NijH, fvec *dN2) {
+  flt_t ret[fvec::VL] __attribute__((aligned(64)));
+  flt_t dN20[fvec::VL] __attribute__((aligned(64)));
+  flt_t dN21[fvec::VL] __attribute__((aligned(64)));
+  flt_t NijC_[fvec::VL] __attribute__((aligned(64)));
+  flt_t NijH_[fvec::VL] __attribute__((aligned(64)));
+  flt_t tmp_dN2[2];
+  fvec::store(NijC_, NijC);
+  fvec::store(NijH_, NijH);
+  int i;
+  for (i = 0; i < fvec::VL; i++) {
+    ret[i] = PijSpline(ka, itype, jtype, NijC_[i], NijH_[i], tmp_dN2);
+    dN20[i] = tmp_dN2[0];
+    dN21[i] = tmp_dN2[1];
+  }
+  dN2[0] = fvec::load(dN20);
+  dN2[1] = fvec::load(dN21);
+  return fvec::load(ret);
+}
+
+/*
+ * aut_frebo_data stores all the short-ranged coordinations
+ * and intermediate values that get reused frequently during
+ * bondorder calculations.
+ * BUF_CAP should rarely exceed 4, so 8 is a very conservative
+ * value.
+ */
+static const int BUF_CAP = 8;
+struct aut_frebo_data {
+  fvec rikx_buf[BUF_CAP];
+  fvec riky_buf[BUF_CAP];
+  fvec rikz_buf[BUF_CAP];
+  fvec rikmag_buf[BUF_CAP];
+  fvec cosjik_buf[BUF_CAP];
+  ivec k_buf[BUF_CAP];
+  fvec g_buf[BUF_CAP];
+  fvec dgdc_buf[BUF_CAP];
+  fvec ex_lam_buf[BUF_CAP];
+  fvec wik_buf[BUF_CAP];
+  fvec dwik_buf[BUF_CAP];
+  fvec cutN_buf[BUF_CAP];
+  fvec dcutN_buf[BUF_CAP];
+  bvec ktype_buf[BUF_CAP];
+  bvec mask_buf[BUF_CAP];
+  fvec force_k_x_buf[BUF_CAP];
+  fvec force_k_y_buf[BUF_CAP];
+  fvec force_k_z_buf[BUF_CAP];
+  int buf_len;
+  fvec x_i;
+  fvec y_i;
+  fvec z_i;
+  fvec x_j;
+  fvec y_j;
+  fvec z_j;
+  fvec nCi;
+  fvec nHi;
+  fvec force_i_x;
+  fvec force_i_y;
+  fvec force_i_z;
+  fvec force_j_x;
+  fvec force_j_y;
+  fvec force_j_z;
+};
+
+/*
+ * Initialize values in aut_frebo_data and perform the calculations
+ * for p_ij.
+ */
+static fvec aut_frebo_pij_pd_2(
+    KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka,
+    struct aut_frebo_data * _noalias data,
+    int itype, int jtype,
+    ivec vi, ivec vj,
+    fvec rijx, fvec rijy, fvec rijz, fvec rijmag,
+    fvec wij, fvec VA, fvec * sum_N, fvec fij[3]
+) {
+  AtomAIREBOT<flt_t> * _noalias x = ka->x;
+  int * _noalias map = ka->map;
+  flt_t * _noalias nC = ka->nC;
+  flt_t * _noalias nH = ka->nH;
+  fvec x_i, y_i, z_i;
+  fvec x_j, y_j, z_j;
+  x_i = data->x_i;
+  y_i = data->y_i;
+  z_i = data->z_i;
+  x_j = data->x_j;
+  y_j = data->y_j;
+  z_j = data->z_j;
+  fvec invrijm = fvec::recip(rijmag);
+  fvec invrijm2 = invrijm *  invrijm;
+  fvec rcminij = fvec::set1(ka->params.rcmin[itype][jtype]);
+  fvec rcmaxij = fvec::set1(ka->params.rcmax[itype][jtype]);
+  fvec Nmin = fvec::set1(ka->params.Nmin);
+  fvec Nmax = fvec::set1(ka->params.Nmax);
+  int map_i_scalar = 0;
+  {
+    int i;
+    for (i = 1; i < ka->num_types; i++) {
+      if (ka->map[i])
+        map_i_scalar |= (1 << i);
+    }
+  }
+  ivec map_i = ivec::set1(map_i_scalar);
+  fvec nCi = data->nCi;
+  fvec nHi = data->nHi;
+  fvec Nij = nHi +  nCi -  wij;
+  fvec factor_jtype, factor_not_jtype;
+  if (jtype) {
+    factor_jtype = fvec::set1(1);
+    factor_not_jtype = fvec::set1(0);
+  } else {
+    factor_jtype = fvec::set1(0);
+    factor_not_jtype = fvec::set1(1);
+  }
+  fvec NijC = nCi -  wij *  factor_not_jtype;
+  fvec NijH = nHi -  wij *  factor_jtype;
+  fvec sum_pij = fvec::setzero();
+  fvec sum_dpij_dN = fvec::setzero();
+  fvec dN2[2];
+  ivec offseti = ivec::mask_gather(ivec::setzero(), bvec::full(), vi, 
+				   ka->neigh_rebo.offset, sizeof(int));
+  int buf_len = 0;
+  ivec knum = ivec::mask_gather(ivec::setzero(), bvec::full(), vi, 
+				ka->neigh_rebo.num, sizeof(int));
+  ivec kk = ivec::setzero();
+  bvec active_mask = ivec::cmplt(kk, knum);
+  ivec c_i1 = ivec::set1(1);
+  fvec rho_j = fvec::set1(ka->params.rho[jtype][1]);
+  fvec rho_k0 = fvec::set1(ka->params.rho[0][1]);
+  fvec rho_k1 = fvec::set1(ka->params.rho[1][1]);
+  fvec c_4 = fvec::set1(4);
+  fvec c_2_0 = fvec::set1(2.0);
+  fvec c_m2_0 = fvec::set1(-2.0);
+  fvec c_4_0 = fvec::set1(4.0);
+  fvec c_0_5 = fvec::set1(0.5);
+  fvec c_m0_5 = fvec::set1(-0.5);
+  fvec c_1 = fvec::set1(1);
+  fvec c_m1 = fvec::set1(-1);
+  fvec factor_itype = itype ? c_1 : fvec::setzero();
+  fvec rcmax0 = fvec::set1(ka->params.rcmax[itype][0]);
+  fvec rcmax1 = fvec::set1(ka->params.rcmax[itype][1]);
+  fvec rcmin0 = fvec::set1(ka->params.rcmin[itype][0]);
+  fvec rcmin1 = fvec::set1(ka->params.rcmin[itype][1]);
+  fvec result_f_i_x = fvec::setzero();
+  fvec result_f_i_y = fvec::setzero();
+  fvec result_f_i_z = fvec::setzero();
+  fvec result_f_j_x = fvec::setzero();
+  fvec result_f_j_y = fvec::setzero();
+  fvec result_f_j_z = fvec::setzero();
+  *sum_N = fvec::setzero();
+  {
+    while (bvec::test_any_set(active_mask)) {
+      ivec k = ivec::mask_gather(ivec::setzero(), active_mask, kk +  offseti, 
+				 ka->neigh_rebo.entries, sizeof(int));
+      bvec excluded_mask = ivec::cmpeq(k, vj) & active_mask;
+      if (bvec::test_any_set(excluded_mask)) {
+        kk = ivec::mask_add(kk, excluded_mask, kk, c_i1);
+        active_mask = ivec::cmplt(kk, knum);
+        continue;
+      }
+      fvec x_k, y_k, z_k;
+      bvec ktype_mask;
+      aut_loadatoms_vec(x, k, &x_k, &y_k, &z_k, &ktype_mask, ka->map, map_i, 
+			c_i1);
+      fvec rikx = x_i -  x_k;
+      fvec riky = y_i -  y_k;
+      fvec rikz = z_i -  z_k;
+      fvec rikmag = fvec::sqrt(rikx *  rikx +  riky *  riky +  rikz *  rikz);
+      fvec rho_k = fvec::mask_blend(ktype_mask, rho_k0, rho_k1);
+      fvec lamdajik = c_4 *  factor_itype * ( rho_k -  rikmag - ( rho_j -  
+								  rijmag));
+      fvec ex_lam = fvec::exp(lamdajik);
+      fvec rcmax = fvec::mask_blend(ktype_mask, rcmax0, rcmax1);
+      fvec rcmin = fvec::mask_blend(ktype_mask, rcmin0, rcmin1);
+      fvec dwik;
+      fvec wik = aut_Sp_deriv(rikmag, rcmin, rcmax, &dwik);
+      fvec Nki = fvec::gather(k, nC, sizeof(flt_t)) +  
+	fvec::gather(k, nH, sizeof(flt_t)) -  wik;
+      fvec cosjik = (rijx *  rikx +  rijy *  riky +  rijz *  rikz) / 
+	( rijmag *  rikmag);
+      cosjik = fvec::min(c_1, fvec::max(c_m1, cosjik));
+      fvec dgdc, dgdN;
+      fvec g = aut_mask_gSpline_pd_2(ka, active_mask, itype, cosjik, Nij, 
+				     &dgdc, &dgdN);
+      sum_pij = fvec::mask_add(sum_pij, active_mask, sum_pij, wik * g * ex_lam);
+      sum_dpij_dN = fvec::mask_add(sum_dpij_dN, active_mask, sum_dpij_dN, 
+				   wik * ex_lam * dgdN);
+      fvec dcutN;
+      fvec cutN = aut_Sp_deriv(Nki, Nmin, Nmax, &dcutN);
+      *sum_N = fvec::mask_add(*sum_N, active_mask, *sum_N, 
+			      fvec::mask_blend(ktype_mask, c_1, 
+					       fvec::setzero()) * wik * cutN);
+      if (buf_len == BUF_CAP) goto exceed_buffer;
+      data->rikx_buf[buf_len] = rikx;
+      data->riky_buf[buf_len] = riky;
+      data->rikz_buf[buf_len] = rikz;
+      data->rikmag_buf[buf_len] = rikmag;
+      data->cosjik_buf[buf_len] = cosjik;
+      data->ktype_buf[buf_len] = ktype_mask;
+      data->k_buf[buf_len] = k;
+      data->g_buf[buf_len] = g;
+      data->dgdc_buf[buf_len] = dgdc;
+      data->ex_lam_buf[buf_len] = ex_lam;
+      data->wik_buf[buf_len] = wik;
+      data->dwik_buf[buf_len] = dwik;
+      data->mask_buf[buf_len] = active_mask;
+      data->cutN_buf[buf_len] = cutN;
+      data->dcutN_buf[buf_len] = dcutN;
+      buf_len += 1;
+      kk = ivec::mask_add(kk, active_mask, kk, c_i1);
+      active_mask = ivec::cmplt(kk, knum);
+    }
+    data->buf_len = buf_len;
+    fvec PijS = aut_PijSpline(ka, itype, jtype, NijC, NijH, &dN2[0]);
+    fvec pij = fvec::invsqrt(c_1 + sum_pij + PijS);
+    fvec tmp = c_m0_5 * pij * pij * pij;
+    int buf_idx;
+    for (buf_idx = 0; buf_idx < buf_len; buf_idx++) {
+      fvec rikx = data->rikx_buf[buf_idx];
+      fvec riky = data->riky_buf[buf_idx];
+      fvec rikz = data->rikz_buf[buf_idx];
+      fvec rikmag = data->rikmag_buf[buf_idx];
+      fvec cosjik = data->cosjik_buf[buf_idx];
+      bvec ktype_mask = data->ktype_buf[buf_idx];
+      ivec k = data->k_buf[buf_idx];
+      fvec g = data->g_buf[buf_idx];
+      fvec dgdc = data->dgdc_buf[buf_idx];
+      fvec ex_lam = data->ex_lam_buf[buf_idx];
+      fvec wik = data->wik_buf[buf_idx];
+      fvec dwik = data->dwik_buf[buf_idx];
+      bvec mask = data->mask_buf[buf_idx];
+      fvec invrikm = fvec::recip(rikmag);
+      fvec rjkx = rikx -  rijx;
+      fvec rjky = riky -  rijy;
+      fvec rjkz = rikz -  rijz;
+      fvec rjkmag = fvec::sqrt(
+           rjkx *  rjkx +  rjky *  rjky +  rjkz *  rjkz);
+      fvec rijrik = c_2_0 *  rijmag *  rikmag;
+      fvec rr = rijmag *  rijmag -  rikmag *  rikmag;
+      fvec dctdjk = c_m2_0 /  rijrik;
+      fvec dctdik = (rjkmag *  rjkmag -  rr) / ( rijrik *  rikmag *  rikmag);
+      fvec dctdij = (rjkmag *  rjkmag +  rr) / ( rijrik *  rijmag *  rijmag);
+      fvec fi[3], fj[3], fk[3];
+      fvec pref = c_0_5 *  VA *  tmp;
+      fvec tmp20 = pref *  wik *  dgdc *  ex_lam;
+      fj[0] = fj[1] = fj[2] = fvec::setzero();
+      fvec tmpdik = tmp20 *  dctdik;
+      fi[0] = fvec::setzero() -  tmpdik *  rikx;
+      fi[1] = fvec::setzero() -  tmpdik *  riky;
+      fi[2] = fvec::setzero() -  tmpdik *  rikz;
+      fk[0] = tmpdik *  rikx;
+      fk[1] = tmpdik *  riky;
+      fk[2] = tmpdik *  rikz;
+
+      fvec tmpdij = tmp20 *  dctdij;
+      fij[0] = fvec::mask_sub(fij[0], mask, fij[0], tmpdij *  rijx);
+      fij[1] = fvec::mask_sub(fij[1], mask, fij[1], tmpdij *  rijy);
+      fij[2] = fvec::mask_sub(fij[2], mask, fij[2], tmpdij *  rijz);
+
+      fvec tmpdjk = tmp20 *  dctdjk;
+      fi[0] = fi[0] -  tmpdjk *  rjkx;
+      fi[1] = fi[1] -  tmpdjk *  rjky;
+      fi[2] = fi[2] -  tmpdjk *  rjkz;
+      fk[0] = fk[0] +  tmpdjk *  rjkx;
+      fk[1] = fk[1] +  tmpdjk *  rjky;
+      fk[2] = fk[2] +  tmpdjk *  rjkz;
+      fij[0] = fvec::mask_add(fij[0], mask, fij[0], tmpdjk *  rjkx);
+      fij[1] = fvec::mask_add(fij[1], mask, fij[1], tmpdjk *  rjky);
+      fij[2] = fvec::mask_add(fij[2], mask, fij[2], tmpdjk *  rjkz);
+
+      if (itype) {
+        fvec tmp21 = pref *  wik *  g *  ex_lam *  c_4_0;
+        fvec tmp21ij = tmp21 *  invrijm;
+        fij[0] = fvec::mask_sub(fij[0], mask, fij[0], tmp21ij * rijx);
+        fij[1] = fvec::mask_sub(fij[1], mask, fij[1], tmp21ij * rijy);
+        fij[2] = fvec::mask_sub(fij[2], mask, fij[2], tmp21ij * rijz);
+        fvec tmp21ik = tmp21 * invrikm;
+        fi[0] = fi[0] +  tmp21ik *  rikx;
+        fi[1] = fi[1] +  tmp21ik *  riky;
+        fi[2] = fi[2] +  tmp21ik *  rikz;
+        fk[0] = fk[0] -  tmp21ik *  rikx;
+        fk[1] = fk[1] -  tmp21ik *  riky;
+        fk[2] = fk[2] -  tmp21ik *  rikz;
+      }
+
+      // coordination forces
+
+      // dwik forces
+      fvec tmp22 = pref *  dwik *  g *  ex_lam *  invrikm;
+      fi[0] = fi[0] -  tmp22 *  rikx;
+      fi[1] = fi[1] -  tmp22 *  riky;
+      fi[2] = fi[2] -  tmp22 *  rikz;
+      fk[0] = fk[0] +  tmp22 *  rikx;
+      fk[1] = fk[1] +  tmp22 *  riky;
+      fk[2] = fk[2] +  tmp22 *  rikz;
+
+      // PIJ forces
+      fvec dN2ktype = fvec::mask_blend(ktype_mask, dN2[0], dN2[1]);
+      fvec tmp23 = pref *  dN2ktype *  dwik *  invrikm;
+      fi[0] = fi[0] -  tmp23 *  rikx;
+      fi[1] = fi[1] -  tmp23 *  riky;
+      fi[2] = fi[2] -  tmp23 *  rikz;
+      fk[0] = fk[0] +  tmp23 *  rikx;
+      fk[1] = fk[1] +  tmp23 *  riky;
+      fk[2] = fk[2] +  tmp23 *  rikz;
+
+      // dgdN forces
+      fvec tmp24 = pref *  sum_dpij_dN *  dwik *  invrikm;
+      fi[0] = fi[0] -  tmp24 *  rikx;
+      fi[1] = fi[1] -  tmp24 *  riky;
+      fi[2] = fi[2] -  tmp24 *  rikz;
+      fk[0] = fk[0] +  tmp24 *  rikx;
+      fk[1] = fk[1] +  tmp24 *  riky;
+      fk[2] = fk[2] +  tmp24 *  rikz;
+
+      result_f_i_x = fvec::mask_add(result_f_i_x, mask, result_f_i_x, fi[0]);
+      result_f_i_y = fvec::mask_add(result_f_i_y, mask, result_f_i_y, fi[1]);
+      result_f_i_z = fvec::mask_add(result_f_i_z, mask, result_f_i_z, fi[2]);
+      result_f_j_x = fvec::mask_add(result_f_j_x, mask, result_f_j_x, fj[0]);
+      result_f_j_y = fvec::mask_add(result_f_j_y, mask, result_f_j_y, fj[1]);
+      result_f_j_z = fvec::mask_add(result_f_j_z, mask, result_f_j_z, fj[2]);
+
+      data->force_k_x_buf[buf_idx] = fk[0];
+      data->force_k_y_buf[buf_idx] = fk[1];
+      data->force_k_z_buf[buf_idx] = fk[2];
+    }
+    data->force_i_x = result_f_i_x;
+    data->force_i_y = result_f_i_y;
+    data->force_i_z = result_f_i_z;
+    data->force_j_x = result_f_j_x;
+    data->force_j_y = result_f_j_y;
+    data->force_j_z = result_f_j_z;
+    return pij;
+  }
+  exceed_buffer:
+  data->buf_len = -1;
+  return fvec::setzero();
+}
+
+/*
+ * Apply the force values stored iin aut_frebo_data to
+ * the respective neighbors.
+ */
+static void aut_frebo_data_writeback(
+    KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka, 
+    struct aut_frebo_data * _noalias data) {
+  ResultForceT<acc_t> * _noalias result_f = ka->result_f;
+  flt_t fk_x_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fk_y_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fk_z_buf[fvec::VL] __attribute__((aligned(64)));
+  int fk_k_buf[ivec::VL] __attribute__((aligned(64)));
+  int buf_idx;
+  for (buf_idx = 0; buf_idx < data->buf_len; buf_idx++) {
+    ivec k = data->k_buf[buf_idx];
+    bvec active_mask = data->mask_buf[buf_idx];
+
+    fvec::store(fk_x_buf, data->force_k_x_buf[buf_idx]);
+    fvec::store(fk_y_buf, data->force_k_y_buf[buf_idx]);
+    fvec::store(fk_z_buf, data->force_k_z_buf[buf_idx]);
+    ivec::store(fk_k_buf, k);
+
+    int lane;
+    for (lane = 0; lane < fvec::VL; lane++) {
+      if (bvec::test_at(active_mask, lane)) {} else continue;
+      int kk = fk_k_buf[lane];
+      result_f[kk].x += fk_x_buf[lane];
+      result_f[kk].y += fk_y_buf[lane];
+      result_f[kk].z += fk_z_buf[lane];
+    }
+  }
+}
+
+static void aut_frebo_N_spline_force(
+     KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka, 
+     struct aut_frebo_data * _noalias data, int itype, int jtype, ivec vi, 
+     ivec vj, fvec VA, fvec dN, fvec dNconj, fvec Nconj) {
+  ivec c_i1 = ivec::set1(1);
+  fvec c_2 = fvec::set1(2);
+  fvec c_TOL = fvec::set1(TOL);
+  ResultForceT<acc_t> * _noalias result_f = ka->result_f;
+  AtomAIREBOT<flt_t> * _noalias x = ka->x;
+  int * _noalias map = ka->map;
+  flt_t * _noalias nC = ka->nC;
+  flt_t * _noalias nH = ka->nH;
+  fvec x_i, y_i, z_i;
+  x_i = data->x_i;
+  y_i = data->y_i;
+  z_i = data->z_i;
+  fvec Nmin = fvec::set1(ka->params.Nmin);
+  fvec Nmax = fvec::set1(ka->params.Nmax);
+  int map_i_scalar = 0;
+  {
+    int i;
+    for (i = 1; i < ka->num_types; i++) {
+      if (ka->map[i])
+        map_i_scalar |= (1 << i);
+    }
+  }
+  ivec map_i = ivec::set1(map_i_scalar);
+  fvec dN2[2];
+  ivec kk = ivec::setzero();
+  fvec rcmax0 = fvec::set1(ka->params.rcmax[itype][0]);
+  fvec rcmax1 = fvec::set1(ka->params.rcmax[itype][1]);
+  fvec rcmin0 = fvec::set1(ka->params.rcmin[itype][0]);
+  fvec rcmin1 = fvec::set1(ka->params.rcmin[itype][1]);
+  fvec result_f_i_x = fvec::setzero();
+  fvec result_f_i_y = fvec::setzero();
+  fvec result_f_i_z = fvec::setzero();
+  int buf_idx;
+  for (buf_idx = 0; buf_idx < data->buf_len; buf_idx++) {
+    ivec k = data->k_buf[buf_idx];
+    bvec active_mask = data->mask_buf[buf_idx];
+    fvec rikx = data->rikx_buf[buf_idx];
+    fvec riky = data->riky_buf[buf_idx];
+    fvec rikz = data->rikz_buf[buf_idx];
+    fvec rikmag = data->rikmag_buf[buf_idx];
+    bvec ktype_mask = data->ktype_buf[buf_idx];
+
+    fvec dwik = data->dwik_buf[buf_idx];
+    fvec wik = data->wik_buf[buf_idx];
+
+    fvec dNki = data->dcutN_buf[buf_idx];
+    fvec SpN = data->cutN_buf[buf_idx];
+
+    fvec invrikmag = fvec::recip(rikmag);
+    fvec pref = VA *  dwik *  invrikmag;
+    fvec fdN = dN *  pref;
+    fvec fdNconj = pref *  SpN *  c_2 *  dNconj *  Nconj;
+    fvec ffactor = fdN;
+    bvec ktype_is_C = ~ ktype_mask;
+    ffactor = fvec::mask_add(ffactor, ktype_is_C, ffactor,  fdNconj);
+
+    fvec fkx = ffactor *  rikx;
+    fvec fky = ffactor *  riky;
+    fvec fkz = ffactor *  rikz;
+
+    data->force_k_x_buf[buf_idx] = data->force_k_x_buf[buf_idx] +  fkx;
+    data->force_k_y_buf[buf_idx] = data->force_k_y_buf[buf_idx] +  fky;
+    data->force_k_z_buf[buf_idx] = data->force_k_z_buf[buf_idx] +  fkz;
+
+    result_f_i_x = fvec::mask_sub(result_f_i_x, active_mask, result_f_i_x, fkx);
+    result_f_i_y = fvec::mask_sub(result_f_i_y, active_mask, result_f_i_y, fky);
+    result_f_i_z = fvec::mask_sub(result_f_i_z, active_mask, result_f_i_z, fkz);
+
+    bvec need_k_neighs = fvec::mask_cmpnle(active_mask, fvec::abs(dNki), c_TOL)
+      & ktype_is_C;
+    if (bvec::test_any_set(need_k_neighs)) {
+      int lane;
+      for (lane = 0; lane < fvec::VL; lane++) {
+        if (! bvec::test_at(need_k_neighs, lane)) continue;
+        int kk = ivec::at(k, lane);
+        int k = kk;
+        int ktype = map[x[k].w];
+        int i = ivec::at(vi, lane);
+        fvec oldVA = VA;
+        double VA = fvec::at(oldVA, lane);
+        fvec oldwik = wik;
+        double wik = fvec::at(oldwik, lane);
+        fvec olddNconj = dNconj;
+        double dNconj = fvec::at(olddNconj, lane);
+        fvec oldNconj = Nconj;
+        double Nconj = fvec::at(oldNconj, lane);
+        fvec olddNki = dNki;
+        double dNki = fvec::at(olddNki, lane);
+        int * neighs_k = ka->neigh_rebo.entries + ka->neigh_rebo.offset[k];
+        int nnum = ka->neigh_rebo.num[k];
+        int nn;
+        for (nn = 0; nn < nnum; nn++) {
+          int n = neighs_k[nn];
+          if (n == i) continue;
+          double rknx = x[k].x - x[n].x;
+          double rkny = x[k].y - x[n].y;
+          double rknz = x[k].z - x[n].z;
+          double rknmag = sqrt(rknx * rknx + rkny * rkny + rknz * rknz);
+          int ntype = map[x[n].w];
+          double rcminkn = ka->params.rcmin[ktype][ntype];
+          double rcmaxkn = ka->params.rcmax[ktype][ntype];
+          double dwkn;
+          Sp(rknmag, rcminkn, rcmaxkn, &dwkn);
+          double ffactor = VA * dNconj * 2 * Nconj * wik * dNki * dwkn / rknmag;
+          result_f[k].x -= ffactor * rknx;
+          result_f[k].y -= ffactor * rkny;
+          result_f[k].z -= ffactor * rknz;
+          result_f[n].x += ffactor * rknx;
+          result_f[n].y += ffactor * rkny;
+          result_f[n].z += ffactor * rknz;
+        }
+      }
+    }
+  }
+  data->force_i_x = data->force_i_x +  result_f_i_x;
+  data->force_i_y = data->force_i_y +  result_f_i_y;
+  data->force_i_z = data->force_i_z +  result_f_i_z;
+}
+
+static fvec aut_frebo_pi_rc_pd(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype,
+			       int jtype, fvec Nij, fvec Nji, fvec Nijconj, 
+			       fvec * dN3) {
+  flt_t ret[fvec::VL] __attribute__((aligned(64)));
+  flt_t dN3ret[3][fvec::VL] __attribute__((aligned(64)));
+  int i;
+  for (i = 0; i < fvec::VL; i++) {
+    flt_t dN3tmp[3];
+    ret[i] = frebo_pi_rc(ka, itype, jtype, fvec::at(Nij, i), fvec::at(Nji, i), 
+			 fvec::at(Nijconj, i), &dN3tmp[0]);
+    dN3ret[0][i] = dN3tmp[0];
+    dN3ret[1][i] = dN3tmp[1];
+    dN3ret[2][i] = dN3tmp[2];
+  }
+  dN3[0] = fvec::load(&dN3ret[0][0]);
+  dN3[1] = fvec::load(&dN3ret[1][0]);
+  dN3[2] = fvec::load(&dN3ret[2][0]);
+  return fvec::load(&ret[0]);
+}
+
+static fvec aut_frebo_Tij(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, 
+			  int jtype, fvec Nij, fvec Nji, fvec Nijconj, 
+			  fvec * dN3) {
+  flt_t ret[fvec::VL] __attribute__((aligned(64)));
+  flt_t dN3ret[3][fvec::VL] __attribute__((aligned(64)));
+  int i;
+  for (i = 0; i < fvec::VL; i++) {
+    flt_t dN3tmp[3];
+    ret[i] = frebo_Tij(ka, itype, jtype, fvec::at(Nij, i), fvec::at(Nji, i), 
+		       fvec::at(Nijconj, i), &dN3tmp[0]);
+    dN3ret[0][i] = dN3tmp[0];
+    dN3ret[1][i] = dN3tmp[1];
+    dN3ret[2][i] = dN3tmp[2];
+  }
+  dN3[0] = fvec::load(&dN3ret[0][0]);
+  dN3[1] = fvec::load(&dN3ret[1][0]);
+  dN3[2] = fvec::load(&dN3ret[2][0]);
+  return fvec::load(&ret[0]);
+}
+
+static fvec aut_frebo_sum_omega(
+    KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka,
+    struct aut_frebo_data * _noalias i_data,
+    struct aut_frebo_data * _noalias j_data,
+    int itype, int jtype,
+    ivec vi, ivec vj,
+    fvec r23x, fvec r23y, fvec r23z, fvec r23mag,
+    fvec VA, fvec fij[3]
+) {
+  fvec c_1 = fvec::set1(1);
+  fvec c_m1 = fvec::set1(-1);
+  fvec c_2 = fvec::set1(2);
+  fvec c_m2 = fvec::set1(-2);
+  fvec sum_omega = fvec::setzero();
+  fvec thmin = fvec::set1(ka->params.thmin);
+  fvec thmax = fvec::set1(ka->params.thmax);
+  // 2 == i, 3 == j
+  fvec r32x = fvec::setzero() -  r23x;
+  fvec r32y = fvec::setzero() -  r23y;
+  fvec r32z = fvec::setzero() -  r23z;
+  int buf_idx_i, buf_idx_j;
+  for (buf_idx_i = 0; buf_idx_i < i_data->buf_len; buf_idx_i++) {
+    // a1 == k == buf_idx_i
+    bvec mask_start = i_data->mask_buf[buf_idx_i];
+    fvec r21x = i_data->rikx_buf[buf_idx_i]; // a2 - a1 -> i - k
+    fvec r21y = i_data->riky_buf[buf_idx_i]; // a2 - a1 -> i - k
+    fvec r21z = i_data->rikz_buf[buf_idx_i]; // a2 - a1 -> i - k
+    fvec r21mag = i_data->rikmag_buf[buf_idx_i];
+    // TODO use buffered cosjik
+    fvec cos321 = (
+        r23x *  r21x +  r23y *  r21y +  r23z *  r21z) / ( r23mag *  r21mag);
+    cos321 = fvec::min(c_1, fvec::max(c_m1, cos321));
+    fvec sin321 = fvec::sqrt(c_1 -  cos321 *  cos321);
+    bvec mask_outer = fvec::cmpneq(fvec::setzero(), sin321) & mask_start;
+    // add "continue"
+    fvec sink2i = fvec::mask_recip(fvec::undefined(), mask_outer, 
+				   sin321 * sin321);
+    fvec rik2i = fvec::mask_recip(fvec::undefined(), mask_outer, 
+				  r21mag * r21mag);
+    fvec rr = r23mag *  r23mag -  r21mag *  r21mag;
+    fvec r31x = r21x -  r23x;
+    fvec r31y = r21y -  r23y;
+    fvec r31z = r21z -  r23z;
+    fvec r31mag2 = r31x *  r31x +  r31y *  r31y +  r31z *  r31z;
+    fvec rijrik = c_2 *  r23mag *  r21mag;
+    fvec r21mag2 = r21mag *  r21mag;
+    fvec dctik = fvec::mask_div(fvec::undefined(), mask_outer, r31mag2 -  rr, 
+				rijrik *  r21mag2);
+    fvec dctij = fvec::mask_div(fvec::undefined(), mask_outer, r31mag2 +  rr, 
+				rijrik *  r23mag *  r23mag);
+    fvec dctjk = fvec::mask_div(fvec::undefined(), mask_outer, c_m2, rijrik);
+    fvec dw21 = i_data->dwik_buf[buf_idx_i];
+    fvec w21 = i_data->wik_buf[buf_idx_i];
+    fvec dtsjik;
+    fvec tspjik = aut_Sp2_deriv(cos321, thmin, thmax, &dtsjik);
+    dtsjik = fvec::setzero() -  dtsjik; // todo replace by appropriate xor.
+    ivec k = i_data->k_buf[buf_idx_i];
+    for (buf_idx_j = 0; buf_idx_j < j_data->buf_len; buf_idx_j++) {
+      // check l == k in second loop.
+      // l == a4 == buf_idx_j
+      ivec l = j_data->k_buf[buf_idx_j];
+      bvec mask_inner_0 = ivec::mask_cmpneq(mask_outer, k, l) & 
+	j_data->mask_buf[buf_idx_j];
+      // add "continue"
+      fvec r34x = j_data->rikx_buf[buf_idx_j];
+      fvec r34y = j_data->riky_buf[buf_idx_j];
+      fvec r34z = j_data->rikz_buf[buf_idx_j];
+      fvec r34mag = j_data->rikmag_buf[buf_idx_j];
+      fvec cos234 = fvec::mask_div(fvec::undefined(), mask_inner_0, 
+				   r32x * r34x + r32y * r34y + r32z * r34z, 
+				   r23mag * r34mag);
+      cos234 = fvec::min(c_1, fvec::max(c_m1, cos234));
+      fvec sin234 = fvec::mask_sqrt(fvec::undefined(), mask_inner_0, 
+				    c_1 - cos234 * cos234);
+      bvec mask_inner_1 = fvec::mask_cmpneq(mask_inner_0, sin234, 
+					    fvec::setzero());
+      // add "continue"
+      fvec sinl2i = fvec::mask_recip(fvec::undefined(), mask_inner_1, 
+				     sin234 * sin234);
+      fvec rjl2i = fvec::mask_recip(fvec::undefined(), mask_inner_1, 
+				    r34mag * r34mag);
+      fvec dw34 = j_data->dwik_buf[buf_idx_j];
+      fvec w34 = j_data->wik_buf[buf_idx_j];
+      fvec rr = r23mag *  r23mag - r34mag * r34mag;
+      fvec r24x = r23x +  r34x;
+      fvec r24y = r23y +  r34y;
+      fvec r24z = r23z +  r34z;
+      fvec r242 = r24x *  r24x +  r24y *  r24y +  r24z *  r24z;
+      fvec rijrjl = c_2 *  r23mag *  r34mag;
+      fvec rjl2 = r34mag *  r34mag;
+      fvec dctjl = fvec::mask_div(fvec::undefined(), mask_inner_1, r242 -  rr, 
+				  rijrjl *  rjl2);
+      fvec dctji = fvec::mask_div(fvec::undefined(), mask_inner_1, r242 +  rr, 
+				  rijrjl *  r23mag *  r23mag);
+      fvec dctil = fvec::mask_div(fvec::undefined(), mask_inner_1, c_m2, 
+				  rijrjl);
+      fvec dtsijl;
+      fvec tspijl = aut_Sp2_deriv(cos234, thmin, thmax, &dtsijl);
+      dtsijl = fvec::setzero() -  dtsijl;
+      fvec prefactor = VA;
+
+      fvec cross321x = r32y *  r21z -  r32z *  r21y;
+      fvec cross321y = r32z *  r21x -  r32x *  r21z;
+      fvec cross321z = r32x *  r21y -  r32y *  r21x;
+      fvec cross234x = r23y *  r34z -  r23z *  r34y;
+      fvec cross234y = r23z *  r34x -  r23x *  r34z;
+      fvec cross234z = r23x *  r34y -  r23y *  r34x;
+
+      fvec cwnum = cross321x * cross234x + cross321y * cross234y + cross321z *
+	cross234z;
+      fvec cwnom = r21mag * r34mag * r23mag * r23mag * sin321 * sin234;
+      fvec om1234 = fvec::mask_div(fvec::undefined(), mask_inner_1, cwnum, 
+				   cwnom);
+      fvec cw = om1234;
+      fvec sum_omega_contrib = (c_1 -  om1234 *  om1234) *  w21 *  w34 *
+	(c_1 -  tspjik) * ( c_1 -  tspijl);
+      sum_omega = fvec::mask_add(sum_omega, mask_inner_1, sum_omega, 
+				 sum_omega_contrib);
+      fvec dt1dik = rik2i -  dctik *  sink2i *  cos321;
+      fvec dt1djk = fvec::setzero() -  dctjk *  sink2i *  cos321;
+      fvec dt1djl = rjl2i -  dctjl *  sinl2i *  cos234;
+      fvec dt1dil = fvec::setzero() -  dctil *  sinl2i *  cos234;
+      fvec dt1dij =   fvec::mask_div(fvec::undefined(), mask_inner_1, c_2, 
+				     r23mag * r23mag) - 
+	dctij * sink2i * cos321 -  dctji *  sinl2i *  cos234;
+
+      fvec dt2dikx = r23y *  cross234z -  r23z *  cross234y;
+      fvec dt2diky = r23z *  cross234x -  r23x *  cross234z;
+      fvec dt2dikz = r23x *  cross234y -  r23y *  cross234x;
+
+      fvec dt2djlx = r23z *  cross321y -  r23y *  cross321z;
+      fvec dt2djly = r23x *  cross321z -  r23z *  cross321x;
+      fvec dt2djlz = r23y *  cross321x -  r23x *  cross321y;
+
+      fvec dt2dijx = r21z *  cross234y +  r34y *  cross321z -
+	( r34z *  cross321y +  r21y *  cross234z);
+      fvec dt2dijy = r21x *  cross234z +  r34z *  cross321x -
+	( r34x *  cross321z +  r21z *  cross234x);
+      fvec dt2dijz = r21y *  cross234x +  r34x *  cross321y -
+	( r34y *  cross321x +  r21x *  cross234y);
+
+      fvec aa = prefactor *  c_2 *  fvec::mask_div(fvec::undefined(), 
+						   mask_inner_1, cw, cwnom) *
+	w21 *  w34 *  (c_1 -  tspjik) * ( c_1 -  tspijl);
+      fvec aaa1 = (fvec::setzero() - prefactor) * (c_1 - om1234 * om1234) *
+	(c_1 - tspjik) * (c_1 - tspijl);
+      fvec aaa2 = (fvec::setzero() -  prefactor) * (c_1 -  om1234 *  om1234) *
+	w21 * w34;
+      fvec at2 = aa * cwnum;
+
+      fvec fcijpc = aaa2 * dtsjik * dctij * (c_1 - tspijl) +  aaa2 * dtsijl * 
+	dctji * (c_1 - tspjik) - dt1dij * at2;
+      fvec fcikpc =  aaa2 * dtsjik * dctik * (c_1 - tspijl) - dt1dik * at2;
+      fvec fcjlpc =  aaa2 * dtsijl * dctjl * (c_1 - tspjik) - dt1djl * at2;
+      fvec fcjkpc =  aaa2 * dtsjik * dctjk * (c_1 - tspijl) - dt1djk * at2;
+      fvec fcilpc =  aaa2 * dtsijl * dctil * (c_1 - tspjik) - dt1dil * at2;
+
+      fvec F23x = fcijpc *  r23x +  aa *  dt2dijx;
+      fvec F23y = fcijpc *  r23y +  aa *  dt2dijy;
+      fvec F23z = fcijpc *  r23z +  aa *  dt2dijz;
+
+      fvec F12x = fcikpc *  r21x +  aa *  dt2dikx;
+      fvec F12y = fcikpc *  r21y +  aa *  dt2diky;
+      fvec F12z = fcikpc *  r21z +  aa *  dt2dikz;
+
+      fvec F34x = fcjlpc *  r34x +  aa *  dt2djlx;
+      fvec F34y = fcjlpc *  r34y +  aa *  dt2djly;
+      fvec F34z = fcjlpc *  r34z +  aa *  dt2djlz;
+
+      fvec F31x = fcjkpc *  r31x;
+      fvec F31y = fcjkpc *  r31y;
+      fvec F31z = fcjkpc *  r31z;
+
+      fvec F24x = fcilpc *  r24x;
+      fvec F24y = fcilpc *  r24y;
+      fvec F24z = fcilpc *  r24z;
+
+      fvec f1x = fvec::setzero() - ( F12x +  F31x);
+      fvec f1y = fvec::setzero() - ( F12y +  F31y);
+      fvec f1z = fvec::setzero() - ( F12z +  F31z);
+      fvec f2x = F12x +  F31x;
+      fvec f2y = F12y +  F31y;
+      fvec f2z = F12z +  F31z;
+      fvec f3x = F34x +  F24x;
+      fvec f3y = F34y +  F24y;
+      fvec f3z = F34z +  F24z;
+      fvec f4x = fvec::setzero() - ( F34x +  F24x);
+      fvec f4y = fvec::setzero() - ( F34y +  F24y);
+      fvec f4z = fvec::setzero() - ( F34z +  F24z);
+
+      fij[0] = fvec::mask_add(fij[0], mask_inner_1, fij[0],
+          F23x +  F24x -  F31x);
+      fij[1] = fvec::mask_add(fij[1], mask_inner_1, fij[1],
+          F23y +  F24y -  F31y);
+      fij[2] = fvec::mask_add(fij[2], mask_inner_1, fij[2],
+          F23z +  F24z -  F31z);
+
+      fvec tmp20 = VA * (c_1 - om1234 * om1234) * (c_1 - tspjik) * 
+	(c_1 - tspijl) * dw21 * w34 * fvec::mask_recip(fvec::undefined(), 
+						       mask_inner_1, r21mag);
+      f2x = f2x -  tmp20 *  r21x;
+      f2y = f2y -  tmp20 *  r21y;
+      f2z = f2z -  tmp20 *  r21z;
+      f1x = f1x +  tmp20 *  r21x;
+      f1y = f1y +  tmp20 *  r21y;
+      f1z = f1z +  tmp20 *  r21z;
+
+      fvec tmp21 = VA * (c_1 - om1234 * om1234) * (c_1 - tspjik) * 
+	(c_1 - tspijl) * w21 * dw34 * fvec::mask_recip(fvec::undefined(), 
+						       mask_inner_1, r34mag);
+      f3x = f3x -  tmp21 *  r34x;
+      f3y = f3y -  tmp21 *  r34y;
+      f3z = f3z -  tmp21 *  r34z;
+      f4x = f4x +  tmp21 *  r34x;
+      f4y = f4y +  tmp21 *  r34y;
+      f4z = f4z +  tmp21 *  r34z;
+
+      // 1 == buf_idx_i, 2 == i, 3 == j, 4 == buf_idx_j
+      i_data->force_k_x_buf[buf_idx_i] = 
+	fvec::mask_add(i_data->force_k_x_buf[buf_idx_i], 
+		       mask_inner_1, i_data->force_k_x_buf[buf_idx_i], f1x);
+      i_data->force_k_y_buf[buf_idx_i] = 
+	fvec::mask_add(i_data->force_k_y_buf[buf_idx_i], mask_inner_1, 
+		       i_data->force_k_y_buf[buf_idx_i], f1y);
+      i_data->force_k_z_buf[buf_idx_i] = 
+	fvec::mask_add(i_data->force_k_z_buf[buf_idx_i], mask_inner_1, 
+		       i_data->force_k_z_buf[buf_idx_i], f1z);
+      i_data->force_i_x = 
+	fvec::mask_add(i_data->force_i_x, mask_inner_1, i_data->force_i_x, f2x);
+      i_data->force_i_y = 
+	fvec::mask_add(i_data->force_i_y, mask_inner_1, i_data->force_i_y, f2y);
+      i_data->force_i_z = 
+	fvec::mask_add(i_data->force_i_z, mask_inner_1, i_data->force_i_z, f2z);
+      j_data->force_i_x = 
+	fvec::mask_add(j_data->force_i_x, mask_inner_1, j_data->force_i_x, f3x);
+      j_data->force_i_y = 
+	fvec::mask_add(j_data->force_i_y, mask_inner_1, j_data->force_i_y, f3y);
+      j_data->force_i_z = 
+	fvec::mask_add(j_data->force_i_z, mask_inner_1, j_data->force_i_z, f3z);
+      j_data->force_k_x_buf[buf_idx_j] = 
+	fvec::mask_add(j_data->force_k_x_buf[buf_idx_j], mask_inner_1, 
+		       j_data->force_k_x_buf[buf_idx_j], f4x);
+      j_data->force_k_y_buf[buf_idx_j] = 
+	fvec::mask_add(j_data->force_k_y_buf[buf_idx_j], mask_inner_1, 
+		       j_data->force_k_y_buf[buf_idx_j], f4y);
+      j_data->force_k_z_buf[buf_idx_j] = 
+	fvec::mask_add(j_data->force_k_z_buf[buf_idx_j], mask_inner_1, 
+		       j_data->force_k_z_buf[buf_idx_j], f4z);
+    }
+  }
+  return sum_omega;
+}
+
+static fvec aut_frebo_pi_dh(
+    KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka,
+    struct aut_frebo_data * _noalias i_data,
+    struct aut_frebo_data * _noalias j_data,
+    int itype, int jtype, ivec vi, ivec vj,
+    fvec r23x, fvec r23y, fvec r23z, fvec r23mag,
+    fvec VA,
+    fvec Nij, fvec Nji, fvec Nijconj, fvec NconjtmpI, fvec NconjtmpJ,
+    fvec fij[3]
+) {
+  fvec c_TOL = fvec::set1(TOL);
+  fvec dN3[3];
+  fvec Tij = aut_frebo_Tij(ka, itype, jtype, Nij, Nji, Nijconj, &dN3[0]);
+  bvec TijgtTOLmask = fvec::cmpnle(fvec::abs(Tij), c_TOL);
+  fvec sum_omega = fvec::setzero();
+  if (bvec::test_any_set(TijgtTOLmask)) {
+    sum_omega = aut_frebo_sum_omega(
+        ka, i_data, j_data, itype, jtype, vi, vj,
+        r23x, r23y, r23z, r23mag, VA *  Tij, fij);
+    sum_omega = fvec::mask_blend(TijgtTOLmask, fvec::setzero(), sum_omega);
+    aut_frebo_N_spline_force(ka, i_data, itype, jtype, vi, vj, VA * sum_omega,
+			     dN3[0], dN3[2], NconjtmpI);
+    aut_frebo_N_spline_force(ka, j_data, jtype, itype, vj, vi, VA * sum_omega,
+			     dN3[1], dN3[2], NconjtmpJ);
+  }
+  return Tij *  sum_omega;
+}
+
+/*
+ We can reuse the aut_frebo_data buffers here to do this calculation very 
+ cheaply.
+*/
+static void aut_torsion_vec(
+    KernelArgsAIREBOT<flt_t,acc_t> * ka,
+    struct aut_frebo_data * i_data,
+    struct aut_frebo_data * j_data,
+    ivec i, ivec j, fvec wij, fvec dwij
+) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  flt_t (*epsilonT)[2] = ka->params.epsilonT;
+  fvec epsilonT00 = fvec::set1(epsilonT[0][0]);
+  fvec epsilonT01 = fvec::set1(epsilonT[0][1]);
+  fvec epsilonT10 = fvec::set1(epsilonT[1][0]);
+  fvec epsilonT11 = fvec::set1(epsilonT[1][1]);
+  fvec thmin = fvec::set1(ka->params.thmin);
+  fvec thmax = fvec::set1(ka->params.thmax);
+
+  const fvec c_1_0 = fvec::set1(1.0);
+  const fvec c_0_5 = fvec::set1(0.5);
+  const fvec c_0_1 = fvec::set1(0.1);
+  const fvec c_2_0 = fvec::set1(2.0);
+  const fvec c_2_5 = fvec::set1(2.5);
+  const fvec c_256_405 = fvec::set1(256.0/405.0);
+
+  fvec del32x = j_data->x_i -  i_data->x_i;
+  fvec del32y = j_data->y_i -  i_data->y_i;
+  fvec del32z = j_data->z_i -  i_data->z_i;
+  fvec rsq = del32x * del32x +  del32y * del32y +  del32z * del32z;
+  fvec r32 = fvec::sqrt(rsq);
+  fvec del23x = fvec::setzero() -  del32x;
+  fvec del23y = fvec::setzero() -  del32y;
+  fvec del23z = fvec::setzero() -  del32z;
+  fvec r23 = r32;
+  fvec w23 = wij;
+  fvec dw23 = dwij;
+
+  for (int buf_idx_i = 0; buf_idx_i < i_data->buf_len; buf_idx_i++) {
+    bvec mask_start = i_data->mask_buf[buf_idx_i];
+    fvec del21x = i_data->rikx_buf[buf_idx_i]; // a2 - a1 -> i - k
+    fvec del21y = i_data->riky_buf[buf_idx_i]; // a2 - a1 -> i - k
+    fvec del21z = i_data->rikz_buf[buf_idx_i]; // a2 - a1 -> i - k
+    fvec r21 = i_data->rikmag_buf[buf_idx_i];
+    fvec cos321 = i_data->cosjik_buf[buf_idx_i];
+    fvec sin321 = fvec::sqrt(c_1_0 -  cos321 *  cos321);
+    // strictly equivalent to sin321 < TOL
+    mask_start = fvec::mask_cmpneq(mask_start, fvec::setzero(), sin321);
+    if (! bvec::test_any_set(mask_start)) continue;
+
+    fvec deljkx = del21x -  del23x;
+    fvec deljky = del21y -  del23y;
+    fvec deljkz = del21z -  del23z;
+    fvec rjk2 = deljkx * deljkx +  deljky * deljky + deljkz * deljkz;
+    fvec rjk = fvec::sqrt(rjk2);
+    fvec rik2 = r21 *  r21;
+    fvec w21 = i_data->wik_buf[buf_idx_i];
+    fvec dw21 = i_data->dwik_buf[buf_idx_i];
+
+    fvec rij = r32;
+    fvec rik = r21;
+    fvec rij2 = r32 *  r32;
+    fvec dtsjik;
+    fvec tspjik = aut_Sp2_deriv(cos321, thmin, thmax, &dtsjik);
+    dtsjik = fvec::setzero() -  dtsjik;
+
+    bvec ktype_mask = i_data->ktype_buf[buf_idx_i];
+    fvec epsilonT0 = fvec::mask_blend(ktype_mask, epsilonT00, epsilonT10);
+    fvec epsilonT1 = fvec::mask_blend(ktype_mask, epsilonT01, epsilonT11);
+
+    ivec k = i_data->k_buf[buf_idx_i];
+    for (int buf_idx_j = 0; buf_idx_j < j_data->buf_len; buf_idx_j++) {
+      ivec l = j_data->k_buf[buf_idx_j];
+      bvec mask_inner_0 = ivec::mask_cmpneq(mask_start, k, l) & 
+	j_data->mask_buf[buf_idx_j];
+      if (! bvec::test_any_set(mask_inner_0)) continue;
+      fvec del34x = j_data->rikx_buf[buf_idx_j];
+      fvec del34y = j_data->riky_buf[buf_idx_j];
+      fvec del34z = j_data->rikz_buf[buf_idx_j];
+      fvec r34 = j_data->rikmag_buf[buf_idx_j];
+      bvec ltype_mask = j_data->ktype_buf[buf_idx_j];
+      fvec cos234 = j_data->cosjik_buf[buf_idx_j];
+      fvec sin234 = fvec::sqrt(c_1_0 -  cos234 *  cos234);
+      // strictly equivalent to sin234 < TOL
+      mask_inner_0 = fvec::mask_cmpneq(mask_inner_0, sin234, fvec::setzero());
+      if (! bvec::test_any_set(mask_inner_0)) continue;
+      fvec dw34 = j_data->dwik_buf[buf_idx_j];
+      fvec w34 = j_data->wik_buf[buf_idx_j];
+      fvec delilx = del23x +  del34x;
+      fvec delily = del23y +  del34y;
+      fvec delilz = del23z +  del34z;
+      fvec ril2 = delilx * delilx +  delily * delily + delilz * delilz;
+      fvec ril = fvec::sqrt(ril2);
+      fvec rjl2 = r34 *  r34;
+
+      fvec rjl = r34;
+      fvec dtsijl;
+      fvec tspijl = aut_Sp2_deriv(cos234, thmin, thmax, &dtsijl);
+      dtsijl = fvec::setzero() -  dtsijl;
+      fvec cross321x = del32y * del21z - del32z * del21y;
+      fvec cross321y = del32z * del21x - del32x * del21z;
+      fvec cross321z = del32x * del21y - del32y * del21x;
+      fvec cross321mag = fvec::sqrt(cross321x * cross321x + 
+				    cross321y * cross321y + 
+				    cross321z * cross321z);
+      fvec cross234x = del23y * del34z - del23z * del34y;
+      fvec cross234y = del23z * del34x - del23x * del34z;
+      fvec cross234z = del23x * del34y - del23y * del34x;
+      fvec cross234mag = fvec::sqrt(cross234x * cross234x + 
+				    cross234y * cross234y + 
+				    cross234z * cross234z);
+      fvec cwnum = cross321x * cross234x + cross321y * cross234y + 
+	cross321z * cross234z;
+      fvec cwnom = r21 * r34 * r32 * r32 * sin321 * sin234;
+      fvec cw = cwnum /  cwnom;
+
+      fvec cw2 = c_0_5 * ( c_1_0 - cw);
+      fvec ekijl = fvec::mask_blend(ltype_mask, epsilonT0, epsilonT1);
+      fvec Ec = c_256_405 * ekijl;
+      fvec cw2_5 = cw2 *  cw2 *  cw2 *  cw2 *  cw2;
+      fvec Vtors = Ec *  cw2_5 -  ekijl *  c_0_1;
+
+      fvec evdwl = Vtors * w21 * w23 * w34 * (c_1_0-tspjik) * (c_1_0-tspijl);
+      ka->result_eng += fvec::mask_reduce_add(mask_inner_0, evdwl);
+
+      fvec dndijx  = cross234y * del21z - cross234z * del21y;
+      fvec dndijy  = cross234z * del21x - cross234x * del21z;
+      fvec dndijz  = cross234x * del21y - cross234y * del21x;
+
+      fvec tmpvecx = del34y * cross321z - del34z * cross321y;
+      fvec tmpvecy = del34z * cross321x - del34x * cross321z;
+      fvec tmpvecz = del34x * cross321y - del34y * cross321x;
+
+      dndijx = dndijx + tmpvecx;
+      dndijy = dndijy + tmpvecy;
+      dndijz = dndijz + tmpvecz;
+
+      fvec dndikx = del23y * cross234z - del23z * cross234y;
+      fvec dndiky = del23z * cross234x - del23x * cross234z;
+      fvec dndikz = del23x * cross234y - del23y * cross234x;
+
+      fvec dndjlx = cross321y * del23z - cross321z * del23y;
+      fvec dndjly = cross321z * del23x - cross321x * del23z;
+      fvec dndjlz = cross321x * del23y - cross321y * del23x;
+
+      fvec r23sq = r23 *  r23;
+      fvec r21sq = r21 *  r21;
+      fvec r34sq = r34 *  r34;
+      fvec rjksq = rjk *  rjk;
+      fvec rilsq = ril *  ril;
+      fvec dcidij = (r23sq -  r21sq +  rjksq) / ( c_2_0 *  r23sq *  r21);
+      fvec dcidik = (r21sq -  r23sq +  rjksq) / ( c_2_0 *  r21sq *  r23);
+      fvec dcidjk = fvec::setzero() -  rjk / ( r23 *  r21);
+      fvec dcjdji = (r23sq -  r34sq +  rilsq) / ( c_2_0 *  r23sq *  r34);
+      fvec dcjdjl = (r34sq -  r23sq +  rilsq) / ( c_2_0 *  r34sq *  r23);
+      fvec dcjdil = fvec::setzero() -  ril / ( r23 *  r34);
+
+      fvec dsidij = fvec::setzero() -  cos321 / sin321 * dcidij;
+      fvec dsidik = fvec::setzero() -  cos321 / sin321 * dcidik;
+      fvec dsidjk = fvec::setzero() -  cos321 / sin321 * dcidjk;
+
+      fvec dsjdji = fvec::setzero() -  cos234 / sin234 * dcjdji;
+      fvec dsjdjl = fvec::setzero() -  cos234 / sin234 * dcjdjl;
+      fvec dsjdil = fvec::setzero() -  cos234 / sin234 * dcjdil;
+
+      fvec dxidij = r21 * sin321 + r23 * r21 * dsidij;
+      fvec dxidik = r23 * sin321 + r23 * r21 * dsidik;
+      fvec dxidjk = r23 * r21 * dsidjk;
+
+      fvec dxjdji = r34 * sin234 + r23 * r34 * dsjdji;
+      fvec dxjdjl = r23 * sin234 + r23 * r34 * dsjdjl;
+      fvec dxjdil = r23 * r34 * dsjdil;
+
+      fvec ddndij = dxidij * cross234mag + cross321mag * dxjdji;
+      fvec ddndik = dxidik * cross234mag;
+      fvec ddndjk = dxidjk * cross234mag;
+      fvec ddndjl = cross321mag * dxjdjl;
+      fvec ddndil = cross321mag * dxjdil;
+      fvec dcwddn = fvec::setzero() -  cwnum / ( cwnom * cwnom);
+      fvec dcwdn = fvec::recip(cwnom);
+      fvec cw2_4 = cw2 *  cw2 *  cw2 *  cw2;
+      fvec dvpdcw = c_2_5 * Ec * cw2_4 * w23 * w21 * w34 * (c_1_0 - tspjik) *
+	(c_1_0 - tspijl);
+
+      fvec Ftmpx = dvpdcw * (dcwdn * dndijx + dcwddn * ddndij * del23x / r23);
+      fvec Ftmpy = dvpdcw * (dcwdn * dndijy + dcwddn * ddndij * del23y / r23);
+      fvec Ftmpz = dvpdcw * (dcwdn * dndijz + dcwddn * ddndij * del23z / r23);
+      fvec fix = Ftmpx;
+      fvec fiy = Ftmpy;
+      fvec fiz = Ftmpz;
+      fvec fjx = fvec::setzero() - Ftmpx;
+      fvec fjy = fvec::setzero() - Ftmpy;
+      fvec fjz = fvec::setzero() - Ftmpz;
+
+      Ftmpx = dvpdcw * (dcwdn * dndikx + dcwddn * ddndik * del21x / r21);
+      Ftmpy = dvpdcw * (dcwdn * dndiky + dcwddn * ddndik * del21y / r21);
+      Ftmpz = dvpdcw * (dcwdn * dndikz + dcwddn * ddndik * del21z / r21);
+      fix = fix +  Ftmpx;
+      fiy = fiy +  Ftmpy;
+      fiz = fiz +  Ftmpz;
+      fvec fkx = fvec::setzero() -  Ftmpx;
+      fvec fky = fvec::setzero() -  Ftmpy;
+      fvec fkz = fvec::setzero() -  Ftmpz;
+
+      Ftmpx = dvpdcw * dcwddn * ddndjk * deljkx / rjk;
+      Ftmpy = dvpdcw * dcwddn * ddndjk * deljky / rjk;
+      Ftmpz = dvpdcw * dcwddn * ddndjk * deljkz / rjk;
+      fjx = fjx +  Ftmpx;
+      fjy = fjy +  Ftmpy;
+      fjz = fjz +  Ftmpz;
+      fkx = fkx -  Ftmpx;
+      fky = fky -  Ftmpy;
+      fkz = fkz -  Ftmpz;
+
+      Ftmpx = dvpdcw * (dcwdn * dndjlx + dcwddn * ddndjl * del34x / r34);
+      Ftmpy = dvpdcw * (dcwdn * dndjly + dcwddn * ddndjl * del34y / r34);
+      Ftmpz = dvpdcw * (dcwdn * dndjlz + dcwddn * ddndjl * del34z / r34);
+      fjx = fjx +  Ftmpx;
+      fjy = fjy +  Ftmpy;
+      fjz = fjz +  Ftmpz;
+      fvec flx = fvec::setzero() -  Ftmpx;
+      fvec fly = fvec::setzero() -  Ftmpy;
+      fvec flz = fvec::setzero() -  Ftmpz;
+
+      Ftmpx = dvpdcw * dcwddn * ddndil * delilx / ril;
+      Ftmpy = dvpdcw * dcwddn * ddndil * delily / ril;
+      Ftmpz = dvpdcw * dcwddn * ddndil * delilz / ril;
+      fix = fix +  Ftmpx;
+      fiy = fiy +  Ftmpy;
+      fiz = fiz +  Ftmpz;
+      flx = flx -  Ftmpx;
+      fly = fly -  Ftmpy;
+      flz = flz -  Ftmpz;
+
+      // coordination forces
+
+      fvec fpair = Vtors * dw21 * w23 * w34 * (c_1_0 - tspjik) * 
+	(c_1_0 - tspijl) /  r21;
+      fix = fix -  del21x * fpair;
+      fiy = fiy -  del21y * fpair;
+      fiz = fiz -  del21z * fpair;
+      fkx = fkx +  del21x * fpair;
+      fky = fky +  del21y * fpair;
+      fkz = fkz +  del21z * fpair;
+
+      fpair = Vtors * w21 * dw23 * w34 * (c_1_0 - tspjik) * (c_1_0 - tspijl) /
+	r23;
+      fix = fix -  del23x * fpair;
+      fiy = fiy -  del23y * fpair;
+      fiz = fiz -  del23z * fpair;
+      fjx = fjx +  del23x * fpair;
+      fjy = fjy +  del23y * fpair;
+      fjz = fjz +  del23z * fpair;
+
+      fpair = Vtors * w21 * w23 * dw34 * (c_1_0 - tspjik) * (c_1_0 - tspijl) /
+	r34;
+      fjx = fjx -  del34x * fpair;
+      fjy = fjy -  del34y * fpair;
+      fjz = fjz -  del34z * fpair;
+      flx = flx +  del34x * fpair;
+      fly = fly +  del34y * fpair;
+      flz = flz +  del34z * fpair;
+
+      // additional cut off function forces
+
+      fvec fcpc = fvec::setzero() - Vtors * w21 * w23 * w34 * dtsjik * (c_1_0 -
+									tspijl);
+      fpair = fcpc * dcidij / rij;
+      fix = fix +  fpair * del23x;
+      fiy = fiy +  fpair * del23y;
+      fiz = fiz +  fpair * del23z;
+      fjx = fjx -  fpair * del23x;
+      fjy = fjy -  fpair * del23y;
+      fjz = fjz -  fpair * del23z;
+
+      fpair = fcpc * dcidik / rik;
+      fix = fix +  fpair * del21x;
+      fiy = fiy +  fpair * del21y;
+      fiz = fiz +  fpair * del21z;
+      fkx = fkx -  fpair * del21x;
+      fky = fky -  fpair * del21y;
+      fkz = fkz -  fpair * del21z;
+
+      fpair = fcpc * dcidjk / rjk;
+      fjx = fjx +  fpair * deljkx;
+      fjy = fjy +  fpair * deljky;
+      fjz = fjz +  fpair * deljkz;
+      fkx = fkx -  fpair * deljkx;
+      fky = fky -  fpair * deljky;
+      fkz = fkz -  fpair * deljkz;
+
+      fcpc = fvec::setzero() - Vtors * w21 * w23 * w34 * (c_1_0 - tspjik) * 
+	dtsijl;
+      fpair = fcpc * dcjdji / rij;
+      fix = fix +  fpair * del23x;
+      fiy = fiy +  fpair * del23y;
+      fiz = fiz +  fpair * del23z;
+      fjx = fjx -  fpair * del23x;
+      fjy = fjy -  fpair * del23y;
+      fjz = fjz -  fpair * del23z;
+
+      fpair = fcpc * dcjdjl / rjl;
+      fjx = fjx +  fpair * del34x;
+      fjy = fjy +  fpair * del34y;
+      fjz = fjz +  fpair * del34z;
+      flx = flx -  fpair * del34x;
+      fly = fly -  fpair * del34y;
+      flz = flz -  fpair * del34z;
+
+      fpair = fcpc * dcjdil / ril;
+      fix = fix +  fpair * delilx;
+      fiy = fiy +  fpair * delily;
+      fiz = fiz +  fpair * delilz;
+      flx = flx -  fpair * delilx;
+      fly = fly -  fpair * delily;
+      flz = flz -  fpair * delilz;
+
+      // sum per-atom forces into atom force array
+
+      i_data->force_i_x = fvec::mask_add(i_data->force_i_x, mask_inner_0, 
+					 i_data->force_i_x, fix);
+      i_data->force_i_y = fvec::mask_add(i_data->force_i_y, mask_inner_0, 
+					 i_data->force_i_y, fiy);
+      i_data->force_i_z = fvec::mask_add(i_data->force_i_z, mask_inner_0, 
+					 i_data->force_i_z, fiz);
+      i_data->force_j_x = fvec::mask_add(i_data->force_j_x, mask_inner_0, 
+					 i_data->force_j_x, fjx);
+      i_data->force_j_y = fvec::mask_add(i_data->force_j_y, mask_inner_0, 
+					 i_data->force_j_y, fjy);
+      i_data->force_j_z = fvec::mask_add(i_data->force_j_z, mask_inner_0, 
+					 i_data->force_j_z, fjz);
+      i_data->force_k_x_buf[buf_idx_i] = 
+	fvec::mask_add(i_data->force_k_x_buf[buf_idx_i], mask_inner_0, 
+		       i_data->force_k_x_buf[buf_idx_i], fkx);
+      i_data->force_k_y_buf[buf_idx_i] = 
+	fvec::mask_add(i_data->force_k_y_buf[buf_idx_i], mask_inner_0, 
+		       i_data->force_k_y_buf[buf_idx_i], fky);
+      i_data->force_k_z_buf[buf_idx_i] = 
+	fvec::mask_add(i_data->force_k_z_buf[buf_idx_i], mask_inner_0, 
+		       i_data->force_k_z_buf[buf_idx_i], fkz);
+      j_data->force_k_x_buf[buf_idx_j] = 
+	fvec::mask_add(j_data->force_k_x_buf[buf_idx_j], mask_inner_0, 
+		       j_data->force_k_x_buf[buf_idx_j], flx);
+      j_data->force_k_y_buf[buf_idx_j] = 
+	fvec::mask_add(j_data->force_k_y_buf[buf_idx_j], mask_inner_0, 
+		       j_data->force_k_y_buf[buf_idx_j], fly);
+      j_data->force_k_z_buf[buf_idx_j] = 
+	fvec::mask_add(j_data->force_k_z_buf[buf_idx_j], mask_inner_0, 
+		       j_data->force_k_z_buf[buf_idx_j], flz);
+    }
+  }
+}
+
+/*
+ * Processes VL elements of the same type itype/jtype for REBO and TORSION
+ * interactions. This allows us to reuse the aut_frebo_data buffes in the 
+ * torsion calculaltion.
+ */
+static void aut_frebo_batch_of_kind(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+				    int torflag, int itype, int jtype, 
+				    int * i_buf, int * j_buf) {
+ { // jump-scope for exceed_limits
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * tag = ka->tag;
+  int * map = ka->map;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  flt_t rcminij = ka->params.rcmin[itype][jtype];
+  flt_t rcmaxij = ka->params.rcmax[itype][jtype];
+  flt_t Qij = ka->params.Q[itype][jtype];
+  flt_t Aij = ka->params.A[itype][jtype];
+  flt_t alphaij = ka->params.alpha[itype][jtype];
+  fvec vrcminij = fvec::set1(ka->params.rcmin[itype][jtype]);
+  fvec vrcmaxij = fvec::set1(ka->params.rcmax[itype][jtype]);
+  fvec vQij = fvec::set1(ka->params.Q[itype][jtype]);
+  fvec vAij = fvec::set1(ka->params.A[itype][jtype]);
+  fvec malphaij = fvec::set1(-ka->params.alpha[itype][jtype]);
+  fvec c_1_0 = fvec::set1(1);
+  fvec c_0_5 = fvec::set1(0.5);
+  fvec c_TOL = fvec::set1(1e-9);
+  struct aut_frebo_data i_data, j_data;
+
+  fvec evdwl_vacc = fvec::setzero();
+  ivec vi = ivec::maskz_loadu(bvec::full(), i_buf);
+  int tmp;
+  ivec vj = ivec::maskz_loadu(bvec::full(), j_buf);
+  fvec x_i, y_i, z_i;
+  fvec x_j, y_j, z_j;
+  aut_loadatoms_vec_notype(x, vi, &x_i, &y_i, &z_i);
+  aut_loadatoms_vec_notype(x, vj, &x_j, &y_j, &z_j);
+  i_data.x_i = x_i;
+  i_data.y_i = y_i;
+  i_data.z_i = z_i;
+  i_data.x_j = x_j;
+  i_data.y_j = y_j;
+  i_data.z_j = z_j;
+  j_data.x_i = x_j;
+  j_data.y_i = y_j;
+  j_data.z_i = z_j;
+  j_data.x_j = x_i;
+  j_data.y_j = y_i;
+  j_data.z_j = z_i;
+  fvec delx = x_i -  x_j;
+  fvec dely = y_i -  y_j;
+  fvec delz = z_i -  z_j;
+  fvec rsq = delx *  delx +  dely *  dely +  delz *  delz;
+  fvec rij = fvec::sqrt(rsq);
+  fvec dwij;
+  fvec wij = aut_Sp_deriv(rij, vrcminij, vrcmaxij, &dwij);
+
+  fvec exp_alphar = fvec::exp(malphaij *  rij);
+  fvec Qij_over_rij = vQij /  rij;
+  fvec Qij_over_rsq = vQij /  rsq;
+  fvec VR_by_wij = ( c_1_0 +  Qij_over_rij) *  vAij *  exp_alphar;
+  fvec VR = wij * VR_by_wij;
+  fvec pre = wij *  vAij *  exp_alphar;
+  fvec dVRdi = pre * ( malphaij +  malphaij *  Qij_over_rij -  Qij_over_rsq);
+  dVRdi = dVRdi + VR_by_wij *  dwij;
+
+  fvec VA_by_wij = fvec::setzero();
+  fvec dVA = fvec::setzero();
+
+  int k;
+  for (k = 0; k < 3; k++) {
+    fvec mBIJc = fvec::set1(-ka->params.BIJc[itype][jtype][k]);
+    fvec mBetaij = fvec::set1(-ka->params.Beta[itype][jtype][k]);
+    fvec term = mBIJc *  fvec::exp(mBetaij *  rij);
+    VA_by_wij = VA_by_wij +  term;
+    dVA = dVA +  mBetaij * wij * term;
+  }
+
+  dVA = dVA +  dwij *  VA_by_wij;
+  fvec VA = wij * VA_by_wij;
+
+  bvec tol_check = fvec::cmplt(wij, c_TOL);
+  VA = fvec::mask_blend(tol_check, VA, fvec::setzero());
+  dVA = fvec::mask_blend(tol_check, dVA, fvec::setzero());
+  VR = fvec::mask_blend(tol_check, VR, fvec::setzero());
+  dVRdi = fvec::mask_blend(tol_check, dVRdi, fvec::setzero());
+
+  fvec nHi = fvec::gather(vi, ka->nH, sizeof(flt_t));
+  fvec nCi = fvec::gather(vi, ka->nC, sizeof(flt_t));
+  fvec nHj = fvec::gather(vj, ka->nH, sizeof(flt_t));
+  fvec nCj = fvec::gather(vj, ka->nC, sizeof(flt_t));
+  fvec Nij = (nHi +  nCi) -  wij;
+  fvec Nji = (nHj +  nCj) -  wij;
+  i_data.nHi = nHi;
+  i_data.nCi = nCi;
+  j_data.nHi = nHj;
+  j_data.nCi = nCj;
+  fvec fij[3], fji[3];
+  fij[0] = fvec::setzero(); fij[1] = fvec::setzero();
+  fij[2] = fvec::setzero();
+  fji[0] = fvec::setzero(); fji[1] = fvec::setzero();
+  fji[2] = fvec::setzero();
+
+  fvec NconjtmpI;
+  fvec pij = aut_frebo_pij_pd_2(
+      ka, &i_data, itype, jtype, vi, vj,
+      delx, dely, delz, rij, wij, VA, &NconjtmpI, fij);
+
+  if (i_data.buf_len < 0) goto exceed_limits;
+
+  fvec NconjtmpJ;
+  fvec rjix = fvec::setzero() -  delx;
+  fvec rjiy = fvec::setzero() -  dely;
+  fvec rjiz = fvec::setzero() -  delz;
+  fvec pji = aut_frebo_pij_pd_2(
+      ka, &j_data, jtype, itype, vj, vi,
+      rjix, rjiy, rjiz, rij, wij, VA, &NconjtmpJ, fji);
+  fij[0] = fij[0] -  fji[0];
+  fij[1] = fij[1] -  fji[1];
+  fij[2] = fij[2] -  fji[2];
+
+  if (j_data.buf_len < 0) goto exceed_limits;
+
+  if (torflag && itype == 0 && jtype == 0)
+    aut_torsion_vec(ka, &i_data, &j_data, vi, vj, wij, dwij);
+
+  fvec Nijconj = c_1_0 +  NconjtmpI *  NconjtmpI +  NconjtmpJ *  NconjtmpJ;
+  fvec dN3[3];
+  fvec pi_rc = aut_frebo_pi_rc_pd(ka, itype, jtype, Nij, Nji, Nijconj, dN3);
+  aut_frebo_N_spline_force(ka, &i_data, itype, jtype, vi, vj, VA, dN3[0], 
+			   dN3[2], NconjtmpI);
+  aut_frebo_N_spline_force(ka, &j_data, jtype, itype, vj, vi, VA, dN3[1], 
+			   dN3[2], NconjtmpJ);
+  fvec pi_dh = aut_frebo_pi_dh(ka, &i_data, &j_data, itype, jtype, vi, vj, 
+			       delx, dely, delz, rij, VA, Nij, Nji, Nijconj,
+			       NconjtmpI, NconjtmpJ, fij);
+
+  fvec bij = c_0_5 * ( pij +  pji) +  pi_rc +  pi_dh;
+  fvec dVAdi = bij *  dVA;
+  fvec fpair = (dVAdi +  dVRdi) *  fvec::recip(rij);
+  fvec result_f_j_x = fpair *  delx -  fij[0];
+  fvec result_f_j_y = fpair *  dely -  fij[1];
+  fvec result_f_j_z = fpair *  delz -  fij[2];
+  fvec result_f_i_x = fvec::setzero() -  result_f_j_x;
+  fvec result_f_i_y = fvec::setzero() -  result_f_j_y;
+  fvec result_f_i_z = fvec::setzero() -  result_f_j_z;
+  fvec evdwl = VR +  bij *  VA;
+  evdwl_vacc = evdwl_vacc +  evdwl;
+
+  aut_frebo_data_writeback(ka, &i_data);
+  aut_frebo_data_writeback(ka, &j_data);
+
+  flt_t fi_x_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fi_y_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fi_z_buf[fvec::VL] __attribute__((aligned(64)));
+  int fi_i_buf[ivec::VL] __attribute__((aligned(64)));
+  flt_t fj_x_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fj_y_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fj_z_buf[fvec::VL] __attribute__((aligned(64)));
+  int fj_j_buf[ivec::VL] __attribute__((aligned(64)));
+  flt_t evdwl_buf[fvec::VL] __attribute__((aligned(64)));
+
+  result_f_i_x = i_data.force_i_x +  result_f_i_x;
+  result_f_i_y = i_data.force_i_y +  result_f_i_y;
+  result_f_i_z = i_data.force_i_z +  result_f_i_z;
+  result_f_j_x = i_data.force_j_x +  result_f_j_x;
+  result_f_j_y = i_data.force_j_y +  result_f_j_y;
+  result_f_j_z = i_data.force_j_z +  result_f_j_z;
+
+  result_f_i_x = j_data.force_j_x +  result_f_i_x;
+  result_f_i_y = j_data.force_j_y +  result_f_i_y;
+  result_f_i_z = j_data.force_j_z +  result_f_i_z;
+  result_f_j_x = j_data.force_i_x +  result_f_j_x;
+  result_f_j_y = j_data.force_i_y +  result_f_j_y;
+  result_f_j_z = j_data.force_i_z +  result_f_j_z;
+
+  fvec::store(fi_x_buf, result_f_i_x);
+  fvec::store(fi_y_buf, result_f_i_y);
+  fvec::store(fi_z_buf, result_f_i_z);
+  ivec::store(fi_i_buf, vi);
+  fvec::store(fj_x_buf, result_f_j_x);
+  fvec::store(fj_y_buf, result_f_j_y);
+  fvec::store(fj_z_buf, result_f_j_z);
+  ivec::store(fj_j_buf, vj);
+  fvec::store(evdwl_buf, evdwl);
+
+  int lane;
+  for (lane = 0; lane < fvec::VL; lane++) {
+    int ii = fi_i_buf[lane];
+    result_f[ii].x += fi_x_buf[lane];
+    result_f[ii].y += fi_y_buf[lane];
+    result_f[ii].z += fi_z_buf[lane];
+    result_f[ii].w += 0.5 * evdwl_buf[lane];
+    int jj = fj_j_buf[lane];
+    result_f[jj].x += fj_x_buf[lane];
+    result_f[jj].y += fj_y_buf[lane];
+    result_f[jj].z += fj_z_buf[lane];
+    result_f[jj].w += 0.5 * evdwl_buf[lane];
+  }
+  ka->result_eng += fvec::reduce_add(evdwl_vacc);
+  return;
+ }
+exceed_limits:
+  for (int l = 0; l < fvec::VL; l++) {
+    int i = i_buf[l];
+    int j = j_buf[l];
+    ref_frebo_single_interaction(ka, i, j);
+    if (torflag && itype == 0 && jtype == 0) 
+      ref_torsion_single_interaction(ka, i, j);
+  }
+}
+
+/*
+ Orders the interactions by itype and jtype and passes chunks to the above 
+ method.
+*/
+static void aut_frebo(KernelArgsAIREBOT<flt_t,acc_t> * ka, int torflag) {
+  AtomAIREBOT<flt_t> * _noalias x = ka->x;
+  int * _noalias tag = ka->tag;
+  int * _noalias map = ka->map;
+  int i_buf[2][2][fvec::VL];
+  int j_buf[2][2][fvec::VL];
+  int n_buf[2][2] = {0};
+  for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) {
+    int itag = tag[i];
+    int itype = map[x[i].w];
+    flt_t x_i = x[i].x;
+    flt_t y_i = x[i].y;
+    flt_t z_i = x[i].z;
+    int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
+    int jnum = ka->neigh_rebo.num[i];
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = neighs[jj];
+      int jtag = tag[j];
+      if (itag > jtag) {
+        if (((itag + jtag) & 1) == 0)
+          continue;
+      } else if (itag < jtag) {
+        if (((itag + jtag) & 1) == 1)
+          continue;
+      } else {
+        if (x[j].z < z_i)
+          continue;
+        if (x[j].z == z_i && x[j].y < y_i)
+          continue;
+        if (x[j].z == z_i && x[j].y == y_i && x[j].x < x_i)
+          continue;
+      }
+      int jtype = map[x[j].w];
+      int ins = n_buf[itype][jtype];
+      i_buf[itype][jtype][ins] = i;
+      j_buf[itype][jtype][ins] = j;
+      n_buf[itype][jtype] += 1;
+      if (n_buf[itype][jtype] == fvec::VL) {
+        aut_frebo_batch_of_kind(ka, torflag, itype, jtype,
+            i_buf[itype][jtype], j_buf[itype][jtype]);
+        n_buf[itype][jtype] = 0;
+      }
+    }
+  }
+  for (int itype = 0; itype < 2; itype++) {
+    for (int jtype = 0; jtype < 2; jtype++) {
+      for (int l = 0; l < n_buf[itype][jtype]; l++) {
+        int i = i_buf[itype][jtype][l];
+        int j = j_buf[itype][jtype][l];
+        ref_frebo_single_interaction(ka, i, j);
+        if (torflag && itype == 0 && jtype == 0) 
+	  ref_torsion_single_interaction(ka, i, j);
+      }
+    }
+  }
+}
+
+/*
+ * Apply paths in scalar fashion, not crucial for performance.
+ */
+static void aut_airebo_lj_force_path(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+   bvec mask, fvec dC, LennardJonesPathAIREBOT<flt_t> path[fvec::VL]) {
+  for (int i = 0; i < fvec::VL; i++) {
+    if (bvec::test_at(mask, i)) {
+      ref_lennard_jones_force_path(ka, fvec::at(dC, i), &path[i]);
+    }
+  }
+}
+
+/*
+ * Hash-Map for efficient calculation of C_ij.
+ * Can have up to ITEMS entries with associated paths, as well as
+ * 1024 entries. Open addressing, invalidation by using a different i.
+ * Only needs to be reset once per timestep.
+ */
+static const int OPT_TEST_PATH_SIZE = 1024;
+static const int OPT_TEST_PATH_ITEMS = 128;
+struct aut_airebo_lj_test_path_result_data {
+  LennardJonesPathAIREBOT<flt_t> testpath[OPT_TEST_PATH_ITEMS];
+  int i[OPT_TEST_PATH_SIZE];
+  int j[OPT_TEST_PATH_SIZE];
+  flt_t cij[OPT_TEST_PATH_SIZE];
+  int testpath_idx[OPT_TEST_PATH_SIZE];
+};
+static const unsigned int OPT_TEST_PATH_HASH = 2654435761;
+
+static int aut_lj_tap_hash_fn(int j, int attempt) {
+  uint32_t result = j;
+  result *= (uint32_t) OPT_TEST_PATH_HASH;
+  result += (uint32_t) attempt;
+  result %= (uint32_t) OPT_TEST_PATH_SIZE;
+  return result;
+}
+
+static ivec aut_airebo_lj_tap_hash_fn_vec(ivec val, ivec attempt) {
+  const ivec golden = ivec::set1(OPT_TEST_PATH_HASH);
+  const ivec mask = ivec::set1(OPT_TEST_PATH_SIZE - 1);
+  ivec a = ivec::mullo(golden, val);
+  ivec b = a +  attempt;
+  ivec c = ivec::the_and(b, mask);
+  return c;
+}
+
+/*
+ * Enter all those (potential) neighbors of i (including 2nd and 3rd degree) 
+ * into the hash-map. There is no good way to vectorize this, and it does not 
+ * seem time-critical.
+ */
+static bool aut_airebo_lj_test_all_paths(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+    int i, struct aut_airebo_lj_test_path_result_data * result) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  flt_t (*rcmin)[2] = &ka->params.rcmin[0];
+  flt_t (*rcmax)[2] = &ka->params.rcmax[0];
+  flt_t rcminsq[2][2];
+  rcminsq[0][0] = rcmin[0][0] * rcmin[0][0];
+  rcminsq[0][1] = rcmin[0][1] * rcmin[0][1];
+  rcminsq[1][0] = rcmin[1][0] * rcmin[1][0];
+  rcminsq[1][1] = rcmin[1][1] * rcmin[1][1];
+  int * neighs_i = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]];
+  int itype = map[x[i].w];
+  int path_insert_pos = 0;
+  for (int jj = 0; jj < ka->neigh_rebo.num[i]; jj++) {
+    int j = neighs_i[jj];
+    int jtype = map[x[j].w];
+    flt_t dijx = x[j].x - x[i].x;
+    flt_t dijy = x[j].y - x[i].y;
+    flt_t dijz = x[j].z - x[i].z;
+    flt_t rijsq = dijx * dijx + dijy * dijy + dijz * dijz;
+    flt_t wj = 1, dwj = 0;
+    flt_t rij = 0;
+    if (rijsq >= rcminsq[itype][jtype]) {
+      rij = overloaded::sqrt(rijsq);
+      wj = Sp(rij, rcmin[itype][jtype], rcmax[itype][jtype], &dwj);
+    }
+    int attempt = 0;
+    int start_hash_slot = aut_lj_tap_hash_fn(j, attempt);
+    int hash_slot = start_hash_slot;
+    while (result->i[hash_slot] == i && result->j[hash_slot] != j && 
+	   attempt < OPT_TEST_PATH_SIZE) {
+      hash_slot = aut_lj_tap_hash_fn(j, ++attempt);
+    }
+    if (attempt >= OPT_TEST_PATH_SIZE) goto exceed_limits;
+    bool init_slot = result->i[hash_slot] != i;
+    if (init_slot || (1 - wj < result->cij[hash_slot])) {
+      result->i[hash_slot] = i;
+      result->j[hash_slot] = j;
+      result->cij[hash_slot] = 1 - wj;
+      if (wj != 1.0) {
+        if (path_insert_pos >= OPT_TEST_PATH_ITEMS) goto exceed_limits;
+        result->testpath_idx[hash_slot] = path_insert_pos;
+        LennardJonesPathAIREBOT<flt_t> *path = 
+	  &result->testpath[path_insert_pos++];
+        path->num = 2;
+        path->del[0].x = dijx;
+        path->del[0].y = dijy;
+        path->del[0].z = dijz;
+        if (rij == 0) rij = sqrt(rijsq);
+        path->r[0] = rij;
+        path->w[0] = wj;
+        path->dw[0] = dwj;
+        path->idx[0] = i;
+        path->idx[1] = j;
+      }
+    }
+    int * neighs_j = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[j]];
+    for (int kk = 0; kk < ka->neigh_rebo.num[j]; kk++) {
+      int k = neighs_j[kk];
+      if (k == i) continue;
+      int ktype = map[x[k].w];
+      flt_t djkx = x[k].x - x[j].x;
+      flt_t djky = x[k].y - x[j].y;
+      flt_t djkz = x[k].z - x[j].z;
+      flt_t rjksq = djkx * djkx + djky * djky + djkz * djkz;
+      flt_t wk = 1, dwk = 0;
+      flt_t rjk = 0;
+      if (rjksq >= rcminsq[jtype][ktype]) {
+        rjk = overloaded::sqrt(rjksq);
+        wk = Sp(rjk, rcmin[jtype][ktype], rcmax[jtype][ktype], &dwk);
+      }
+      int attempt = 0;
+      int start_hash_slot = aut_lj_tap_hash_fn(k, attempt);
+      int hash_slot = start_hash_slot;
+      while (result->i[hash_slot] == i && result->j[hash_slot] != k && 
+	     attempt < OPT_TEST_PATH_SIZE) {
+        hash_slot = aut_lj_tap_hash_fn(k, ++attempt);
+      }
+      if (attempt >= OPT_TEST_PATH_SIZE) goto exceed_limits;
+      bool init_slot = result->i[hash_slot] != i;
+      if (init_slot || (1 - wj * wk < result->cij[hash_slot])) {
+        result->i[hash_slot] = i;
+        result->j[hash_slot] = k;
+        result->cij[hash_slot] = 1 - wj * wk;
+        if (wj * wk != 1.0) {
+          if (path_insert_pos >= OPT_TEST_PATH_ITEMS) goto exceed_limits;
+          result->testpath_idx[hash_slot] = path_insert_pos;
+          LennardJonesPathAIREBOT<flt_t> *path = 
+	    &result->testpath[path_insert_pos++];
+          path->num = 3;
+          path->del[0].x = dijx;
+          path->del[0].y = dijy;
+          path->del[0].z = dijz;
+          if (rij == 0) rij = sqrt(rijsq);
+          path->r[0] = rij;
+          path->del[1].x = djkx;
+          path->del[1].y = djky;
+          path->del[1].z = djkz;
+          if (rjk == 0) rjk = sqrt(rjksq);
+          path->r[1] = rjk;
+          path->w[0] = wj;
+          path->dw[0] = dwj;
+          path->w[1] = wk;
+          path->dw[1] = dwk;
+          path->idx[0] = i;
+          path->idx[1] = j;
+          path->idx[2] = k;
+        }
+      }
+      int * neighs_k = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[k]];
+      for (int ll = 0; ll < ka->neigh_rebo.num[k]; ll++) {
+        int l = neighs_k[ll];
+        if ((l == i) || (l == j)) continue;
+        int ltype = map[x[l].w];
+        flt_t dklx = x[l].x - x[k].x;
+        flt_t dkly = x[l].y - x[k].y;
+        flt_t dklz = x[l].z - x[k].z;
+        flt_t rklsq = dklx * dklx + dkly * dkly + dklz * dklz;
+        flt_t wl = 1, dwl = 0;
+        flt_t rkl = 0;
+        if (rklsq >= rcminsq[ktype][ltype]) {
+          rkl = overloaded::sqrt(rklsq);
+          wl = Sp(rkl, rcmin[ktype][ltype], rcmax[ktype][ltype], &dwl);
+        }
+        int attempt = 0;
+        int start_hash_slot = aut_lj_tap_hash_fn(l, attempt);
+        int hash_slot = start_hash_slot;
+        while (result->i[hash_slot] == i && result->j[hash_slot] != l && 
+	       attempt < OPT_TEST_PATH_SIZE) {
+          hash_slot = aut_lj_tap_hash_fn(l, ++attempt);
+        }
+        if (attempt >= OPT_TEST_PATH_SIZE) goto exceed_limits;
+        bool init_slot = result->i[hash_slot] != i;
+        if (init_slot || (1 - wj * wk * wl < result->cij[hash_slot])) {
+          result->i[hash_slot] = i;
+          result->j[hash_slot] = l;
+          result->cij[hash_slot] = 1 - wj * wk * wl;
+          if (wj * wk * wl != 1.0) {
+            if (path_insert_pos >= OPT_TEST_PATH_ITEMS) goto exceed_limits;
+            result->testpath_idx[hash_slot] = path_insert_pos;
+            LennardJonesPathAIREBOT<flt_t> *path = 
+	      &result->testpath[path_insert_pos++];
+            path->num = 4;
+            path->del[0].x = dijx;
+            path->del[0].y = dijy;
+            path->del[0].z = dijz;
+            if (rij == 0) rij = sqrt(rijsq);
+            path->r[0] = rij;
+            path->del[1].x = djkx;
+            path->del[1].y = djky;
+            path->del[1].z = djkz;
+            if (rjk == 0) rjk = sqrt(rjksq);
+            path->r[1] = rjk;
+            path->del[2].x = dklx;
+            path->del[2].y = dkly;
+            path->del[2].z = dklz;
+            if (rkl == 0) rkl = sqrt(rklsq);
+            path->r[2] = rkl;
+            path->w[0] = wj;
+            path->dw[0] = dwj;
+            path->w[1] = wk;
+            path->dw[1] = dwk;
+            path->w[2] = wl;
+            path->dw[2] = dwl;
+            path->idx[0] = i;
+            path->idx[1] = j;
+            path->idx[2] = k;
+            path->idx[3] = l;
+          }
+        }
+      }
+    }
+  }
+  return true;
+exceed_limits:
+  return false;
+}
+
+/*
+ * Attempt to look up an element in the hash-map.
+ */
+static fvec aut_airebo_lj_tap_test_path(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+  struct aut_airebo_lj_test_path_result_data * test_path_result,
+  bvec need_search, ivec i_bc, ivec j, 
+  LennardJonesPathAIREBOT<flt_t> path[fvec::VL]
+) {
+  const ivec c_i1 = ivec::set1(1);
+  fvec cij = fvec::set1(1.0);
+  // first round: hash all j
+  // lookup i/j in hash list.
+  // if i matches and j matches: congrats
+  // if i matches and j does not: look up attempts
+  // if attempts > current_attempts:
+  //   do another round of hashing
+  // for all those found:
+
+  //   fill in the path
+  // -----------------------------------------------
+  // find all the correct hash slots, and a mask of where found.
+  ivec attempt = ivec::setzero();
+  ivec hash_slot = aut_airebo_lj_tap_hash_fn_vec(j, attempt);
+  ivec lookup_i = ivec::mask_gather(ivec::undefined(), need_search, hash_slot,
+      &test_path_result->i[0], sizeof(int));
+  bvec correct_i = ivec::mask_cmpeq(need_search, lookup_i, i_bc);
+  ivec lookup_j = ivec::mask_gather(ivec::undefined(), correct_i, hash_slot,
+      &test_path_result->j[0], sizeof(int));
+  bvec found_items = ivec::mask_cmpeq(correct_i, lookup_j, j);
+  bvec another_attempt = correct_i & ~ found_items;
+  while (bvec::test_any_set(another_attempt)) {
+    attempt = ivec::mask_add(attempt, another_attempt, attempt, c_i1);
+    hash_slot = aut_airebo_lj_tap_hash_fn_vec(j, attempt);
+    ivec lookup_i_2 = ivec::mask_gather(lookup_i, another_attempt, hash_slot,
+        &test_path_result->i[0], sizeof(int));
+    lookup_i = lookup_i_2;
+    correct_i = ivec::mask_cmpeq(need_search, lookup_i, i_bc);
+    lookup_j = ivec::mask_gather(lookup_j, another_attempt, hash_slot,
+        &test_path_result->j[0], sizeof(int));
+    found_items = ivec::mask_cmpeq(correct_i, lookup_j, j);
+    another_attempt = correct_i & ~ found_items;
+  }
+  cij = fvec::mask_gather(cij, found_items, hash_slot, 
+			  &test_path_result->cij[0], sizeof(flt_t));
+  bvec need_testpath = fvec::mask_cmplt(found_items, fvec::setzero(), cij);
+  if (bvec::test_any_set(need_testpath)) {
+    for (int i = 0; i < fvec::VL; i++) {
+      if (bvec::test_at(need_testpath, i)) {
+        int testpath_idx = 
+          test_path_result->testpath_idx[ivec::at(hash_slot, i)];
+        path[i] = test_path_result->testpath[testpath_idx];
+      }
+    }
+  }
+  return cij;
+}
+
+/*
+ * This function calculates the Lennard-Jones interaciton for those
+ * elements that require a bond-order calculation.
+ * It is similarly structured as the aut_frebo_batch_of_kind function.
+ * The forces due to bondorders are calculated speculatively and later
+ * updated with the correct outer derivative.
+ */
+template<int MORSEFLAG>
+static void aut_lj_with_bo(
+    KernelArgsAIREBOT<flt_t,acc_t> * ka,
+    int itype, int jtype,
+    ivec i, ivec j,
+    fvec cij, LennardJonesPathAIREBOT<flt_t> testpath[fvec::VL]
+) {
+ { // jump-scope for exceed_limits
+  AtomAIREBOT<flt_t> * _noalias x = ka->x;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+
+  ivec c_i4 = ivec::set1(4);
+  fvec c_1_0 = fvec::set1(1.0);
+  fvec c_2_0 = fvec::set1(2.0);
+  fvec c_0_5 = fvec::set1(0.5);
+
+  fvec x_i, y_i, z_i;
+  aut_loadatoms_vec_notype(x, i, &x_i, &y_i, &z_i);
+  fvec x_j, y_j, z_j;
+  aut_loadatoms_vec_notype(x, j, &x_j, &y_j, &z_j);
+  fvec delx = x_i -  x_j;
+  fvec dely = y_i -  y_j;
+  fvec delz = z_i -  z_j;
+  fvec rsq = delx *  delx +  dely *  dely +  delz *  delz;
+
+  fvec rij = fvec::sqrt(rsq);
+  bvec need_path_force = fvec::cmplt(cij, c_1_0);
+  flt_t sigcut = ka->params.sigcut;
+  flt_t sigmin = ka->params.sigmin;
+  flt_t sigma = ka->params.sigma[itype][jtype];
+  flt_t rljmax = sigcut * sigma;
+  flt_t rljmin = sigmin * sigma;
+  fvec p_rljmin = fvec::set1(rljmin);
+  fvec p_rljmax = fvec::set1(rljmax);
+
+  fvec dslw, slw = aut_Sp2_deriv(rij, p_rljmin, p_rljmax, &dslw);
+
+  fvec p_lj1 = fvec::set1(ka->params.lj1[itype][jtype]);
+  fvec p_lj2 = fvec::set1(ka->params.lj2[itype][jtype]);
+  fvec p_lj3 = fvec::set1(ka->params.lj3[itype][jtype]);
+  fvec p_lj4 = fvec::set1(ka->params.lj4[itype][jtype]);
+
+  fvec r2inv = fvec::recip(rsq);
+
+  fvec vdw, dvdw;
+  if (MORSEFLAG) {
+    fvec exr = fvec::exp(fvec::setzero() - rij * p_lj4);
+    vdw = p_lj1 * exr * (p_lj2 * exr - c_2_0);
+    dvdw = p_lj3 * exr * (c_1_0 - p_lj2 * exr);
+  } else {
+    fvec r6inv = r2inv *  r2inv *  r2inv;
+
+    vdw = r6inv * ( p_lj3 *  r6inv -  p_lj4);
+    fvec r7inv = r6inv *  rij *  r2inv;
+    dvdw = r7inv * ( p_lj2 -  p_lj1 *  r6inv);
+  }
+
+  fvec VLJ = vdw *  slw;
+  fvec dVLJ = dvdw *  slw +  vdw *  dslw;
+
+  fvec p_rcLJmin = fvec::set1(ka->params.rcLJmin[itype][jtype]);
+  fvec p_rcLJmax = fvec::set1(ka->params.rcLJmax[itype][jtype]);
+  fvec dStr, Str = aut_Sp2_deriv(rij, p_rcLJmin, p_rcLJmax, &dStr);
+  fvec VA = cij *  VLJ *  Str;
+
+  fvec fij[3], fji[3];
+  fij[0] = fvec::setzero(); fij[1] = fvec::setzero();
+  fij[2] = fvec::setzero();
+  fji[0] = fvec::setzero(); fji[1] = fvec::setzero();
+  fji[2] = fvec::setzero();
+
+  ivec vi = i;
+  ivec vj = j;
+
+  struct aut_frebo_data i_data, j_data;
+  i_data.x_i = x_i;
+  i_data.y_i = y_i;
+  i_data.z_i = z_i;
+  i_data.x_j = x_j;
+  i_data.y_j = y_j;
+  i_data.z_j = z_j;
+  j_data.x_i = x_j;
+  j_data.y_i = y_j;
+  j_data.z_i = z_j;
+  j_data.x_j = x_i;
+  j_data.y_j = y_i;
+  j_data.z_j = z_i;
+
+  fvec p_rcmin = fvec::set1(ka->params.rcmin[itype][jtype]);
+  fvec p_rcmax = fvec::set1(ka->params.rcmax[itype][jtype]);
+  fvec dwij;
+  fvec wij = aut_Sp_deriv(rij, p_rcmin, p_rcmax, &dwij);
+
+  fvec nHi = fvec::gather(vi, ka->nH, sizeof(flt_t));
+  fvec nCi = fvec::gather(vi, ka->nC, sizeof(flt_t));
+  fvec nHj = fvec::gather(vj, ka->nH, sizeof(flt_t));
+  fvec nCj = fvec::gather(vj, ka->nC, sizeof(flt_t));
+  fvec Nij = nHi +  nCi -  wij;
+  fvec Nji = nHj +  nCj -  wij;
+  i_data.nHi = nHi;
+  i_data.nCi = nCi;
+  j_data.nHi = nHj;
+  j_data.nCi = nCj;
+
+  fvec the_r = fvec::set1(ka->params.rcmin[itype][jtype]);
+  fvec scale = the_r / rij;
+
+  fvec NconjtmpI;
+  fvec pij = aut_frebo_pij_pd_2(ka, &i_data, itype, jtype, vi, vj, 
+				delx * scale, dely * scale, delz * scale, 
+				the_r, wij, VA, &NconjtmpI, fij);
+
+  if (i_data.buf_len < 0) goto exceed_limits;
+
+  fvec NconjtmpJ;
+  fvec rjix = fvec::setzero() -  delx;
+  fvec rjiy = fvec::setzero() -  dely;
+  fvec rjiz = fvec::setzero() -  delz;
+  fvec pji = aut_frebo_pij_pd_2(ka, &j_data, jtype, itype, vj, vi, 
+				rjix * scale, rjiy * scale, rjiz * scale, 
+				the_r, wij, VA, &NconjtmpJ, fji);
+  fij[0] = fij[0] -  fji[0];
+  fij[1] = fij[1] -  fji[1];
+  fij[2] = fij[2] -  fji[2];
+
+  if (j_data.buf_len < 0) goto exceed_limits;
+
+  fvec Nijconj = c_1_0 +  NconjtmpI *  NconjtmpI +  NconjtmpJ *  NconjtmpJ;
+  fvec dN3[3];
+  fvec pi_rc = aut_frebo_pi_rc_pd(ka, itype, jtype, Nij, Nji, Nijconj, dN3);
+
+  fvec c_TOL = fvec::set1(TOL);
+  fvec dN3_dh[3];
+  fvec Tij = aut_frebo_Tij(ka, itype, jtype, Nij, Nji, Nijconj, &dN3_dh[0]);
+  bvec TijgtTOLmask = fvec::cmpnle(fvec::abs(Tij), c_TOL);
+  fvec sum_omega = fvec::setzero();
+  if (bvec::test_any_set(TijgtTOLmask)) {
+    sum_omega = aut_frebo_sum_omega(
+        ka, &i_data, &j_data, itype, jtype, vi, vj,
+        delx * scale, dely * scale, delz * scale, the_r, VA *  Tij, fij);
+    sum_omega = fvec::mask_blend(TijgtTOLmask, fvec::setzero(), sum_omega);
+  }
+  fvec pi_dh = Tij *  sum_omega;
+
+  fvec bij = c_0_5 * ( pij +  pji) + pi_rc +  pi_dh;
+
+  fvec p_bLJmin = fvec::set1(ka->params.bLJmin[itype][jtype]);
+  fvec p_bLJmax = fvec::set1(ka->params.bLJmax[itype][jtype]);
+  fvec dStb, Stb = aut_Sp2_deriv(bij, p_bLJmin, p_bLJmax, &dStb);
+
+  bvec need_bo_deriv = fvec::cmpneq(dStb, fvec::setzero());
+  // fix up j_data, i_data, fij:
+  // multiply each by dStb
+  if (bvec::test_any_set(need_bo_deriv)) {
+    i_data.force_i_x = dStb * i_data.force_i_x;
+    i_data.force_i_y = dStb * i_data.force_i_y;
+    i_data.force_i_z = dStb * i_data.force_i_z;
+    i_data.force_j_x = dStb * i_data.force_j_x;
+    i_data.force_j_y = dStb * i_data.force_j_y;
+    i_data.force_j_z = dStb * i_data.force_j_z;
+    j_data.force_i_x = dStb * j_data.force_i_x;
+    j_data.force_i_y = dStb * j_data.force_i_y;
+    j_data.force_i_z = dStb * j_data.force_i_z;
+    j_data.force_j_x = dStb * j_data.force_j_x;
+    j_data.force_j_y = dStb * j_data.force_j_y;
+    j_data.force_j_z = dStb * j_data.force_j_z;
+    for (int k = 0; k < i_data.buf_len; k++) {
+      i_data.force_k_x_buf[k] = dStb * i_data.force_k_x_buf[k];
+      i_data.force_k_y_buf[k] = dStb * i_data.force_k_y_buf[k];
+      i_data.force_k_z_buf[k] = dStb * i_data.force_k_z_buf[k];
+    }
+    for (int k = 0; k < j_data.buf_len; k++) {
+      j_data.force_k_x_buf[k] = dStb * j_data.force_k_x_buf[k];
+      j_data.force_k_y_buf[k] = dStb * j_data.force_k_y_buf[k];
+      j_data.force_k_z_buf[k] = dStb * j_data.force_k_z_buf[k];
+    }
+    fvec fijc[3];
+    fijc[0] = dStb * fij[0];
+    fijc[1] = dStb * fij[1];
+    fijc[2] = dStb * fij[2];
+    fij[0] = scale * (fijc[0] - (delx * delx * fijc[0] + dely * delx * 
+				 fijc[1] + delz * delx * fijc[2]) / rsq);
+    fij[1] = scale * (fijc[1] - (delx * dely * fijc[0] + dely * dely * 
+				 fijc[1] + delz * dely * fijc[2]) / rsq);
+    fij[2] = scale * (fijc[2] - (delx * delz * fijc[0] + dely * delz * 
+				 fijc[1] + delz * delz * fijc[2]) / rsq);
+
+    aut_frebo_N_spline_force(ka, &i_data, itype, jtype, vi, vj, dStb * VA, 
+			     dN3[0], dN3[2], NconjtmpI);
+    aut_frebo_N_spline_force(ka, &j_data, jtype, itype, vj, vi, dStb * VA, 
+			     dN3[1], dN3[2], NconjtmpJ);
+    if (bvec::test_any_set(TijgtTOLmask)) {
+      aut_frebo_N_spline_force(ka, &i_data, itype, jtype, vi, vj, 
+			       dStb * VA * sum_omega, dN3_dh[0], dN3_dh[2], 
+			       NconjtmpI);
+      aut_frebo_N_spline_force(ka, &j_data, jtype, itype, vj, vi, 
+			       dStb * VA * sum_omega, dN3_dh[1], dN3_dh[2], 
+			       NconjtmpJ);
+    }
+
+    aut_frebo_data_writeback(ka, &i_data);
+    aut_frebo_data_writeback(ka, &j_data);
+  } else {
+    fij[0] = fvec::setzero();
+    fij[1] = fvec::setzero();
+    fij[2] = fvec::setzero();
+  }
+
+  fvec fpdVLJ = cij *  dVLJ * ( c_1_0 +  Str * ( Stb -  c_1_0));
+  fvec fpdStr = dStr *  cij * ( Stb *  VLJ -  VLJ);
+  fvec fpair = r2inv *  rij * ( fvec::setzero() - ( fpdVLJ +  fpdStr));
+  fvec evdwl = VA *  Stb +  cij *  VLJ * ( c_1_0 -  Str);
+
+  fvec result_f_i_x = fpair *  delx +  fij[0];
+  fvec result_f_i_y = fpair *  dely +  fij[1];
+  fvec result_f_i_z = fpair *  delz +  fij[2];
+  fvec result_f_j_x = fvec::setzero() -  result_f_i_x;
+  fvec result_f_j_y = fvec::setzero() -  result_f_i_y;
+  fvec result_f_j_z = fvec::setzero() -  result_f_i_z;
+
+  flt_t fi_x_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fi_y_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fi_z_buf[fvec::VL] __attribute__((aligned(64)));
+  int fi_i_buf[ivec::VL] __attribute__((aligned(64)));
+  flt_t fj_x_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fj_y_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fj_z_buf[fvec::VL] __attribute__((aligned(64)));
+  int fj_j_buf[ivec::VL] __attribute__((aligned(64)));
+  flt_t evdwl_buf[fvec::VL] __attribute__((aligned(64)));
+
+  if (bvec::test_any_set(need_bo_deriv)) {
+    result_f_i_x = i_data.force_i_x +  result_f_i_x;
+    result_f_i_y = i_data.force_i_y +  result_f_i_y;
+    result_f_i_z = i_data.force_i_z +  result_f_i_z;
+    result_f_j_x = i_data.force_j_x +  result_f_j_x;
+    result_f_j_y = i_data.force_j_y +  result_f_j_y;
+    result_f_j_z = i_data.force_j_z +  result_f_j_z;
+
+    result_f_i_x = j_data.force_j_x +  result_f_i_x;
+    result_f_i_y = j_data.force_j_y +  result_f_i_y;
+    result_f_i_z = j_data.force_j_z +  result_f_i_z;
+    result_f_j_x = j_data.force_i_x +  result_f_j_x;
+    result_f_j_y = j_data.force_i_y +  result_f_j_y;
+    result_f_j_z = j_data.force_i_z +  result_f_j_z;
+  }
+
+  fvec::store(fi_x_buf, result_f_i_x);
+  fvec::store(fi_y_buf, result_f_i_y);
+  fvec::store(fi_z_buf, result_f_i_z);
+  ivec::store(fi_i_buf, vi);
+  fvec::store(fj_x_buf, result_f_j_x);
+  fvec::store(fj_y_buf, result_f_j_y);
+  fvec::store(fj_z_buf, result_f_j_z);
+  ivec::store(fj_j_buf, vj);
+  fvec::store(evdwl_buf, evdwl);
+
+  int lane;
+  for (lane = 0; lane < fvec::VL; lane++) {
+    int ii = fi_i_buf[lane];
+    result_f[ii].x += fi_x_buf[lane];
+    result_f[ii].y += fi_y_buf[lane];
+    result_f[ii].z += fi_z_buf[lane];
+    result_f[ii].w += 0.5 * evdwl_buf[lane];
+    int jj = fj_j_buf[lane];
+    result_f[jj].x += fj_x_buf[lane];
+    result_f[jj].y += fj_y_buf[lane];
+    result_f[jj].z += fj_z_buf[lane];
+    result_f[jj].w += 0.5 * evdwl_buf[lane];
+  }
+  ka->result_eng += fvec::reduce_add(evdwl);
+
+  if (bvec::test_any_set(need_path_force)) {
+    fvec dC = VLJ * ( Str *  Stb +  c_1_0 -  Str);
+    aut_airebo_lj_force_path(ka, need_path_force, dC, testpath);
+  }
+  return;
+ }
+exceed_limits:
+  for (int l = 0; l < fvec::VL; l++) {
+    ref_lennard_jones_single_interaction(ka, ivec::at(i, l), ivec::at(j, l), 
+					 MORSEFLAG);
+  }
+  return;
+}
+
+/*
+ * Calculate the lennard-jones interaction.
+ * Uses the above hash-map, and outlines the calculation if the bondorder is
+ *  needed.
+ * Agressively compresses to get the most values calculated.
+ */
+template<int MORSEFLAG>
+static void aut_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * tag = ka->tag;
+  int * map = ka->map;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  ivec c_i1 = ivec::set1(1);
+  ivec c_i4 = ivec::set1(4);
+  fvec c_1_0 = fvec::set1(1.0);
+  fvec c_2_0 = fvec::set1(2.0);
+  fvec c_0_0 = fvec::set1(0.0);
+  int map_i_scalar = 0;
+  {
+    int i;
+    for (i = 1; i < ka->num_types; i++) {
+      if (ka->map[i])
+        map_i_scalar |= (1 << i);
+    }
+  }
+  ivec map_i = ivec::set1(map_i_scalar);
+  fvec result_eng = fvec::setzero();
+
+  struct aut_airebo_lj_test_path_result_data test_path_result;
+  for (int i = 0; i < OPT_TEST_PATH_SIZE; i++) {
+    test_path_result.i[i] = -1;
+  }
+
+  ivec i_bo[2][2];
+  ivec j_bo[2][2];
+  fvec cij_bo[2][2];
+  LennardJonesPathAIREBOT<flt_t> testpath_bo[2][2][fvec::VL];
+  int num_bo[2][2] = {0};
+
+  for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) {
+    ivec itag_bc = ivec::set1(tag[i]);
+    int itype = map[x[i].w];
+    fvec x_i = fvec::set1(x[i].x);
+    fvec y_i = fvec::set1(x[i].y);
+    fvec z_i = fvec::set1(x[i].z);
+    ivec i_bc = ivec::set1(i);
+
+    fvec cutljsq0 = fvec::set1(ka->params.cutljsq[itype][0]);
+    fvec cutljsq1 = fvec::set1(ka->params.cutljsq[itype][1]);
+    fvec p_rcmax0 = fvec::set1(ka->params.rcmax[itype][0]);
+    fvec p_rcmax1 = fvec::set1(ka->params.rcmax[itype][1]);
+    flt_t sigcut = ka->params.sigcut;
+    flt_t sigmin = ka->params.sigmin;
+    flt_t sigma0 = ka->params.sigma[itype][0];
+    flt_t rljmax0 = sigcut * sigma0;
+    flt_t rljmin0 = sigmin * sigma0;
+    flt_t sigma1 = ka->params.sigma[itype][1];
+    flt_t rljmax1 = sigcut * sigma1;
+    flt_t rljmin1 = sigmin * sigma1;
+    fvec p_rljmax0 = fvec::set1(rljmax0);
+    fvec p_rljmax1 = fvec::set1(rljmax1);
+    fvec p_rljmin0 = fvec::set1(rljmin0);
+    fvec p_rljmin1 = fvec::set1(rljmin1);
+    fvec p_rcLJmax0 = fvec::set1(ka->params.rcLJmax[itype][0]);
+    fvec p_rcLJmax1 = fvec::set1(ka->params.rcLJmax[itype][1]);
+    fvec p_rcLJmin0 = fvec::set1(ka->params.rcLJmin[itype][0]);
+    fvec p_rcLJmin1 = fvec::set1(ka->params.rcLJmin[itype][1]);
+    fvec p_lj10 = fvec::set1(ka->params.lj1[itype][0]);
+    fvec p_lj11 = fvec::set1(ka->params.lj1[itype][1]);
+    fvec p_lj20 = fvec::set1(ka->params.lj2[itype][0]);
+    fvec p_lj21 = fvec::set1(ka->params.lj2[itype][1]);
+    fvec p_lj30 = fvec::set1(ka->params.lj3[itype][0]);
+    fvec p_lj31 = fvec::set1(ka->params.lj3[itype][1]);
+    fvec p_lj40 = fvec::set1(ka->params.lj4[itype][0]);
+    fvec p_lj41 = fvec::set1(ka->params.lj4[itype][1]);
+
+    int * neighs = ka->neigh_lmp.entries + ka->neigh_lmp.offset[i];
+    int jnum = ka->neigh_lmp.num_half[i];
+
+    bool tap_success = aut_airebo_lj_test_all_paths(ka, i, &test_path_result);
+    if (! tap_success) {
+      for (int jj = 0; jj < jnum; jj++) {
+        ref_lennard_jones_single_interaction(ka, i, neighs[jj], MORSEFLAG);
+      }
+      continue;
+    }
+
+    ivec j_2;
+    fvec delx_2, dely_2, delz_2, rsq_2;
+    bvec jtype_mask_2;
+    int num_2 = 0;
+
+    fvec result_f_i_x = fvec::setzero();
+    fvec result_f_i_y = fvec::setzero();
+    fvec result_f_i_z = fvec::setzero();
+
+    int jj = 0;
+    bool rest_j = jj < jnum;
+    bool rest_2 = fvec::fast_compress();
+    #pragma forceinline recursive
+    while (rest_j || rest_2) {
+      fvec delx, dely, delz, rsq;
+      bvec jtype_mask, within_cutoff;
+      ivec j;
+      if (rest_j) {
+        bvec mask_0 = bvec::full();
+	//0xFF >> (8 - (jnum - jj));
+        if (jj + (fvec::VL - 1) >= jnum) mask_0 = bvec::only(jnum - jj);
+        j = ivec::maskz_loadu(mask_0, &neighs[jj]);
+        fvec x_j, y_j, z_j;
+        aut_loadatoms_vec(x, j, &x_j, &y_j, &z_j, &jtype_mask, map, map_i, 
+			  c_i1);
+        fvec::gather_prefetch0(ivec::mullo(c_i4, 
+	  ivec::maskz_loadu(bvec::full(), &neighs[jj + fvec::VL])), x);
+        _mm_prefetch((const char*)&neighs[jj + 2 * fvec::VL], _MM_HINT_T0);
+        delx = x_i -  x_j;
+        dely = y_i -  y_j;
+        delz = z_i -  z_j;
+        rsq = delx *  delx +  dely *  dely +  delz *  delz;
+        fvec cutoff_sq = fvec::mask_blend(jtype_mask, cutljsq0, cutljsq1);
+        within_cutoff = fvec::mask_cmplt(mask_0, rsq, cutoff_sq);
+
+        if (fvec::fast_compress()) {
+          j = ivec::masku_compress(within_cutoff, j);
+          delx = fvec::masku_compress(within_cutoff, delx);
+          dely = fvec::masku_compress(within_cutoff, dely);
+          delz = fvec::masku_compress(within_cutoff, delz);
+          rsq = fvec::masku_compress(within_cutoff, rsq);
+          jtype_mask = bvec::masku_compress(within_cutoff, jtype_mask);
+          //within_cutoff = 0xFF >> (8 - _cc_popcnt(within_cutoff));
+
+          bvec mask_2 = bvec::after(num_2);//0xFF << num_2;
+          j_2 = ivec::mask_expand(j_2, mask_2, j);
+          delx_2 = fvec::mask_expand(delx_2, mask_2, delx);
+          dely_2 = fvec::mask_expand(dely_2, mask_2, dely);
+          delz_2 = fvec::mask_expand(delz_2, mask_2, delz);
+          rsq_2 = fvec::mask_expand(rsq_2, mask_2, rsq);
+          jtype_mask_2 = bvec::mask_expand(jtype_mask_2, mask_2, jtype_mask);
+          num_2 = num_2 + bvec::popcnt(within_cutoff);
+          if (num_2 < fvec::VL) {
+            jj += fvec::VL;
+            rest_j = jj < jnum;
+            continue;
+          }
+
+          num_2 -= fvec::VL;
+	  //(0xFF >> (8 - num_2)) << (_cc_popcnt(within_cutoff) - num_2);
+          mask_2 = bvec::onlyafter(num_2, bvec::popcnt(within_cutoff) - num_2);
+          {
+            ivec tmp_j = j_2;
+            j_2 = ivec::masku_compress(mask_2, j);
+            j = tmp_j;
+            fvec tmp_delx = delx_2;
+            delx_2 = fvec::masku_compress(mask_2, delx);
+            delx = tmp_delx;
+            fvec tmp_dely = dely_2;
+            dely_2 = fvec::masku_compress(mask_2, dely);
+            dely = tmp_dely;
+            fvec tmp_delz = delz_2;
+            delz_2 = fvec::masku_compress(mask_2, delz);
+            delz = tmp_delz;
+            fvec tmp_rsq = rsq_2;
+            rsq_2 = fvec::masku_compress(mask_2, rsq);
+            rsq = tmp_rsq;
+            bvec tmp_jtype_mask = jtype_mask_2;
+            jtype_mask_2 = bvec::masku_compress(mask_2, jtype_mask);
+            jtype_mask = tmp_jtype_mask;
+            within_cutoff = bvec::full();
+          }
+        }
+      } else if (rest_2) {
+        rest_2 = false;
+        j = j_2;
+        delx = delx_2;
+        dely = dely_2;
+        delz = delz_2;
+        rsq = rsq_2;
+        jtype_mask = jtype_mask_2;
+        within_cutoff = bvec::only(num_2);
+        num_2 = 0;
+      }
+
+      bvec current_mask = within_cutoff;
+      if (bvec::test_all_unset(current_mask)) {
+        jj += fvec::VL;
+        rest_j = jj < jnum;
+        continue;
+      }
+
+      fvec rij = fvec::sqrt(rsq);
+      LennardJonesPathAIREBOT<flt_t> testpath[fvec::VL];
+      fvec cij = c_1_0;
+      fvec p_cut3rebo = fvec::set1(ka->params.cut3rebo);
+      bvec need_search = fvec::mask_cmplt(current_mask, rij, p_cut3rebo);
+      if (bvec::test_any_set(need_search)) {
+        fvec p_rcmax = fvec::mask_blend(jtype_mask, p_rcmax0, p_rcmax1);
+        #pragma noinline
+        cij = aut_airebo_lj_tap_test_path(ka, &test_path_result, need_search, 
+					  i_bc, j, testpath);
+      }
+      current_mask = fvec::mask_cmplt(current_mask, c_0_0, cij);
+      if (bvec::test_all_unset(current_mask)) {
+        jj += fvec::VL;
+        rest_j = jj < jnum;
+        continue;
+      }
+      bvec need_path_force = fvec::mask_cmplt(current_mask, cij, c_1_0);
+
+      fvec p_rljmax = fvec::mask_blend(jtype_mask, p_rljmax0, p_rljmax1);
+      fvec p_rljmin = fvec::mask_blend(jtype_mask, p_rljmin0, p_rljmin1);
+
+      fvec dslw, slw = aut_Sp2_deriv(rij, p_rljmin, p_rljmax, &dslw);
+
+      fvec p_lj1 = fvec::mask_blend(jtype_mask, p_lj10, p_lj11);
+      fvec p_lj2 = fvec::mask_blend(jtype_mask, p_lj20, p_lj21);
+      fvec p_lj3 = fvec::mask_blend(jtype_mask, p_lj30, p_lj31);
+      fvec p_lj4 = fvec::mask_blend(jtype_mask, p_lj40, p_lj41);
+
+      fvec vdw, dvdw;
+
+      fvec r2inv = fvec::recip(rsq);
+
+      if (MORSEFLAG) {
+        fvec exr = fvec::exp(fvec::setzero() - rij * p_lj4);
+        vdw = p_lj1 * exr * (p_lj2 * exr - c_2_0);
+        dvdw = p_lj3 * exr * (c_1_0 - p_lj2 * exr);
+      } else {
+        fvec r6inv = r2inv *  r2inv *  r2inv;
+
+        vdw = r6inv * ( p_lj3 *  r6inv -  p_lj4);
+        fvec r7inv = r6inv *  rij *  r2inv;
+        dvdw = r7inv * ( p_lj2 -  p_lj1 *  r6inv);
+      }
+
+      fvec VLJ = vdw *  slw;
+      fvec dVLJ = dvdw *  slw +  vdw *  dslw;
+
+      fvec p_rcLJmin = fvec::mask_blend(jtype_mask, p_rcLJmin0, p_rcLJmin1);
+      fvec p_rcLJmax = fvec::mask_blend(jtype_mask, p_rcLJmax0, p_rcLJmax1);
+      fvec dStr, Str = aut_Sp2_deriv(rij, p_rcLJmin, p_rcLJmax, &dStr);
+      fvec VA = cij *  VLJ *  Str;
+      bvec need_bondorder = fvec::mask_cmplt(current_mask, c_0_0, Str);
+      fvec Stb = fvec::setzero();
+      fvec fij[3];
+      fij[0] = fvec::setzero();
+      fij[1] = fvec::setzero();
+      fij[2] = fvec::setzero();
+      if (bvec::test_any_set(need_bondorder)) {
+        for (int jtype = 0; jtype < 2; jtype++) {
+          bvec need_bo_with_jtype = need_bondorder;
+          if (jtype) need_bo_with_jtype = need_bo_with_jtype & jtype_mask;
+          else need_bo_with_jtype = need_bo_with_jtype & ~ jtype_mask;
+          ivec jtmp = ivec::masku_compress(need_bo_with_jtype, j);
+          ivec itmp = ivec::masku_compress(need_bo_with_jtype, ivec::set1(i));
+          fvec cijtmp = fvec::masku_compress(need_bo_with_jtype, cij);
+          bvec insert_mask = bvec::after(num_bo[itype][jtype]);
+          i_bo[itype][jtype] = ivec::mask_expand(i_bo[itype][jtype], 
+						 insert_mask, itmp);
+          j_bo[itype][jtype] = ivec::mask_expand(j_bo[itype][jtype], 
+						 insert_mask, jtmp);
+          cij_bo[itype][jtype] = fvec::mask_expand(cij_bo[itype][jtype], 
+						   insert_mask, cijtmp);
+          bvec need_path_force_with_jtype = need_bo_with_jtype & 
+	    need_path_force;
+          int testpath_end = fvec::VL;
+          if (bvec::test_any_set(need_path_force_with_jtype)) {
+            int pos = num_bo[itype][jtype];
+            for (int l = 0; l < fvec::VL; l++) {
+              if (pos >= fvec::VL) {
+                testpath_end = l;
+                break;
+              }
+              if (bvec::test_at(need_path_force_with_jtype, l)) {
+                testpath_bo[itype][jtype][pos] = testpath[l];
+              }
+              if (bvec::test_at(need_bo_with_jtype, l)) {
+                pos += 1;
+              }
+            }
+          }
+          num_bo[itype][jtype] = num_bo[itype][jtype] + 
+	    bvec::popcnt(need_bo_with_jtype);
+          if (num_bo[itype][jtype] >= fvec::VL) {
+            #pragma noinline
+            aut_lj_with_bo<MORSEFLAG>(ka, itype, jtype, i_bo[itype][jtype], 
+				      j_bo[itype][jtype], cij_bo[itype][jtype],
+				      testpath_bo[itype][jtype]);
+            num_bo[itype][jtype] -= fvec::VL;
+            insert_mask = bvec::onlyafter(num_bo[itype][jtype], 
+					  bvec::popcnt(need_bo_with_jtype) - 
+					  num_bo[itype][jtype]);
+            i_bo[itype][jtype] = ivec::masku_compress(insert_mask, itmp);
+            j_bo[itype][jtype] = ivec::masku_compress(insert_mask, jtmp);
+            cij_bo[itype][jtype] = fvec::masku_compress(insert_mask, cijtmp);
+            if (bvec::test_any_set(need_path_force_with_jtype)) {
+              int pos = 0;
+              for (int l = testpath_end; l < fvec::VL; l++) {
+                if (bvec::test_at(need_path_force_with_jtype, l)) {
+                  testpath_bo[itype][jtype][pos] = testpath[l];
+                }
+                if (bvec::test_at(need_bo_with_jtype, l)) {
+                  pos += 1;
+                }
+              }
+            }
+          }
+        }
+        current_mask = current_mask & ~ need_bondorder;
+        need_path_force = need_path_force & ~ need_bondorder;
+      }
+
+      fvec fpdVLJ = cij *  dVLJ * ( c_1_0 +  Str * ( Stb -  c_1_0));
+      fvec fpdStr = dStr *  cij * ( Stb *  VLJ -  VLJ);
+      fvec fpair = r2inv *  rij * ( fvec::setzero() - ( fpdVLJ +  fpdStr));
+      fvec evdwl = VA *  Stb +  cij *  VLJ * ( c_1_0 -  Str);
+
+      fvec fix = fpair *  delx +  fij[0];
+      fvec fiy = fpair *  dely +  fij[1];
+      fvec fiz = fpair *  delz +  fij[2];
+      result_f_i_x = fvec::mask_add(result_f_i_x, current_mask, result_f_i_x, 
+				    fix);
+      result_f_i_y = fvec::mask_add(result_f_i_y, current_mask, result_f_i_y, 
+				    fiy);
+      result_f_i_z = fvec::mask_add(result_f_i_z, current_mask, result_f_i_z, 
+				    fiz);
+      result_eng = fvec::mask_add(result_eng, current_mask, result_eng, evdwl);
+
+      ivec j_dbl_idx = ivec::mullo(j, c_i4);
+      avec fjx = avec::mask_gather(avec::undefined(), current_mask, j_dbl_idx, 
+				   &ka->result_f[0].x, sizeof(acc_t));
+      avec fjy = avec::mask_gather(avec::undefined(), current_mask, j_dbl_idx, 
+				   &ka->result_f[0].y, sizeof(acc_t));
+      avec fjz = avec::mask_gather(avec::undefined(), current_mask, j_dbl_idx, 
+				   &ka->result_f[0].z, sizeof(acc_t));
+
+      fjx = fjx -  fix;
+      fjy = fjy -  fiy;
+      fjz = fjz -  fiz;
+      avec::mask_i32loscatter(&ka->result_f[0].x, current_mask, j_dbl_idx, fjx, 
+			      sizeof(acc_t));
+      avec::mask_i32loscatter(&ka->result_f[0].y, current_mask, j_dbl_idx, fjy, 
+			      sizeof(acc_t));
+      avec::mask_i32loscatter(&ka->result_f[0].z, current_mask, j_dbl_idx, fjz, 
+			      sizeof(acc_t));
+
+      if (bvec::test_any_set(need_path_force)) {
+        fvec dC = VLJ * ( Str *  Stb +  c_1_0 -  Str);
+        #pragma noinline
+        aut_airebo_lj_force_path(ka, need_path_force, dC, testpath);
+      }
+      jj += fvec::VL;
+      rest_j = jj < jnum;
+    }
+    ka->result_f[i].x += fvec::reduce_add(result_f_i_x);
+    ka->result_f[i].y += fvec::reduce_add(result_f_i_y);
+    ka->result_f[i].z += fvec::reduce_add(result_f_i_z);
+  }
+  for (int itype = 0; itype < 2; itype++) {
+    for (int jtype = 0; jtype < 2; jtype++) {
+      for (int l = 0; l < num_bo[itype][jtype]; l++) {
+        ref_lennard_jones_single_interaction(ka,ivec::at(i_bo[itype][jtype],l),
+					     ivec::at(j_bo[itype][jtype], l),
+					     MORSEFLAG);
+      }
+    }
+  }
+  ka->result_eng += fvec::reduce_add(result_eng);
+}
+
+};
+
+template<typename flt_t, typename acc_t>
+void aut_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka, int morseflag) {
+#ifdef LMP_INTEL_AIREBO_REF
+  ref_lennard_jones(ka, morseflag);
+#else
+  if (morseflag) {
+    aut_wrap<flt_t,acc_t>::template aut_lennard_jones<1>(ka);
+  } else {
+    aut_wrap<flt_t,acc_t>::template aut_lennard_jones<0>(ka);
+  }
+#endif
+}
+
+template<typename flt_t, typename acc_t>
+void aut_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
+#ifdef LMP_INTEL_AIREBO_REF
+  ref_rebo_neigh(ka);
+#else
+  aut_wrap<flt_t,acc_t>::aut_rebo_neigh(ka);
+#endif
+}
+
+template<typename flt_t, typename acc_t>
+void aut_frebo(KernelArgsAIREBOT<flt_t,acc_t> * ka, int torsion_flag) {
+#ifdef LMP_INTEL_AIREBO_REF
+  ref_frebo(ka, torsion_flag);
+#else
+  aut_wrap<flt_t,acc_t>::aut_frebo(ka, torsion_flag);
+#endif
+}
+
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(pop)
+#endif
+
+}
+
diff --git a/src/USER-INTEL/pair_airebo_intel.h b/src/USER-INTEL/pair_airebo_intel.h
new file mode 100644
index 000000000..d3179c09f
--- /dev/null
+++ b/src/USER-INTEL/pair_airebo_intel.h
@@ -0,0 +1,110 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(airebo/intel,PairAIREBOIntel)
+
+#else
+
+#ifndef LMP_PAIR_AIREBO_INTEL_H
+#define LMP_PAIR_AIREBO_INTEL_H
+
+#include "pair.h"
+#include "fix_intel.h"
+#include "pair_airebo.h"
+//#include "airebo_common.h"
+
+namespace LAMMPS_NS {
+
+template<class flt_t, class acc_t>
+struct PairAIREBOIntelParam;
+
+class PairAIREBOIntel : public PairAIREBO {
+ public:
+  PairAIREBOIntel(class LAMMPS *);
+  virtual ~PairAIREBOIntel();
+  virtual void compute(int, int);
+  virtual void init_style();
+ protected:
+
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers);
+
+  template <int EVFLAG, int EFLAG, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+	    IntelBuffers<flt_t,acc_t> * buffers,
+	    const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(IntelBuffers<flt_t,acc_t> * buffers);
+
+  template <class flt_t, class acc_t>
+  PairAIREBOIntelParam<flt_t,acc_t> get_param();
+
+  FixIntel * fix;
+  int _cop;
+
+  int * REBO_cnumneigh;
+  int * REBO_num_skin;
+  int * REBO_list_data;
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: Pair style AIREBO requires atom IDs
+
+This is a requirement to use the AIREBO potential.
+
+E: Pair style AIREBO requires newton pair on
+
+See the newton command.  This is a restriction to use the AIREBO
+potential.
+
+E: All pair coeffs are not set
+
+All pair coefficients must be set in the data file or by the
+pair_coeff command before running a simulation.
+
+E: Neighbor list overflow, boost neigh_modify one
+
+There are too many neighbors of a single atom.  Use the neigh_modify
+command to increase the max number of neighbors allowed for one atom.
+You may also want to boost the page size.
+
+E: Cannot open AIREBO potential file %s
+
+The specified AIREBO potential file cannot be opened.  Check that the
+path and name are correct.
+
+*/
diff --git a/src/USER-INTEL/pair_airebo_morse_intel.cpp b/src/USER-INTEL/pair_airebo_morse_intel.cpp
new file mode 100644
index 000000000..9c0f3b8ed
--- /dev/null
+++ b/src/USER-INTEL/pair_airebo_morse_intel.cpp
@@ -0,0 +1,37 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#include "pair_airebo_morse_intel.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairAIREBOMorseIntel::PairAIREBOMorseIntel(LAMMPS *lmp) 
+  : PairAIREBOIntel(lmp) {}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairAIREBOMorseIntel::settings(int narg, char **arg)
+{
+  PairAIREBOIntel::settings(narg,arg);
+
+  morseflag = 1;
+}
diff --git a/src/USER-INTEL/pair_airebo_morse_intel.h b/src/USER-INTEL/pair_airebo_morse_intel.h
new file mode 100644
index 000000000..5210ea80e
--- /dev/null
+++ b/src/USER-INTEL/pair_airebo_morse_intel.h
@@ -0,0 +1,40 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(airebo/morse/intel,PairAIREBOMorseIntel)
+
+#else
+
+#ifndef LMP_PAIR_AIREBO_MORSE_INTEL_H
+#define LMP_PAIR_AIREBO_MORSE_INTEL_H
+
+#include "pair_airebo_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairAIREBOMorseIntel : public PairAIREBOIntel {
+ public:
+  PairAIREBOMorseIntel(class LAMMPS *);
+  virtual void settings(int, char **);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-INTEL/pair_eam_alloy_intel.cpp b/src/USER-INTEL/pair_eam_alloy_intel.cpp
new file mode 100644
index 000000000..4f47c7ee2
--- /dev/null
+++ b/src/USER-INTEL/pair_eam_alloy_intel.cpp
@@ -0,0 +1,326 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Stephen Foiles (SNL), Murray Daw (SNL)
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pair_eam_alloy_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define MAXLINE 1024
+
+/* ---------------------------------------------------------------------- */
+
+PairEAMAlloyIntel::PairEAMAlloyIntel(LAMMPS *lmp) : PairEAMIntel(lmp)
+{
+  one_coeff = 1;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+   read DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyIntel::coeff(int narg, char **arg)
+{
+  int i,j;
+
+  if (!allocated) allocate();
+
+  if (narg != 3 + atom->ntypes)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // insure I,J args are * *
+
+  if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // read EAM setfl file
+
+  if (setfl) {
+    for (i = 0; i < setfl->nelements; i++) delete [] setfl->elements[i];
+    delete [] setfl->elements;
+    delete [] setfl->mass;
+    memory->destroy(setfl->frho);
+    memory->destroy(setfl->rhor);
+    memory->destroy(setfl->z2r);
+    delete setfl;
+  }
+  setfl = new Setfl();
+  read_file(arg[2]);
+
+  // read args that map atom types to elements in potential file
+  // map[i] = which element the Ith atom type is, -1 if NULL
+
+  for (i = 3; i < narg; i++) {
+    if (strcmp(arg[i],"NULL") == 0) {
+      map[i-2] = -1;
+      continue;
+    }
+    for (j = 0; j < setfl->nelements; j++)
+      if (strcmp(arg[i],setfl->elements[j]) == 0) break;
+    if (j < setfl->nelements) map[i-2] = j;
+    else error->all(FLERR,"No matching element in EAM potential file");
+  }
+
+  // clear setflag since coeff() called once with I,J = * *
+
+  int n = atom->ntypes;
+  for (i = 1; i <= n; i++)
+    for (j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  // set setflag i,j for type pairs where both are mapped to elements
+  // set mass of atom type if i = j
+
+  int count = 0;
+  for (i = 1; i <= n; i++) {
+    for (j = i; j <= n; j++) {
+      if (map[i] >= 0 && map[j] >= 0) {
+        setflag[i][j] = 1;
+        if (i == j) atom->set_mass(FLERR,i,setfl->mass[map[i]]);
+        count++;
+      }
+      scale[i][j] = 1.0;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   read a multi-element DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyIntel::read_file(char *filename)
+{
+  Setfl *file = setfl;
+
+  // open potential file
+
+  int me = comm->me;
+  FILE *fptr;
+  char line[MAXLINE];
+
+  if (me == 0) {
+    fptr = force->open_potential(filename);
+    if (fptr == NULL) {
+      char str[128];
+      sprintf(str,"Cannot open EAM potential file %s",filename);
+      error->one(FLERR,str);
+    }
+  }
+
+  // read and broadcast header
+  // extract element names from nelements line
+
+  int n;
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    n = strlen(line) + 1;
+  }
+  MPI_Bcast(&n,1,MPI_INT,0,world);
+  MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+  sscanf(line,"%d",&file->nelements);
+  int nwords = atom->count_words(line);
+  if (nwords != file->nelements + 1)
+    error->all(FLERR,"Incorrect element names in EAM potential file");
+
+  char **words = new char*[file->nelements+1];
+  nwords = 0;
+  strtok(line," \t\n\r\f");
+  while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue;
+
+  file->elements = new char*[file->nelements];
+  for (int i = 0; i < file->nelements; i++) {
+    n = strlen(words[i]) + 1;
+    file->elements[i] = new char[n];
+    strcpy(file->elements[i],words[i]);
+  }
+  delete [] words;
+
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    sscanf(line,"%d %lg %d %lg %lg",
+           &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
+  }
+
+  MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
+  MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->nr,1,MPI_INT,0,world);
+  MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
+
+  file->mass = new double[file->nelements];
+  memory->create(file->frho,file->nelements,file->nrho+1,"pair:frho");
+  memory->create(file->rhor,file->nelements,file->nr+1,"pair:rhor");
+  memory->create(file->z2r,file->nelements,file->nelements,file->nr+1,
+                 "pair:z2r");
+
+  int i,j,tmp;
+  for (i = 0; i < file->nelements; i++) {
+    if (me == 0) {
+      fgets(line,MAXLINE,fptr);
+      sscanf(line,"%d %lg",&tmp,&file->mass[i]);
+    }
+    MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
+
+    if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
+    MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
+    if (me == 0) grab(fptr,file->nr,&file->rhor[i][1]);
+    MPI_Bcast(&file->rhor[i][1],file->nr,MPI_DOUBLE,0,world);
+  }
+
+  for (i = 0; i < file->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
+      MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+
+  // close the potential file
+
+  if (me == 0) fclose(fptr);
+}
+
+/* ----------------------------------------------------------------------
+   copy read-in setfl potential to standard array format
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyIntel::file2array()
+{
+  int i,j,m,n;
+  int ntypes = atom->ntypes;
+
+  // set function params directly from setfl file
+
+  nrho = setfl->nrho;
+  nr = setfl->nr;
+  drho = setfl->drho;
+  dr = setfl->dr;
+  rhomax = (nrho-1) * drho;
+
+  // ------------------------------------------------------------------
+  // setup frho arrays
+  // ------------------------------------------------------------------
+
+  // allocate frho arrays
+  // nfrho = # of setfl elements + 1 for zero array
+
+  nfrho = setfl->nelements + 1;
+  memory->destroy(frho);
+  memory->create(frho,nfrho,nrho+1,"pair:frho");
+
+  // copy each element's frho to global frho
+
+  for (i = 0; i < setfl->nelements; i++)
+    for (m = 1; m <= nrho; m++) frho[i][m] = setfl->frho[i][m];
+
+  // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
+  // this is necessary b/c fp is still computed for non-EAM atoms
+
+  for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
+
+  // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
+  // if atom type doesn't point to element (non-EAM atom in pair hybrid)
+  // then map it to last frho array of zeroes
+
+  for (i = 1; i <= ntypes; i++)
+    if (map[i] >= 0) type2frho[i] = map[i];
+    else type2frho[i] = nfrho-1;
+
+  // ------------------------------------------------------------------
+  // setup rhor arrays
+  // ------------------------------------------------------------------
+
+  // allocate rhor arrays
+  // nrhor = # of setfl elements
+
+  nrhor = setfl->nelements;
+  memory->destroy(rhor);
+  memory->create(rhor,nrhor,nr+1,"pair:rhor");
+
+  // copy each element's rhor to global rhor
+
+  for (i = 0; i < setfl->nelements; i++)
+    for (m = 1; m <= nr; m++) rhor[i][m] = setfl->rhor[i][m];
+
+  // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
+  // for setfl files, I,J mapping only depends on I
+  // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
+
+  for (i = 1; i <= ntypes; i++)
+    for (j = 1; j <= ntypes; j++)
+      type2rhor[i][j] = map[i];
+
+  // ------------------------------------------------------------------
+  // setup z2r arrays
+  // ------------------------------------------------------------------
+
+  // allocate z2r arrays
+  // nz2r = N*(N+1)/2 where N = # of setfl elements
+
+  nz2r = setfl->nelements * (setfl->nelements+1) / 2;
+  memory->destroy(z2r);
+  memory->create(z2r,nz2r,nr+1,"pair:z2r");
+
+  // copy each element pair z2r to global z2r, only for I >= J
+
+  n = 0;
+  for (i = 0; i < setfl->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      for (m = 1; m <= nr; m++) z2r[n][m] = setfl->z2r[i][j][m];
+      n++;
+    }
+
+  // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
+  // set of z2r arrays only fill lower triangular Nelement matrix
+  // value = n = sum over rows of lower-triangular matrix until reach irow,icol
+  // swap indices when irow < icol to stay lower triangular
+  // if map = -1 (non-EAM atom in pair hybrid):
+  //   type2z2r is not used by non-opt
+  //   but set type2z2r to 0 since accessed by opt
+
+  int irow,icol;
+  for (i = 1; i <= ntypes; i++) {
+    for (j = 1; j <= ntypes; j++) {
+      irow = map[i];
+      icol = map[j];
+      if (irow == -1 || icol == -1) {
+        type2z2r[i][j] = 0;
+        continue;
+      }
+      if (irow < icol) {
+        irow = map[j];
+        icol = map[i];
+      }
+      n = 0;
+      for (m = 0; m < irow; m++) n += m + 1;
+      n += icol;
+      type2z2r[i][j] = n;
+    }
+  }
+}
diff --git a/src/USER-INTEL/pair_eam_alloy_intel.h b/src/USER-INTEL/pair_eam_alloy_intel.h
new file mode 100644
index 000000000..4967c3709
--- /dev/null
+++ b/src/USER-INTEL/pair_eam_alloy_intel.h
@@ -0,0 +1,43 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(eam/alloy/intel,PairEAMAlloyIntel)
+
+#else
+
+#ifndef LMP_PAIR_EAM_ALLOY_INTEL_H
+#define LMP_PAIR_EAM_ALLOY_INTEL_H
+
+#include "pair_eam_intel.h"
+
+namespace LAMMPS_NS {
+
+// need virtual public b/c of how eam/alloy/opt inherits from it
+
+class PairEAMAlloyIntel : virtual public PairEAMIntel {
+ public:
+  PairEAMAlloyIntel(class LAMMPS *);
+  virtual ~PairEAMAlloyIntel() {}
+  void coeff(int, char **);
+
+ protected:
+  void read_file(char *);
+  void file2array();
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-INTEL/pair_eam_fs_intel.cpp b/src/USER-INTEL/pair_eam_fs_intel.cpp
new file mode 100644
index 000000000..cfcc8200c
--- /dev/null
+++ b/src/USER-INTEL/pair_eam_fs_intel.cpp
@@ -0,0 +1,335 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Tim Lau (MIT)
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pair_eam_fs_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define MAXLINE 1024
+
+/* ---------------------------------------------------------------------- */
+
+PairEAMFSIntel::PairEAMFSIntel(LAMMPS *lmp) : PairEAMIntel(lmp)
+{
+  one_coeff = 1;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+   read EAM Finnis-Sinclair file
+------------------------------------------------------------------------- */
+
+void PairEAMFSIntel::coeff(int narg, char **arg)
+{
+  int i,j;
+
+  if (!allocated) allocate();
+
+  if (narg != 3 + atom->ntypes)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // insure I,J args are * *
+
+  if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // read EAM Finnis-Sinclair file
+
+  if (fs) {
+    for (i = 0; i < fs->nelements; i++) delete [] fs->elements[i];
+    delete [] fs->elements;
+    delete [] fs->mass;
+    memory->destroy(fs->frho);
+    memory->destroy(fs->rhor);
+    memory->destroy(fs->z2r);
+    delete fs;
+  }
+  fs = new Fs();
+  read_file(arg[2]);
+
+  // read args that map atom types to elements in potential file
+  // map[i] = which element the Ith atom type is, -1 if NULL
+
+  for (i = 3; i < narg; i++) {
+    if (strcmp(arg[i],"NULL") == 0) {
+      map[i-2] = -1;
+      continue;
+    }
+    for (j = 0; j < fs->nelements; j++)
+      if (strcmp(arg[i],fs->elements[j]) == 0) break;
+    if (j < fs->nelements) map[i-2] = j;
+    else error->all(FLERR,"No matching element in EAM potential file");
+  }
+
+  // clear setflag since coeff() called once with I,J = * *
+
+  int n = atom->ntypes;
+  for (i = 1; i <= n; i++)
+    for (j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  // set setflag i,j for type pairs where both are mapped to elements
+  // set mass of atom type if i = j
+
+  int count = 0;
+  for (i = 1; i <= n; i++) {
+    for (j = i; j <= n; j++) {
+      if (map[i] >= 0 && map[j] >= 0) {
+        setflag[i][j] = 1;
+        if (i == j) atom->set_mass(FLERR,i,fs->mass[map[i]]);
+        count++;
+      }
+      scale[i][j] = 1.0;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   read a multi-element DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMFSIntel::read_file(char *filename)
+{
+  Fs *file = fs;
+
+  // open potential file
+
+  int me = comm->me;
+  FILE *fptr;
+  char line[MAXLINE];
+
+  if (me == 0) {
+    fptr = force->open_potential(filename);
+    if (fptr == NULL) {
+      char str[128];
+      sprintf(str,"Cannot open EAM potential file %s",filename);
+      error->one(FLERR,str);
+    }
+  }
+
+  // read and broadcast header
+  // extract element names from nelements line
+
+  int n;
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    n = strlen(line) + 1;
+  }
+  MPI_Bcast(&n,1,MPI_INT,0,world);
+  MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+  sscanf(line,"%d",&file->nelements);
+  int nwords = atom->count_words(line);
+  if (nwords != file->nelements + 1)
+    error->all(FLERR,"Incorrect element names in EAM potential file");
+
+  char **words = new char*[file->nelements+1];
+  nwords = 0;
+  strtok(line," \t\n\r\f");
+  while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue;
+
+  file->elements = new char*[file->nelements];
+  for (int i = 0; i < file->nelements; i++) {
+    n = strlen(words[i]) + 1;
+    file->elements[i] = new char[n];
+    strcpy(file->elements[i],words[i]);
+  }
+  delete [] words;
+
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    sscanf(line,"%d %lg %d %lg %lg",
+           &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
+  }
+
+  MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
+  MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->nr,1,MPI_INT,0,world);
+  MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
+
+  file->mass = new double[file->nelements];
+  memory->create(file->frho,file->nelements,file->nrho+1,
+                                              "pair:frho");
+  memory->create(file->rhor,file->nelements,file->nelements,
+                 file->nr+1,"pair:rhor");
+  memory->create(file->z2r,file->nelements,file->nelements,
+                 file->nr+1,"pair:z2r");
+
+  int i,j,tmp;
+  for (i = 0; i < file->nelements; i++) {
+    if (me == 0) {
+      fgets(line,MAXLINE,fptr);
+      sscanf(line,"%d %lg",&tmp,&file->mass[i]);
+    }
+    MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
+
+    if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
+    MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
+
+    for (j = 0; j < file->nelements; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->rhor[i][j][1]);
+      MPI_Bcast(&file->rhor[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+  }
+
+  for (i = 0; i < file->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
+      MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+
+  // close the potential file
+
+  if (me == 0) fclose(fptr);
+}
+
+/* ----------------------------------------------------------------------
+   copy read-in setfl potential to standard array format
+------------------------------------------------------------------------- */
+
+void PairEAMFSIntel::file2array()
+{
+  int i,j,m,n;
+  int ntypes = atom->ntypes;
+
+  // set function params directly from fs file
+
+  nrho = fs->nrho;
+  nr = fs->nr;
+  drho = fs->drho;
+  dr = fs->dr;
+  rhomax = (nrho-1) * drho;
+
+  // ------------------------------------------------------------------
+  // setup frho arrays
+  // ------------------------------------------------------------------
+
+  // allocate frho arrays
+  // nfrho = # of fs elements + 1 for zero array
+
+  nfrho = fs->nelements + 1;
+  memory->destroy(frho);
+  memory->create(frho,nfrho,nrho+1,"pair:frho");
+
+  // copy each element's frho to global frho
+
+  for (i = 0; i < fs->nelements; i++)
+    for (m = 1; m <= nrho; m++) frho[i][m] = fs->frho[i][m];
+
+  // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
+  // this is necessary b/c fp is still computed for non-EAM atoms
+
+  for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
+
+  // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
+  // if atom type doesn't point to element (non-EAM atom in pair hybrid)
+  // then map it to last frho array of zeroes
+
+  for (i = 1; i <= ntypes; i++)
+    if (map[i] >= 0) type2frho[i] = map[i];
+    else type2frho[i] = nfrho-1;
+
+  // ------------------------------------------------------------------
+  // setup rhor arrays
+  // ------------------------------------------------------------------
+
+  // allocate rhor arrays
+  // nrhor = square of # of fs elements
+
+  nrhor = fs->nelements * fs->nelements;
+  memory->destroy(rhor);
+  memory->create(rhor,nrhor,nr+1,"pair:rhor");
+
+  // copy each element pair rhor to global rhor
+
+  n = 0;
+  for (i = 0; i < fs->nelements; i++)
+    for (j = 0; j < fs->nelements; j++) {
+      for (m = 1; m <= nr; m++) rhor[n][m] = fs->rhor[i][j][m];
+      n++;
+    }
+
+  // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
+  // for fs files, there is a full NxN set of rhor arrays
+  // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
+
+  for (i = 1; i <= ntypes; i++)
+    for (j = 1; j <= ntypes; j++)
+      type2rhor[i][j] = map[i] * fs->nelements + map[j];
+
+  // ------------------------------------------------------------------
+  // setup z2r arrays
+  // ------------------------------------------------------------------
+
+  // allocate z2r arrays
+  // nz2r = N*(N+1)/2 where N = # of fs elements
+
+  nz2r = fs->nelements * (fs->nelements+1) / 2;
+  memory->destroy(z2r);
+  memory->create(z2r,nz2r,nr+1,"pair:z2r");
+
+  // copy each element pair z2r to global z2r, only for I >= J
+
+  n = 0;
+  for (i = 0; i < fs->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      for (m = 1; m <= nr; m++) z2r[n][m] = fs->z2r[i][j][m];
+      n++;
+    }
+
+  // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
+  // set of z2r arrays only fill lower triangular Nelement matrix
+  // value = n = sum over rows of lower-triangular matrix until reach irow,icol
+  // swap indices when irow < icol to stay lower triangular
+  // if map = -1 (non-EAM atom in pair hybrid):
+  //   type2z2r is not used by non-opt
+  //   but set type2z2r to 0 since accessed by opt
+
+  int irow,icol;
+  for (i = 1; i <= ntypes; i++) {
+    for (j = 1; j <= ntypes; j++) {
+      irow = map[i];
+      icol = map[j];
+      if (irow == -1 || icol == -1) {
+        type2z2r[i][j] = 0;
+        continue;
+      }
+      if (irow < icol) {
+        irow = map[j];
+        icol = map[i];
+      }
+      n = 0;
+      for (m = 0; m < irow; m++) n += m + 1;
+      n += icol;
+      type2z2r[i][j] = n;
+    }
+  }
+}
diff --git a/src/USER-INTEL/pair_eam_fs_intel.h b/src/USER-INTEL/pair_eam_fs_intel.h
new file mode 100644
index 000000000..da2ab9d2d
--- /dev/null
+++ b/src/USER-INTEL/pair_eam_fs_intel.h
@@ -0,0 +1,43 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(eam/fs/intel,PairEAMFSIntel)
+
+#else
+
+#ifndef LMP_PAIR_EAM_FS_INTEL_H
+#define LMP_PAIR_EAM_FS_INTEL_H
+
+#include "pair_eam_intel.h"
+
+namespace LAMMPS_NS {
+
+// need virtual public b/c of how eam/fs/opt inherits from it
+
+class PairEAMFSIntel : virtual public PairEAMIntel {
+ public:
+  PairEAMFSIntel(class LAMMPS *);
+  virtual ~PairEAMFSIntel() {}
+  void coeff(int, char **);
+
+ protected:
+  void read_file(char *);
+  void file2array();
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp
index ed7dd424a..3fbb58308 100644
--- a/src/USER-INTEL/pair_gayberne_intel.cpp
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@@ -1,1079 +1,1079 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    This software is distributed under the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #include "pair_gayberne_intel.h"
 #include "math_extra_intel.h"
 
 #ifdef _LMP_INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))
 #endif
 #include <cmath>
 #ifdef _LMP_INTEL_OFFLOAD
 #pragma offload_attribute(pop)
 #endif
 
 #include "atom.h"
 #include "comm.h"
 #include "atom_vec_ellipsoid.h"
 #include "force.h"
 #include "memory.h"
 #include "modify.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 
 #include "suffix.h"
 using namespace LAMMPS_NS;
 
 #define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
 #define FC_PACKED2_T typename ForceConst<flt_t>::fc_packed2
 #define FC_PACKED3_T typename ForceConst<flt_t>::fc_packed3
 
 /* ---------------------------------------------------------------------- */
 
 PairGayBerneIntel::PairGayBerneIntel(LAMMPS *lmp) :
   PairGayBerne(lmp)
 {
   suffix_flag |= Suffix::INTEL;
   respa_enable = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairGayBerneIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 
   fix->balance_stamp();
   vflag_fdotr = 0;
 }
 
 template <class flt_t, class acc_t>
 void PairGayBerneIntel::compute(int eflag, int vflag,
                                 IntelBuffers<flt_t,acc_t> *buffers,
                                 const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag, vflag);
   } else evflag = vflag_fdotr = 0;
 
   const int inum = list->inum;
   const int nall = atom->nlocal + atom->nghost;
   const int nthreads = comm->nthreads;
   const int host_start = fix->host_start_pair();
   const int offload_end = fix->offload_end_pair();
   const int ago = neighbor->ago;
 
   if (fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
     const AtomVecEllipsoid::Bonus * const bonus = avec->bonus;
     const int * const ellipsoid = atom->ellipsoid;
     QUAT_T * _noalias const quat = buffers->get_quat();
 
     int packthreads;
     if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
     else packthreads = 1;
     #if defined(_OPENMP)
     #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, packthreads,
                                 sizeof(ATOM_T));
       if (ago != 0) buffers->thr_pack(ifrom,ito,ago);
 
       for (int i = ifrom; i < ito; i++) {
         int qi = ellipsoid[i];
         if (qi > -1) {
           quat[i].w = bonus[qi].quat[0];
           quat[i].i = bonus[qi].quat[1];
           quat[i].j = bonus[qi].quat[2];
           quat[i].k = bonus[qi].quat[3];
         }
       }
     }
     quat[nall].w = (flt_t)1.0;
     quat[nall].i = (flt_t)0.0;
     quat[nall].j = (flt_t)0.0;
     quat[nall].k = (flt_t)0.0;
     fix->stop_watch(TIME_PACK);
   }
 
   int ovflag = 0;
   if (vflag_fdotr) ovflag = 2;
   else if (vflag) ovflag = 1;
   if (eflag) {
     if (force->newton_pair) {
       eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
       eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
       eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
       eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
       eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
       eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
       eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
       eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
 template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairGayBerneIntel::eval(const int offload, const int vflag,
                              IntelBuffers<flt_t,acc_t> *buffers,
                              const ForceConst<flt_t> &fc,
                              const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
   int nlocal, nall, minlocal;
   fix->get_buffern(offload, nlocal, nall, minlocal);
 
   const int ago = neighbor->ago;
   ATOM_T * _noalias const x = buffers->get_x(offload);
   QUAT_T * _noalias const quat = buffers->get_quat(offload);
   const AtomVecEllipsoid::Bonus *bonus = avec->bonus;
   const int *ellipsoid = atom->ellipsoid;
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (fix->separate_buffers()) {
     fix->start_watch(TIME_PACK);
     if (offload) {
       #pragma omp parallel
       {
         int ifrom, ito, tid;
         int nthreads = comm->nthreads;
         IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,
                                   nthreads, sizeof(ATOM_T));
         if (ago != 0) buffers->thr_pack_cop(ifrom, ito, 0);
         for (int i = ifrom; i < ito; i++) {
           int qi = ellipsoid[i];
           if (qi > -1) {
             quat[i].w = bonus[qi].quat[0];
             quat[i].i = bonus[qi].quat[1];
             quat[i].j = bonus[qi].quat[2];
             quat[i].k = bonus[qi].quat[3];
           }
         }
         int nghost = nall - nlocal;
         if (nghost) {
           IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,
                                  nthreads, sizeof(ATOM_T));
           int offset = 0;
           ifrom += nlocal;
           ito += nlocal;
           if (ago != 0) {
             offset = fix->offload_min_ghost() - nlocal;
             buffers->thr_pack_cop(ifrom, ito, offset, ago == 1);
           }
           for (int i = ifrom; i < ito; i++) {
             int qi = ellipsoid[i + offset];
             if (qi > -1) {
               quat[i].w = bonus[qi].quat[0];
               quat[i].i = bonus[qi].quat[1];
               quat[i].j = bonus[qi].quat[2];
               quat[i].k = bonus[qi].quat[3];
             }
           }
         }
       }
     } else {
       if (ago != 0) buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);
       for (int i = fix->host_min_local(); i < nlocal; i++) {
         int qi = ellipsoid[i];
         if (qi > -1) {
           quat[i].w = bonus[qi].quat[0];
           quat[i].i = bonus[qi].quat[1];
           quat[i].j = bonus[qi].quat[2];
           quat[i].k = bonus[qi].quat[3];
         }
       }
       int offset = fix->host_min_ghost() - nlocal;
       if (ago != 0) buffers->thr_pack_host(nlocal, nall, offset);
       for (int i = nlocal; i < nall; i++) {
         int qi = ellipsoid[i + offset];
         if (qi > -1) {
           quat[i].w = bonus[qi].quat[0];
           quat[i].i = bonus[qi].quat[1];
           quat[i].j = bonus[qi].quat[2];
           quat[i].k = bonus[qi].quat[3];
         }
       }
     }
     fix->stop_watch(TIME_PACK);
   }
   #endif
 
   //  const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
   const flt_t * _noalias const special_lj = fc.special_lj;
 
   const FC_PACKED1_T * _noalias const ijc = fc.ijc[0];
   const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0];
   const FC_PACKED3_T * _noalias const ic = fc.ic;
   const flt_t mu = fc.mu;
   const flt_t gamma = fc.gamma;
   const flt_t upsilon = fc.upsilon;
 
   flt_t * const rsq_formi = fc.rsq_form[0];
   flt_t * const delx_formi = fc.delx_form[0];
   flt_t * const dely_formi = fc.dely_form[0];
   flt_t * const delz_formi = fc.delz_form[0];
   int * const jtype_formi = fc.jtype_form[0];
   int * const jlist_formi = fc.jlist_form[0];
 
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
                        buffers, offload, fix, separate_flag,
                        x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
   const int max_nbors = _max_nbors;
   const int nthreads = tc;
 
   int pad = 1;
   if (offload) {
     if (INTEL_MIC_NBOR_PAD > 1)
       pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
   } else {
     if (INTEL_NBOR_PAD > 1)
       pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
   }
   const int pad_width = pad;
 
   #ifdef _LMP_INTEL_OFFLOAD
   int *overflow = fix->get_off_overflow_flag();
   double *timer_compute = fix->off_watch_pair();
 
   if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
   #pragma offload target(mic:_cop) if(offload) \
     in(special_lj:length(0) alloc_if(0) free_if(0)) \
     in(ijc,lj34,ic:length(0) alloc_if(0) free_if(0)) \
     in(rsq_formi, delx_formi, dely_formi: length(0) alloc_if(0) free_if(0)) \
     in(delz_formi, jtype_formi, jlist_formi: length(0) alloc_if(0) free_if(0))\
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(quat:length(nall+1) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(nthreads,inum,nall,ntypes,vflag,eatom,minlocal,separate_flag) \
     in(astart,nlocal,f_stride,max_nbors,mu,gamma,upsilon,offload,pad_width) \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
     signal(f_start)
   #endif
   {
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute=MIC_Wtime();
     #endif
 
     #ifdef _LMP_INTEL_OFFLOAD
     if (separate_flag) {
       if (separate_flag < 3) {
         int all_local = nlocal;
         int ghost_min = overflow[LMP_GHOST_MIN];
         nlocal = overflow[LMP_LOCAL_MAX] + 1;
         int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;
         if (nghost < 0) nghost = 0;
         nall = nlocal + nghost;
         separate_flag--;
         int flength;
         if (NEWTON_PAIR) flength = nall;
         else flength = nlocal;
         IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),
                              separate_flag);
         if (nghost) {
           if (nlocal < all_local || ghost_min > all_local) {
             memmove(x + nlocal, x + ghost_min,
                     (nall - nlocal) * sizeof(ATOM_T));
             memmove(quat + nlocal, quat + ghost_min,
                     (nall - nlocal) * sizeof(QUAT_T));
           }
         }
       }
       x[nall].x = (flt_t)INTEL_BIGP;
       x[nall].y = (flt_t)INTEL_BIGP;
       x[nall].z = (flt_t)INTEL_BIGP;
       x[nall].w = 1;
       quat[nall].w = (flt_t)1.0;
       quat[nall].i = (flt_t)0.0;
       quat[nall].j = (flt_t)0.0;
       quat[nall].k = (flt_t)0.0;
     }
     #endif
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EFLAG) oevdwl = (acc_t)0.0;
     if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
     if (NEWTON_PAIR == 0) f_start[1].w = 0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
     #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iip, iito, tid;
       IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
       int foff;
       if (NEWTON_PAIR) foff = tid * f_stride - minlocal * 2;
       else foff = minlocal*-2;
       FORCE_T * _noalias const f = f_start + foff;
       if (NEWTON_PAIR) memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T));
 
       flt_t * _noalias const rsq_form = rsq_formi + tid * max_nbors;
       flt_t * _noalias const delx_form = delx_formi + tid * max_nbors;
       flt_t * _noalias const dely_form = dely_formi + tid * max_nbors;
       flt_t * _noalias const delz_form = delz_formi + tid * max_nbors;
       int * _noalias const jtype_form = jtype_formi + tid * max_nbors;
       int * _noalias const jlist_form = jlist_formi + tid * max_nbors;
 
       int ierror = 0;
       for (int i = iifrom; i < iito; i += iip) {
         // const int i = ilist[ii];
         const int itype = x[i].w;
         const int ptr_off = itype * ntypes;
         const FC_PACKED1_T * _noalias const ijci = ijc + ptr_off;
         const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;
 
         const int * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
 
         flt_t a1_0, a1_1, a1_2, a1_3, a1_4, a1_5, a1_6, a1_7, a1_8;
         flt_t b1_0, b1_1, b1_2, b1_3, b1_4, b1_5, b1_6, b1_7, b1_8;
         flt_t g1_0, g1_1, g1_2, g1_3, g1_4, g1_5, g1_6, g1_7, g1_8;
 
         if (ijci[itype].form == ELLIPSE_ELLIPSE) {
           flt_t temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8;
           ME_quat_to_mat_trans(quat[i],a1);
           ME_diag_times3(ic[itype].well,a1,temp);
           ME_transpose_times3(a1,temp,b1);
           ME_diag_times3(ic[itype].shape2,a1,temp);
           ME_transpose_times3(a1,temp,g1);
         }
 
         acc_t fxtmp, fytmp, fztmp, fwtmp, t1tmp, t2tmp, t3tmp;
         acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
         fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
 
         if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
         if (NEWTON_PAIR == 0)
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
 
         bool multiple_forms = false;
         int packed_j = 0;
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma ivdep
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           int jm = jlist[jj];
           int j = jm & NEIGHMASK;
           const int jtype = x[j].w;
 
           if (ijci[jtype].form == ELLIPSE_ELLIPSE) {
             flt_t delx = x[j].x-xtmp;
             flt_t dely = x[j].y-ytmp;
             flt_t delz = x[j].z-ztmp;
             flt_t rsq = delx * delx + dely * dely + delz * delz;
 
             if (rsq < ijci[jtype].cutsq) {
               rsq_form[packed_j] = rsq;
               delx_form[packed_j] = delx;
               dely_form[packed_j] = dely;
               delz_form[packed_j] = delz;
               jtype_form[packed_j] = jtype;
               jlist_form[packed_j] = jm;
               packed_j++;
             }
           } else
             multiple_forms = true;
         }
-        const int edge = (packed_j % pad_width);
+        const int edge = packed_j & (pad_width - 1);
         if (edge) {
           const int packed_end = packed_j + (pad_width - edge);
           #if defined(LMP_SIMD_COMPILER)
           #pragma loop_count min=1, max=15, avg=8
           #endif
           for ( ; packed_j < packed_end; packed_j++)
             jlist_form[packed_j] = nall;
         }
 
         // -------------------------------------------------------------
 
         #ifdef INTEL_V512
         __assume(packed_j % INTEL_VECTOR_WIDTH == 0);
         __assume(packed_j % 8 == 0);
         __assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
         #endif
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
                                  sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
         #endif
         for (int jj = 0; jj < packed_j; jj++) {
           flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
           flt_t b2_0, b2_1, b2_2, b2_3, b2_4, b2_5, b2_6, b2_7, b2_8;
           flt_t g2_0, g2_1, g2_2, g2_3, g2_4, g2_5, g2_6, g2_7, g2_8;
           flt_t temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8;
           flt_t fforce_0, fforce_1, fforce_2, ttor_0, ttor_1, ttor_2;
           flt_t rtor_0, rtor_1, rtor_2;
 
           const int sbindex = jlist_form[jj] >> SBBITS & 3;
           const int j = jlist_form[jj] & NEIGHMASK;
           flt_t factor_lj = special_lj[sbindex];
           const int jtype = jtype_form[jj];
           const flt_t sigma = ijci[jtype].sigma;
           const flt_t epsilon = ijci[jtype].epsilon;
           const flt_t shape2_0 = ic[jtype].shape2[0];
           const flt_t shape2_1 = ic[jtype].shape2[1];
           const flt_t shape2_2 = ic[jtype].shape2[2];
           flt_t one_eng, evdwl;
 
           ME_quat_to_mat_trans(quat[j], a2);
           ME_diag_times3(ic[jtype].well, a2, temp);
           ME_transpose_times3(a2, temp, b2);
           ME_diag_times3a(shape2, a2, temp);
           ME_transpose_times3(a2, temp, g2);
 
           flt_t tempv_0, tempv_1, tempv_2, tempv2_0, tempv2_1, tempv2_2;
           flt_t temp1, temp2, temp3;
 
           flt_t r12hat_0, r12hat_1, r12hat_2;
           ME_normalize3(delx_form[jj], dely_form[jj], delz_form[jj], r12hat);
           flt_t r = sqrt(rsq_form[jj]);
 
           // compute distance of closest approach
 
           flt_t g12_0, g12_1, g12_2, g12_3, g12_4, g12_5, g12_6, g12_7, g12_8;
           ME_plus3(g1, g2, g12);
           flt_t kappa_0, kappa_1, kappa_2;
           ME_mldivide3(g12, delx_form[jj], dely_form[jj], delz_form[jj],
                        kappa, ierror);
 
           // tempv = G12^-1*r12hat
 
           flt_t inv_r = (flt_t)1.0 / r;
           tempv_0 = kappa_0 * inv_r;
           tempv_1 = kappa_1 * inv_r;
           tempv_2 = kappa_2 * inv_r;
           flt_t sigma12 = ME_dot3(r12hat, tempv);
           sigma12 = std::pow((flt_t)0.5 * sigma12,(flt_t) - 0.5);
           flt_t h12 = r - sigma12;
 
           // energy
           // compute u_r
 
           flt_t varrho = sigma / (h12 + gamma * sigma);
           flt_t varrho6 = std::pow(varrho, (flt_t)6.0);
           flt_t varrho12 = varrho6 * varrho6;
           flt_t u_r = (flt_t)4.0 * epsilon * (varrho12 - varrho6);
 
           // compute eta_12
 
           flt_t eta = (flt_t)2.0 * ijci[jtype].lshape;
           flt_t det_g12 = ME_det3(g12);
           eta = std::pow(eta / det_g12, upsilon);
 
           // compute chi_12
 
           flt_t b12_0, b12_1, b12_2, b12_3, b12_4, b12_5, b12_6, b12_7, b12_8;
           flt_t iota_0, iota_1, iota_2;
           ME_plus3(b1, b2, b12);
           ME_mldivide3(b12, delx_form[jj], dely_form[jj], delz_form[jj],
                        iota, ierror);
 
           // tempv = G12^-1*r12hat
 
           tempv_0 = iota_0 * inv_r;
           tempv_1 = iota_1 * inv_r;
           tempv_2 = iota_2 * inv_r;
           flt_t chi = ME_dot3(r12hat, tempv);
           chi = std::pow(chi * (flt_t)2.0, mu);
 
           // force
           // compute dUr/dr
 
           temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) /
             sigma;
           temp1 = temp1 * (flt_t)24.0 * epsilon;
           flt_t u_slj = temp1 * std::pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
           flt_t dUr_0, dUr_1, dUr_2;
           temp2 = ME_dot3(kappa, r12hat);
           flt_t uslj_rsq = u_slj / rsq_form[jj];
           dUr_0 = temp1 * r12hat_0 + uslj_rsq * (kappa_0 - temp2 * r12hat_0);
           dUr_1 = temp1 * r12hat_1 + uslj_rsq * (kappa_1 - temp2 * r12hat_1);
           dUr_2 = temp1 * r12hat_2 + uslj_rsq * (kappa_2 - temp2 * r12hat_2);
 
           // compute dChi_12/dr
 
           flt_t dchi_0, dchi_1, dchi_2;
           temp1 = ME_dot3(iota, r12hat);
           temp2 = (flt_t)-4.0 / rsq_form[jj] * mu *
             std::pow(chi, (mu - (flt_t)1.0) / mu);
           dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0);
           dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1);
           dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2);
 
           temp1 = -eta * u_r;
           temp2 = eta * chi;
           fforce_0 = temp1 * dchi_0 - temp2 * dUr_0;
           fforce_1 = temp1 * dchi_1 - temp2 * dUr_1;
           fforce_2 = temp1 * dchi_2 - temp2 * dUr_2;
 
           // torque for particle 1 and 2
           // compute dUr
 
           tempv_0 = -uslj_rsq * kappa_0;
           tempv_1 = -uslj_rsq * kappa_1;
           tempv_2 = -uslj_rsq * kappa_2;
           ME_vecmat(kappa, g1, tempv2);
           ME_cross3(tempv, tempv2, dUr);
           flt_t dUr2_0, dUr2_1, dUr2_2;
 
           if (NEWTON_PAIR) {
             ME_vecmat(kappa, g2, tempv2);
             ME_cross3(tempv, tempv2, dUr2);
           }
 
           // compute d_chi
 
           ME_vecmat(iota, b1, tempv);
           ME_cross3(tempv, iota, dchi);
           temp1 = (flt_t)-4.0 / rsq_form[jj];
           dchi_0 *= temp1;
           dchi_1 *= temp1;
           dchi_2 *= temp1;
           flt_t dchi2_0, dchi2_1, dchi2_2;
 
           if (NEWTON_PAIR) {
             ME_vecmat(iota, b2, tempv);
             ME_cross3(tempv, iota, dchi2);
             dchi2_0 *= temp1;
             dchi2_1 *= temp1;
             dchi2_2 *= temp1;
           }
 
           // compute d_eta
 
           flt_t deta_0, deta_1, deta_2;
           deta_0 = deta_1 = deta_2 = (flt_t)0.0;
           ME_compute_eta_torque(g12, a1, shape2, temp);
           temp1 = -eta * upsilon;
 
           tempv_0 = temp1 * temp_0;
           tempv_1 = temp1 * temp_1;
           tempv_2 = temp1 * temp_2;
           ME_mv0_cross3(a1, tempv, tempv2);
           deta_0 += tempv2_0;
           deta_1 += tempv2_1;
           deta_2 += tempv2_2;
 
           tempv_0 = temp1 * temp_3;
           tempv_1 = temp1 * temp_4;
           tempv_2 = temp1 * temp_5;
           ME_mv1_cross3(a1, tempv, tempv2);
           deta_0 += tempv2_0;
           deta_1 += tempv2_1;
           deta_2 += tempv2_2;
 
           tempv_0 = temp1 * temp_6;
           tempv_1 = temp1 * temp_7;
           tempv_2 = temp1 * temp_8;
           ME_mv2_cross3(a1, tempv, tempv2);
           deta_0 += tempv2_0;
           deta_1 += tempv2_1;
           deta_2 += tempv2_2;
 
           // compute d_eta for particle 2
 
           flt_t deta2_0, deta2_1, deta2_2;
           if (NEWTON_PAIR) {
             deta2_0 = deta2_1 = deta2_2 = (flt_t)0.0;
             ME_compute_eta_torque(g12, a2, shape2, temp);
 
             tempv_0 = temp1 * temp_0;
             tempv_1 = temp1 * temp_1;
             tempv_2 = temp1 * temp_2;
             ME_mv0_cross3(a2, tempv, tempv2);
             deta2_0 += tempv2_0;
             deta2_1 += tempv2_1;
             deta2_2 += tempv2_2;
 
             tempv_0 = temp1 * temp_3;
             tempv_1 = temp1 * temp_4;
             tempv_2 = temp1 * temp_5;
             ME_mv1_cross3(a2, tempv, tempv2);
             deta2_0 += tempv2_0;
             deta2_1 += tempv2_1;
             deta2_2 += tempv2_2;
 
             tempv_0 = temp1 * temp_6;
             tempv_1 = temp1 * temp_7;
             tempv_2 = temp1 * temp_8;
             ME_mv2_cross3(a2, tempv, tempv2);
             deta2_0 += tempv2_0;
             deta2_1 += tempv2_1;
             deta2_2 += tempv2_2;
           }
 
           // torque
 
           temp1 = u_r * eta;
           temp2 = u_r * chi;
           temp3 = chi * eta;
 
           ttor_0 = (temp1 * dchi_0 + temp2 * deta_0 + temp3 * dUr_0) *
             (flt_t)-1.0;
           ttor_1 = (temp1 * dchi_1 + temp2 * deta_1 + temp3 * dUr_1) *
             (flt_t)-1.0;
           ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) *
             (flt_t)-1.0;
 
           if (NEWTON_PAIR) {
             rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) *
               (flt_t)-1.0;
             rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) *
               (flt_t)-1.0;
             rtor_2 = (temp1 * dchi2_2 + temp2 * deta2_2 + temp3 * dUr2_2) *
               (flt_t)-1.0;
           }
 
           one_eng = temp1 * chi;
           #ifndef INTEL_VMASK
           if (jlist_form[jj] == nall) {
             one_eng = (flt_t)0.0;
             fforce_0 = 0.0;
             fforce_1 = 0.0;
             fforce_2 = 0.0;
             ttor_0 = 0.0;
             ttor_1 = 0.0;
             ttor_2 = 0.0;
             rtor_0 = 0.0;
             rtor_1 = 0.0;
             rtor_2 = 0.0;
           }
           #endif
 
           fforce_0 *= factor_lj;
           fforce_1 *= factor_lj;
           fforce_2 *= factor_lj;
           ttor_0 *= factor_lj;
           ttor_1 *= factor_lj;
           ttor_2 *= factor_lj;
 
           #ifdef INTEL_VMASK
           if (jlist_form[jj] < nall) {
           #endif
             fxtmp += fforce_0;
             fytmp += fforce_1;
             fztmp += fforce_2;
             t1tmp += ttor_0;
             t2tmp += ttor_1;
             t3tmp += ttor_2;
 
             if (NEWTON_PAIR) {
               rtor_0 *= factor_lj;
               rtor_1 *= factor_lj;
               rtor_2 *= factor_lj;
               int jp = j * 2;
               f[jp].x -= fforce_0;
               f[jp].y -= fforce_1;
               f[jp].z -= fforce_2;
               jp++;
               f[jp].x += rtor_0;
               f[jp].y += rtor_1;
               f[jp].z += rtor_2;
             }
 
             if (EFLAG) {
               evdwl = factor_lj * one_eng;
               sevdwl += evdwl;
               if (eatom) {
                 fwtmp += (flt_t)0.5 * evdwl;
                 if (NEWTON_PAIR)
                   f[j*2].w += (flt_t)0.5 * evdwl;
               }
             }
 
             if (NEWTON_PAIR == 0) {
               if (vflag == 1) {
                 sv0 += delx_form[jj] * fforce_0;
                 sv1 += dely_form[jj] * fforce_1;
                 sv2 += delz_form[jj] * fforce_2;
                 sv3 += delx_form[jj] * fforce_1;
                 sv4 += delx_form[jj] * fforce_2;
                 sv5 += dely_form[jj] * fforce_2;
               }
             } // EVFLAG
           #ifdef INTEL_VMASK
           }
           #endif
         } // for jj
 
         // -------------------------------------------------------------
 
         if (multiple_forms)
           ierror = 2;
 
         int ip = i * 2;
         if (NEWTON_PAIR) {
           f[ip].x += fxtmp;
           f[ip].y += fytmp;
           f[ip].z += fztmp;
           ip++;
           f[ip].x += t1tmp;
           f[ip].y += t2tmp;
           f[ip].z += t3tmp;
         } else {
           f[ip].x = fxtmp;
           f[ip].y = fytmp;
           f[ip].z = fztmp;
           ip++;
           f[ip].x = t1tmp;
           f[ip].y = t2tmp;
           f[ip].z = t3tmp;
         }
 
         if (EFLAG) {
           oevdwl += sevdwl;
           if (eatom) f[i * 2].w += fwtmp;
         }
         if (NEWTON_PAIR == 0) {
           if (vflag == 1) {
             ov0 += sv0;
             ov1 += sv1;
             ov2 += sv2;
             ov3 += sv3;
             ov4 += sv4;
             ov5 += sv5;
           }
         }
       } // for i
       int o_range;
       if (NEWTON_PAIR) {
         o_range = nall;
         if (offload == 0) o_range -= minlocal;
         IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,
                                sizeof(FORCE_T));
         const int sto = iito * 8;
         const int fst4 = f_stride * 4;
         #if defined(_OPENMP)
         #pragma omp barrier
         #endif
         acc_t *f_scalar = &f_start[0].x;
         acc_t *f_scalar2 = f_scalar + fst4;
         for (int t = 1; t < nthreads; t++) {
           #if defined(LMP_SIMD_COMPILER)
           #pragma vector aligned
           #pragma simd
           #endif
           for (int n = iifrom * 8; n < sto; n++)
             f_scalar[n] += f_scalar2[n];
           f_scalar2 += fst4;
         }
 
         if (vflag==2) {
           const ATOM_T * _noalias const xo = x + minlocal;
           #if defined(LMP_SIMD_COMPILER)
           #pragma novector
           #endif
           for (int n = iifrom; n < iito; n++) {
             const int nt2 = n * 2;
             ov0 += f_start[nt2].x * xo[n].x;
             ov1 += f_start[nt2].y * xo[n].y;
             ov2 += f_start[nt2].z * xo[n].z;
             ov3 += f_start[nt2].y * xo[n].x;
             ov4 += f_start[nt2].z * xo[n].x;
             ov5 += f_start[nt2].z * xo[n].y;
           }
         }
       }
 
       if (ierror)
         f_start[1].w = ierror;
     } // omp
 
     if (EFLAG) {
       if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
       ev_global[0] = oevdwl;
       ev_global[1] = (acc_t)0.0;
     }
     if (vflag) {
       if (NEWTON_PAIR == 0) {
         ov0 *= (acc_t)-0.5;
         ov1 *= (acc_t)-0.5;
         ov2 *= (acc_t)-0.5;
         ov3 *= (acc_t)-0.5;
         ov4 *= (acc_t)-0.5;
         ov5 *= (acc_t)-0.5;
       }
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
       ev_global[5] = ov3;
       ev_global[6] = ov4;
       ev_global[7] = ov5;
     }
 
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // offload
 
   if (offload)
     fix->stop_watch(TIME_OFFLOAD_LATENCY);
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
   if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, 2);
   else
     fix->add_result_array(f_start, 0, offload, 0, 0, 2);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairGayBerneIntel::init_style()
 {
   PairGayBerne::init_style();
   if (force->newton_pair == 0) {
     neighbor->requests[neighbor->nrequest-1]->half = 0;
     neighbor->requests[neighbor->nrequest-1]->full = 1;
   }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   if (force->newton_pair) fix->set_offload_noghost(1);
   _cop = fix->coprocessor_number();
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void PairGayBerneIntel::pack_force_const(ForceConst<flt_t> &fc,
                                          IntelBuffers<flt_t,acc_t> *buffers)
 {
   int tp1 = atom->ntypes + 1;
   _max_nbors = buffers->get_max_nbors();
   int mthreads = comm->nthreads;
   if (mthreads < buffers->get_off_threads())
     mthreads = buffers->get_off_threads();
   fc.set_ntypes(tp1, _max_nbors, mthreads, memory, _cop);
   buffers->set_ntypes(tp1);
   flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
   double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i,j);
         cutneigh = cut + neighbor->skin;
         cutsq[i][j] = cutsq[j][i] = cut*cut;
         cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
       }
     }
   }
 
   for (int i = 0; i < 4; i++) {
     fc.special_lj[i] = force->special_lj[i];
     fc.special_lj[0] = 1.0;
   }
   fc.gamma = gamma;
   fc.upsilon = upsilon;
   fc.mu = mu;
 
   for (int i = 0; i < tp1; i++) {
     for (int j = 0; j < tp1; j++) {
       fc.ijc[i][j].lj1 = lj1[i][j];
       fc.ijc[i][j].lj2 = lj2[i][j];
       fc.ijc[i][j].cutsq = cutsq[i][j];
       fc.ijc[i][j].offset = offset[i][j];
       fc.ijc[i][j].sigma = sigma[i][j];
       fc.ijc[i][j].epsilon = epsilon[i][j];
       fc.ijc[i][j].form = form[i][j];
       fc.ijc[i][j].lshape = lshape[i] * lshape[j];
       fc.lj34[i][j].lj3 = lj3[i][j];
       fc.lj34[i][j].lj4 = lj4[i][j];
     }
     for (int j = 0; j < 4; j++) {
       fc.ic[i].shape2[j] = shape2[i][j];
       fc.ic[i].well[j] = well[i][j];
     }
   }
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop < 0) return;
   flt_t * special_lj = fc.special_lj;
   FC_PACKED1_T *oijc = fc.ijc[0];
   FC_PACKED2_T *olj34 = fc.lj34[0];
   FC_PACKED3_T *oic = fc.ic;
   flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   if (oijc != NULL && oic != NULL) {
     #pragma offload_transfer target(mic:_cop) \
       in(special_lj: length(4) alloc_if(0) free_if(0)) \
       in(oijc,olj34: length(tp1sq) alloc_if(0) free_if(0)) \
       in(oic: length(tp1) alloc_if(0) free_if(0)) \
       in(ocutneighsq: length(tp1sq))
   }
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                       const int one_length,
                                                       const int nthreads,
                                                       Memory *memory,
                                                       const int cop) {
   if (ntypes != _ntypes) {
     if (_ntypes > 0) {
       fc_packed3 *oic = ic;
 
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       fc_packed1 *oijc = ijc[0];
       fc_packed2 *olj34 = lj34[0];
       flt_t * orsq_form = rsq_form[0];
       flt_t * odelx_form = delx_form[0];
       flt_t * odely_form = dely_form[0];
       flt_t * odelz_form = delz_form[0];
       int * ojtype_form = jtype_form[0];
       int * ojlist_form = jlist_form[0];
 
       if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
           orsq_form != NULL && odelx_form != NULL && odely_form != NULL &&
           odelz_form != NULL && ojtype_form != NULL && ojlist_form != NULL &&
           _cop >= 0) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(ospecial_lj, oijc, olj34, oic: alloc_if(0) free_if(1)) \
           nocopy(orsq_form, odelx_form, odely_form: alloc_if(0) free_if(1)) \
           nocopy(odelz_form, ojtype_form, ojlist_form: alloc_if(0) free_if(1))
       }
       #endif
 
       _memory->destroy(oic);
       _memory->destroy(ijc);
       _memory->destroy(lj34);
       _memory->destroy(rsq_form);
       _memory->destroy(delx_form);
       _memory->destroy(dely_form);
       _memory->destroy(delz_form);
       _memory->destroy(jtype_form);
       _memory->destroy(jlist_form);
     }
 
     if (ntypes > 0) {
       _cop = cop;
       memory->create(ijc, ntypes, ntypes, "fc.ijc");
       memory->create(lj34, ntypes, ntypes, "fc.lj34");
       memory->create(ic, ntypes, "fc.ic");
       memory->create(rsq_form, nthreads, one_length, "rsq_form");
       memory->create(delx_form, nthreads, one_length, "delx_form");
       memory->create(dely_form, nthreads, one_length, "dely_form");
       memory->create(delz_form, nthreads, one_length, "delz_form");
       memory->create(jtype_form, nthreads, one_length, "jtype_form");
       memory->create(jlist_form, nthreads, one_length, "jlist_form");
 
       for (int zn = 0; zn < nthreads; zn++)
         for (int zo = 0; zo < one_length; zo++) {
           rsq_form[zn][zo] = 10.0;
           delx_form[zn][zo] = 10.0;
           dely_form[zn][zo] = 10.0;
           delz_form[zn][zo] = 10.0;
           jtype_form[zn][zo] = 1;
           jlist_form[zn][zo] = 0;
         }
 
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       fc_packed1 *oijc = ijc[0];
       fc_packed2 *olj34 = lj34[0];
       fc_packed3 *oic = ic;
       flt_t * orsq_form = rsq_form[0];
       flt_t * odelx_form = delx_form[0];
       flt_t * odely_form = dely_form[0];
       flt_t * odelz_form = delz_form[0];
       int * ojtype_form = jtype_form[0];
       int * ojlist_form = jlist_form[0];
       int off_onel = one_length * nthreads;
 
       int tp1sq = ntypes*ntypes;
       if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
           oic != NULL && orsq_form != NULL && odelx_form != NULL &&
           odely_form != NULL && odelz_form != NULL && ojtype_form !=NULL &&
           ojlist_form !=NULL && cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
           nocopy(oijc,olj34: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(oic: length(ntypes) alloc_if(1) free_if(0)) \
           in(orsq_form: length(off_onel) alloc_if(1) free_if(0)) \
           in(odelx_form: length(off_onel) alloc_if(1) free_if(0)) \
           in(odely_form: length(off_onel) alloc_if(1) free_if(0)) \
           in(odelz_form: length(off_onel) alloc_if(1) free_if(0)) \
           in(ojtype_form: length(off_onel) alloc_if(1) free_if(0)) \
           in(ojlist_form: length(off_onel) alloc_if(1) free_if(0))
       }
       #endif
     }
   }
   _ntypes = ntypes;
   _memory = memory;
 }
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
new file mode 100644
index 000000000..0dc2c275e
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
@@ -0,0 +1,595 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include "pair_lj_charmm_coul_charmm_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "group.h"
+#include "memory.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "memory.h"
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define LJ_T typename IntelBuffers<flt_t,flt_t>::vec4_t
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulCharmmIntel::PairLJCharmmCoulCharmmIntel(LAMMPS *lmp) :
+  PairLJCharmmCoulCharmm(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulCharmmIntel::~PairLJCharmmCoulCharmmIntel()
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision()==FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+template <class flt_t, class acc_t>
+void PairLJCharmmCoulCharmmIntel::compute(int eflag, int vflag,
+                                        IntelBuffers<flt_t,acc_t> *buffers,
+                                        const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int inum = list->inum;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+
+  if (ago != 0 && fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
+    #if defined(_OPENMP)
+    #pragma omp parallel if(packthreads > 1)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost,
+                                packthreads, sizeof(ATOM_T));
+      buffers->thr_pack(ifrom,ito,ago);
+    }
+    fix->stop_watch(TIME_PACK);
+  }
+
+  // -------------------- Regular version
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (eflag) {
+    if (force->newton_pair) {
+      eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
+    } else {
+      eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
+    }
+  } else {
+    if (force->newton_pair) {
+      eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
+    } else {
+      eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
+                                     IntelBuffers<flt_t,acc_t> *buffers,
+                                     const ForceConst<flt_t> &fc,
+                                     const int astart, const int aend)
+{
+  const int inum = aend - astart;
+  if (inum == 0) return;
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
+
+  ATOM_T * _noalias const x = buffers->get_x(offload);
+  flt_t * _noalias const q = buffers->get_q(offload);
+
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);
+
+  const flt_t * _noalias const special_coul = fc.special_coul;
+  const flt_t * _noalias const special_lj = fc.special_lj;
+  const flt_t qqrd2e = force->qqrd2e;
+  const flt_t inv_denom_lj = (flt_t)1.0/denom_lj;
+  const flt_t inv_denom_coul = (flt_t)1.0/denom_coul;
+
+  const flt_t * _noalias const cutsq = fc.cutsq[0];
+  const LJ_T * _noalias const lj = fc.lj[0];
+  const flt_t cut_ljsq = fc.cut_ljsq;
+  const flt_t cut_lj_innersq = fc.cut_lj_innersq;
+  const flt_t cut_coul_innersq = fc.cut_coul_innersq;
+  const flt_t cut_coulsq = fc.cut_coulsq;
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  flt_t * _noalias const ccachex = buffers->get_ccachex();
+  flt_t * _noalias const ccachey = buffers->get_ccachey();
+  flt_t * _noalias const ccachez = buffers->get_ccachez();
+  flt_t * _noalias const ccachew = buffers->get_ccachew();
+  int * _noalias const ccachei = buffers->get_ccachei();
+  int * _noalias const ccachej = buffers->get_ccachej();
+  const int ccache_stride = _ccache_stride;
+
+  // Determine how much data to transfer
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+
+  const int nthreads = tc;
+  #ifdef _LMP_INTEL_OFFLOAD
+  int *overflow = fix->get_off_overflow_flag();
+  double *timer_compute = fix->off_watch_pair();
+
+  if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
+  #pragma offload target(mic:_cop) if(offload) \
+    in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
+    in(cutsq,lj:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(x:length(x_size) alloc_if(0) free_if(0)) \
+    in(q:length(q_size) alloc_if(0) free_if(0)) \
+    in(overflow:length(0) alloc_if(0) free_if(0)) \
+    in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
+    in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
+    in(ccache_stride,nthreads,qqrd2e,inum,nall,ntypes,cut_coulsq) \
+    in(vflag,eatom,f_stride,separate_flag,offload) \
+    in(astart,cut_ljsq,cut_lj_innersq,nlocal,inv_denom_lj,minlocal) \
+    in(inv_denom_coul,cut_coul_innersq) \
+    out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
+    out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(f_start)
+  #endif
+  {
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
+                              f_stride, x, q);
+
+    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EFLAG) oevdwl = oecoul = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+
+    // loop over neighbors of my atoms
+    #if defined(_OPENMP)
+    #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+      flt_t cutboth = cut_coulsq;
+
+      const int toffs = tid * ccache_stride;
+      flt_t * _noalias const tdelx = ccachex + toffs;
+      flt_t * _noalias const tdely = ccachey + toffs;
+      flt_t * _noalias const tdelz = ccachez + toffs;
+      flt_t * _noalias const trsq = ccachew + toffs;
+      int * _noalias const tj = ccachei + toffs;
+      int * _noalias const tjtype = ccachej + toffs;
+
+      for (int i = iifrom; i < iito; i += iip) {
+        //        const int i = ilist[ii];
+        const int itype = x[i].w;
+
+        const int ptr_off = itype * ntypes;
+        const flt_t * _noalias const cutsqi = cutsq + ptr_off;
+        const LJ_T * _noalias const lji = lj + ptr_off;
+
+        const int   * _noalias const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
+
+        acc_t fxtmp,fytmp,fztmp,fwtmp;
+        acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
+
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        const flt_t qtmp = q[i];
+        fxtmp = fytmp = fztmp = (acc_t)0;
+        if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+        if (NEWTON_PAIR == 0)
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+
+        int ej = 0;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          const int j = jlist[jj] & NEIGHMASK;
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+
+          if (rsq < cut_coulsq) {
+            trsq[ej]=rsq;
+            tdelx[ej]=delx;
+            tdely[ej]=dely;
+            tdelz[ej]=delz;
+            tjtype[ej]=x[j].w;
+            tj[ej]=jlist[jj];
+            ej++;
+          }
+        }
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
+                               sv0, sv1, sv2, sv3, sv4, sv5)
+        #endif
+        for (int jj = 0; jj < ej; jj++) {
+          flt_t forcecoul, forcelj, evdwl;
+          forcecoul = forcelj = evdwl = (flt_t)0.0;
+
+          const int j = tj[jj] & NEIGHMASK;
+          const int sbindex = tj[jj] >> SBBITS & 3;
+          const flt_t rsq = trsq[jj];
+          const flt_t r2inv = (flt_t)1.0 / rsq;
+	  const flt_t r_inv = (flt_t)1.0 / sqrt(rsq);
+	  forcecoul = qqrd2e * qtmp * q[j] * r_inv;
+	  if (rsq > cut_coul_innersq) {
+	    const flt_t ccr = cut_coulsq - rsq;
+	    const flt_t switch1 = ccr * ccr * inv_denom_coul *
+              (cut_coulsq + (flt_t)2.0 * rsq - (flt_t)3.0 * cut_coul_innersq);
+            forcecoul *= switch1; 
+          }
+
+          #ifdef INTEL_VMASK
+          if (rsq < cut_ljsq) {
+          #endif
+	    const int jtype = tjtype[jj];
+            flt_t r6inv = r2inv * r2inv * r2inv;
+            forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y);
+            if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w);
+
+            #ifdef INTEL_VMASK
+            if (rsq > cut_lj_innersq) {
+            #endif
+              const flt_t drsq = cut_ljsq - rsq;
+              const flt_t cut2 = (rsq - cut_lj_innersq) * drsq;
+              const flt_t switch1 = drsq * (drsq * drsq + (flt_t)3.0 * cut2) *
+                  inv_denom_lj;
+              const flt_t switch2 = (flt_t)12.0 * rsq * cut2 * inv_denom_lj;
+              if (EFLAG) {
+                #ifndef INTEL_VMASK
+                if (rsq > cut_lj_innersq) {
+                #endif
+                  forcelj = forcelj * switch1 + evdwl * switch2;
+                  evdwl *= switch1;
+                #ifndef INTEL_VMASK
+                }
+                #endif
+              } else {
+                const flt_t philj = r6inv * (lji[jtype].z*r6inv -
+                    lji[jtype].w);
+                #ifndef INTEL_VMASK
+                if (rsq > cut_lj_innersq)
+                #endif
+                  forcelj =  forcelj * switch1 + philj * switch2;
+              }
+            #ifdef INTEL_VMASK
+            }
+            #endif
+
+          #ifdef INTEL_VMASK
+          }
+          #else
+          if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
+          #endif
+	  if (sbindex) {
+  	    const flt_t factor_coul = special_coul[sbindex];
+	    forcecoul *= factor_coul;
+	    const flt_t factor_lj = special_lj[sbindex];
+	    forcelj *= factor_lj;
+	    if (EFLAG) evdwl *= factor_lj;
+          }
+
+          const flt_t fpair = (forcecoul + forcelj) * r2inv;
+          const flt_t fpx = fpair * tdelx[jj];
+          fxtmp += fpx;
+          if (NEWTON_PAIR) f[j].x -= fpx;
+          const flt_t fpy = fpair * tdely[jj];
+          fytmp += fpy;
+          if (NEWTON_PAIR) f[j].y -= fpy;
+          const flt_t fpz = fpair * tdelz[jj];
+          fztmp += fpz;
+          if (NEWTON_PAIR) f[j].z -= fpz;
+
+          if (EFLAG) {
+            sevdwl += evdwl;
+            secoul += forcecoul;
+            if (eatom) {
+              fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * forcecoul;
+              if (NEWTON_PAIR)
+                f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * forcecoul;
+            }
+          }
+          if (NEWTON_PAIR == 0)
+            IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
+                                  fpx, fpy, fpz);
+        } // for jj
+        if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+        }
+        IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
+      } // for ii
+
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
+    } // end of omp parallel region
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) {
+        oevdwl *= (acc_t)0.5;
+        oecoul *= (acc_t)0.5;
+      }
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
+      }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
+    }
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end of offload region
+
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EFLAG || vflag)
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmIntel::init_style()
+{
+  PairLJCharmmCoulCharmm::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  fix->pair_init_check();
+  #ifdef _LMP_INTEL_OFFLOAD
+  _cop = fix->coprocessor_number();
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  else
+    pack_force_const(force_const_single, fix->get_single_buffers());
+}
+
+template <class flt_t, class acc_t>
+void PairLJCharmmCoulCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                          IntelBuffers<flt_t,acc_t> *buffers)
+{
+  int off_ccache = 0;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop >= 0) off_ccache = 1;
+  #endif
+  buffers->grow_ccache(off_ccache, comm->nthreads, 1);
+  _ccache_stride = buffers->ccache_stride();
+
+  int tp1 = atom->ntypes + 1;
+
+  fc.set_ntypes(tp1, memory, _cop);
+  buffers->set_ntypes(tp1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  if (cut_lj > cut_coul)
+    error->all(FLERR,
+         "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i, j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+      }
+    }
+  }
+
+  cut_coul_innersq = cut_coul_inner * cut_coul_inner;
+  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
+  cut_ljsq = cut_lj * cut_lj;
+  cut_coulsq = cut_coul * cut_coul;
+  cut_bothsq = MAX(cut_ljsq, cut_coulsq);
+
+  fc.cut_coulsq = cut_coulsq;
+  fc.cut_ljsq = cut_ljsq;
+  fc.cut_coul_innersq = cut_coul_innersq;
+  fc.cut_lj_innersq = cut_lj_innersq;
+
+  for (int i = 0; i < 4; i++) {
+    fc.special_lj[i] = force->special_lj[i];
+    fc.special_coul[i] = force->special_coul[i];
+    fc.special_coul[0] = 1.0;
+    fc.special_lj[0] = 1.0;
+  }
+
+  for (int i = 0; i < tp1; i++) {
+    for (int j = 0; j < tp1; j++) {
+      fc.lj[i][j].x = lj1[i][j];
+      fc.lj[i][j].y = lj2[i][j];
+      fc.lj[i][j].z = lj3[i][j];
+      fc.lj[i][j].w = lj4[i][j];
+      fc.cutsq[i][j] = cutsq[i][j];
+    }
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop < 0) return;
+  flt_t * special_lj = fc.special_lj;
+  flt_t * special_coul = fc.special_coul;
+  flt_t * cutsq = fc.cutsq[0];
+  LJ_T * lj = fc.lj[0];
+  flt_t * ocutneighsq = cutneighsq[0];
+  int tp1sq = tp1 * tp1;
+  #pragma offload_transfer target(mic:_cop) \
+    in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
+    in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) \
+    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void PairLJCharmmCoulCharmmIntel::ForceConst<flt_t>::set_ntypes(
+  const int ntypes, Memory *memory, const int cop) {
+  if (ntypes != _ntypes) {
+    if (_ntypes > 0) {
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      flt_t * ospecial_coul = special_coul;
+      flt_t * ocutsq = cutsq[0];
+      typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
+      if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
+          ospecial_coul != NULL && cop >= 0) {
+        #pragma offload_transfer target(mic:cop) \
+          nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
+          nocopy(ocutsq, olj: alloc_if(0) free_if(1))
+      }
+      #endif
+
+      _memory->destroy(cutsq);
+      _memory->destroy(lj);
+    }
+    if (ntypes > 0) {
+      _cop = cop;
+      memory->create(cutsq,ntypes,ntypes,"fc.cutsq");
+      memory->create(lj,ntypes,ntypes,"fc.lj");
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      flt_t * ospecial_coul = special_coul;
+      flt_t * ocutsq = cutsq[0];
+      typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
+      int tp1sq = ntypes*ntypes;
+      if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
+          ospecial_coul != NULL && cop >= 0) {
+        #pragma offload_transfer target(mic:cop) \
+          nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
+          nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
+          nocopy(ocutsq,olj: length(tp1sq) alloc_if(1) free_if(0))
+      }
+      #endif
+    }
+  }
+  _ntypes=ntypes;
+  _memory=memory;
+}
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.h b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.h
new file mode 100644
index 000000000..64d607747
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.h
@@ -0,0 +1,100 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/charmm/coul/charmm/intel,PairLJCharmmCoulCharmmIntel)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CHARMM_COUL_CHARMM_INTEL_H
+#define LMP_PAIR_LJ_CHARMM_COUL_CHARMM_INTEL_H
+
+#include "pair_lj_charmm_coul_charmm.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCharmmCoulCharmmIntel : public PairLJCharmmCoulCharmm {
+
+ public:
+  PairLJCharmmCoulCharmmIntel(class LAMMPS *);
+  virtual ~PairLJCharmmCoulCharmmIntel();
+
+  virtual void compute(int, int);
+  void init_style();
+
+  typedef struct { float x,y,z; int w; } sng4_t;
+
+ private:
+  FixIntel *fix;
+  int _cop, _ccache_stride;
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  // ----------------------------------------------------------------------
+  template <class flt_t>
+  class ForceConst {
+   public:
+    _alignvar(flt_t special_coul[4],64);
+    _alignvar(flt_t special_lj[4],64);
+    flt_t **cutsq;
+    flt_t cut_coulsq, cut_ljsq;
+    flt_t cut_coul_innersq, cut_lj_innersq;
+    typename IntelBuffers<flt_t,flt_t>::vec4_t **lj;
+
+    ForceConst() : _ntypes(0) {}
+    ~ForceConst() { set_ntypes(0,NULL,_cop); }
+
+    void set_ntypes(const int ntypes, Memory *memory, const int cop);
+
+   private:
+    int _ntypes, _cop;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self-explanatory.
+
+E: Intel varient of lj/charmm/coul/charmm expects lj cutoff<=coulombic
+
+The intel accelerated version of the CHARMM style requires that the
+Lennard-Jones cutoff is not greater than the coulombic cutoff.
+
+*/
diff --git a/src/USER-INTEL/pair_rebo_intel.cpp b/src/USER-INTEL/pair_rebo_intel.cpp
new file mode 100644
index 000000000..006830a5f
--- /dev/null
+++ b/src/USER-INTEL/pair_rebo_intel.cpp
@@ -0,0 +1,42 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#include "pair_rebo_intel.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairREBOIntel::PairREBOIntel(LAMMPS *lmp) : PairAIREBOIntel(lmp) {}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairREBOIntel::settings(int narg, char **arg)
+{
+  if (narg != 0) error->all(FLERR,"Illegal pair_style command");
+
+  cutlj = 0.0;
+  ljflag = torflag = 0;
+  //
+  // this one parameter for C-C interactions is different in REBO vs AIREBO
+  // see Favata, Micheletti, Ryu, Pugno, Comp Phys Comm (2016)
+  
+  PCCf_2_0 = 0.0;
+}
diff --git a/src/USER-INTEL/pair_rebo_intel.h b/src/USER-INTEL/pair_rebo_intel.h
new file mode 100644
index 000000000..e76279a24
--- /dev/null
+++ b/src/USER-INTEL/pair_rebo_intel.h
@@ -0,0 +1,40 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(rebo/intel,PairREBOIntel)
+
+#else
+
+#ifndef LMP_PAIR_REBO_INTEL_H
+#define LMP_PAIR_REBO_INTEL_H
+
+#include "pair_airebo_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairREBOIntel : public PairAIREBOIntel {
+ public:
+  PairREBOIntel(class LAMMPS *);
+  virtual void settings(int, char **);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-INTEL/pair_sw_intel.cpp b/src/USER-INTEL/pair_sw_intel.cpp
index 7a6b7afd9..fff104f39 100644
--- a/src/USER-INTEL/pair_sw_intel.cpp
+++ b/src/USER-INTEL/pair_sw_intel.cpp
@@ -1,1315 +1,1316 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #include "pair_sw_intel.h"
 
 #ifdef _LMP_INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))
 #endif
 #include <cmath>
 #ifdef _LMP_INTEL_OFFLOAD
 #pragma offload_attribute(pop)
 #endif
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "atom.h"
 #include "neighbor.h"
 #include "neigh_request.h"
 #include "force.h"
 #include "comm.h"
 #include "memory.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "memory.h"
 #include "error.h"
 #include "modify.h"
 #include "suffix.h"
 
 #ifdef LMP_USE_AVXCD
 #define OUTER_CHUNK 1
 #include "intel_simd.h"
 using namespace ip_simd;
 #endif
 
 using namespace LAMMPS_NS;
 
 #define FC_PACKED0_T typename ForceConst<flt_t>::fc_packed0
 #define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
 #define FC_PACKED1p2_T typename ForceConst<flt_t>::fc_packed1p2
 #define FC_PACKED2_T typename ForceConst<flt_t>::fc_packed2
 #define FC_PACKED3_T typename ForceConst<flt_t>::fc_packed3
 
 #define MAXLINE 1024
 #define DELTA 4
 
 /* ---------------------------------------------------------------------- */
 
 PairSWIntel::PairSWIntel(LAMMPS *lmp) : PairSW(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 PairSWIntel::~PairSWIntel()
 {
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairSWIntel::compute(int eflag, int vflag)
 {
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 
   fix->balance_stamp();
   vflag_fdotr = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void PairSWIntel::compute(int eflag, int vflag,
                           IntelBuffers<flt_t,acc_t> *buffers,
                           const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag, vflag);
   } else evflag = vflag_fdotr = 0;
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
   const int host_start = fix->host_start_pair();
   const int offload_end = fix->offload_end_pair();
   const int ago = neighbor->ago;
 
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
 
     int packthreads;
     if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
     else packthreads = 1;
     #if defined(_OPENMP)
     #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
                                 packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom, ito, ago);
     }
 
     fix->stop_watch(TIME_PACK);
   }
 
   int ovflag = 0;
   if (vflag_fdotr) ovflag = 2;
   else if (vflag) ovflag = 1;
   if (_onetype) {
     if (_spq) {
       if (eflag) {
         eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
         eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       } else {
         eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
         eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       }
     } else {
       if (eflag) {
         eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
         eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       } else {
         eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
         eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       }
     }
   } else {
     if (_spq) {
       if (eflag) {
         eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
         eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       } else {
         eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
         eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       }
     } else {
       if (eflag) {
         eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
         eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       } else {
         eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
         eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       }
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 #ifndef LMP_USE_AVXCD
 
 template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t>
 void PairSWIntel::eval(const int offload, const int vflag,
                        IntelBuffers<flt_t,acc_t> *buffers,
                        const ForceConst<flt_t> &fc, const int astart,
                        const int aend, const int pad_width)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
   int nlocal, nall, minlocal;
   fix->get_buffern(offload, nlocal, nall, minlocal);
 
   const int ago = neighbor->ago;
   IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
   const int * _noalias const numneighhalf = buffers->get_atombin();
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
 
   const FC_PACKED0_T * _noalias const p2 = fc.p2[0];
   const FC_PACKED1_T * _noalias const p2f = fc.p2f[0];
   const FC_PACKED1p2_T * _noalias const p2f2 = fc.p2f2[0];
   const FC_PACKED2_T * _noalias const p2e = fc.p2e[0];
   const FC_PACKED3_T * _noalias const p3 = fc.p3[0][0];
 
   flt_t * _noalias const ccachex = buffers->get_ccachex();
   flt_t * _noalias const ccachey = buffers->get_ccachey();
   flt_t * _noalias const ccachez = buffers->get_ccachez();
   flt_t * _noalias const ccachew = buffers->get_ccachew();
   int * _noalias const ccachei = buffers->get_ccachei();
   int * _noalias const ccachej = buffers->get_ccachej();
   const int ccache_stride = _ccache_stride;
 
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EFLAG, vflag,
                        buffers, offload, fix, separate_flag,
                        x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
   const int nthreads = tc;
 
   #ifdef _LMP_INTEL_OFFLOAD
   double *timer_compute = fix->off_watch_pair();
   int *overflow = fix->get_off_overflow_flag();
 
   if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
   #pragma offload target(mic:_cop) if(offload) \
     in(p2,p2f,p2f2,p2e,p3:length(0) alloc_if(0) free_if(0)) \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
     in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
     in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload) \
     in(astart,nlocal,f_stride,minlocal,separate_flag,pad_width) \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
     signal(f_start)
   #endif
   {
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
     IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall,
                               f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EFLAG) oevdwl = (acc_t)0;
     if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     #if defined(_OPENMP)
     #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iip, iito, tid;
       IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
       FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
       memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       const int toffs = tid * ccache_stride;
       flt_t * _noalias const tdelx = ccachex + toffs;
       flt_t * _noalias const tdely = ccachey + toffs;
       flt_t * _noalias const tdelz = ccachez + toffs;
       flt_t * _noalias const trsq = ccachew + toffs;
       int * _noalias const tj = ccachei + toffs;
       int * _noalias const tjtype = ccachej + toffs;
 
       // loop over full neighbor list of my atoms
       flt_t cutsq, cut, powerp, powerq, sigma, c1, c2, c3, c4, c5, c6;
       flt_t sigma_gamma, costheta, lambda_epsilon, lambda_epsilon2;
       if (ONETYPE) {
         cutsq = p2[3].cutsq;
         cut = p2f[3].cut;
         sigma = p2f[3].sigma;
         c1 = p2f2[3].c1;
         c2 = p2f2[3].c2;
         c3 = p2f2[3].c3;
         c4 = p2f2[3].c4;
         sigma_gamma = p2[3].sigma_gamma;
         costheta = p3[7].costheta;
         lambda_epsilon = p3[7].lambda_epsilon;
         lambda_epsilon2 = p3[7].lambda_epsilon2;
         if (SPQ == 0) {
           powerp = p2f[3].powerp;
           powerq = p2f[3].powerq;
         }
         if (EFLAG) {
           c5 = p2e[3].c5;
           c6 = p2e[3].c6;
         }
       }
 
       for (int i = iifrom; i < iito; i += iip) {
         int itype, itype_offset;
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
 
         if (!ONETYPE) {
           itype = x[i].w;
           itype_offset = itype * ntypes;
         }
 
         const int * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
         const int jnumhalf = numneighhalf[i];
 
         acc_t fxtmp, fytmp, fztmp, fwtmp;
         acc_t sevdwl;
         fxtmp = fytmp = fztmp = (acc_t)0.0;
         if (EFLAG) fwtmp = sevdwl = (acc_t)0;
 
         int ejnum = 0, ejnumhalf = 0;
         #pragma vector aligned
         #pragma ivdep
         for (int jj = 0; jj < jnum; jj++) {
           int j = jlist[jj];
           j &= NEIGHMASK;
           const flt_t delx = x[j].x - xtmp;
           const flt_t dely = x[j].y - ytmp;
           const flt_t delz = x[j].z - ztmp;
           int jtype, ijtype;
           if (!ONETYPE) {
             jtype = x[j].w;
             ijtype = itype_offset + jtype;
             cutsq = p2[ijtype].cutsq;
           }
           const flt_t rsq1 = delx * delx + dely * dely + delz * delz;
           if (rsq1 < cutsq) {
             tdelx[ejnum] = delx;
             tdely[ejnum] = dely;
             tdelz[ejnum] = delz;
             trsq[ejnum] = rsq1;
             tj[ejnum] = j;
             if (!ONETYPE) tjtype[ejnum] = jtype;
             ejnum++;
             if (jj < jnumhalf) ejnumhalf++;
           }
         }
-        int ejnum_pad = ejnum;
-
-        while ( (ejnum_pad % pad_width) != 0) {
-          tdelx[ejnum_pad] = (flt_t)0.0;
-          tdely[ejnum_pad] = (flt_t)0.0;
-          tdelz[ejnum_pad] = (flt_t)0.0;
-          trsq[ejnum_pad] = p2[3].cutsq + (flt_t)1.0;
-          tj[ejnum_pad] = nall;
-          if (!ONETYPE) tjtype[ejnum_pad] = 0;
-          ejnum_pad++;
+
+	int ejrem = ejnum & (pad_width - 1);
+	if (ejrem) ejrem = pad_width - ejrem;
+	const int ejnum_pad = ejnum + ejrem;
+	for (int jj = ejnum; jj < ejnum_pad; jj++) {
+          tdelx[jj] = (flt_t)0.0;
+          tdely[jj] = (flt_t)0.0;
+          tdelz[jj] = (flt_t)0.0;
+          trsq[jj] = p2[3].cutsq + (flt_t)1.0;
+          tj[jj] = nall;
+          if (!ONETYPE) tjtype[jj] = 0;
         }
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
         #endif
         for (int jj = 0; jj < ejnum_pad; jj++) {
           acc_t fjxtmp, fjytmp, fjztmp, fjtmp;
           fjxtmp = fjytmp = fjztmp = (acc_t)0.0;
           if (EFLAG) fjtmp = (acc_t)0.0;
           int ijtype;
 
           if (!ONETYPE) ijtype = tjtype[jj] + itype_offset;
           const flt_t rsq1 = trsq[jj];
 
           const flt_t rinvsq1 = (flt_t)1.0 / rsq1;
           const flt_t r1 = (flt_t)1.0/sqrt(rinvsq1);
           if (!ONETYPE) cut = p2f[ijtype].cut;
           const flt_t rainv1 = (flt_t)1.0 / (r1 - cut);
 
           // two-body interactions, skip half of them
           flt_t rp, rq;
           if (SPQ == 1) {
             rp = r1 * r1;
             rp *= rp;
             rp = (flt_t)1.0 / rp;
             rq = (flt_t)1.0;
           } else {
             if (!ONETYPE) {
               powerp = p2f[ijtype].powerp;
               powerq = p2f[ijtype].powerq;
             }
             rp = std::pow(r1, powerp);
             rq = std::pow(r1, powerq);
           }
 
           if (!ONETYPE) {
             sigma = p2f[ijtype].sigma;
             c1 = p2f2[ijtype].c1;
             c2 = p2f2[ijtype].c2;
             c3 = p2f2[ijtype].c3;
             c4 = p2f2[ijtype].c4;
           }
 
           const flt_t rainvsq = rainv1 * rainv1 * r1;
           flt_t expsrainv = exp(sigma * rainv1);
           if (jj >= ejnumhalf) expsrainv = (flt_t)0.0;
           const flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) *
                                rainvsq) * expsrainv * rinvsq1;
 
           const flt_t delx = tdelx[jj];
           const flt_t dely = tdely[jj];
           const flt_t delz = tdelz[jj];
           const flt_t fpx = fpair * delx;
           fxtmp -= fpx;
           fjxtmp += fpx;
           const flt_t fpy = fpair * dely;
           fytmp -= fpy;
           fjytmp += fpy;
           const flt_t fpz = fpair * delz;
           fztmp -= fpz;
           fjztmp += fpz;
 
           if (EFLAG) {
             flt_t evdwl;
             if (!ONETYPE) {
               c5 = p2e[ijtype].c5;
               c6 = p2e[ijtype].c6;
             }
             evdwl = (c5 * rp - c6 * rq) * expsrainv;
             sevdwl += evdwl;
             if (eatom) {
               fwtmp += (flt_t)0.5 * evdwl;
               fjtmp += (flt_t)0.5 * evdwl;
             }
           }
 
           /*---------------------------------------------*/
 
           int ijkoff;
           if (!ONETYPE) {
             sigma_gamma = p2[ijtype].sigma_gamma;
             ijkoff = ijtype * ntypes;
           }
 
           flt_t gsrainv1 = sigma_gamma * rainv1;
           flt_t gsrainvsq1 = gsrainv1 * rainv1 / r1;
           flt_t expgsrainv1 = exp(gsrainv1);
 
           for (int kk = 0; kk < ejnum; kk++) {
             int iktype, ijktype;
             if (!ONETYPE) {
               iktype = tjtype[kk];
               ijktype = ijkoff + iktype;
               iktype += itype_offset;
               cut = p2[iktype].cut;
               sigma_gamma = p2[iktype].sigma_gamma;
               costheta = p3[ijktype].costheta;
               lambda_epsilon = p3[ijktype].lambda_epsilon;
               lambda_epsilon2 = p3[ijktype].lambda_epsilon2;
             }
 
             flt_t delr2[3];
             delr2[0] = tdelx[kk];
             delr2[1] = tdely[kk];
             delr2[2] = tdelz[kk];
             const flt_t rsq2 = trsq[kk];
 
             const flt_t rinvsq2 = (flt_t)1.0 / rsq2;
             const flt_t r2 = (flt_t)1.0 / sqrt(rinvsq2);
             const flt_t rainv2 = (flt_t)1.0 / (r2 - cut);
             const flt_t gsrainv2 = sigma_gamma * rainv2;
             const flt_t gsrainvsq2 = gsrainv2 * rainv2 / r2;
             const flt_t expgsrainv2 = exp(gsrainv2);
 
             const flt_t rinv12 = (flt_t)1.0 / (r1 * r2);
             const flt_t cs = (delx * delr2[0] + dely * delr2[1] +
                               delz * delr2[2]) * rinv12;
             const flt_t delcs = cs - costheta;
             const flt_t delcssq = delcs*delcs;
 
             flt_t kfactor;
             if (jj == kk || jj >= ejnum) kfactor = (flt_t)0.0;
             else kfactor = (flt_t)1.0;
 
             const flt_t facexp = expgsrainv1*expgsrainv2*kfactor;
             const flt_t facrad = lambda_epsilon * facexp * delcssq;
             const flt_t frad1 = facrad*gsrainvsq1;
             const flt_t frad2 = facrad*gsrainvsq2;
             const flt_t facang = lambda_epsilon2 * facexp * delcs;
             const flt_t facang12 = rinv12*facang;
             const flt_t csfacang = cs*facang;
             const flt_t csfac1 = rinvsq1*csfacang;
 
             const flt_t fjx = delx*(frad1+csfac1)-delr2[0]*facang12;
             const flt_t fjy = dely*(frad1+csfac1)-delr2[1]*facang12;
             const flt_t fjz = delz*(frad1+csfac1)-delr2[2]*facang12;
 
             fxtmp -= fjx;
             fytmp -= fjy;
             fztmp -= fjz;
             fjxtmp += fjx;
             fjytmp += fjy;
             fjztmp += fjz;
 
             if (EFLAG) {
               const flt_t evdwl = facrad * (flt_t)0.5;
               sevdwl += evdwl;
               if (eatom) {
                 fwtmp += (acc_t)0.33333333 * evdwl;
                 fjtmp += (acc_t)0.33333333 * facrad;
               }
             }
           } // for kk
           const int j = tj[jj];
           f[j].x += fjxtmp;
           f[j].y += fjytmp;
           f[j].z += fjztmp;
           if (EFLAG)
             if (eatom) f[j].w += fjtmp;
         } // for jj
 
         f[i].x += fxtmp;
         f[i].y += fytmp;
         f[i].z += fztmp;
 
         if (EFLAG) {
           f[i].w += fwtmp;
           oevdwl += sevdwl;
         }
       } // for ii
 
       IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride,
                               x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5);
     } // end omp
 
     IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
       ev_global[0] = oevdwl;
       ev_global[1] = (acc_t)0.0;
     }
     if (vflag) {
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
       ev_global[5] = ov3;
       ev_global[6] = ov4;
       ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end offload
   if (offload)
     fix->stop_watch(TIME_OFFLOAD_LATENCY);
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
   if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
 }
 
 #else
 
 /* ----------------------------------------------------------------------
 
 Vector intrinsics are temporarily being used for the Stillinger-Weber
 potential to allow for advanced features in the AVX512 instruction set to
 be exploited on early hardware. We hope to see compiler improvements for
 AVX512 that will eliminate this requirement, so it is not recommended to
 develop code based on the intrinsics implementation. Please e-mail the
 authors for more details.
 
 ------------------------------------------------------------------------- */
 
 template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t>
 void PairSWIntel::eval(const int offload, const int vflag,
                        IntelBuffers<flt_t,acc_t> *buffers,
                        const ForceConst<flt_t> &fc, const int astart,
                        const int aend, const int pad_width)
 {
   typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t;
   typedef typename SIMD_type<acc_t>::SIMD_vec SIMD_acc_t;
   const int swidth = SIMD_type<flt_t>::width();
 
   const int inum = aend - astart;
   if (inum == 0) return;
   int nlocal, nall, minlocal;
   fix->get_buffern(offload, nlocal, nall, minlocal);
 
   const int ago = neighbor->ago;
   IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
   const int * _noalias const numneighhalf = buffers->get_atombin();
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
 
   const FC_PACKED0_T * _noalias const p2 = fc.p2[0];
   const FC_PACKED1_T * _noalias const p2f = fc.p2f[0];
   const FC_PACKED1p2_T * _noalias const p2f2 = fc.p2f2[0];
   const FC_PACKED2_T * _noalias const p2e = fc.p2e[0];
   const FC_PACKED3_T * _noalias const p3 = fc.p3[0][0];
 
   flt_t * _noalias const ccachex = buffers->get_ccachex();
   flt_t * _noalias const ccachey = buffers->get_ccachey();
   flt_t * _noalias const ccachez = buffers->get_ccachez();
   flt_t * _noalias const ccachew = buffers->get_ccachew();
   int * _noalias const ccachei = buffers->get_ccachei();
   int * _noalias const ccachej = buffers->get_ccachej();
   acc_t * _noalias const ccachef = buffers->get_ccachef();
   const int ccache_stride = _ccache_stride;
   const int ccache_stride3 = _ccache_stride3;
 
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EFLAG, vflag,
                        buffers, offload, fix, separate_flag,
                        x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
   const int nthreads = tc;
 
   #ifdef _LMP_INTEL_OFFLOAD
   double *timer_compute = fix->off_watch_pair();
   int *overflow = fix->get_off_overflow_flag();
 
   if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
   #pragma offload target(mic:_cop) if(offload) \
     in(p2,p2f,p2f2,p2e,p3:length(0) alloc_if(0) free_if(0)) \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
     in(ccachei,ccachej,ccachef:length(0) alloc_if(0) free_if(0)) \
     in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload) \
     in(astart,nlocal,f_stride,minlocal,separate_flag,pad_width) \
     in(ccache_stride3)                                          \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
     signal(f_start)
   #endif
   {
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
     IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall,
                               f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EFLAG) oevdwl = (acc_t)0;
     if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     #if defined(_OPENMP)
     #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iip, iito, tid;
       IP_PRE_omp_stride_id_vec(iifrom, iip, iito, tid, inum, nthreads,
                                swidth);
 
       iifrom += astart;
       iito += astart;
 
       FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
       memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       const int toffs = tid * ccache_stride;
       flt_t * _noalias const tdelx = ccachex + toffs;
       flt_t * _noalias const tdely = ccachey + toffs;
       flt_t * _noalias const tdelz = ccachez + toffs;
       flt_t * _noalias const trsq = ccachew + toffs;
       int * _noalias const tj = ccachei + toffs;
       int * _noalias const tjtype = ccachej + toffs;
       acc_t * _noalias const tf = ccachef + tid * ccache_stride3;
 
       // loop over full neighbor list of my atoms
 
       SIMD_flt_t cutsq, cut, powerp, powerq, sigma, c1, c2, c3,c4, c5, c6;
       SIMD_flt_t sigma_gamma, costheta, lambda_epsilon, lambda_epsilon2;
       if (ONETYPE) {
         cutsq = SIMD_set(p2[3].cutsq);
         cut = SIMD_set(p2f[3].cut);
         sigma = SIMD_set(p2f[3].sigma);
         c1 = SIMD_set(p2f2[3].c1);
         c2 = SIMD_set(p2f2[3].c2);
         c3 = SIMD_set(p2f2[3].c3);
         c4 = SIMD_set(p2f2[3].c4);
         sigma_gamma = SIMD_set(p2[3].sigma_gamma);
         costheta = SIMD_set(p3[7].costheta);
         lambda_epsilon = SIMD_set(p3[7].lambda_epsilon);
         lambda_epsilon2 = SIMD_set(p3[7].lambda_epsilon2);
         if (SPQ == 0) {
           powerp = SIMD_set(p2f[3].powerp);
           powerq = SIMD_set(p2f[3].powerq);
         }
         if (EFLAG) {
           c5 = SIMD_set(p2e[3].c5);
           c6 = SIMD_set(p2e[3].c6);
         }
       }
 
       SIMD_int ilist = SIMD_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
       const SIMD_int goffset = SIMD_set(0,16,32,48,64,80,96,112,128,
                                         144,160,176,192,208,224,240);
       ilist = ilist + iifrom;
       acc_t * const dforce = &(f[0].x);
       for (int i = iifrom; i < iito; i += iip) {
         SIMD_mask imask = ilist < iito;
         SIMD_flt_t xtmp, ytmp, ztmp;
         SIMD_int itype, itype_offset;
 
         if (ONETYPE)
           SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp);
         else {
           SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp, itype);
           itype_offset = itype * ntypes;
         }
 
         #ifdef OUTER_CHUNK
         const int* ng = firstneigh + cnumneigh[i] - swidth;
         #else
         SIMD_int ng = SIMD_load(cnumneigh + i);
         ng = ng - 1;
         #endif
         const SIMD_int jnum = SIMD_loadz(imask, numneigh + i);
         const SIMD_int jnumhalf = SIMD_loadz(imask, numneighhalf + i);
         const int jnum_max = SIMD_max(jnum);
 
         SIMD_acc_t fxtmp = SIMD_set((acc_t)0);
         SIMD_acc_t fytmp = SIMD_set((acc_t)0);
         SIMD_acc_t fztmp = SIMD_set((acc_t)0);
         SIMD_acc_t fwtmp, fxtmp2, fytmp2, fztmp2, fwtmp2;
         if (is_same<flt_t,acc_t>::value == 0) {
           fxtmp2 = SIMD_set((acc_t)0);
           fytmp2 = SIMD_set((acc_t)0);
           fztmp2 = SIMD_set((acc_t)0);
           if (EFLAG) fwtmp2 = SIMD_set((acc_t)0);
         }
 
         SIMD_acc_t sevdwl;
         if (EFLAG) {
           fwtmp = SIMD_set((acc_t)0);
           sevdwl = SIMD_set((acc_t)0);
         }
 
         SIMD_int ejnum = SIMD_set(0);
         SIMD_int ejnumhalf = SIMD_set(0);
         SIMD_int coffset = SIMD_set(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                                     11, 12, 13, 14, 15);
         for (int jj = 0; jj < jnum_max; jj++) {
           SIMD_mask jmask = jj < jnum;
 
           #ifdef OUTER_CHUNK
           ng += swidth;
           SIMD_int j = SIMD_load(ng);
           #else
           ng = ng + 1;
           SIMD_int j = SIMD_gather(jmask, firstneigh, ng);
           #endif
           j = j & SIMD_set(NEIGHMASK);
           const SIMD_int joffset = j << 4;
 
           SIMD_flt_t delx, dely, delz;
           SIMD_int jtype, ijtype;
           if (ONETYPE)
             SIMD_atom_gather(jmask, &(x[0].x), joffset, delx, dely, delz);
           else {
             SIMD_atom_gather(jmask, &(x[0].x), joffset, delx, dely, delz,
                              jtype);
             ijtype = (jtype + itype_offset) << 2;
             cutsq = SIMD_gather(jmask, &(p2[0].cutsq), ijtype);
           }
 
           delx = delx - xtmp;
           dely = dely - ytmp;
           delz = delz - ztmp;
           SIMD_flt_t rsq1 = delx * delx;
           rsq1 = SIMD_fma(dely, dely, rsq1);
           rsq1 = SIMD_fma(delz, delz, rsq1);
 
           const SIMD_mask rmask = SIMD_lt(jmask, rsq1, cutsq);
           SIMD_scatter(rmask, tdelx, coffset, delx);
           SIMD_scatter(rmask, tdely, coffset, dely);
           SIMD_scatter(rmask, tdelz, coffset, delz);
           SIMD_scatter(rmask, trsq, coffset, rsq1);
           SIMD_scatter(rmask, tj, coffset, j);
           if (!ONETYPE) SIMD_scatter(rmask, tjtype, coffset, jtype);
           ejnum = SIMD_add(rmask, ejnum, 1);
           coffset = SIMD_add(rmask, coffset, swidth);
           const SIMD_mask hmask = SIMD_lt(rmask, SIMD_set(jj), jnumhalf);
           ejnumhalf = SIMD_add(hmask, ejnumhalf, 1);
         }
 
         const int ejnum_max = SIMD_max(ejnum);
         const int ejnumhalf_max = SIMD_max(ejnumhalf);
         memset(tf, 0, ejnum_max * sizeof(acc_t) * swidth * 3);
         for (int jj = 0; jj < ejnum_max; jj++) {
           SIMD_int ijtype;
           const int coffset = jj * swidth;
           if (!ONETYPE) {
             ijtype = SIMD_load(tjtype + coffset);
             ijtype = (ijtype + itype_offset) << 2;
             cut = SIMD_gather(&(p2f[0].cut), ijtype);
           }
 
           SIMD_acc_t fjxtmp = SIMD_set((acc_t)0);
           SIMD_acc_t fjytmp = SIMD_set((acc_t)0);
           SIMD_acc_t fjztmp = SIMD_set((acc_t)0);
           SIMD_acc_t fjtmp, fjxtmp2, fjytmp2, fjztmp2, fjtmp2;
           if (EFLAG) fjtmp = SIMD_set((acc_t)0.0);
 
           if (is_same<flt_t,acc_t>::value == 0) {
             fjxtmp2 = SIMD_set((acc_t)0);
             fjytmp2 = SIMD_set((acc_t)0);
             fjztmp2 = SIMD_set((acc_t)0);
             if (EFLAG) fjtmp2 = SIMD_set((acc_t)0.0);
           }
 
           const SIMD_flt_t delx = SIMD_load(tdelx + coffset);
           const SIMD_flt_t dely = SIMD_load(tdely + coffset);
           const SIMD_flt_t delz = SIMD_load(tdelz + coffset);
           const SIMD_flt_t rsq1 = SIMD_load(trsq + coffset);
 
           const SIMD_flt_t rinvsq1 = SIMD_rcp(rsq1);
           const SIMD_flt_t r1 = SIMD_invsqrt(rinvsq1);
           const SIMD_flt_t rainv1 = SIMD_rcp(r1 - cut);
 
           // two-body interactions, skip half of them
           if (jj < ejnumhalf_max) {
             SIMD_flt_t rp, rq;
             if (SPQ == 1) {
               rp = r1 * r1;
               rp = rp * rp;
               rp = SIMD_rcp(rp);
               rq = SIMD_set((flt_t)1.0);
             } else {
               if (!ONETYPE) {
                 powerp = SIMD_gather(&(p2f[0].powerp), ijtype);
                 powerq = SIMD_gather(&(p2f[0].powerq), ijtype);
               }
               rp = SIMD_pow(r1, powerp);
               rq = SIMD_pow(r1, powerq);
             }
 
             if (!ONETYPE) {
               sigma = SIMD_gather(&(p2f[0].sigma), ijtype);
               c1 = SIMD_gather(&(p2f2[0].c1), ijtype);
               c2 = SIMD_gather(&(p2f2[0].c2), ijtype);
               c3 = SIMD_gather(&(p2f2[0].c3), ijtype);
               c4 = SIMD_gather(&(p2f2[0].c4), ijtype);
             }
 
             const SIMD_flt_t rainvsq = rainv1 * rainv1 * r1;
             const SIMD_flt_t expsrainv = SIMD_exp(sigma * rainv1);
             const SIMD_flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) *
                                       rainvsq) * expsrainv * rinvsq1;
 
             const SIMD_flt_t fjx = delx * fpair;
             const SIMD_flt_t fjy = dely * fpair;
             const SIMD_flt_t fjz = delz * fpair;
 
             const SIMD_mask hmask = jj < ejnumhalf;
             SIMD_accumulate3(hmask, fjx, fjy, fjz, fxtmp, fytmp, fztmp,
                              fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2,
                              fztmp2, fjxtmp2, fjytmp2, fjztmp2);
 
             if (EFLAG) {
               if (!ONETYPE) {
                 c5 = SIMD_gather(&(p2e[0].c5), ijtype);
                 c6 = SIMD_gather(&(p2e[0].c6), ijtype);
               }
               SIMD_flt_t evdwl;
               evdwl = (c5 * rp - c6 * rq) * expsrainv;
               SIMD_acc_energy3(hmask, evdwl, eatom, sevdwl, fwtmp, fjtmp,
                                fwtmp2, fjtmp2);
             }
           }
 
           /*---------------------------------------------*/
           SIMD_int ijkoff;
           if (!ONETYPE) {
             sigma_gamma = SIMD_gather(&(p2[0].sigma_gamma), ijtype);
             ijkoff = ijtype * ntypes;
           }
           const SIMD_flt_t gsrainv1 = sigma_gamma * rainv1;
           const SIMD_flt_t gsrainvsq1 = gsrainv1 * rainv1 / r1;
           const SIMD_flt_t expgsrainv1 = SIMD_exp(gsrainv1);
 
           const SIMD_mask jmask = jj < ejnum;
           for (int kk = jj+1; kk < ejnum_max; kk++) {
             SIMD_int iktype, ijktype;
             const int kcoffset = kk * swidth;
             if (!ONETYPE) {
               iktype = SIMD_load(tjtype + kcoffset);
               ijktype = ijkoff + (iktype << 2);
               iktype = (iktype + itype_offset) << 2;
               cut = SIMD_gather(&(p2[0].cut), iktype);
               sigma_gamma = SIMD_gather(&(p2[0].sigma_gamma), iktype);
               costheta = SIMD_gather(&(p3[0].costheta), ijktype);
               lambda_epsilon = SIMD_gather(&(p3[0].lambda_epsilon), ijktype);
               lambda_epsilon2 = SIMD_gather(&(p3[0].lambda_epsilon2), ijktype);
             }
             const SIMD_flt_t delr2x = SIMD_load(tdelx + kcoffset);
             const SIMD_flt_t delr2y = SIMD_load(tdely + kcoffset);
             const SIMD_flt_t delr2z = SIMD_load(tdelz + kcoffset);
             const SIMD_flt_t rsq2 = SIMD_load(trsq + kcoffset);
 
             const SIMD_flt_t rinvsq2 = SIMD_rcp(rsq2);
             const SIMD_flt_t r2 = SIMD_invsqrt(rinvsq2);
             const SIMD_flt_t rainv2 = SIMD_rcp(r2 - cut);
             const SIMD_flt_t gsrainv2 = sigma_gamma * rainv2;
             const SIMD_flt_t gsrainvsq2 = gsrainv2 * rainv2 / r2;
             const SIMD_flt_t expgsrainv2 = SIMD_exp(gsrainv2);
             const SIMD_flt_t rinv12 = SIMD_rcp(r1 * r2);
             const SIMD_flt_t cs = (delx * delr2x + dely * delr2y +
                               delz * delr2z) * rinv12;
             const SIMD_flt_t delcs = cs - costheta;
             const SIMD_flt_t delcssq = delcs*delcs;
 
             const SIMD_flt_t facexp = expgsrainv1*expgsrainv2;
             const SIMD_flt_t facrad = lambda_epsilon * facexp * delcssq;
             const SIMD_flt_t frad1 = facrad * gsrainvsq1;
             const SIMD_flt_t frad2 = facrad * gsrainvsq2;
             const SIMD_flt_t facang = lambda_epsilon2 * facexp * delcs;
             const SIMD_flt_t facang12 = rinv12 * facang;
             const SIMD_flt_t csfacang = cs * facang;
 
             const SIMD_flt_t csfac1 = rinvsq1 * csfacang;
             const SIMD_flt_t fjx = delx * (frad1 + csfac1)-delr2x*facang12;
             const SIMD_flt_t fjy = dely * (frad1 + csfac1)-delr2y*facang12;
             const SIMD_flt_t fjz = delz * (frad1 + csfac1)-delr2z*facang12;
 
             const SIMD_flt_t csfac2 = rinvsq2 * csfacang;
             SIMD_flt_t fkx = delx * facang12 - delr2x * (frad2 + csfac2);
             SIMD_flt_t fky = dely * facang12 - delr2y * (frad2 + csfac2);
             SIMD_flt_t fkz = delz * facang12 - delr2z * (frad2 + csfac2);
 
             const SIMD_mask kmask = SIMD_lt(jmask, kk, ejnum);
 
             SIMD_acc_cache3(kmask, fjx, fjy, fjz, fkx, fky, fkz, fxtmp, fytmp,
                             fztmp, fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2,
                             fztmp2, fjxtmp2, fjytmp2, fjztmp2,
                             tf + kcoffset * 3, swidth);
 
             if (EFLAG) {
               SIMD_int k;
               if (eatom) {
                 k = SIMD_load(tj + kcoffset);
                 k = k << 4;
               }
               SIMD_acc_three(kmask, facrad, eatom, sevdwl, fwtmp, fjtmp,
                              fwtmp2, fjtmp2, k, dforce);
             }
           } // for kk
           if (is_same<flt_t,acc_t>::value == 1)
             SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp);
           else
             SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp,
                         fjxtmp2, fjytmp2, fjztmp2);
 
           if (EFLAG) {
             if (eatom) {
               SIMD_int j = SIMD_load(tj + coffset);
               j = j << 4;
               SIMD_jeng_update(jmask, dforce + 3, j, fjtmp);
               if (is_same<flt_t,acc_t>::value == 0)
                 SIMD_jeng_update_hi(jmask, dforce + 3, j, fjtmp2);
             }
           }
         } // for jj first loop
 
         for (int jj = 0; jj < ejnum_max; jj++) {
           const int coffset = jj * swidth;
           const SIMD_mask jmask = jj < ejnum;
           const SIMD_int j = SIMD_load(tj + coffset);
           const SIMD_int joffset = j << 4;
 
           SIMD_acc_t fjxtmp, fjytmp, fjztmp, fjxtmp2, fjytmp2, fjztmp2;
           int foffset = swidth;
           if (is_same<flt_t,acc_t>::value == 0) foffset = foffset >> 1;
           acc_t *p = tf + coffset * 3;
           fjxtmp = SIMD_load(p);
           if (is_same<flt_t,acc_t>::value == 0) {
             p = p + foffset;
             fjxtmp2 = SIMD_load(p);
           }
           p = p + foffset;
           fjytmp = SIMD_load(p);
           if (is_same<flt_t,acc_t>::value == 0) {
             p = p + foffset;
             fjytmp2 = SIMD_load(p);
           }
           p = p + foffset;
           fjztmp = SIMD_load(p);
           if (is_same<flt_t,acc_t>::value == 0) {
             p = p + foffset;
             fjztmp2 = SIMD_load(p);
           }
 
           SIMD_conflict_pi_reduce3(jmask, joffset, fjxtmp, fjytmp, fjztmp);
           SIMD_jforce_update(jmask, dforce, joffset, fjxtmp, fjytmp,
                              fjztmp);
           if (is_same<flt_t,acc_t>::value == 0) {
             SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
             SIMD_mask jmask2 = jmask >> 8;
             SIMD_conflict_pi_reduce3(jmask2, joffset2, fjxtmp2, fjytmp2,
                                      fjztmp2);
             SIMD_jforce_update(jmask2, dforce, joffset2, fjxtmp2, fjytmp2,
                                fjztmp2);
           }
         } // for jj second loop
 
         SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp,
                            EFLAG, eatom, fwtmp);
         if (is_same<flt_t,acc_t>::value == 0) {
           imask = imask >> 8;
           SIMD_iforce_update(imask, &(f[i+8].x), goffset, fxtmp2, fytmp2,
                              fztmp2, EFLAG, eatom, fwtmp2);
         }
         if (EFLAG) oevdwl += SIMD_sum(sevdwl);
         ilist = ilist + iip;
       } // for ii
 
       IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride,
                               x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5);
     } // end omp
 
     IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
       ev_global[0] = oevdwl;
       ev_global[1] = (acc_t)0.0;
     }
     if (vflag) {
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
       ev_global[5] = ov3;
       ev_global[6] = ov4;
       ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end offload
   if (offload)
     fix->stop_watch(TIME_OFFLOAD_LATENCY);
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
   if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
 }
 
 #endif
 
 /* ---------------------------------------------------------------------- */
 
 void PairSWIntel::allocate()
 {
   PairSW::allocate();
 }
 
 /* ----------------------------------------------------------------------
    init specific to this pair style
 ------------------------------------------------------------------------- */
 
 void PairSWIntel::init_style()
 {
   PairSW::init_style();
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
   map[0] = map[1];
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   fix->pair_init_check(true);
 
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
     pack_force_const(force_const_single, fix->get_mixed_buffers());
     fix->get_mixed_buffers()->need_tag(1);
   } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
     pack_force_const(force_const_double, fix->get_double_buffers());
     fix->get_double_buffers()->need_tag(1);
   } else {
     pack_force_const(force_const_single, fix->get_single_buffers());
     fix->get_single_buffers()->need_tag(1);
   }
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (fix->offload_noghost())
     error->all(FLERR,"The 'ghost no' option cannot be used with sw/intel.");
   #endif
 
   #if defined(__INTEL_COMPILER)
   if (__INTEL_COMPILER_BUILD_DATE < 20141023)
     error->all(FLERR, "Intel compiler versions before "
                "15 Update 1 not supported for sw/intel");
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
                                    IntelBuffers<flt_t,acc_t> *buffers)
 {
   #ifdef LMP_USE_AVXCD
   fix->nbor_pack_width(SIMD_type<flt_t>::width());
   #endif
   fix->three_body_neighbor(1);
 
   int off_ccache = 0;
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop >= 0) off_ccache = 1;
   #endif
 
   #ifdef LMP_USE_AVXCD
   const int swidth = SIMD_type<flt_t>::width();
   #else
   const int swidth = 1;
   #endif
 
   buffers->grow_ccache(off_ccache, comm->nthreads, swidth);
   _ccache_stride = buffers->ccache_stride();
   #ifdef LMP_USE_AVXCD
   _ccache_stride3 = buffers->ccache_stride3();
   #endif
 
   int tp1 = atom->ntypes + 1;
   fc.set_ntypes(tp1,memory,_cop);
   buffers->set_ntypes(tp1);
   flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
   double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i,j);
         cutneigh = cut + neighbor->skin;
         cutsq[i][j] = cutsq[j][i] = cut*cut;
         cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
       }
     }
   }
 
   _onetype = 0;
   if (atom->ntypes == 1) _onetype = 1;
 
   _spq = 1;
   for (int ii = 0; ii < tp1; ii++) {
     int i = map[ii];
     for (int jj = 0; jj < tp1; jj++) {
       int j = map[jj];
       if (i < 0 || j < 0 || ii == 0 || jj == 0) {
         fc.p2[ii][jj].cutsq = 0;
         fc.p2[ii][jj].cut = 0;
         fc.p2[ii][jj].sigma_gamma = 0;
         fc.p2f[ii][jj].cut = 0;
         fc.p2f[ii][jj].powerp = 0;
         fc.p2f[ii][jj].powerq = 0;
         fc.p2f[ii][jj].sigma = 0;
         fc.p2f2[ii][jj].c1 = 0;
         fc.p2f2[ii][jj].c2 = 0;
         fc.p2f2[ii][jj].c3 = 0;
         fc.p2f2[ii][jj].c4 = 0;
         fc.p2e[ii][jj].c5 = 0;
         fc.p2e[ii][jj].c6 = 0;
       } else {
         int ijparam = elem2param[i][j][j];
         fc.p2[ii][jj].cutsq = params[ijparam].cutsq;
         fc.p2[ii][jj].cut = params[ijparam].cut;
         fc.p2[ii][jj].sigma_gamma = params[ijparam].sigma_gamma;
         fc.p2f[ii][jj].cut = params[ijparam].cut;
         fc.p2f[ii][jj].powerp = -params[ijparam].powerp;
         fc.p2f[ii][jj].powerq = -params[ijparam].powerq;
         fc.p2f[ii][jj].sigma = params[ijparam].sigma;
         fc.p2f2[ii][jj].c1 = params[ijparam].c1;
         fc.p2f2[ii][jj].c2 = params[ijparam].c2;
         fc.p2f2[ii][jj].c3 = params[ijparam].c3;
         fc.p2f2[ii][jj].c4 = params[ijparam].c4;
         fc.p2e[ii][jj].c5 = params[ijparam].c5;
         fc.p2e[ii][jj].c6 = params[ijparam].c6;
 
         double cutcut = params[ijparam].cut * params[ijparam].cut;
         if (params[ijparam].cutsq >= cutcut)
           fc.p2[ii][jj].cutsq *= 0.98;
 
         if (params[ijparam].powerp != 4.0 || params[ijparam].powerq != 0.0)
           _spq = 0;
       }
 
       for (int kk = 0; kk < tp1; kk++) {
         int k = map[kk];
         if (i < 0 || j < 0 || k < 0  || ii == 0 || jj == 0 || kk == 0) {
           fc.p3[ii][jj][kk].costheta = 0;
           fc.p3[ii][jj][kk].lambda_epsilon = 0;
           fc.p3[ii][jj][kk].lambda_epsilon2 = 0;
         } else {
           int ijkparam = elem2param[i][j][k];
           fc.p3[ii][jj][kk].costheta = params[ijkparam].costheta;
           fc.p3[ii][jj][kk].lambda_epsilon = params[ijkparam].lambda_epsilon;
           fc.p3[ii][jj][kk].lambda_epsilon2 = params[ijkparam].lambda_epsilon2;
         }
       }
     }
   }
 
   _host_pad = 1;
   _offload_pad = 1;
 
   if (INTEL_NBOR_PAD > 1)
     _host_pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop < 0) return;
   FC_PACKED0_T *op2 = fc.p2[0];
   FC_PACKED1_T *op2f = fc.p2f[0];
   FC_PACKED1p2_T *op2f2 = fc.p2f2[0];
   FC_PACKED2_T *op2e = fc.p2e[0];
   FC_PACKED3_T *op3 = fc.p3[0][0];
   flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   int tp1cu = tp1sq * tp1;
   if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL &&
       op3 != NULL && ocutneighsq != NULL) {
     #pragma offload_transfer target(mic:_cop) \
       in(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(0) free_if(0))     \
       in(op3: length(tp1cu) alloc_if(0) free_if(0)) \
       in(ocutneighsq: length(tp1sq))
   }
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void PairSWIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                 Memory *memory,
                                                 const int cop) {
   if (ntypes != _ntypes) {
     if (_ntypes > 0) {
       fc_packed0 *op2 = p2[0];
       fc_packed1 *op2f = p2f[0];
       fc_packed1p2 *op2f2 = p2f2[0];
       fc_packed2 *op2e = p2e[0];
       fc_packed3 *op3 = p3[0][0];
 
       #ifdef _LMP_INTEL_OFFLOAD
       if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL &&
           op3 != NULL && _cop >= 0) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(op2, op2f, op2f2, op2e, op3: alloc_if(0) free_if(1))
       }
       #endif
 
       _memory->destroy(op2);
       _memory->destroy(op2f);
       _memory->destroy(op2f2);
       _memory->destroy(op2e);
       _memory->destroy(op3);
     }
     if (ntypes > 0) {
       _cop = cop;
       memory->create(p2,ntypes,ntypes,"fc.p2");
       memory->create(p2f,ntypes,ntypes,"fc.p2f");
       memory->create(p2f2,ntypes,ntypes,"fc.p2f2");
       memory->create(p2e,ntypes,ntypes,"fc.p2e");
       memory->create(p3,ntypes,ntypes,ntypes,"fc.p3");
 
       #ifdef _LMP_INTEL_OFFLOAD
       fc_packed0 *op2 = p2[0];
       fc_packed1 *op2f = p2f[0];
       fc_packed1p2 *op2f2 = p2f2[0];
       fc_packed2 *op2e = p2e[0];
       fc_packed3 *op3 = p3[0][0];
       int tp1sq = ntypes * ntypes;
       int tp1cu = tp1sq * ntypes;
       if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL &&
           op3 != NULL && cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(op3: length(tp1cu) alloc_if(1) free_if(0))
       }
       #endif
     }
   }
   _ntypes = ntypes;
   _memory = memory;
 }