diff --git a/doc/_sources/package.txt b/doc/_sources/package.txt
index f3a13e422..b17dba9a9 100644
--- a/doc/_sources/package.txt
+++ b/doc/_sources/package.txt
@@ -1,689 +1,686 @@
 .. index:: package
 
 package command
 ===============
 
 Syntax
 """"""
 
 .. parsed-literal::
 
    package style args
 
 * style = *cuda* or *gpu* or *intel* or *kokkos* or *omp*
 * args = arguments specific to the style
 .. parsed-literal::
 
      *cuda* args = Ngpu keyword value ...
        Ngpu = # of GPUs per node
        zero or more keyword/value pairs may be appended
        keywords = *newton* or *gpuID* or *timing* or *test* or *thread*
          *newton* = *off* or *on*
            off = set Newton pairwise and bonded flags off (default)
            on = set Newton pairwise and bonded flags on
          *gpuID* values = gpu1 .. gpuN
            gpu1 .. gpuN = IDs of the Ngpu GPUs to use
          *timing* values = none
          *test* values = id
            id = atom-ID of a test particle
          *thread* = auto or tpa or bpa
            auto = test whether tpa or bpa is faster
            tpa = one thread per atom
            bpa = one block per atom
      *gpu* args = Ngpu keyword value ...
        Ngpu = # of GPUs per node
        zero or more keyword/value pairs may be appended 
        keywords = *neigh* or *newton* or *binsize* or *split* or *gpuID* or *tpa* or *device* or *blocksize*
          *neigh* value = *yes* or *no*
            yes = neighbor list build on GPU (default)
            no = neighbor list build on CPU
          *newton* = *off* or *on*
            off = set Newton pairwise flag off (default and required)
            on = set Newton pairwise flag on (currently not allowed)
          *binsize* value = size
            size = bin size for neighbor list construction (distance units)
          *split* = fraction
            fraction = fraction of atoms assigned to GPU (default = 1.0)
          *gpuID* values = first last
            first = ID of first GPU to be used on each node
            last = ID of last GPU to be used on each node
          *tpa* value = Nthreads
            Nthreads = # of GPU threads used per atom
          *device* value = device_type
            device_type = *kepler* or *fermi* or *cypress* or *generic*
          *blocksize* value = size
            size = thread block size for pair force computation
      *intel* args = NPhi keyword value ...
        Nphi = # of coprocessors per node
        zero or more keyword/value pairs may be appended 
        keywords = *omp* or *mode* or *balance* or *ghost* or *tpc* or *tptask* or *no_affinity*
          *omp* value = Nthreads
            Nthreads = number of OpenMP threads to use on CPU (default = 0)
          *mode* value = *single* or *mixed* or *double*
            single = perform force calculations in single precision
            mixed = perform force calculations in mixed precision
            double = perform force calculations in double precision
          *balance* value = split
            split = fraction of work to offload to coprocessor, -1 for dynamic
          *ghost* value = *yes* or *no*
            yes = include ghost atoms for offload
            no = do not include ghost atoms for offload
          *tpc* value = Ntpc
            Ntpc = max number of coprocessor threads per coprocessor core (default = 4)
          *tptask* value = Ntptask
            Ntptask = max number of coprocessor threads per MPI task (default = 240)
          *no_affinity* values = none
      *kokkos* args = keyword value ...
        zero or more keyword/value pairs may be appended
        keywords = *neigh* or *newton* or *binsize* or *comm* or *comm/exchange* or *comm/forward*
-         *neigh* value = *full* or *half/thread* or *half* or *n2* or *full/cluster*
+         *neigh* value = *full* or *half* or *n2* or *full/cluster*
            full = full neighbor list
-           half/thread = half neighbor list built in thread-safe manner
-           half = half neighbor list, not thread-safe, only use when 1 thread/MPI task
+           half = half neighbor list built in thread-safe manner
            n2 = non-binning neighbor list build, O(N^2) algorithm
            full/cluster = full neighbor list with clustered groups of atoms
          *newton* = *off* or *on*
            off = set Newton pairwise and bonded flags off (default)
            on = set Newton pairwise and bonded flags on
          *binsize* value = size
            size = bin size for neighbor list construction (distance units)
          *comm* value = *no* or *host* or *device*
            use value for both comm/exchange and comm/forward
          *comm/exchange* value = *no* or *host* or *device*
          *comm/forward* value = *no* or *host* or *device*
            no = perform communication pack/unpack in non-KOKKOS mode
            host = perform pack/unpack on host (e.g. with OpenMP threading)
            device = perform pack/unpack on device (e.g. on GPU)
      *omp* args = Nthreads keyword value ...
        Nthread = # of OpenMP threads to associate with each MPI process
        zero or more keyword/value pairs may be appended 
        keywords = *neigh*
          *neigh* value = *yes* or *no*
            yes = threaded neighbor list build (default)
            no = non-threaded neighbor list build
 
 
 
 Examples
 """"""""
 
 .. parsed-literal::
 
    package gpu 1
    package gpu 1 split 0.75
    package gpu 2 split -1.0
    package cuda 2 gpuID 0 2
    package cuda 1 test 3948
-   package kokkos neigh half/thread comm device
+   package kokkos neigh half comm device
    package omp 0 neigh no
    package omp 4
    package intel 1
    package intel 2 omp 4 mode mixed balance 0.5
 
 Description
 """""""""""
 
 This command invokes package-specific settings for the various
 accelerator packages available in LAMMPS.  Currently the following
 packages use settings from this command: USER-CUDA, GPU, USER-INTEL,
 KOKKOS, and USER-OMP.
 
 If this command is specified in an input script, it must be near the
 top of the script, before the simulation box has been defined.  This
 is because it specifies settings that the accelerator packages use in
 their intialization, before a simultion is defined.
 
 This command can also be specified from the command-line when
 launching LAMMPS, using the "-pk" :ref:`command-line switch <start_7>`.  The syntax is exactly the same as
 when used in an input script.
 
 Note that all of the accelerator packages require the package command
 to be specified (except the OPT package), if the package is to be used
 in a simulation (LAMMPS can be built with an accelerator package
 without using it in a particular simulation).  However, in all cases,
 a default version of the command is typically invoked by other
 accelerator settings.
 
 The USER-CUDA and KOKKOS packages require a "-c on" or "-k on"
 :ref:`command-line switch <start_7>` respectively, which
 invokes a "package cuda" or "package kokkos" command with default
 settings.
 
 For the GPU, USER-INTEL, and USER-OMP packages, if a "-sf gpu" or "-sf
 intel" or "-sf omp" :ref:`command-line switch <start_7>`
 is used to auto-append accelerator suffixes to various styles in the
 input script, then those switches also invoke a "package gpu",
 "package intel", or "package omp" command with default settings.
 
 .. note::
 
    A package command for a particular style can be invoked multiple
    times when a simulation is setup, e.g. by the "-c on", "-k on", "-sf",
    and "-pk" :ref:`command-line switches <start_7>`, and by
    using this command in an input script.  Each time it is used all of
    the style options are set, either to default values or to specified
    settings.  I.e. settings from previous invocations do not persist
    across multiple invocations.
 
 See the :doc:`Section Accelerate <Section_accelerate>` section of the
 manual for more details about using the various accelerator packages
 for speeding up LAMMPS simulations.
 
 
 ----------
 
 
 The *cuda* style invokes settings associated with the use of the
 USER-CUDA package.
 
 The *Ngpus* argument sets the number of GPUs per node.  There must be
 exactly one MPI task per GPU, as set by the mpirun or mpiexec command.
 
 Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.
 
 The *newton* keyword sets the Newton flags for pairwise and bonded
 interactions to *off* or *on*, the same as the :doc:`newton <newton>`
 command allows.  The default is *off* because this will almost always
 give better performance for the USER-CUDA package.  This means
 more computation is done, but less communication.
 
 The *gpuID* keyword allows selection of which GPUs on each node will
 be used for a simulation.  GPU IDs range from 0 to N-1 where N is the
 physical number of GPUs/node.  An ID is specified for each of the
 Ngpus being used.  For example if you have three GPUs on a machine,
 one of which is used for the X-Server (the GPU with the ID 1) while
 the others (with IDs 0 and 2) are used for computations you would
 specify:
 
 .. parsed-literal::
 
    package cuda 2 gpuID 0 2
 
 The purpose of the *gpuID* keyword is to allow two (or more)
 simulations to be run on one workstation.  In that case one could set
 the first simulation to use GPU 0 and the second to use GPU 1. This is
 not necessary however, if the GPUs are in what is called *compute
 exclusive* mode.  Using that setting, every process will get its own
 GPU automatically.  This *compute exclusive* mode can be set as root
 using the *nvidia-smi* tool which is part of the CUDA installation.
 
 Also note that if the *gpuID* keyword is not used, the USER-CUDA
 package sorts existing GPUs on each node according to their number of
 multiprocessors.  This way, compute GPUs will be priorized over
 X-Server GPUs.
 
 If the *timing* keyword is specified, detailed timing information for
 various subroutines will be output.
 
 If the *test* keyword is specified, information for the specified atom
 with atom-ID will be output at several points during each timestep.
 This is mainly usefull for debugging purposes.  Note that the
 simulation slow down dramatically if this option is used.
 
 The *thread* keyword can be used to specify how GPU threads are
 assigned work during pair style force evaluation.  If the value =
 *tpa*, one thread per atom is used.  If the value = *bpa*, one block
 per atom is used.  If the value = *auto*, a short test is performed at
 the beginning of each run to determing where *tpa* or *bpa* mode is
 faster.  The result of this test is output.  Since *auto* is the
 default value, it is usually not necessary to use this keyword.
 
 
 ----------
 
 
 The *gpu* style invokes settings associated with the use of the GPU
 package.
 
 The *Ngpu* argument sets the number of GPUs per node.  There must be
 at least as many MPI tasks per node as GPUs, as set by the mpirun or
 mpiexec command.  If there are more MPI tasks (per node) 
 than GPUs, multiple MPI tasks will share each GPU.
 
 Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.
 
 The *neigh* keyword specifies where neighbor lists for pair style
 computation will be built.  If *neigh* is *yes*, which is the default,
 neighbor list building is performed on the GPU.  If *neigh* is *no*,
 neighbor list building is performed on the CPU.  GPU neighbor list
 building currently cannot be used with a triclinic box.  GPU neighbor
 list calculation currently cannot be used with
 :doc:`hybrid <pair_hybrid>` pair styles.  GPU neighbor lists are not
 compatible with comannds that are not GPU-enabled.  When a non-GPU
 enabled command requires a neighbor list, it will also be built on the
 CPU.  In these cases, it will typically be more efficient to only use
 CPU neighbor list builds.
 
 The *newton* keyword sets the Newton flags for pairwise (not bonded)
 interactions to *off* or *on*, the same as the :doc:`newton <newton>`
 command allows.  Currently, only an *off* value is allowed, since all
 the GPU package pair styles require this setting.  This means more
 computation is done, but less communication.  In the future a value of
 *on* may be allowed, so the *newton* keyword is included as an option
 for compatibility with the package command for other accelerator
 styles.  Note that the newton setting for bonded interactions is not
 affected by this keyword.
 
 The *binsize* keyword sets the size of bins used to bin atoms in
 neighbor list builds performed on the GPU, if *neigh* = *yes* is set.
 If *binsize* is set to 0.0 (the default), then bins = the size of the
 pairwise cutoff + neighbor skin distance.  This is 2x larger than the
 LAMMPS default used for neighbor list building on the CPU.  This will
 be close to optimal for the GPU, so you do not normally need to use
 this keyword.  Note that if you use a longer-than-usual pairwise
 cutoff, e.g. to allow for a smaller fraction of KSpace work with a
 :doc:`long-range Coulombic solver <kspace_style>` because the GPU is
 faster at performing pairwise interactions, then it may be optimal to
 make the *binsize* smaller than the default.  For example, with a
 cutoff of 20*sigma in LJ :doc:`units <units>` and a neighbor skin
 distance of sigma, a *binsize* = 5.25*sigma can be more efficient than
 the default.
 
 The *split* keyword can be used for load balancing force calculations
 between CPU and GPU cores in GPU-enabled pair styles. If 0 < *split* <
 1.0, a fixed fraction of particles is offloaded to the GPU while force
 calculation for the other particles occurs simulataneously on the CPU.
 If *split* < 0.0, the optimal fraction (based on CPU and GPU timings)
 is calculated every 25 timesteps, i.e. dynamic load-balancing across
 the CPU and GPU is performed.  If *split* = 1.0, all force
 calculations for GPU accelerated pair styles are performed on the GPU.
 In this case, other :doc:`hybrid <pair_hybrid>` pair interactions,
 :doc:`bond <bond_style>`, :doc:`angle <angle_style>`,
 :doc:`dihedral <dihedral_style>`, :doc:`improper <improper_style>`, and
 :doc:`long-range <kspace_style>` calculations can be performed on the
 CPU while the GPU is performing force calculations for the GPU-enabled
 pair style.  If all CPU force computations complete before the GPU
 completes, LAMMPS will block until the GPU has finished before
 continuing the timestep.
 
 As an example, if you have two GPUs per node and 8 CPU cores per node,
 and would like to run on 4 nodes (32 cores) with dynamic balancing of
 force calculation across CPU and GPU cores, you could specify
 
 .. parsed-literal::
 
    mpirun -np 32 -sf gpu -in in.script    # launch command
    package gpu 2 split -1                 # input script command
 
 In this case, all CPU cores and GPU devices on the nodes would be
 utilized.  Each GPU device would be shared by 4 CPU cores. The CPU
 cores would perform force calculations for some fraction of the
 particles at the same time the GPUs performed force calculation for
 the other particles.
 
 The *gpuID* keyword allows selection of which GPUs on each node will
 be used for a simulation.  The *first* and *last* values specify the
 GPU IDs to use (from 0 to Ngpu-1).  By default, first = 0 and last =
 Ngpu-1, so that all GPUs are used, assuming Ngpu is set to the number
 of physical GPUs.  If you only wish to use a subset, set Ngpu to a
 smaller number and first/last to a sub-range of the available GPUs.
 
 The *tpa* keyword sets the number of GPU thread per atom used to
 perform force calculations.  With a default value of 1, the number of
 threads will be chosen based on the pair style, however, the value can
 be set explicitly with this keyword to fine-tune performance.  For
 large cutoffs or with a small number of particles per GPU, increasing
 the value can improve performance. The number of threads per atom must
 be a power of 2 and currently cannot be greater than 32.
 
 The *device* keyword can be used to tune parameters optimized for a
 specific accelerator, when using OpenCL.  For CUDA, the *device*
 keyword is ignored.  Currently, the device type is limited to NVIDIA
 Kepler, NVIDIA Fermi, AMD Cypress, or a generic device.  More devices
 may be added later.  The default device type can be specified when
 building LAMMPS with the GPU library, via settings in the
 lib/gpu/Makefile that is used.
 
 The *blocksize* keyword allows you to tweak the number of threads used
 per thread block. This number should be a multiple of 32 (for GPUs)
 and its maximum depends on the specific GPU hardware. Typical choices
 are 64, 128, or 256. A larger blocksize increases occupancy of
 individual GPU cores, but reduces the total number of thread blocks,
 thus may lead to load imbalance.
 
 
 ----------
 
 
 The *intel* style invokes settings associated with the use of the
 USER-INTEL package.  All of its settings, except the *omp* and *mode*
 keywords, are ignored if LAMMPS was not built with Xeon Phi
 coprocessor support.  All of its settings, including the *omp* and
 *mode* keyword are applicable if LAMMPS was built with coprocessor
 support.
 
 The *Nphi* argument sets the number of coprocessors per node.
 This can be set to any value, including 0, if LAMMPS was not
 built with coprocessor support.
 
 Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.
 
 The *omp* keyword determines the number of OpenMP threads allocated
 for each MPI task when any portion of the interactions computed by a
 USER-INTEL pair style are run on the CPU.  This can be the case even
 if LAMMPS was built with coprocessor support; see the *balance*
 keyword discussion below.  If you are running with less MPI tasks/node
 than there are CPUs, it can be advantageous to use OpenMP threading on
 the CPUs.
 
 .. note::
 
    The *omp* keyword has nothing to do with coprocessor threads on
    the Xeon Phi; see the *tpc* and *tptask* keywords below for a
    discussion of coprocessor threads.
 
 The *Nthread* value for the *omp* keyword sets the number of OpenMP
 threads allocated for each MPI task.  Setting *Nthread* = 0 (the
 default) instructs LAMMPS to use whatever value is the default for the
 given OpenMP environment. This is usually determined via the
 *OMP_NUM_THREADS* environment variable or the compiler runtime, which
 is usually a value of 1.
 
 For more details, including examples of how to set the OMP_NUM_THREADS
 environment variable, see the discussion of the *Nthreads* setting on
 this doc page for the "package omp" command.  Nthreads is a required
 argument for the USER-OMP package.  Its meaning is exactly the same
 for the USER-INTEL pacakge.
 
 .. note::
 
    If you build LAMMPS with both the USER-INTEL and USER-OMP
    packages, be aware that both packages allow setting of the *Nthreads*
    value via their package commands, but there is only a single global
    *Nthreads* value used by OpenMP.  Thus if both package commands are
    invoked, you should insure the two values are consistent.  If they are
    not, the last one invoked will take precedence, for both packages.
    Also note that if the "-sf hybrid intel omp" :ref:`command-line switch <start_7>` is used, it invokes a "package
    intel" command, followed by a "package omp" command, both with a
    setting of *Nthreads* = 0.
 
 The *mode* keyword determines the precision mode to use for
 computing pair style forces, either on the CPU or on the coprocessor,
 when using a USER-INTEL supported :doc:`pair style <pair_style>`.  It
 can take a value of *single*, *mixed* which is the default, or
 *double*.  *Single* means single precision is used for the entire
 force calculation.  *Mixed* means forces between a pair of atoms are
 computed in single precision, but accumulated and stored in double
 precision, including storage of forces, torques, energies, and virial
 quantities.  *Double* means double precision is used for the entire
 force calculation.
 
 The *balance* keyword sets the fraction of :doc:`pair style <pair_style>` work offloaded to the coprocessor for split
 values between 0.0 and 1.0 inclusive.  While this fraction of work is
 running on the coprocessor, other calculations will run on the host,
 including neighbor and pair calculations that are not offloaded, as
 well as angle, bond, dihedral, kspace, and some MPI communications.
 If *split* is set to -1, the fraction of work is dynamically adjusted
 automatically throughout the run.  This typically give performance
 within 5 to 10 percent of the optimal fixed fraction.
 
 The *ghost* keyword determines whether or not ghost atoms, i.e. atoms
 at the boundaries of proessor sub-domains, are offloaded for neighbor
 and force calculations.  When the value = "no", ghost atoms are not
 offloaded.  This option can reduce the amount of data transfer with
 the coprocessor and can also overlap MPI communication of forces with
 computation on the coprocessor when the :doc:`newton pair <newton>`
 setting is "on".  When the value = "yes", ghost atoms are offloaded.
 In some cases this can provide better performance, especially if the
 *balance* fraction is high.
 
 The *tpc* keyword sets the max # of coprocessor threads *Ntpc* that
 will run on each core of the coprocessor.  The default value = 4,
 which is the number of hardware threads per core supported by the
 current generation Xeon Phi chips.
 
 The *tptask* keyword sets the max # of coprocessor threads (Ntptask*
 assigned to each MPI task.  The default value = 240, which is the
 total # of threads an entire current generation Xeon Phi chip can run
 (240 = 60 cores * 4 threads/core).  This means each MPI task assigned
 to the Phi will enough threads for the chip to run the max allowed,
 even if only 1 MPI task is assigned.  If 8 MPI tasks are assigned to
 the Phi, each will run with 30 threads.  If you wish to limit the
 number of threads per MPI task, set *tptask* to a smaller value.
 E.g. for *tptask* = 16, if 8 MPI tasks are assigned, each will run
 with 16 threads, for a total of 128.
 
 Note that the default settings for *tpc* and *tptask* are fine for
 most problems, regardless of how many MPI tasks you assign to a Phi.
 
 The *no_affinity* keyword will turn off automatic setting of core
 affinity for MPI tasks and OpenMP threads on the host when using
 offload to a coprocessor. Affinity settings are used when possible 
 to prevent MPI tasks and OpenMP threads from being on separate NUMA 
 domains and to prevent offload threads from interfering with other 
 processes/threads used for LAMMPS.
 
 
 ----------
 
 
 The *kokkos* style invokes settings associated with the use of the
 KOKKOS package.
 
 All of the settings are optional keyword/value pairs.  Each has a
 default value as listed below.
 
 The *neigh* keyword determines how neighbor lists are built.  A value
-of *half* uses half-neighbor lists, the same as used by most pair
-styles in LAMMPS.  A value of *half/thread* uses a thread-safe variant
-of the half-neighbor list.  It should be used instead of *half* when
-running with more than 1 threads per MPI task on a CPU.  A value of
+of *half* uses a thread-safe variant of half-neighbor lists,
+the same as used by most pair styles in LAMMPS.  A value of
 *n2* uses an O(N^2) algorithm to build the neighbor list without
 binning, where N = # of atoms on a processor.  It is typically slower
 than the other methods, which use binning.
 
 A value of *full* uses a full neighbor lists and is the default.  This
 performs twice as much computation as the *half* option, however that
 is often a win because it is thread-safe and doesn't require atomic
 operations in the calculation of pair forces.  For that reason, *full*
 is the default setting.  However, when running in MPI-only mode with 1
 thread per MPI task, *half* neighbor lists will typically be faster,
 just as it is for non-accelerated pair styles.
 
 A value of *full/cluster* is an experimental neighbor style, where
 particles interact with all particles within a small cluster, if at
 least one of the clusters particles is within the neighbor cutoff
 range.  This potentially allows for better vectorization on
 architectures such as the Intel Phi.  If also reduces the size of the
 neighbor list by roughly a factor of the cluster size, thus reducing
 the total memory footprint considerably.
 
 The *newton* keyword sets the Newton flags for pairwise and bonded
 interactions to *off* or *on*, the same as the :doc:`newton <newton>`
 command allows.  The default is *off* because this will almost always
 give better performance for the KOKKOS package.  This means more
 computation is done, but less communication.  However, when running in
 MPI-only mode with 1 thread per MPI task, a value of *on* will
 typically be faster, just as it is for non-accelerated pair styles.
 
 The *binsize* keyword sets the size of bins used to bin atoms in
 neighbor list builds.  The same value can be set by the :doc:`neigh_modify binsize <neigh_modify>` command.  Making it an option in the
 package kokkos command allows it to be set from the command line.  The
 default value is 0.0, which means the LAMMPS default will be used,
 which is bins = 1/2 the size of the pairwise cutoff + neighbor skin
 distance.  This is fine when neighbor lists are built on the CPU.  For
 GPU builds, a 2x larger binsize equal to the pairwise cutoff +
 neighbor skin, is often faster, which can be set by this keyword.
 Note that if you use a longer-than-usual pairwise cutoff, e.g. to
 allow for a smaller fraction of KSpace work with a :doc:`long-range Coulombic solver <kspace_style>` because the GPU is faster at
 performing pairwise interactions, then this rule of thumb may give too
 large a binsize.
 
 The *comm* and *comm/exchange* and *comm/forward* keywords determine
 whether the host or device performs the packing and unpacking of data
 when communicating per-atom data between processors.  "Exchange"
 communication happens only on timesteps that neighbor lists are
 rebuilt.  The data is only for atoms that migrate to new processors.
 "Forward" communication happens every timestep.  The data is for atom
 coordinates and any other atom properties that needs to be updated for
 ghost atoms owned by each processor.
 
 The *comm* keyword is simply a short-cut to set the same value
 for both the *comm/exchange* and *comm/forward* keywords.
 
 The value options for all 3 keywords are *no* or *host* or *device*.
 A value of *no* means to use the standard non-KOKKOS method of
 packing/unpacking data for the communication.  A value of *host* means
 to use the host, typically a multi-core CPU, and perform the
 packing/unpacking in parallel with threads.  A value of *device* means
 to use the device, typically a GPU, to perform the packing/unpacking
 operation.
 
 The optimal choice for these keywords depends on the input script and
 the hardware used.  The *no* value is useful for verifying that the
 Kokkos-based *host* and *device* values are working correctly.  It may
 also be the fastest choice when using Kokkos styles in MPI-only mode
 (i.e. with a thread count of 1).
 
 When running on CPUs or Xeon Phi, the *host* and *device* values work
 identically.  When using GPUs, the *device* value will typically be
 optimal if all of your styles used in your input script are supported
 by the KOKKOS package.  In this case data can stay on the GPU for many
 timesteps without being moved between the host and GPU, if you use the
 *device* value.  This requires that your MPI is able to access GPU
 memory directly.  Currently that is true for OpenMPI 1.8 (or later
 versions), Mvapich2 1.9 (or later), and CrayMPI.  If your script uses
 styles (e.g. fixes) which are not yet supported by the KOKKOS package,
 then data has to be move between the host and device anyway, so it is
 typically faster to let the host handle communication, by using the
 *host* value.  Using *host* instead of *no* will enable use of
 multiple threads to pack/unpack communicated data.
 
 
 ----------
 
 
 The *omp* style invokes settings associated with the use of the
 USER-OMP package.
 
 The *Nthread* argument sets the number of OpenMP threads allocated for
 each MPI task.  For example, if your system has nodes with dual
 quad-core processors, it has a total of 8 cores per node.  You could
 use two MPI tasks per node (e.g. using the -ppn option of the mpirun
 command in MPICH or -npernode in OpenMPI), and set *Nthreads* = 4.
 This would use all 8 cores on each node.  Note that the product of MPI
 tasks * threads/task should not exceed the physical number of cores
 (on a node), otherwise performance will suffer.
 
 Setting *Nthread* = 0 instructs LAMMPS to use whatever value is the
 default for the given OpenMP environment. This is usually determined
 via the *OMP_NUM_THREADS* environment variable or the compiler
 runtime.  Note that in most cases the default for OpenMP capable
 compilers is to use one thread for each available CPU core when
 *OMP_NUM_THREADS* is not explicitly set, which can lead to poor
 performance.
 
 Here are examples of how to set the environment variable when
 launching LAMMPS:
 
 .. parsed-literal::
 
    env OMP_NUM_THREADS=4 lmp_machine -sf omp -in in.script
    env OMP_NUM_THREADS=2 mpirun -np 2 lmp_machine -sf omp -in in.script
    mpirun -x OMP_NUM_THREADS=2 -np 2 lmp_machine -sf omp -in in.script
 
 or you can set it permanently in your shell's start-up script.  
 All three of these examples use a total of 4 CPU cores.
 
 Note that different MPI implementations have different ways of passing
 the OMP_NUM_THREADS environment variable to all MPI processes.  The
 2nd example line above is for MPICH; the 3rd example line with -x is
 for OpenMPI.  Check your MPI documentation for additional details.
 
 What combination of threads and MPI tasks gives the best performance
 is difficult to predict and can depend on many components of your
 input.  Not all features of LAMMPS support OpenMP threading via the
 USER-OMP packaage and the parallel efficiency can be very different,
 too.
 
 Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.
 
 The *neigh* keyword specifies whether neighbor list building will be
 multi-threaded in addition to force calculations.  If *neigh* is set
 to *no* then neighbor list calculation is performed only by MPI tasks
 with no OpenMP threading.  If *mode* is *yes* (the default), a
 multi-threaded neighbor list build is used.  Using *neigh* = *yes* is
 almost always faster and should produce idential neighbor lists at the
 expense of using more memory.  Specifically, neighbor list pages are
 allocated for all threads at the same time and each thread works
 within its own pages.
 
 
 ----------
 
 
 Restrictions
 """"""""""""
 
 
 This command cannot be used after the simulation box is defined by a
 :doc:`read_data <read_data>` or :doc:`create_box <create_box>` command.
 
 The cuda style of this command can only be invoked if LAMMPS was built
 with the USER-CUDA package.  See the :ref:`Making LAMMPS <start_3>` section for more info.
 
 The gpu style of this command can only be invoked if LAMMPS was built
 with the GPU package.  See the :ref:`Making LAMMPS <start_3>` section for more info.
 
 The intel style of this command can only be invoked if LAMMPS was
 built with the USER-INTEL package.  See the :ref:`Making LAMMPS <start_3>` section for more info.
 
 The kk style of this command can only be invoked if LAMMPS was built
 with the KOKKOS package.  See the :ref:`Making LAMMPS <start_3>` section for more info.
 
 The omp style of this command can only be invoked if LAMMPS was built
 with the USER-OMP package.  See the :ref:`Making LAMMPS <start_3>` section for more info.
 
 Related commands
 """"""""""""""""
 
 :doc:`suffix <suffix>`, "-pk" :ref:`command-line setting <start_7>`
 
 Default
 """""""
 
 For the USER-CUDA package, the default is Ngpu = 1 and the option
 defaults are newton = off, gpuID = 0 to Ngpu-1, timing = not enabled,
 test = not enabled, and thread = auto.  These settings are made
 automatically by the required "-c on" :ref:`command-line switch <start_7>`.  You can change them bu using the
 package cuda command in your input script or via the "-pk cuda"
 :ref:`command-line switch <start_7>`.
 
 For the GPU package, the default is Ngpu = 1 and the option defaults
 are neigh = yes, newton = off, binsize = 0.0, split = 1.0, gpuID = 0
 to Ngpu-1, tpa = 1, and device = not used.  These settings are made
 automatically if the "-sf gpu" :ref:`command-line switch <start_7>` is used.  If it is not used, you
 must invoke the package gpu command in your input script or via the
 "-pk gpu" :ref:`command-line switch <start_7>`.
 
 For the USER-INTEL package, the default is Nphi = 1 and the option
 defaults are omp = 0, mode = mixed, balance = -1, tpc = 4, tptask =
 240.  The default ghost option is determined by the pair style being
 used.  This value is output to the screen in the offload report at the
 end of each run.  Note that all of these settings, except "omp" and
 "mode", are ignored if LAMMPS was not built with Xeon Phi coprocessor
 support.  These settings are made automatically if the "-sf intel"
 :ref:`command-line switch <start_7>` is used.  If it is
 not used, you must invoke the package intel command in your input
 script or or via the "-pk intel" :ref:`command-line switch <start_7>`.
 
 For the KOKKOS package, the option defaults neigh = full, newton =
 off, binsize = 0.0, and comm = device.  These settings are made
 automatically by the required "-k on" :ref:`command-line switch <start_7>`.  You can change them bu using the
 package kokkos command in your input script or via the "-pk kokkos"
 :ref:`command-line switch <start_7>`.
 
 For the OMP package, the default is Nthreads = 0 and the option
 defaults are neigh = yes.  These settings are made automatically if
 the "-sf omp" :ref:`command-line switch <start_7>` is
 used.  If it is not used, you must invoke the package omp command in
 your input script or via the "-pk omp" :ref:`command-line switch <start_7>`.
 
 
 .. _lws: http://lammps.sandia.gov
 .. _ld: Manual.html
 .. _lc: Section_commands.html#comm
diff --git a/doc/package.html b/doc/package.html
index f79f993d3..9265e93f1 100644
--- a/doc/package.html
+++ b/doc/package.html
@@ -1,772 +1,769 @@
 
 
 <!DOCTYPE html>
 <!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
 <!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
 <head>
   <meta charset="utf-8">
   
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
   <title>package command &mdash; LAMMPS documentation</title>
   
 
   
   
 
   
 
   
   
     
 
   
 
   
   
     <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
   
 
   
     <link rel="stylesheet" href="_static/sphinxcontrib-images/LightBox2/lightbox2/css/lightbox.css" type="text/css" />
   
 
   
     <link rel="top" title="LAMMPS documentation" href="index.html"/> 
 
   
   <script src="_static/js/modernizr.min.js"></script>
 
 </head>
 
 <body class="wy-body-for-nav" role="document">
 
   <div class="wy-grid-for-nav">
 
     
     <nav data-toggle="wy-nav-shift" class="wy-nav-side">
       <div class="wy-side-nav-search">
         
 
         
           <a href="Manual.html" class="icon icon-home"> LAMMPS
         
 
         
         </a>
 
         
 <div role="search">
   <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
     <input type="text" name="q" placeholder="Search docs" />
     <input type="hidden" name="check_keywords" value="yes" />
     <input type="hidden" name="area" value="default" />
   </form>
 </div>
 
         
       </div>
 
       <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
         
           
           
               <ul>
 <li class="toctree-l1"><a class="reference internal" href="Section_intro.html">1. Introduction</a></li>
 <li class="toctree-l1"><a class="reference internal" href="Section_start.html">2. Getting Started</a></li>
 <li class="toctree-l1"><a class="reference internal" href="Section_commands.html">3. Commands</a></li>
 <li class="toctree-l1"><a class="reference internal" href="Section_packages.html">4. Packages</a></li>
 <li class="toctree-l1"><a class="reference internal" href="Section_accelerate.html">5. Accelerating LAMMPS performance</a></li>
 <li class="toctree-l1"><a class="reference internal" href="Section_howto.html">6. How-to discussions</a></li>
 <li class="toctree-l1"><a class="reference internal" href="Section_example.html">7. Example problems</a></li>
 <li class="toctree-l1"><a class="reference internal" href="Section_perf.html">8. Performance &amp; scalability</a></li>
 <li class="toctree-l1"><a class="reference internal" href="Section_tools.html">9. Additional tools</a></li>
 <li class="toctree-l1"><a class="reference internal" href="Section_modify.html">10. Modifying &amp; extending LAMMPS</a></li>
 <li class="toctree-l1"><a class="reference internal" href="Section_python.html">11. Python interface to LAMMPS</a></li>
 <li class="toctree-l1"><a class="reference internal" href="Section_errors.html">12. Errors</a></li>
 <li class="toctree-l1"><a class="reference internal" href="Section_history.html">13. Future and history</a></li>
 </ul>
 
           
         
       </div>
       &nbsp;
     </nav>
 
     <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
 
       
       <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
         <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
         <a href="Manual.html">LAMMPS</a>
       </nav>
 
 
       
       <div class="wy-nav-content">
         <div class="rst-content">
           <div role="navigation" aria-label="breadcrumbs navigation">
   <ul class="wy-breadcrumbs">
     <li><a href="Manual.html">Docs</a> &raquo;</li>
       
     <li>package command</li>
       <li class="wy-breadcrumbs-aside">
         
           
             <a href="http://lammps.sandia.gov">Website</a>
             <a href="Section_commands.html#comm">Commands</a>
         
       </li>
   </ul>
   <hr/>
   
 </div>
           <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
            <div itemprop="articleBody">
             
   <div class="section" id="package-command">
 <span id="index-0"></span><h1>package command<a class="headerlink" href="#package-command" title="Permalink to this headline">¶</a></h1>
 <div class="section" id="syntax">
 <h2>Syntax<a class="headerlink" href="#syntax" title="Permalink to this headline">¶</a></h2>
 <div class="highlight-python"><div class="highlight"><pre>package style args
 </pre></div>
 </div>
 <ul class="simple">
 <li>style = <em>cuda</em> or <em>gpu</em> or <em>intel</em> or <em>kokkos</em> or <em>omp</em></li>
 <li>args = arguments specific to the style</li>
 </ul>
 <pre class="literal-block">
 <em>cuda</em> args = Ngpu keyword value ...
   Ngpu = # of GPUs per node
   zero or more keyword/value pairs may be appended
   keywords = <em>newton</em> or <em>gpuID</em> or <em>timing</em> or <em>test</em> or <em>thread</em>
     <em>newton</em> = <em>off</em> or <em>on</em>
       off = set Newton pairwise and bonded flags off (default)
       on = set Newton pairwise and bonded flags on
     <em>gpuID</em> values = gpu1 .. gpuN
       gpu1 .. gpuN = IDs of the Ngpu GPUs to use
     <em>timing</em> values = none
     <em>test</em> values = id
       id = atom-ID of a test particle
     <em>thread</em> = auto or tpa or bpa
       auto = test whether tpa or bpa is faster
       tpa = one thread per atom
       bpa = one block per atom
 <em>gpu</em> args = Ngpu keyword value ...
   Ngpu = # of GPUs per node
   zero or more keyword/value pairs may be appended
   keywords = <em>neigh</em> or <em>newton</em> or <em>binsize</em> or <em>split</em> or <em>gpuID</em> or <em>tpa</em> or <em>device</em> or <em>blocksize</em>
     <em>neigh</em> value = <em>yes</em> or <em>no</em>
       yes = neighbor list build on GPU (default)
       no = neighbor list build on CPU
     <em>newton</em> = <em>off</em> or <em>on</em>
       off = set Newton pairwise flag off (default and required)
       on = set Newton pairwise flag on (currently not allowed)
     <em>binsize</em> value = size
       size = bin size for neighbor list construction (distance units)
     <em>split</em> = fraction
       fraction = fraction of atoms assigned to GPU (default = 1.0)
     <em>gpuID</em> values = first last
       first = ID of first GPU to be used on each node
       last = ID of last GPU to be used on each node
     <em>tpa</em> value = Nthreads
       Nthreads = # of GPU threads used per atom
     <em>device</em> value = device_type
       device_type = <em>kepler</em> or <em>fermi</em> or <em>cypress</em> or <em>generic</em>
     <em>blocksize</em> value = size
       size = thread block size for pair force computation
 <em>intel</em> args = NPhi keyword value ...
   Nphi = # of coprocessors per node
   zero or more keyword/value pairs may be appended
   keywords = <em>omp</em> or <em>mode</em> or <em>balance</em> or <em>ghost</em> or <em>tpc</em> or <em>tptask</em> or <em>no_affinity</em>
     <em>omp</em> value = Nthreads
       Nthreads = number of OpenMP threads to use on CPU (default = 0)
     <em>mode</em> value = <em>single</em> or <em>mixed</em> or <em>double</em>
       single = perform force calculations in single precision
       mixed = perform force calculations in mixed precision
       double = perform force calculations in double precision
     <em>balance</em> value = split
       split = fraction of work to offload to coprocessor, -1 for dynamic
     <em>ghost</em> value = <em>yes</em> or <em>no</em>
       yes = include ghost atoms for offload
       no = do not include ghost atoms for offload
     <em>tpc</em> value = Ntpc
       Ntpc = max number of coprocessor threads per coprocessor core (default = 4)
     <em>tptask</em> value = Ntptask
       Ntptask = max number of coprocessor threads per MPI task (default = 240)
     <em>no_affinity</em> values = none
 <em>kokkos</em> args = keyword value ...
   zero or more keyword/value pairs may be appended
   keywords = <em>neigh</em> or <em>newton</em> or <em>binsize</em> or <em>comm</em> or <em>comm/exchange</em> or <em>comm/forward</em>
-    <em>neigh</em> value = <em>full</em> or <em>half/thread</em> or <em>half</em> or <em>n2</em> or <em>full/cluster</em>
+    <em>neigh</em> value = <em>full</em> or <em>half</em> or <em>n2</em> or <em>full/cluster</em>
       full = full neighbor list
-      half/thread = half neighbor list built in thread-safe manner
-      half = half neighbor list, not thread-safe, only use when 1 thread/MPI task
+      half = half neighbor list built in thread-safe manner
       n2 = non-binning neighbor list build, O(N^2) algorithm
       full/cluster = full neighbor list with clustered groups of atoms
     <em>newton</em> = <em>off</em> or <em>on</em>
       off = set Newton pairwise and bonded flags off (default)
       on = set Newton pairwise and bonded flags on
     <em>binsize</em> value = size
       size = bin size for neighbor list construction (distance units)
     <em>comm</em> value = <em>no</em> or <em>host</em> or <em>device</em>
       use value for both comm/exchange and comm/forward
     <em>comm/exchange</em> value = <em>no</em> or <em>host</em> or <em>device</em>
     <em>comm/forward</em> value = <em>no</em> or <em>host</em> or <em>device</em>
       no = perform communication pack/unpack in non-KOKKOS mode
       host = perform pack/unpack on host (e.g. with OpenMP threading)
       device = perform pack/unpack on device (e.g. on GPU)
 <em>omp</em> args = Nthreads keyword value ...
   Nthread = # of OpenMP threads to associate with each MPI process
   zero or more keyword/value pairs may be appended
   keywords = <em>neigh</em>
     <em>neigh</em> value = <em>yes</em> or <em>no</em>
       yes = threaded neighbor list build (default)
       no = non-threaded neighbor list build
 </pre>
 </div>
 <div class="section" id="examples">
 <h2>Examples<a class="headerlink" href="#examples" title="Permalink to this headline">¶</a></h2>
 <div class="highlight-python"><div class="highlight"><pre>package gpu 1
 package gpu 1 split 0.75
 package gpu 2 split -1.0
 package cuda 2 gpuID 0 2
 package cuda 1 test 3948
-package kokkos neigh half/thread comm device
+package kokkos neigh half comm device
 package omp 0 neigh no
 package omp 4
 package intel 1
 package intel 2 omp 4 mode mixed balance 0.5
 </pre></div>
 </div>
 </div>
 <div class="section" id="description">
 <h2>Description<a class="headerlink" href="#description" title="Permalink to this headline">¶</a></h2>
 <p>This command invokes package-specific settings for the various
 accelerator packages available in LAMMPS.  Currently the following
 packages use settings from this command: USER-CUDA, GPU, USER-INTEL,
 KOKKOS, and USER-OMP.</p>
 <p>If this command is specified in an input script, it must be near the
 top of the script, before the simulation box has been defined.  This
 is because it specifies settings that the accelerator packages use in
 their intialization, before a simultion is defined.</p>
 <p>This command can also be specified from the command-line when
 launching LAMMPS, using the &#8220;-pk&#8221; <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a>.  The syntax is exactly the same as
 when used in an input script.</p>
 <p>Note that all of the accelerator packages require the package command
 to be specified (except the OPT package), if the package is to be used
 in a simulation (LAMMPS can be built with an accelerator package
 without using it in a particular simulation).  However, in all cases,
 a default version of the command is typically invoked by other
 accelerator settings.</p>
 <p>The USER-CUDA and KOKKOS packages require a &#8220;-c on&#8221; or &#8220;-k on&#8221;
 <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a> respectively, which
 invokes a &#8220;package cuda&#8221; or &#8220;package kokkos&#8221; command with default
 settings.</p>
 <p>For the GPU, USER-INTEL, and USER-OMP packages, if a &#8220;-sf gpu&#8221; or &#8220;-sf
 intel&#8221; or &#8220;-sf omp&#8221; <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a>
 is used to auto-append accelerator suffixes to various styles in the
 input script, then those switches also invoke a &#8220;package gpu&#8221;,
 &#8220;package intel&#8221;, or &#8220;package omp&#8221; command with default settings.</p>
 <div class="admonition note">
 <p class="first admonition-title">Note</p>
 <p class="last">A package command for a particular style can be invoked multiple
 times when a simulation is setup, e.g. by the &#8220;-c on&#8221;, &#8220;-k on&#8221;, &#8220;-sf&#8221;,
 and &#8220;-pk&#8221; <a class="reference internal" href="Section_start.html#start-7"><span>command-line switches</span></a>, and by
 using this command in an input script.  Each time it is used all of
 the style options are set, either to default values or to specified
 settings.  I.e. settings from previous invocations do not persist
 across multiple invocations.</p>
 </div>
 <p>See the <a class="reference internal" href="Section_accelerate.html"><em>Section Accelerate</em></a> section of the
 manual for more details about using the various accelerator packages
 for speeding up LAMMPS simulations.</p>
 <hr class="docutils" />
 <p>The <em>cuda</em> style invokes settings associated with the use of the
 USER-CUDA package.</p>
 <p>The <em>Ngpus</em> argument sets the number of GPUs per node.  There must be
 exactly one MPI task per GPU, as set by the mpirun or mpiexec command.</p>
 <p>Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.</p>
 <p>The <em>newton</em> keyword sets the Newton flags for pairwise and bonded
 interactions to <em>off</em> or <em>on</em>, the same as the <a class="reference internal" href="newton.html"><em>newton</em></a>
 command allows.  The default is <em>off</em> because this will almost always
 give better performance for the USER-CUDA package.  This means
 more computation is done, but less communication.</p>
 <p>The <em>gpuID</em> keyword allows selection of which GPUs on each node will
 be used for a simulation.  GPU IDs range from 0 to N-1 where N is the
 physical number of GPUs/node.  An ID is specified for each of the
 Ngpus being used.  For example if you have three GPUs on a machine,
 one of which is used for the X-Server (the GPU with the ID 1) while
 the others (with IDs 0 and 2) are used for computations you would
 specify:</p>
 <div class="highlight-python"><div class="highlight"><pre>package cuda 2 gpuID 0 2
 </pre></div>
 </div>
 <p>The purpose of the <em>gpuID</em> keyword is to allow two (or more)
 simulations to be run on one workstation.  In that case one could set
 the first simulation to use GPU 0 and the second to use GPU 1. This is
 not necessary however, if the GPUs are in what is called <em>compute
 exclusive</em> mode.  Using that setting, every process will get its own
 GPU automatically.  This <em>compute exclusive</em> mode can be set as root
 using the <em>nvidia-smi</em> tool which is part of the CUDA installation.</p>
 <p>Also note that if the <em>gpuID</em> keyword is not used, the USER-CUDA
 package sorts existing GPUs on each node according to their number of
 multiprocessors.  This way, compute GPUs will be priorized over
 X-Server GPUs.</p>
 <p>If the <em>timing</em> keyword is specified, detailed timing information for
 various subroutines will be output.</p>
 <p>If the <em>test</em> keyword is specified, information for the specified atom
 with atom-ID will be output at several points during each timestep.
 This is mainly usefull for debugging purposes.  Note that the
 simulation slow down dramatically if this option is used.</p>
 <p>The <em>thread</em> keyword can be used to specify how GPU threads are
 assigned work during pair style force evaluation.  If the value =
 <em>tpa</em>, one thread per atom is used.  If the value = <em>bpa</em>, one block
 per atom is used.  If the value = <em>auto</em>, a short test is performed at
 the beginning of each run to determing where <em>tpa</em> or <em>bpa</em> mode is
 faster.  The result of this test is output.  Since <em>auto</em> is the
 default value, it is usually not necessary to use this keyword.</p>
 <hr class="docutils" />
 <p>The <em>gpu</em> style invokes settings associated with the use of the GPU
 package.</p>
 <p>The <em>Ngpu</em> argument sets the number of GPUs per node.  There must be
 at least as many MPI tasks per node as GPUs, as set by the mpirun or
 mpiexec command.  If there are more MPI tasks (per node)
 than GPUs, multiple MPI tasks will share each GPU.</p>
 <p>Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.</p>
 <p>The <em>neigh</em> keyword specifies where neighbor lists for pair style
 computation will be built.  If <em>neigh</em> is <em>yes</em>, which is the default,
 neighbor list building is performed on the GPU.  If <em>neigh</em> is <em>no</em>,
 neighbor list building is performed on the CPU.  GPU neighbor list
 building currently cannot be used with a triclinic box.  GPU neighbor
 list calculation currently cannot be used with
 <a class="reference internal" href="pair_hybrid.html"><em>hybrid</em></a> pair styles.  GPU neighbor lists are not
 compatible with comannds that are not GPU-enabled.  When a non-GPU
 enabled command requires a neighbor list, it will also be built on the
 CPU.  In these cases, it will typically be more efficient to only use
 CPU neighbor list builds.</p>
 <p>The <em>newton</em> keyword sets the Newton flags for pairwise (not bonded)
 interactions to <em>off</em> or <em>on</em>, the same as the <a class="reference internal" href="newton.html"><em>newton</em></a>
 command allows.  Currently, only an <em>off</em> value is allowed, since all
 the GPU package pair styles require this setting.  This means more
 computation is done, but less communication.  In the future a value of
 <em>on</em> may be allowed, so the <em>newton</em> keyword is included as an option
 for compatibility with the package command for other accelerator
 styles.  Note that the newton setting for bonded interactions is not
 affected by this keyword.</p>
 <p>The <em>binsize</em> keyword sets the size of bins used to bin atoms in
 neighbor list builds performed on the GPU, if <em>neigh</em> = <em>yes</em> is set.
 If <em>binsize</em> is set to 0.0 (the default), then bins = the size of the
 pairwise cutoff + neighbor skin distance.  This is 2x larger than the
 LAMMPS default used for neighbor list building on the CPU.  This will
 be close to optimal for the GPU, so you do not normally need to use
 this keyword.  Note that if you use a longer-than-usual pairwise
 cutoff, e.g. to allow for a smaller fraction of KSpace work with a
 <a class="reference internal" href="kspace_style.html"><em>long-range Coulombic solver</em></a> because the GPU is
 faster at performing pairwise interactions, then it may be optimal to
 make the <em>binsize</em> smaller than the default.  For example, with a
 cutoff of 20*sigma in LJ <a class="reference internal" href="units.html"><em>units</em></a> and a neighbor skin
 distance of sigma, a <em>binsize</em> = 5.25*sigma can be more efficient than
 the default.</p>
 <p>The <em>split</em> keyword can be used for load balancing force calculations
 between CPU and GPU cores in GPU-enabled pair styles. If 0 &lt; <em>split</em> &lt;
 1.0, a fixed fraction of particles is offloaded to the GPU while force
 calculation for the other particles occurs simulataneously on the CPU.
 If <em>split</em> &lt; 0.0, the optimal fraction (based on CPU and GPU timings)
 is calculated every 25 timesteps, i.e. dynamic load-balancing across
 the CPU and GPU is performed.  If <em>split</em> = 1.0, all force
 calculations for GPU accelerated pair styles are performed on the GPU.
 In this case, other <a class="reference internal" href="pair_hybrid.html"><em>hybrid</em></a> pair interactions,
 <a class="reference internal" href="bond_style.html"><em>bond</em></a>, <a class="reference internal" href="angle_style.html"><em>angle</em></a>,
 <a class="reference internal" href="dihedral_style.html"><em>dihedral</em></a>, <a class="reference internal" href="improper_style.html"><em>improper</em></a>, and
 <a class="reference internal" href="kspace_style.html"><em>long-range</em></a> calculations can be performed on the
 CPU while the GPU is performing force calculations for the GPU-enabled
 pair style.  If all CPU force computations complete before the GPU
 completes, LAMMPS will block until the GPU has finished before
 continuing the timestep.</p>
 <p>As an example, if you have two GPUs per node and 8 CPU cores per node,
 and would like to run on 4 nodes (32 cores) with dynamic balancing of
 force calculation across CPU and GPU cores, you could specify</p>
 <div class="highlight-python"><div class="highlight"><pre>mpirun -np 32 -sf gpu -in in.script    # launch command
 package gpu 2 split -1                 # input script command
 </pre></div>
 </div>
 <p>In this case, all CPU cores and GPU devices on the nodes would be
 utilized.  Each GPU device would be shared by 4 CPU cores. The CPU
 cores would perform force calculations for some fraction of the
 particles at the same time the GPUs performed force calculation for
 the other particles.</p>
 <p>The <em>gpuID</em> keyword allows selection of which GPUs on each node will
 be used for a simulation.  The <em>first</em> and <em>last</em> values specify the
 GPU IDs to use (from 0 to Ngpu-1).  By default, first = 0 and last =
 Ngpu-1, so that all GPUs are used, assuming Ngpu is set to the number
 of physical GPUs.  If you only wish to use a subset, set Ngpu to a
 smaller number and first/last to a sub-range of the available GPUs.</p>
 <p>The <em>tpa</em> keyword sets the number of GPU thread per atom used to
 perform force calculations.  With a default value of 1, the number of
 threads will be chosen based on the pair style, however, the value can
 be set explicitly with this keyword to fine-tune performance.  For
 large cutoffs or with a small number of particles per GPU, increasing
 the value can improve performance. The number of threads per atom must
 be a power of 2 and currently cannot be greater than 32.</p>
 <p>The <em>device</em> keyword can be used to tune parameters optimized for a
 specific accelerator, when using OpenCL.  For CUDA, the <em>device</em>
 keyword is ignored.  Currently, the device type is limited to NVIDIA
 Kepler, NVIDIA Fermi, AMD Cypress, or a generic device.  More devices
 may be added later.  The default device type can be specified when
 building LAMMPS with the GPU library, via settings in the
 lib/gpu/Makefile that is used.</p>
 <p>The <em>blocksize</em> keyword allows you to tweak the number of threads used
 per thread block. This number should be a multiple of 32 (for GPUs)
 and its maximum depends on the specific GPU hardware. Typical choices
 are 64, 128, or 256. A larger blocksize increases occupancy of
 individual GPU cores, but reduces the total number of thread blocks,
 thus may lead to load imbalance.</p>
 <hr class="docutils" />
 <p>The <em>intel</em> style invokes settings associated with the use of the
 USER-INTEL package.  All of its settings, except the <em>omp</em> and <em>mode</em>
 keywords, are ignored if LAMMPS was not built with Xeon Phi
 coprocessor support.  All of its settings, including the <em>omp</em> and
 <em>mode</em> keyword are applicable if LAMMPS was built with coprocessor
 support.</p>
 <p>The <em>Nphi</em> argument sets the number of coprocessors per node.
 This can be set to any value, including 0, if LAMMPS was not
 built with coprocessor support.</p>
 <p>Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.</p>
 <p>The <em>omp</em> keyword determines the number of OpenMP threads allocated
 for each MPI task when any portion of the interactions computed by a
 USER-INTEL pair style are run on the CPU.  This can be the case even
 if LAMMPS was built with coprocessor support; see the <em>balance</em>
 keyword discussion below.  If you are running with less MPI tasks/node
 than there are CPUs, it can be advantageous to use OpenMP threading on
 the CPUs.</p>
 <div class="admonition note">
 <p class="first admonition-title">Note</p>
 <p class="last">The <em>omp</em> keyword has nothing to do with coprocessor threads on
 the Xeon Phi; see the <em>tpc</em> and <em>tptask</em> keywords below for a
 discussion of coprocessor threads.</p>
 </div>
 <p>The <em>Nthread</em> value for the <em>omp</em> keyword sets the number of OpenMP
 threads allocated for each MPI task.  Setting <em>Nthread</em> = 0 (the
 default) instructs LAMMPS to use whatever value is the default for the
 given OpenMP environment. This is usually determined via the
 <em>OMP_NUM_THREADS</em> environment variable or the compiler runtime, which
 is usually a value of 1.</p>
 <p>For more details, including examples of how to set the OMP_NUM_THREADS
 environment variable, see the discussion of the <em>Nthreads</em> setting on
 this doc page for the &#8220;package omp&#8221; command.  Nthreads is a required
 argument for the USER-OMP package.  Its meaning is exactly the same
 for the USER-INTEL pacakge.</p>
 <div class="admonition note">
 <p class="first admonition-title">Note</p>
 <p class="last">If you build LAMMPS with both the USER-INTEL and USER-OMP
 packages, be aware that both packages allow setting of the <em>Nthreads</em>
 value via their package commands, but there is only a single global
 <em>Nthreads</em> value used by OpenMP.  Thus if both package commands are
 invoked, you should insure the two values are consistent.  If they are
 not, the last one invoked will take precedence, for both packages.
 Also note that if the &#8220;-sf hybrid intel omp&#8221; <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a> is used, it invokes a &#8220;package
 intel&#8221; command, followed by a &#8220;package omp&#8221; command, both with a
 setting of <em>Nthreads</em> = 0.</p>
 </div>
 <p>The <em>mode</em> keyword determines the precision mode to use for
 computing pair style forces, either on the CPU or on the coprocessor,
 when using a USER-INTEL supported <a class="reference internal" href="pair_style.html"><em>pair style</em></a>.  It
 can take a value of <em>single</em>, <em>mixed</em> which is the default, or
 <em>double</em>.  <em>Single</em> means single precision is used for the entire
 force calculation.  <em>Mixed</em> means forces between a pair of atoms are
 computed in single precision, but accumulated and stored in double
 precision, including storage of forces, torques, energies, and virial
 quantities.  <em>Double</em> means double precision is used for the entire
 force calculation.</p>
 <p>The <em>balance</em> keyword sets the fraction of <a class="reference internal" href="pair_style.html"><em>pair style</em></a> work offloaded to the coprocessor for split
 values between 0.0 and 1.0 inclusive.  While this fraction of work is
 running on the coprocessor, other calculations will run on the host,
 including neighbor and pair calculations that are not offloaded, as
 well as angle, bond, dihedral, kspace, and some MPI communications.
 If <em>split</em> is set to -1, the fraction of work is dynamically adjusted
 automatically throughout the run.  This typically give performance
 within 5 to 10 percent of the optimal fixed fraction.</p>
 <p>The <em>ghost</em> keyword determines whether or not ghost atoms, i.e. atoms
 at the boundaries of proessor sub-domains, are offloaded for neighbor
 and force calculations.  When the value = &#8220;no&#8221;, ghost atoms are not
 offloaded.  This option can reduce the amount of data transfer with
 the coprocessor and can also overlap MPI communication of forces with
 computation on the coprocessor when the <a class="reference internal" href="newton.html"><em>newton pair</em></a>
 setting is &#8220;on&#8221;.  When the value = &#8220;yes&#8221;, ghost atoms are offloaded.
 In some cases this can provide better performance, especially if the
 <em>balance</em> fraction is high.</p>
 <p>The <em>tpc</em> keyword sets the max # of coprocessor threads <em>Ntpc</em> that
 will run on each core of the coprocessor.  The default value = 4,
 which is the number of hardware threads per core supported by the
 current generation Xeon Phi chips.</p>
 <p>The <em>tptask</em> keyword sets the max # of coprocessor threads (Ntptask*
 assigned to each MPI task.  The default value = 240, which is the
 total # of threads an entire current generation Xeon Phi chip can run
 (240 = 60 cores * 4 threads/core).  This means each MPI task assigned
 to the Phi will enough threads for the chip to run the max allowed,
 even if only 1 MPI task is assigned.  If 8 MPI tasks are assigned to
 the Phi, each will run with 30 threads.  If you wish to limit the
 number of threads per MPI task, set <em>tptask</em> to a smaller value.
 E.g. for <em>tptask</em> = 16, if 8 MPI tasks are assigned, each will run
 with 16 threads, for a total of 128.</p>
 <p>Note that the default settings for <em>tpc</em> and <em>tptask</em> are fine for
 most problems, regardless of how many MPI tasks you assign to a Phi.</p>
 <p>The <em>no_affinity</em> keyword will turn off automatic setting of core
 affinity for MPI tasks and OpenMP threads on the host when using
 offload to a coprocessor. Affinity settings are used when possible
 to prevent MPI tasks and OpenMP threads from being on separate NUMA
 domains and to prevent offload threads from interfering with other
 processes/threads used for LAMMPS.</p>
 <hr class="docutils" />
 <p>The <em>kokkos</em> style invokes settings associated with the use of the
 KOKKOS package.</p>
 <p>All of the settings are optional keyword/value pairs.  Each has a
 default value as listed below.</p>
 <p>The <em>neigh</em> keyword determines how neighbor lists are built.  A value
-of <em>half</em> uses half-neighbor lists, the same as used by most pair
-styles in LAMMPS.  A value of <em>half/thread</em> uses a thread-safe variant
-of the half-neighbor list.  It should be used instead of <em>half</em> when
-running with more than 1 threads per MPI task on a CPU.  A value of
+of <em>half</em> uses a thread-safe variant of half-neighbor lists,
+the same as used by most pair styles in LAMMPS.  A value of
 <em>n2</em> uses an O(N^2) algorithm to build the neighbor list without
 binning, where N = # of atoms on a processor.  It is typically slower
 than the other methods, which use binning.</p>
 <p>A value of <em>full</em> uses a full neighbor lists and is the default.  This
 performs twice as much computation as the <em>half</em> option, however that
 is often a win because it is thread-safe and doesn&#8217;t require atomic
 operations in the calculation of pair forces.  For that reason, <em>full</em>
 is the default setting.  However, when running in MPI-only mode with 1
 thread per MPI task, <em>half</em> neighbor lists will typically be faster,
 just as it is for non-accelerated pair styles.</p>
 <p>A value of <em>full/cluster</em> is an experimental neighbor style, where
 particles interact with all particles within a small cluster, if at
 least one of the clusters particles is within the neighbor cutoff
 range.  This potentially allows for better vectorization on
 architectures such as the Intel Phi.  If also reduces the size of the
 neighbor list by roughly a factor of the cluster size, thus reducing
 the total memory footprint considerably.</p>
 <p>The <em>newton</em> keyword sets the Newton flags for pairwise and bonded
 interactions to <em>off</em> or <em>on</em>, the same as the <a class="reference internal" href="newton.html"><em>newton</em></a>
 command allows.  The default is <em>off</em> because this will almost always
 give better performance for the KOKKOS package.  This means more
 computation is done, but less communication.  However, when running in
 MPI-only mode with 1 thread per MPI task, a value of <em>on</em> will
 typically be faster, just as it is for non-accelerated pair styles.</p>
 <p>The <em>binsize</em> keyword sets the size of bins used to bin atoms in
 neighbor list builds.  The same value can be set by the <a class="reference internal" href="neigh_modify.html"><em>neigh_modify binsize</em></a> command.  Making it an option in the
 package kokkos command allows it to be set from the command line.  The
 default value is 0.0, which means the LAMMPS default will be used,
 which is bins = 1/2 the size of the pairwise cutoff + neighbor skin
 distance.  This is fine when neighbor lists are built on the CPU.  For
 GPU builds, a 2x larger binsize equal to the pairwise cutoff +
 neighbor skin, is often faster, which can be set by this keyword.
 Note that if you use a longer-than-usual pairwise cutoff, e.g. to
 allow for a smaller fraction of KSpace work with a <a class="reference internal" href="kspace_style.html"><em>long-range Coulombic solver</em></a> because the GPU is faster at
 performing pairwise interactions, then this rule of thumb may give too
 large a binsize.</p>
 <p>The <em>comm</em> and <em>comm/exchange</em> and <em>comm/forward</em> keywords determine
 whether the host or device performs the packing and unpacking of data
 when communicating per-atom data between processors.  &#8220;Exchange&#8221;
 communication happens only on timesteps that neighbor lists are
 rebuilt.  The data is only for atoms that migrate to new processors.
 &#8220;Forward&#8221; communication happens every timestep.  The data is for atom
 coordinates and any other atom properties that needs to be updated for
 ghost atoms owned by each processor.</p>
 <p>The <em>comm</em> keyword is simply a short-cut to set the same value
 for both the <em>comm/exchange</em> and <em>comm/forward</em> keywords.</p>
 <p>The value options for all 3 keywords are <em>no</em> or <em>host</em> or <em>device</em>.
 A value of <em>no</em> means to use the standard non-KOKKOS method of
 packing/unpacking data for the communication.  A value of <em>host</em> means
 to use the host, typically a multi-core CPU, and perform the
 packing/unpacking in parallel with threads.  A value of <em>device</em> means
 to use the device, typically a GPU, to perform the packing/unpacking
 operation.</p>
 <p>The optimal choice for these keywords depends on the input script and
 the hardware used.  The <em>no</em> value is useful for verifying that the
 Kokkos-based <em>host</em> and <em>device</em> values are working correctly.  It may
 also be the fastest choice when using Kokkos styles in MPI-only mode
 (i.e. with a thread count of 1).</p>
 <p>When running on CPUs or Xeon Phi, the <em>host</em> and <em>device</em> values work
 identically.  When using GPUs, the <em>device</em> value will typically be
 optimal if all of your styles used in your input script are supported
 by the KOKKOS package.  In this case data can stay on the GPU for many
 timesteps without being moved between the host and GPU, if you use the
 <em>device</em> value.  This requires that your MPI is able to access GPU
 memory directly.  Currently that is true for OpenMPI 1.8 (or later
 versions), Mvapich2 1.9 (or later), and CrayMPI.  If your script uses
 styles (e.g. fixes) which are not yet supported by the KOKKOS package,
 then data has to be move between the host and device anyway, so it is
 typically faster to let the host handle communication, by using the
 <em>host</em> value.  Using <em>host</em> instead of <em>no</em> will enable use of
 multiple threads to pack/unpack communicated data.</p>
 <hr class="docutils" />
 <p>The <em>omp</em> style invokes settings associated with the use of the
 USER-OMP package.</p>
 <p>The <em>Nthread</em> argument sets the number of OpenMP threads allocated for
 each MPI task.  For example, if your system has nodes with dual
 quad-core processors, it has a total of 8 cores per node.  You could
 use two MPI tasks per node (e.g. using the -ppn option of the mpirun
 command in MPICH or -npernode in OpenMPI), and set <em>Nthreads</em> = 4.
 This would use all 8 cores on each node.  Note that the product of MPI
 tasks * threads/task should not exceed the physical number of cores
 (on a node), otherwise performance will suffer.</p>
 <p>Setting <em>Nthread</em> = 0 instructs LAMMPS to use whatever value is the
 default for the given OpenMP environment. This is usually determined
 via the <em>OMP_NUM_THREADS</em> environment variable or the compiler
 runtime.  Note that in most cases the default for OpenMP capable
 compilers is to use one thread for each available CPU core when
 <em>OMP_NUM_THREADS</em> is not explicitly set, which can lead to poor
 performance.</p>
 <p>Here are examples of how to set the environment variable when
 launching LAMMPS:</p>
 <div class="highlight-python"><div class="highlight"><pre>env OMP_NUM_THREADS=4 lmp_machine -sf omp -in in.script
 env OMP_NUM_THREADS=2 mpirun -np 2 lmp_machine -sf omp -in in.script
 mpirun -x OMP_NUM_THREADS=2 -np 2 lmp_machine -sf omp -in in.script
 </pre></div>
 </div>
 <p>or you can set it permanently in your shell&#8217;s start-up script.
 All three of these examples use a total of 4 CPU cores.</p>
 <p>Note that different MPI implementations have different ways of passing
 the OMP_NUM_THREADS environment variable to all MPI processes.  The
 2nd example line above is for MPICH; the 3rd example line with -x is
 for OpenMPI.  Check your MPI documentation for additional details.</p>
 <p>What combination of threads and MPI tasks gives the best performance
 is difficult to predict and can depend on many components of your
 input.  Not all features of LAMMPS support OpenMP threading via the
 USER-OMP packaage and the parallel efficiency can be very different,
 too.</p>
 <p>Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.</p>
 <p>The <em>neigh</em> keyword specifies whether neighbor list building will be
 multi-threaded in addition to force calculations.  If <em>neigh</em> is set
 to <em>no</em> then neighbor list calculation is performed only by MPI tasks
 with no OpenMP threading.  If <em>mode</em> is <em>yes</em> (the default), a
 multi-threaded neighbor list build is used.  Using <em>neigh</em> = <em>yes</em> is
 almost always faster and should produce idential neighbor lists at the
 expense of using more memory.  Specifically, neighbor list pages are
 allocated for all threads at the same time and each thread works
 within its own pages.</p>
 </div>
 <hr class="docutils" />
 <div class="section" id="restrictions">
 <h2>Restrictions<a class="headerlink" href="#restrictions" title="Permalink to this headline">¶</a></h2>
 <p>This command cannot be used after the simulation box is defined by a
 <a class="reference internal" href="read_data.html"><em>read_data</em></a> or <a class="reference internal" href="create_box.html"><em>create_box</em></a> command.</p>
 <p>The cuda style of this command can only be invoked if LAMMPS was built
 with the USER-CUDA package.  See the <a class="reference internal" href="Section_start.html#start-3"><span>Making LAMMPS</span></a> section for more info.</p>
 <p>The gpu style of this command can only be invoked if LAMMPS was built
 with the GPU package.  See the <a class="reference internal" href="Section_start.html#start-3"><span>Making LAMMPS</span></a> section for more info.</p>
 <p>The intel style of this command can only be invoked if LAMMPS was
 built with the USER-INTEL package.  See the <a class="reference internal" href="Section_start.html#start-3"><span>Making LAMMPS</span></a> section for more info.</p>
 <p>The kk style of this command can only be invoked if LAMMPS was built
 with the KOKKOS package.  See the <a class="reference internal" href="Section_start.html#start-3"><span>Making LAMMPS</span></a> section for more info.</p>
 <p>The omp style of this command can only be invoked if LAMMPS was built
 with the USER-OMP package.  See the <a class="reference internal" href="Section_start.html#start-3"><span>Making LAMMPS</span></a> section for more info.</p>
 </div>
 <div class="section" id="related-commands">
 <h2>Related commands<a class="headerlink" href="#related-commands" title="Permalink to this headline">¶</a></h2>
 <p><a class="reference internal" href="suffix.html"><em>suffix</em></a>, &#8220;-pk&#8221; <a class="reference internal" href="Section_start.html#start-7"><span>command-line setting</span></a></p>
 </div>
 <div class="section" id="default">
 <h2>Default<a class="headerlink" href="#default" title="Permalink to this headline">¶</a></h2>
 <p>For the USER-CUDA package, the default is Ngpu = 1 and the option
 defaults are newton = off, gpuID = 0 to Ngpu-1, timing = not enabled,
 test = not enabled, and thread = auto.  These settings are made
 automatically by the required &#8220;-c on&#8221; <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a>.  You can change them bu using the
 package cuda command in your input script or via the &#8220;-pk cuda&#8221;
 <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a>.</p>
 <p>For the GPU package, the default is Ngpu = 1 and the option defaults
 are neigh = yes, newton = off, binsize = 0.0, split = 1.0, gpuID = 0
 to Ngpu-1, tpa = 1, and device = not used.  These settings are made
 automatically if the &#8220;-sf gpu&#8221; <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a> is used.  If it is not used, you
 must invoke the package gpu command in your input script or via the
 &#8220;-pk gpu&#8221; <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a>.</p>
 <p>For the USER-INTEL package, the default is Nphi = 1 and the option
 defaults are omp = 0, mode = mixed, balance = -1, tpc = 4, tptask =
 240.  The default ghost option is determined by the pair style being
 used.  This value is output to the screen in the offload report at the
 end of each run.  Note that all of these settings, except &#8220;omp&#8221; and
 &#8220;mode&#8221;, are ignored if LAMMPS was not built with Xeon Phi coprocessor
 support.  These settings are made automatically if the &#8220;-sf intel&#8221;
 <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a> is used.  If it is
 not used, you must invoke the package intel command in your input
 script or or via the &#8220;-pk intel&#8221; <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a>.</p>
 <p>For the KOKKOS package, the option defaults neigh = full, newton =
 off, binsize = 0.0, and comm = device.  These settings are made
 automatically by the required &#8220;-k on&#8221; <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a>.  You can change them bu using the
 package kokkos command in your input script or via the &#8220;-pk kokkos&#8221;
 <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a>.</p>
 <p>For the OMP package, the default is Nthreads = 0 and the option
 defaults are neigh = yes.  These settings are made automatically if
 the &#8220;-sf omp&#8221; <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a> is
 used.  If it is not used, you must invoke the package omp command in
 your input script or via the &#8220;-pk omp&#8221; <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a>.</p>
 </div>
 </div>
 
 
            </div>
           </div>
           <footer>
   
 
   <hr/>
 
   <div role="contentinfo">
     <p>
         &copy; Copyright 2013 Sandia Corporation.
     </p>
   </div>
   Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
 
 </footer>
 
         </div>
       </div>
 
     </section>
 
   </div>
   
 
 
   
 
     <script type="text/javascript">
         var DOCUMENTATION_OPTIONS = {
             URL_ROOT:'./',
             VERSION:'',
             COLLAPSE_INDEX:false,
             FILE_SUFFIX:'.html',
             HAS_SOURCE:  true
         };
     </script>
       <script type="text/javascript" src="_static/jquery.js"></script>
       <script type="text/javascript" src="_static/underscore.js"></script>
       <script type="text/javascript" src="_static/doctools.js"></script>
       <script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
       <script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2/js/jquery-1.11.0.min.js"></script>
       <script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2/js/lightbox.min.js"></script>
       <script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2-customize/jquery-noconflict.js"></script>
 
   
 
   
   
     <script type="text/javascript" src="_static/js/theme.js"></script>
   
 
   
   
   <script type="text/javascript">
       jQuery(function () {
           SphinxRtdTheme.StickyNav.enable();
       });
   </script>
    
 
 </body>
 </html>
\ No newline at end of file