diff --git a/bench/GPU/README b/bench/GPU/README
index 91dba179b..a85d6302b 100644
--- a/bench/GPU/README
+++ b/bench/GPU/README
@@ -1,60 +1,61 @@
 These are input scripts used to run GPU versions of several of the
 benchmarks in the top-level bench directory.  The results of running
 these scripts on different machines are shown on the GPU section of
 the Benchmark page of the LAMMPS WWW site (lammps.sandia.gov/bench).
 
 Examples are shown below of how to run these scripts.  This assumes
 you have built 3 executables with both the GPU and USER-CUDA packages
 installed, e.g.
 
 lmp_linux_single
 lmp_linux_mixed
 lmp_linux_double
 
 The precision (single, mixed, double) refers to the GPU and USER-CUDA
 pacakge precision.  See the README files in the lib/gpu and lib/cuda
 directories for instructions on how to build the packages with
 different precisions.  The doc/Section_accelerate.html file also has a
 summary description.
 
 ------------------------------------------------------------------------
 
 If the script has "cpu" in its name, it is meant to be run in CPU-only
 mode.  For example:
 
 mpirun -np 1 ../lmp_linux_double -c off -v x 8 -v y 8 -v z 8 -v t 100 < in.lj.cpu
 mpirun -np 12 ../lmp_linux_double -c off -v x 16 -v y 16 -v z 16 -v t 100 < in.lj.cpu
 
 The "xyz" settings determine the problem size.  The "t" setting
 determines the number of timesteps.
 
 ------------------------------------------------------------------------
 
 If the script has "gpu" in its name, it is meant to be run using
 the GPU package.  For example:
 
 mpirun -np 12 ../lmp_linux_single -sf gpu -c off -v g 1 -v x 32 -v y 32 -v z 64 -v t 100 < in.lj.gpu
 
 mpirun -np 8 ../lmp_linux_mixed -sf gpu -c off -v g 2 -v x 32 -v y 32 -v z 64 -v t 100 < in.lj.gpu
 
 The "xyz" settings determine the problem size.  The "t" setting
 determines the number of timesteps.  The "np" setting determines how
-many CPUs the problem will be run on, and the "g" settings determines
-how many GPUs the problem will run on, i.e. 1 or 2 in this case.  You
-can use more CPUs than GPUs with the GPU package.
+many MPI tasks per compute node the problem will run on, and the "g"
+setting determines how many GPUs per compute node the problem will run
+on, i.e. 1 or 2 in this case.  Note that you can use more MPI tasks
+than GPUs (both per compute node) with the GPU package.
 
 ------------------------------------------------------------------------
 
 If the script has "cuda" in its name, it is meant to be run using
 the USER-CUDA package.  For example:
 
 mpirun -np 1 ../lmp_linux_single -sf cuda -v g 1 -v x 16 -v y 16 -v z 16 -v t 100 < in.lj.cuda
 
 mpirun -np 2 ../lmp_linux_double -sf cuda -v g 2 -v x 32 -v y 64 -v z 64 -v t 100 < in.eam.cuda
 
 The "xyz" settings determine the problem size.  The "t" setting
 determines the number of timesteps.  The "np" setting determines how
-many CPUs the problem will be run on, and the "g" setting determines
-how many GPUs the problem will run on, i.e. 1 or 2 in this case.  You
-should make the number of CPUs and number of GPUs equal for the
-USER-CUDA package.
+many MPI tasks per compute node the problem will run on, and the "g"
+setting determines how many GPUs per compute node the problem will run
+on, i.e. 1 or 2 in this case.  For the USER-CUDA package, the number
+of MPI tasks and GPUs (both per compute node) must be equal.
diff --git a/doc/Manual.html b/doc/Manual.html
index 6dc80d437..576b6b011 100644
--- a/doc/Manual.html
+++ b/doc/Manual.html
@@ -1,437 +1,437 @@
 <HTML>
 <HEAD>
 <TITLE>LAMMPS-ICMS Users Manual</TITLE>
-<META NAME="docnumber" CONTENT="27 May 2014 version">
+<META NAME="docnumber" CONTENT="29 May 2014 version">
 <META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
 <META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation.  This software and manual is distributed under the GNU General Public License.">
 </HEAD>
 
 <BODY>
 
 <CENTER><A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> - <A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> 
 </CENTER>
 
 
 
 
 
 
 <HR>
 
 <H1></H1>
 
 <CENTER><H3>LAMMPS-ICMS Documentation 
 </H3></CENTER>
-<CENTER><H4>27 May 2014 version 
+<CENTER><H4>29 May 2014 version 
 </H4></CENTER>
 <H4>Version info: 
 </H4>
 <P>The LAMMPS "version" is the date when it was released, such as 1 May
 2010. LAMMPS is updated continuously.  Whenever we fix a bug or add a
 feature, we release it immediately, and post a notice on <A HREF = "http://lammps.sandia.gov/bug.html">this page of
 the WWW site</A>.  Each dated copy of LAMMPS contains all the
 features and bug-fixes up to and including that version date. The
 version date is printed to the screen and logfile every time you run
 LAMMPS. It is also in the file src/version.h and in the LAMMPS
 directory name created when you unpack a tarball, and at the top of
 the first page of the manual (this page).
 </P>
 <P>LAMMPS-ICMS is an experimental variant of LAMMPS with additional
 features made available for testing before they will be submitted
 for inclusion into the official LAMMPS tree. The source code is 
 based on the official LAMMPS svn repository mirror at the Institute
 for Computational Molecular Science at Temple University and generally
 kept up-to-date as much as possible. Sometimes, e.g. when additional
 development work is needed to adapt the upstream changes into
 LAMMPS-ICMS it can take longer until synchronization; and occasionally,
 e.g. in case of the rewrite of the multi-threading support, the
 development will be halted except for important bugfixes until
 all features of LAMMPS-ICMS fully compatible with the upstream
 version or replaced by alternate implementations.
 </P>
 <UL><LI>If you browse the HTML doc pages on the LAMMPS WWW site, they always
 describe the most current version of upstream LAMMPS, but may be
 missing some new features in LAMMPS-ICMS. 
 
 <LI>If you browse the HTML doc pages included in your tarball, they
 describe the version you have, however, not all new features in
 LAMMPS-ICMS are documented immediately. 
 
 <LI>The <A HREF = "Manual.pdf">PDF file</A> on the WWW site or in the tarball is updated
 about once per month.  This is because it is large, and we don't want
 it to be part of every patch. 
 
 <LI>There is also a <A HREF = "Developer.pdf">Developer.pdf</A> file in the doc
 directory, which describes the internal structure and algorithms of
 LAMMPS.  
 </UL>
 <P>LAMMPS stands for Large-scale Atomic/Molecular Massively Parallel
 Simulator.
 </P>
 <P>LAMMPS is a classical molecular dynamics simulation code designed to
 run efficiently on parallel computers.  It was developed at Sandia
 National Laboratories, a US Department of Energy facility, with
 funding from the DOE.  It is an open-source code, distributed freely
 under the terms of the GNU Public License (GPL).
 </P>
 <P>The primary developers of LAMMPS are <A HREF = "http://www.sandia.gov/~sjplimp">Steve Plimpton</A>, Aidan
 Thompson, and Paul Crozier who can be contacted at
 sjplimp,athomps,pscrozi at sandia.gov.  The <A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> at
 http://lammps.sandia.gov has more information about the code and its
 uses.
 </P>
 
 
 
 
 <HR>
 
 <P>The LAMMPS documentation is organized into the following sections.  If
 you find errors or omissions in this manual or have suggestions for
 useful information to add, please send an email to the developers so
 we can improve the LAMMPS documentation.
 </P>
 <P>Once you are familiar with LAMMPS, you may want to bookmark <A HREF = "Section_commands.html#comm">this
 page</A> at Section_commands.html#comm since
 it gives quick access to documentation for all LAMMPS commands.
 </P>
 <P><A HREF = "Manual.pdf">PDF file</A> of the entire manual, generated by
 <A HREF = "http://www.easysw.com/htmldoc">htmldoc</A>
 </P>
 <OL><LI><A HREF = "Section_intro.html">Introduction</A> 
 
 <UL>  1.1 <A HREF = "Section_intro.html#intro_1">What is LAMMPS</A> 
 <BR>
   1.2 <A HREF = "Section_intro.html#intro_2">LAMMPS features</A> 
 <BR>
   1.3 <A HREF = "Section_intro.html#intro_3">LAMMPS non-features</A> 
 <BR>
   1.4 <A HREF = "Section_intro.html#intro_4">Open source distribution</A> 
 <BR>
   1.5 <A HREF = "Section_intro.html#intro_5">Acknowledgments and citations</A> 
 <BR></UL>
 <LI><A HREF = "Section_start.html">Getting started</A> 
 
 <UL>  2.1 <A HREF = "Section_start.html#start_1">What's in the LAMMPS distribution</A> 
 <BR>
   2.2 <A HREF = "Section_start.html#start_2">Making LAMMPS</A> 
 <BR>
   2.3 <A HREF = "Section_start.html#start_3">Making LAMMPS with optional packages</A> 
 <BR>
   2.4 <A HREF = "Section_start.html#start_4">Building LAMMPS via the Make.py script</A> 
 <BR>
   2.5 <A HREF = "Section_start.html#start_5">Building LAMMPS as a library</A> 
 <BR>
   2.6 <A HREF = "Section_start.html#start_6">Running LAMMPS</A> 
 <BR>
   2.7 <A HREF = "Section_start.html#start_7">Command-line options</A> 
 <BR>
   2.8 <A HREF = "Section_start.html#start_8">Screen output</A> 
 <BR>
   2.9 <A HREF = "Section_start.html#start_9">Tips for users of previous versions</A> 
 <BR></UL>
 <LI><A HREF = "Section_commands.html">Commands</A> 
 
 <UL>  3.1 <A HREF = "Section_commands.html#cmd_1">LAMMPS input script</A> 
 <BR>
   3.2 <A HREF = "Section_commands.html#cmd_2">Parsing rules</A> 
 <BR>
   3.3 <A HREF = "Section_commands.html#cmd_3">Input script structure</A> 
 <BR>
   3.4 <A HREF = "Section_commands.html#cmd_4">Commands listed by category</A> 
 <BR>
   3.5 <A HREF = "Section_commands.html#cmd_5">Commands listed alphabetically</A> 
 <BR></UL>
 <LI><A HREF = "Section_packages.html">Packages</A> 
 
 <UL>  4.1 <A HREF = "Section_packages.html#pkg_1">Standard packages</A> 
 <BR>
   4.2 <A HREF = "Section_packages.html#pkg_2">User packages</A> 
 <BR></UL>
 <LI><A HREF = "Section_accelerate.html">Accelerating LAMMPS performance</A> 
 
 <UL>  5.1 <A HREF = "Section_accelerate.html#acc_1">Measuring performance</A> 
 <BR>
   5.2 <A HREF = "Section_accelerate.html#acc_2">General strategies</A> 
 <BR>
   5.3 <A HREF = "Section_accelerate.html#acc_3">Packages with optimized styles</A> 
 <BR>
   5.4 <A HREF = "Section_accelerate.html#acc_4">OPT package</A> 
 <BR>
   5.5 <A HREF = "Section_accelerate.html#acc_5">USER-OMP package</A> 
 <BR>
   5.6 <A HREF = "Section_accelerate.html#acc_6">GPU package</A> 
 <BR>
   5.7 <A HREF = "Section_accelerate.html#acc_7">USER-CUDA package</A> 
 <BR>
   5.8 <A HREF = "Section_accelerate.html#acc_8">Comparison of GPU and USER-CUDA packages</A> 
 <BR></UL>
 <LI><A HREF = "Section_howto.html">How-to discussions</A> 
 
 <UL>  6.1 <A HREF = "Section_howto.html#howto_1">Restarting a simulation</A> 
 <BR>
   6.2 <A HREF = "Section_howto.html#howto_2">2d simulations</A> 
 <BR>
   6.3 <A HREF = "Section_howto.html#howto_3">CHARMM and AMBER force fields</A> 
 <BR>
   6.4 <A HREF = "Section_howto.html#howto_4">Running multiple simulations from one input script</A> 
 <BR>
   6.5 <A HREF = "Section_howto.html#howto_5">Multi-replica simulations</A> 
 <BR>
   6.6 <A HREF = "Section_howto.html#howto_6">Granular models</A> 
 <BR>
   6.7 <A HREF = "Section_howto.html#howto_7">TIP3P water model</A> 
 <BR>
   6.8 <A HREF = "Section_howto.html#howto_8">TIP4P water model</A> 
 <BR>
   6.9 <A HREF = "Section_howto.html#howto_9">SPC water model</A> 
 <BR>
   6.10 <A HREF = "Section_howto.html#howto_10">Coupling LAMMPS to other codes</A> 
 <BR>
   6.11 <A HREF = "Section_howto.html#howto_11">Visualizing LAMMPS snapshots</A> 
 <BR>
   6.12 <A HREF = "Section_howto.html#howto_12">Triclinic (non-orthogonal) simulation boxes</A> 
 <BR>
   6.13 <A HREF = "Section_howto.html#howto_13">NEMD simulations</A> 
 <BR>
   6.14 <A HREF = "Section_howto.html#howto_14">Finite-size spherical and aspherical particles</A> 
 <BR>
   6.15 <A HREF = "Section_howto.html#howto_15">Output from LAMMPS (thermo, dumps, computes, fixes, variables)</A> 
 <BR>
   6.16 <A HREF = "Section_howto.html#howto_16">Thermostatting, barostatting, and compute temperature</A> 
 <BR>
   6.17 <A HREF = "Section_howto.html#howto_17">Walls</A> 
 <BR>
   6.18 <A HREF = "Section_howto.html#howto_18">Elastic constants</A> 
 <BR>
   6.19 <A HREF = "Section_howto.html#howto_19">Library interface to LAMMPS</A> 
 <BR>
   6.20 <A HREF = "Section_howto.html#howto_20">Calculating thermal conductivity</A> 
 <BR>
   6.21 <A HREF = "Section_howto.html#howto_21">Calculating viscosity</A> 
 <BR>
   6.22 <A HREF = "howto_22">Calculating a diffusion coefficient</A> 
 <BR></UL>
 <LI><A HREF = "Section_example.html">Example problems</A> 
 
 <LI><A HREF = "Section_perf.html">Performance & scalability</A> 
 
 <LI><A HREF = "Section_tools.html">Additional tools</A> 
 
 <LI><A HREF = "Section_modify.html">Modifying & extending LAMMPS</A> 
 
 <UL>  10.1 <A HREF = "Section_modify.html#mod_1">Atom styles</A> 
 <BR>
   10.2 <A HREF = "Section_modify.html#mod_2">Bond, angle, dihedral, improper potentials</A> 
 <BR>
   10.3 <A HREF = "Section_modify.html#mod_3">Compute styles</A> 
 <BR>
   10.4 <A HREF = "Section_modify.html#mod_4">Dump styles</A> 
 <BR>
   10.5 <A HREF = "Section_modify.html#mod_5">Dump custom output options</A> 
 <BR>
   10.6 <A HREF = "Section_modify.html#mod_6">Fix styles</A> 
 <BR>
   10.7 <A HREF = "Section_modify.html#mod_7">Input script commands</A> 
 <BR>
   10.8 <A HREF = "Section_modify.html#mod_8">Kspace computations</A> 
 <BR>
   10.9 <A HREF = "Section_modify.html#mod_9">Minimization styles</A> 
 <BR>
   10.10 <A HREF = "Section_modify.html#mod_10">Pairwise potentials</A> 
 <BR>
   10.11 <A HREF = "Section_modify.html#mod_11">Region styles</A> 
 <BR>
   10.12 <A HREF = "Section_modify.html#mod_12">Body styles</A> 
 <BR>
   10.13 <A HREF = "Section_modify.html#mod_13">Thermodynamic output options</A> 
 <BR>
   10.14 <A HREF = "Section_modify.html#mod_14">Variable options</A> 
 <BR>
   10.15 <A HREF = "Section_modify.html#mod_15">Submitting new features for inclusion in LAMMPS</A> 
 <BR></UL>
 <LI><A HREF = "Section_python.html">Python interface</A> 
 
 <UL>  11.1 <A HREF = "Section_python.html#py_1">Building LAMMPS as a shared library</A> 
 <BR>
   11.2 <A HREF = "Section_python.html#py_2">Installing the Python wrapper into Python</A> 
 <BR>
   11.3 <A HREF = "Section_python.html#py_3">Extending Python with MPI to run in parallel</A> 
 <BR>
   11.4 <A HREF = "Section_python.html#py_4">Testing the Python-LAMMPS interface</A> 
 <BR>
   11.5 <A HREF = "Section_python.html#py_5">Using LAMMPS from Python</A> 
 <BR>
   11.6 <A HREF = "Section_python.html#py_6">Example Python scripts that use LAMMPS</A> 
 <BR></UL>
 <LI><A HREF = "Section_errors.html">Errors</A> 
 
 <UL>  12.1 <A HREF = "Section_errors.html#err_1">Common problems</A> 
 <BR>
   12.2 <A HREF = "Section_errors.html#err_2">Reporting bugs</A> 
 <BR>
   12.3 <A HREF = "Section_errors.html#err_3">Error & warning messages</A> 
 <BR></UL>
 <LI><A HREF = "Section_history.html">Future and history</A> 
 
 <UL>  13.1 <A HREF = "Section_history.html#hist_1">Coming attractions</A> 
 <BR>
   13.2 <A HREF = "Section_history.html#hist_2">Past versions</A> 
 <BR></UL>
 
 </OL>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 </BODY>
 
 </HTML>
diff --git a/doc/Manual.txt b/doc/Manual.txt
index dd9a4a3b0..52fbbc10a 100644
--- a/doc/Manual.txt
+++ b/doc/Manual.txt
@@ -1,274 +1,274 @@
 <HEAD>
 <TITLE>LAMMPS-ICMS Users Manual</TITLE>
-<META NAME="docnumber" CONTENT="27 May 2014 version">
+<META NAME="docnumber" CONTENT="29 May 2014 version">
 <META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
 <META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation.  This software and manual is distributed under the GNU General Public License.">
 </HEAD>
 
 <BODY>
 
 "LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
 
 :link(lws,http://lammps.sandia.gov)
 :link(ld,Manual.html)
 :link(lc,Section_commands.html#comm)
 
 :line
 
 <H1></H1>
 
 LAMMPS-ICMS Documentation :c,h3
-27 May 2014 version :c,h4
+29 May 2014 version :c,h4
 
 Version info: :h4
 
 The LAMMPS "version" is the date when it was released, such as 1 May
 2010. LAMMPS is updated continuously.  Whenever we fix a bug or add a
 feature, we release it immediately, and post a notice on "this page of
 the WWW site"_bug.  Each dated copy of LAMMPS contains all the
 features and bug-fixes up to and including that version date. The
 version date is printed to the screen and logfile every time you run
 LAMMPS. It is also in the file src/version.h and in the LAMMPS
 directory name created when you unpack a tarball, and at the top of
 the first page of the manual (this page).
 
 LAMMPS-ICMS is an experimental variant of LAMMPS with additional
 features made available for testing before they will be submitted
 for inclusion into the official LAMMPS tree. The source code is 
 based on the official LAMMPS svn repository mirror at the Institute
 for Computational Molecular Science at Temple University and generally
 kept up-to-date as much as possible. Sometimes, e.g. when additional
 development work is needed to adapt the upstream changes into
 LAMMPS-ICMS it can take longer until synchronization; and occasionally,
 e.g. in case of the rewrite of the multi-threading support, the
 development will be halted except for important bugfixes until
 all features of LAMMPS-ICMS fully compatible with the upstream
 version or replaced by alternate implementations.
 
 If you browse the HTML doc pages on the LAMMPS WWW site, they always
 describe the most current version of upstream LAMMPS, but may be
 missing some new features in LAMMPS-ICMS. :ulb,l
 
 If you browse the HTML doc pages included in your tarball, they
 describe the version you have, however, not all new features in
 LAMMPS-ICMS are documented immediately. :l
 
 The "PDF file"_Manual.pdf on the WWW site or in the tarball is updated
 about once per month.  This is because it is large, and we don't want
 it to be part of every patch. :l
 
 There is also a "Developer.pdf"_Developer.pdf file in the doc
 directory, which describes the internal structure and algorithms of
 LAMMPS.  :ule,l
 
 LAMMPS stands for Large-scale Atomic/Molecular Massively Parallel
 Simulator.
 
 LAMMPS is a classical molecular dynamics simulation code designed to
 run efficiently on parallel computers.  It was developed at Sandia
 National Laboratories, a US Department of Energy facility, with
 funding from the DOE.  It is an open-source code, distributed freely
 under the terms of the GNU Public License (GPL).
 
 The primary developers of LAMMPS are "Steve Plimpton"_sjp, Aidan
 Thompson, and Paul Crozier who can be contacted at
 sjplimp,athomps,pscrozi at sandia.gov.  The "LAMMPS WWW Site"_lws at
 http://lammps.sandia.gov has more information about the code and its
 uses.
 
 :link(bug,http://lammps.sandia.gov/bug.html)
 :link(sjp,http://www.sandia.gov/~sjplimp)
 
 :line
 
 The LAMMPS documentation is organized into the following sections.  If
 you find errors or omissions in this manual or have suggestions for
 useful information to add, please send an email to the developers so
 we can improve the LAMMPS documentation.
 
 Once you are familiar with LAMMPS, you may want to bookmark "this
 page"_Section_commands.html#comm at Section_commands.html#comm since
 it gives quick access to documentation for all LAMMPS commands.
 
 "PDF file"_Manual.pdf of the entire manual, generated by
 "htmldoc"_http://www.easysw.com/htmldoc
 
 "Introduction"_Section_intro.html :olb,l
   1.1 "What is LAMMPS"_intro_1 :ulb,b
   1.2 "LAMMPS features"_intro_2 :b
   1.3 "LAMMPS non-features"_intro_3 :b
   1.4 "Open source distribution"_intro_4 :b
   1.5 "Acknowledgments and citations"_intro_5 :ule,b
 "Getting started"_Section_start.html :l
   2.1 "What's in the LAMMPS distribution"_start_1 :ulb,b
   2.2 "Making LAMMPS"_start_2 :b
   2.3 "Making LAMMPS with optional packages"_start_3 :b
   2.4 "Building LAMMPS via the Make.py script"_start_4 :b
   2.5 "Building LAMMPS as a library"_start_5 :b
   2.6 "Running LAMMPS"_start_6 :b
   2.7 "Command-line options"_start_7 :b
   2.8 "Screen output"_start_8 :b
   2.9 "Tips for users of previous versions"_start_9 :ule,b
 "Commands"_Section_commands.html :l
   3.1 "LAMMPS input script"_cmd_1 :ulb,b
   3.2 "Parsing rules"_cmd_2 :b
   3.3 "Input script structure"_cmd_3 :b
   3.4 "Commands listed by category"_cmd_4 :b
   3.5 "Commands listed alphabetically"_cmd_5 :ule,b
 "Packages"_Section_packages.html :l
   4.1 "Standard packages"_pkg_1 :ulb,b
   4.2 "User packages"_pkg_2 :ule,b
 "Accelerating LAMMPS performance"_Section_accelerate.html :l
   5.1 "Measuring performance"_acc_1 :ulb,b
   5.2 "General strategies"_acc_2 :b
   5.3 "Packages with optimized styles"_acc_3 :b
   5.4 "OPT package"_acc_4 :b
   5.5 "USER-OMP package"_acc_5 :b
   5.6 "GPU package"_acc_6 :b
   5.7 "USER-CUDA package"_acc_7 :b
   5.8 "Comparison of GPU and USER-CUDA packages"_acc_8 :ule,b
 "How-to discussions"_Section_howto.html :l
   6.1 "Restarting a simulation"_howto_1 :ulb,b
   6.2 "2d simulations"_howto_2 :b
   6.3 "CHARMM and AMBER force fields"_howto_3 :b
   6.4 "Running multiple simulations from one input script"_howto_4 :b
   6.5 "Multi-replica simulations"_howto_5 :b
   6.6 "Granular models"_howto_6 :b
   6.7 "TIP3P water model"_howto_7 :b
   6.8 "TIP4P water model"_howto_8 :b
   6.9 "SPC water model"_howto_9 :b
   6.10 "Coupling LAMMPS to other codes"_howto_10 :b
   6.11 "Visualizing LAMMPS snapshots"_howto_11 :b
   6.12 "Triclinic (non-orthogonal) simulation boxes"_howto_12 :b
   6.13 "NEMD simulations"_howto_13 :b
   6.14 "Finite-size spherical and aspherical particles"_howto_14 :b
   6.15 "Output from LAMMPS (thermo, dumps, computes, fixes, variables)"_howto_15 :b
   6.16 "Thermostatting, barostatting, and compute temperature"_howto_16 :b
   6.17 "Walls"_howto_17 :b
   6.18 "Elastic constants"_howto_18 :b
   6.19 "Library interface to LAMMPS"_howto_19 :b
   6.20 "Calculating thermal conductivity"_howto_20 :b
   6.21 "Calculating viscosity"_howto_21 :b
   6.22 "Calculating a diffusion coefficient"_howto_22 :ule,b
 "Example problems"_Section_example.html :l
 "Performance & scalability"_Section_perf.html :l
 "Additional tools"_Section_tools.html :l
 "Modifying & extending LAMMPS"_Section_modify.html :l
   10.1 "Atom styles"_mod_1 :ulb,b
   10.2 "Bond, angle, dihedral, improper potentials"_mod_2 :b
   10.3 "Compute styles"_mod_3 :b
   10.4 "Dump styles"_mod_4 :b
   10.5 "Dump custom output options"_mod_5 :b
   10.6 "Fix styles"_mod_6 :b
   10.7 "Input script commands"_mod_7 :b
   10.8 "Kspace computations"_mod_8 :b
   10.9 "Minimization styles"_mod_9 :b
   10.10 "Pairwise potentials"_mod_10 :b
   10.11 "Region styles"_mod_11 :b
   10.12 "Body styles"_mod_12 :b
   10.13 "Thermodynamic output options"_mod_13 :b
   10.14 "Variable options"_mod_14 :b
   10.15 "Submitting new features for inclusion in LAMMPS"_mod_15 :ule,b
 "Python interface"_Section_python.html :l
   11.1 "Building LAMMPS as a shared library"_py_1 :ulb,b
   11.2 "Installing the Python wrapper into Python"_py_2 :b
   11.3 "Extending Python with MPI to run in parallel"_py_3 :b
   11.4 "Testing the Python-LAMMPS interface"_py_4 :b
   11.5 "Using LAMMPS from Python"_py_5 :b
   11.6 "Example Python scripts that use LAMMPS"_py_6 :ule,b
 "Errors"_Section_errors.html :l
   12.1 "Common problems"_err_1 :ulb,b
   12.2 "Reporting bugs"_err_2 :b
   12.3 "Error & warning messages"_err_3 :ule,b
 "Future and history"_Section_history.html :l
   13.1 "Coming attractions"_hist_1 :ulb,b
   13.2 "Past versions"_hist_2 :ule,b
 :ole
 
 :link(intro_1,Section_intro.html#intro_1)
 :link(intro_2,Section_intro.html#intro_2)
 :link(intro_3,Section_intro.html#intro_3)
 :link(intro_4,Section_intro.html#intro_4)
 :link(intro_5,Section_intro.html#intro_5)
 
 :link(start_1,Section_start.html#start_1)
 :link(start_2,Section_start.html#start_2)
 :link(start_3,Section_start.html#start_3)
 :link(start_4,Section_start.html#start_4)
 :link(start_5,Section_start.html#start_5)
 :link(start_6,Section_start.html#start_6)
 :link(start_7,Section_start.html#start_7)
 :link(start_8,Section_start.html#start_8)
 :link(start_9,Section_start.html#start_9)
 
 :link(cmd_1,Section_commands.html#cmd_1)
 :link(cmd_2,Section_commands.html#cmd_2)
 :link(cmd_3,Section_commands.html#cmd_3)
 :link(cmd_4,Section_commands.html#cmd_4)
 :link(cmd_5,Section_commands.html#cmd_5)
 
 :link(pkg_1,Section_packages.html#pkg_1)
 :link(pkg_2,Section_packages.html#pkg_2)
 
 :link(acc_1,Section_accelerate.html#acc_1)
 :link(acc_2,Section_accelerate.html#acc_2)
 :link(acc_3,Section_accelerate.html#acc_3)
 :link(acc_4,Section_accelerate.html#acc_4)
 :link(acc_5,Section_accelerate.html#acc_5)
 :link(acc_6,Section_accelerate.html#acc_6)
 :link(acc_7,Section_accelerate.html#acc_7)
 :link(acc_8,Section_accelerate.html#acc_8)
 
 :link(howto_1,Section_howto.html#howto_1)
 :link(howto_2,Section_howto.html#howto_2)
 :link(howto_3,Section_howto.html#howto_3)
 :link(howto_4,Section_howto.html#howto_4)
 :link(howto_5,Section_howto.html#howto_5)
 :link(howto_6,Section_howto.html#howto_6)
 :link(howto_7,Section_howto.html#howto_7)
 :link(howto_8,Section_howto.html#howto_8)
 :link(howto_9,Section_howto.html#howto_9)
 :link(howto_10,Section_howto.html#howto_10)
 :link(howto_11,Section_howto.html#howto_11)
 :link(howto_12,Section_howto.html#howto_12)
 :link(howto_13,Section_howto.html#howto_13)
 :link(howto_14,Section_howto.html#howto_14)
 :link(howto_15,Section_howto.html#howto_15)
 :link(howto_16,Section_howto.html#howto_16)
 :link(howto_17,Section_howto.html#howto_17)
 :link(howto_18,Section_howto.html#howto_18)
 :link(howto_19,Section_howto.html#howto_19)
 :link(howto_20,Section_howto.html#howto_20)
 :link(howto_21,Section_howto.html#howto_21)
 
 :link(mod_1,Section_modify.html#mod_1)
 :link(mod_2,Section_modify.html#mod_2)
 :link(mod_3,Section_modify.html#mod_3)
 :link(mod_4,Section_modify.html#mod_4)
 :link(mod_5,Section_modify.html#mod_5)
 :link(mod_6,Section_modify.html#mod_6)
 :link(mod_7,Section_modify.html#mod_7)
 :link(mod_8,Section_modify.html#mod_8)
 :link(mod_9,Section_modify.html#mod_9)
 :link(mod_10,Section_modify.html#mod_10)
 :link(mod_11,Section_modify.html#mod_11)
 :link(mod_12,Section_modify.html#mod_12)
 :link(mod_13,Section_modify.html#mod_13)
 :link(mod_14,Section_modify.html#mod_14)
 :link(mod_15,Section_modify.html#mod_15)
 
 :link(py_1,Section_python.html#py_1)
 :link(py_2,Section_python.html#py_2)
 :link(py_3,Section_python.html#py_3)
 :link(py_4,Section_python.html#py_4)
 :link(py_5,Section_python.html#py_5)
 :link(py_6,Section_python.html#py_6)
 
 :link(err_1,Section_errors.html#err_1)
 :link(err_2,Section_errors.html#err_2)
 :link(err_3,Section_errors.html#err_3)
 
 :link(hist_1,Section_history.html#hist_1)
 :link(hist_2,Section_history.html#hist_2)
 
 </BODY>
diff --git a/doc/Section_accelerate.html b/doc/Section_accelerate.html
index 1b48220bc..4de4fa10f 100644
--- a/doc/Section_accelerate.html
+++ b/doc/Section_accelerate.html
@@ -1,723 +1,1061 @@
 <HTML>
 <CENTER><A HREF = "Section_packages.html">Previous Section</A> - <A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> -
 <A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> - <A HREF = "Section_howto.html">Next
 Section</A> 
 </CENTER>
 
 
 
 
 
 
 <HR>
 
 <H3>5. Accelerating LAMMPS performance 
 </H3>
 <P>This section describes various methods for improving LAMMPS
 performance for different classes of problems running on different
 kinds of machines.
 </P>
 5.1 <A HREF = "#acc_1">Measuring performance</A><BR>
 5.2 <A HREF = "#acc_2">General strategies</A><BR>
 5.3 <A HREF = "#acc_3">Packages with optimized styles</A><BR>
 5.4 <A HREF = "#acc_4">OPT package</A><BR>
 5.5 <A HREF = "#acc_5">USER-OMP package</A><BR>
 5.6 <A HREF = "#acc_6">GPU package</A><BR>
 5.7 <A HREF = "#acc_7">USER-CUDA package</A><BR>
-5.8 <A HREF = "#acc_8">Comparison of GPU and USER-CUDA packages</A> <BR>
+5.8 <A HREF = "#acc_8">KOKKOS package</A><BR>
+5.9 <A HREF = "#acc_9">Comparison of GPU and USER-CUDA packages</A> <BR>
 
 <HR>
 
 <HR>
 
 <H4><A NAME = "acc_1"></A>5.1 Measuring performance 
 </H4>
 <P>Before trying to make your simulation run faster, you should
 understand how it currently performs and where the bottlenecks are.
 </P>
 <P>The best way to do this is run the your system (actual number of
 atoms) for a modest number of timesteps (say 100, or a few 100 at
 most) on several different processor counts, including a single
 processor if possible.  Do this for an equilibrium version of your
 system, so that the 100-step timings are representative of a much
 longer run.  There is typically no need to run for 1000s or timesteps
 to get accurate timings; you can simply extrapolate from short runs.
 </P>
 <P>For the set of runs, look at the timing data printed to the screen and
 log file at the end of each LAMMPS run.  <A HREF = "Section_start.html#start_8">This
 section</A> of the manual has an overview.
 </P>
 <P>Running on one (or a few processors) should give a good estimate of
 the serial performance and what portions of the timestep are taking
 the most time.  Running the same problem on a few different processor
 counts should give an estimate of parallel scalability.  I.e. if the
 simulation runs 16x faster on 16 processors, its 100% parallel
 efficient; if it runs 8x faster on 16 processors, it's 50% efficient.
 </P>
 <P>The most important data to look at in the timing info is the timing
 breakdown and relative percentages.  For example, trying different
 options for speeding up the long-range solvers will have little impact
 if they only consume 10% of the run time.  If the pairwise time is
 dominating, you may want to look at GPU or OMP versions of the pair
 style, as discussed below.  Comparing how the percentages change as
 you increase the processor count gives you a sense of how different
 operations within the timestep are scaling.  Note that if you are
 running with a Kspace solver, there is additional output on the
 breakdown of the Kspace time.  For PPPM, this includes the fraction
 spent on FFTs, which can be communication intensive.
 </P>
 <P>Another important detail in the timing info are the histograms of
 atoms counts and neighbor counts.  If these vary widely across
 processors, you have a load-imbalance issue.  This often results in
 inaccurate relative timing data, because processors have to wait when
 communication occurs for other processors to catch up.  Thus the
 reported times for "Communication" or "Other" may be higher than they
 really are, due to load-imbalance.  If this is an issue, you can
 uncomment the MPI_Barrier() lines in src/timer.cpp, and recompile
 LAMMPS, to obtain synchronized timings.
 </P>
 <HR>
 
 <H4><A NAME = "acc_2"></A>5.2 General strategies 
 </H4>
 <P>NOTE: this sub-section is still a work in progress
 </P>
 <P>Here is a list of general ideas for improving simulation performance.
 Most of them are only applicable to certain models and certain
 bottlenecks in the current performance, so let the timing data you
 generate be your guide.  It is hard, if not impossible, to predict how
 much difference these options will make, since it is a function of
 problem size, number of processors used, and your machine.  There is
 no substitute for identifying performance bottlenecks, and trying out
 various options.
 </P>
 <UL><LI>rRESPA
 <LI>2-FFT PPPM
 <LI>Staggered PPPM
 <LI>single vs double PPPM
 <LI>partial charge PPPM
 <LI>verlet/split
 <LI>processor mapping via processors numa command
 <LI>load-balancing: balance and fix balance
 <LI>processor command for layout
 <LI>OMP when lots of cores 
 </UL>
 <P>2-FFT PPPM, also called <I>analytic differentiation</I> or <I>ad</I> PPPM, uses
 2 FFTs instead of the 4 FFTs used by the default <I>ik differentiation</I>
 PPPM. However, 2-FFT PPPM also requires a slightly larger mesh size to
 achieve the same accuracy as 4-FFT PPPM. For problems where the FFT
 cost is the performance bottleneck (typically large problems running
 on many processors), 2-FFT PPPM may be faster than 4-FFT PPPM.
 </P>
 <P>Staggered PPPM performs calculations using two different meshes, one
 shifted slightly with respect to the other.  This can reduce force
 aliasing errors and increase the accuracy of the method, but also
 doubles the amount of work required. For high relative accuracy, using
 staggered PPPM allows one to half the mesh size in each dimension as
 compared to regular PPPM, which can give around a 4x speedup in the
 kspace time. However, for low relative accuracy, using staggered PPPM
 gives little benefit and can be up to 2x slower in the kspace
 time. For example, the rhodopsin benchmark was run on a single
 processor, and results for kspace time vs. relative accuracy for the
 different methods are shown in the figure below.  For this system,
 staggered PPPM (using ik differentiation) becomes useful when using a
 relative accuracy of slightly greater than 1e-5 and above.
 </P>
 <CENTER><IMG SRC = "JPG/rhodo_staggered.jpg">
 </CENTER>
 <P>IMPORTANT NOTE: Using staggered PPPM may not give the same increase in
 accuracy of energy and pressure as it does in forces, so some caution
 must be used if energy and/or pressure are quantities of interest,
 such as when using a barostat.
 </P>
 <HR>
 
 <H4><A NAME = "acc_3"></A>5.3 Packages with optimized styles 
 </H4>
 <P>Accelerated versions of various <A HREF = "pair_style.html">pair_style</A>,
 <A HREF = "fix.html">fixes</A>, <A HREF = "compute.html">computes</A>, and other commands have
 been added to LAMMPS, which will typically run faster than the
 standard non-accelerated versions, if you have the appropriate
 hardware on your system.
 </P>
 <P>The accelerated styles have the same name as the standard styles,
 except that a suffix is appended.  Otherwise, the syntax for the
 command is identical, their functionality is the same, and the
 numerical results it produces should also be identical, except for
 precision and round-off issues.
 </P>
-<P>For example, all of these variants of the basic Lennard-Jones pair
-style exist in LAMMPS:
+<P>For example, all of these styles are variants of the basic
+Lennard-Jones pair style <A HREF = "pair_lj.html">pair_style lj/cut</A>:
 </P>
-<UL><LI><A HREF = "pair_lj.html">pair_style lj/cut</A>
-<LI><A HREF = "pair_lj.html">pair_style lj/cut/opt</A>
-<LI><A HREF = "pair_lj.html">pair_style lj/cut/omp</A>
+<UL><LI><A HREF = "pair_lj.html">pair_style lj/cut/cuda</A>
 <LI><A HREF = "pair_lj.html">pair_style lj/cut/gpu</A>
-<LI><A HREF = "pair_lj.html">pair_style lj/cut/cuda</A> 
+<LI><A HREF = "pair_lj.html">pair_style lj/cut/kk</A>
+<LI><A HREF = "pair_lj.html">pair_style lj/cut/omp</A>
+<LI><A HREF = "pair_lj.html">pair_style lj/cut/opt</A> 
 </UL>
 <P>Assuming you have built LAMMPS with the appropriate package, these
 styles can be invoked by specifying them explicitly in your input
 script.  Or you can use the <A HREF = "Section_start.html#start_7">-suffix command-line
 switch</A> to invoke the accelerated versions
 automatically, without changing your input script.  The
 <A HREF = "suffix.html">suffix</A> command allows you to set a suffix explicitly and
-to turn off/on the comand-line switch setting, both from within your
-input script.
+to turn off and back on the comand-line switch setting, both from
+within your input script.
 </P>
-<P>Styles with an "opt" suffix are part of the OPT package and typically
-speed-up the pairwise calculations of your simulation by 5-25%.
+<P>Styles with a "cuda" or "gpu" suffix are part of the USER-CUDA or GPU
+packages, and can be run on NVIDIA GPUs associated with your CPUs.
+The speed-up due to GPU usage depends on a variety of factors, as
+discussed below.
+</P>
+<P>Styles with a "kk" suffix are part of the KOKKOS package, and can be
+run using OpenMP, pthreads, or on an NVIDIA GPU.  The speed-up depends
+on a variety of factors, as discussed below.
 </P>
 <P>Styles with an "omp" suffix are part of the USER-OMP package and allow
 a pair-style to be run in multi-threaded mode using OpenMP.  This can
 be useful on nodes with high-core counts when using less MPI processes
 than cores is advantageous, e.g. when running with PPPM so that FFTs
 are run on fewer MPI processors or when the many MPI tasks would
 overload the available bandwidth for communication.
 </P>
-<P>Styles with a "gpu" or "cuda" suffix are part of the GPU or USER-CUDA
-packages, and can be run on NVIDIA GPUs associated with your CPUs.
-The speed-up due to GPU usage depends on a variety of factors, as
-discussed below.
+<P>Styles with an "opt" suffix are part of the OPT package and typically
+speed-up the pairwise calculations of your simulation by 5-25%.
 </P>
 <P>To see what styles are currently available in each of the accelerated
 packages, see <A HREF = "Section_commands.html#cmd_5">Section_commands 5</A> of the
 manual.  A list of accelerated styles is included in the pair, fix,
-compute, and kspace sections.
+compute, and kspace sections.  The doc page for each indvidual style
+(e.g. <A HREF = "pair_lj.html">pair lj/cut</A> or <A HREF = "fix_nve.html">fix nve</A>) will also
+list any accelerated variants available for that style.
 </P>
 <P>The following sections explain:
 </P>
 <UL><LI>what hardware and software the accelerated styles require
-<LI>how to build LAMMPS with the accelerated packages in place
+<LI>how to build LAMMPS with the accelerated package in place
 <LI>what changes (if any) are needed in your input scripts
 <LI>guidelines for best performance
 <LI>speed-ups you can expect 
 </UL>
 <P>The final section compares and contrasts the GPU and USER-CUDA
-packages, since they are both designed to use NVIDIA GPU hardware.
+packages, since they are both designed to use NVIDIA hardware.
 </P>
 <HR>
 
 <H4><A NAME = "acc_4"></A>5.4 OPT package 
 </H4>
 <P>The OPT package was developed by James Fischer (High Performance
 Technologies), David Richie, and Vincent Natoli (Stone Ridge
 Technologies).  It contains a handful of pair styles whose compute()
 methods were rewritten in C++ templated form to reduce the overhead
 due to if tests and other conditional code.
 </P>
 <P>The procedure for building LAMMPS with the OPT package is simple.  It
 is the same as for any other package which has no additional library
 dependencies:
 </P>
 <PRE>make yes-opt
 make machine 
 </PRE>
-<P>If your input script uses one of the OPT pair styles,
-you can run it as follows:
+<P>If your input script uses one of the OPT pair styles, you can run it
+as follows:
 </P>
 <PRE>lmp_machine -sf opt < in.script
 mpirun -np 4 lmp_machine -sf opt < in.script 
 </PRE>
 <P>You should see a reduction in the "Pair time" printed out at the end
 of the run.  On most machines and problems, this will typically be a 5
 to 20% savings.
 </P>
 <HR>
 
 <H4><A NAME = "acc_5"></A>5.5 USER-OMP package 
 </H4>
-<P>The USER-OMP package was developed by Axel Kohlmeyer at Temple University.
-It provides multi-threaded versions of most pair styles, all dihedral
-styles and a few fixes in LAMMPS. The package currently uses the OpenMP
-interface which requires using a specific compiler flag in the makefile
-to enable multiple threads; without this flag the corresponding pair
-styles will still be compiled and work, but do not support multi-threading.
+<P>The USER-OMP package was developed by Axel Kohlmeyer at Temple
+University.  It provides multi-threaded versions of most pair styles,
+all dihedral styles, and a few fixes in LAMMPS. The package currently
+uses the OpenMP interface which requires using a specific compiler
+flag in the makefile to enable multiple threads; without this flag the
+corresponding pair styles will still be compiled and work, but do not
+support multi-threading.
 </P>
 <P><B>Building LAMMPS with the USER-OMP package:</B>
 </P>
 <P>The procedure for building LAMMPS with the USER-OMP package is simple.
 You have to edit your machine specific makefile to add the flag to
 enable OpenMP support to the CCFLAGS and LINKFLAGS variables. For the
 GNU compilers for example this flag is called <I>-fopenmp</I>. Check your
 compiler documentation to find out which flag you need to add.
 The rest of the compilation is the same as for any other package which
 has no additional library dependencies:
 </P>
 <PRE>make yes-user-omp
 make machine 
 </PRE>
 <P>Please note that this will only install accelerated versions
 of styles that are already installed, so you want to install
 this package as the last package, or else you may be missing
 some accelerated styles. If you plan to uninstall some package,
 you should first uninstall the USER-OMP package then the other
 package and then re-install USER-OMP, to make sure that there
 are no orphaned <I>omp</I> style files present, which would lead to
 compilation errors.
 </P>
 <P>If your input script uses one of regular styles that are also
 exist as an OpenMP version in the USER-OMP package you can run
 it as follows:
 </P>
 <PRE>env OMP_NUM_THREADS=4 lmp_serial -sf omp -in in.script
 env OMP_NUM_THREADS=2 mpirun -np 2 lmp_machine -sf omp -in in.script
 mpirun -x OMP_NUM_THREADS=2 -np 2 lmp_machine -sf omp -in in.script 
 </PRE>
 <P>The value of the environment variable OMP_NUM_THREADS determines how
-many threads per MPI task are launched. All three examples above use
-a total of 4 CPU cores.  For different MPI implementations the method
-to pass the OMP_NUM_THREADS environment variable to all processes is
-different.  Two different variants, one for MPICH and OpenMPI, respectively
-are shown above.  Please check the documentation of your MPI installation
-for additional details.  Alternatively, the value provided by OMP_NUM_THREADS
-can be overridded with the <A HREF = "package.html">package omp</A> command.
-Depending on which styles are accelerated in your input, you should
-see a reduction in the "Pair time" and/or "Bond time" and "Loop time"
-printed out at the end of the run. The optimal ratio of MPI to OpenMP
-can vary a lot and should always be confirmed through some benchmark
-runs for the current system and on the current machine.
+many threads per MPI task are launched. All three examples above use a
+total of 4 CPU cores.  For different MPI implementations the method to
+pass the OMP_NUM_THREADS environment variable to all processes is
+different.  Two different variants, one for MPICH and OpenMPI,
+respectively are shown above.  Please check the documentation of your
+MPI installation for additional details.  Alternatively, the value
+provided by OMP_NUM_THREADS can be overridded with the <A HREF = "package.html">package
+omp</A> command.  Depending on which styles are accelerated
+in your input, you should see a reduction in the "Pair time" and/or
+"Bond time" and "Loop time" printed out at the end of the run. The
+optimal ratio of MPI to OpenMP can vary a lot and should always be
+confirmed through some benchmark runs for the current system and on
+the current machine.
 </P>
 <P><B>Restrictions:</B>
 </P>
 <P>None of the pair styles in the USER-OMP package support the "inner",
 "middle", "outer" options for r-RESPA integration, only the "pair"
 option is supported.
 </P>
 <P><B>Parallel efficiency and performance tips:</B>
 </P>
 <P>In most simple cases the MPI parallelization in LAMMPS is more
 efficient than multi-threading implemented in the USER-OMP package.
 Also the parallel efficiency varies between individual styles.
 On the other hand, in many cases you still want to use the <I>omp</I> version
 - even when compiling or running without OpenMP support - since they
 all contain optimizations similar to those in the OPT package, which
 can result in serial speedup.
 </P>
-<P>Using multi-threading is most effective under the following circumstances:
-</P>
-<UL><LI>Individual compute nodes have a significant number of CPU cores
-but the CPU itself has limited memory bandwidth, e.g. Intel Xeon 53xx
-(Clovertown) and 54xx (Harpertown) quad core processors. Running
-one MPI task per CPU core will result in significant performance
-degradation, so that running with 4 or even only 2 MPI tasks per
-nodes is faster. Running in hybrid MPI+OpenMP mode will reduce the
-inter-node communication bandwidth contention in the same way,
-but offers and additional speedup from utilizing the otherwise
-idle CPU cores. 
+<P>Using multi-threading is most effective under the following
+circumstances:
+</P>
+<UL><LI>Individual compute nodes have a significant number of CPU cores but
+the CPU itself has limited memory bandwidth, e.g. Intel Xeon 53xx
+(Clovertown) and 54xx (Harpertown) quad core processors. Running one
+MPI task per CPU core will result in significant performance
+degradation, so that running with 4 or even only 2 MPI tasks per nodes
+is faster. Running in hybrid MPI+OpenMP mode will reduce the
+inter-node communication bandwidth contention in the same way, but
+offers and additional speedup from utilizing the otherwise idle CPU
+cores. 
 
 <LI>The interconnect used for MPI communication is not able to provide
-sufficient bandwidth for a large number of MPI tasks per node.
-This applies for example to running over gigabit ethernet or
-on Cray XT4 or XT5 series supercomputers. Same as in the aforementioned
-case this effect worsens with using an increasing number of nodes. 
-
-<LI>The input is a system that has an inhomogeneous particle density
-which cannot be mapped well to the domain decomposition scheme
-that LAMMPS employs. While this can be to some degree alleviated
-through using the <A HREF = "processors.html">processors</A> keyword, multi-threading
-provides a parallelism that parallelizes over the number of particles
-not their distribution in space. 
+sufficient bandwidth for a large number of MPI tasks per node.  This
+applies for example to running over gigabit ethernet or on Cray XT4 or
+XT5 series supercomputers. Same as in the aforementioned case this
+effect worsens with using an increasing number of nodes. 
+
+<LI>The input is a system that has an inhomogeneous particle density which
+cannot be mapped well to the domain decomposition scheme that LAMMPS
+employs. While this can be to some degree alleviated through using the
+<A HREF = "processors.html">processors</A> keyword, multi-threading provides a
+parallelism that parallelizes over the number of particles not their
+distribution in space. 
 
 <LI>Finally, multi-threaded styles can improve performance when running
 LAMMPS in "capability mode", i.e. near the point where the MPI
-parallelism scales out. This can happen in particular when using
-as kspace style for long-range electrostatics. Here the scaling
-of the kspace style is the performance limiting factor and using
-multi-threaded styles allows to operate the kspace style at the
-limit of scaling and then increase performance parallelizing
-the real space calculations with hybrid MPI+OpenMP. Sometimes
-additional speedup can be achived by increasing the real-space
-coulomb cutoff and thus reducing the work in the kspace part. 
+parallelism scales out. This can happen in particular when using as
+kspace style for long-range electrostatics. Here the scaling of the
+kspace style is the performance limiting factor and using
+multi-threaded styles allows to operate the kspace style at the limit
+of scaling and then increase performance parallelizing the real space
+calculations with hybrid MPI+OpenMP. Sometimes additional speedup can
+be achived by increasing the real-space coulomb cutoff and thus
+reducing the work in the kspace part. 
 </UL>
-<P>The best parallel efficiency from <I>omp</I> styles is typically 
-achieved when there is at least one MPI task per physical 
-processor, i.e. socket or die.
+<P>The best parallel efficiency from <I>omp</I> styles is typically achieved
+when there is at least one MPI task per physical processor,
+i.e. socket or die.
 </P>
 <P>Using threads on hyper-threading enabled cores is usually
 counterproductive, as the cost in additional memory bandwidth
-requirements is not offset by the gain in CPU utilization
-through hyper-threading.
+requirements is not offset by the gain in CPU utilization through
+hyper-threading.
 </P>
 <P>A description of the multi-threading strategy and some performance
-examples are <A HREF = "http://sites.google.com/site/akohlmey/software/lammps-icms/lammps-icms-tms2011-talk.pdf?attredirects=0&d=1">presented here</A>
+examples are <A HREF = "http://sites.google.com/site/akohlmey/software/lammps-icms/lammps-icms-tms2011-talk.pdf?attredirects=0&d=1">presented
+here</A>
 </P>
 <HR>
 
 <H4><A NAME = "acc_6"></A>5.6 GPU package 
 </H4>
 <P>The GPU package was developed by Mike Brown at ORNL and his
 collaborators.  It provides GPU versions of several pair styles,
 including the 3-body Stillinger-Weber pair style, and for long-range
 Coulombics via the PPPM command.  It has the following features:
 </P>
 <UL><LI>The package is designed to exploit common GPU hardware configurations
 where one or more GPUs are coupled with many cores of a multi-core
 CPUs, e.g. within a node of a parallel machine. 
 
 <LI>Atom-based data (e.g. coordinates, forces) moves back-and-forth
 between the CPU(s) and GPU every timestep. 
 
 <LI>Neighbor lists can be constructed on the CPU or on the GPU 
 
 <LI>The charge assignement and force interpolation portions of PPPM can be
 run on the GPU.  The FFT portion, which requires MPI communication
 between processors, runs on the CPU. 
 
 <LI>Asynchronous force computations can be performed simultaneously on the
 CPU(s) and GPU. 
 
+<LI>It allows for GPU computations to be performed in single or double
+precision, or in mixed-mode precision. where pairwise forces are
+cmoputed in single precision, but accumulated into double-precision
+force vectors. 
+
 <LI>LAMMPS-specific code is in the GPU package.  It makes calls to a
 generic GPU library in the lib/gpu directory.  This library provides
 NVIDIA support as well as more general OpenCL support, so that the
 same functionality can eventually be supported on a variety of GPU
 hardware. 
 </UL>
-<P>NOTE:
-  discuss 3 precisions
-    if change, also have to re-link with LAMMPS
-  always use newton off
-  expt with differing numbers of CPUs vs GPU - can't tell what is fastest
-  give command line switches in examples
-</P>
-<P>I am not very clear to the meaning of  "Max Mem / Proc"
-in the "GPU Time Info (average)".
-Is it the maximal of GPU memory used by one CPU core?
-</P>
-<P>It is the maximum memory used at one time on the GPU for data storage by
-a single MPI process. - Mike
-</P>
 <P><B>Hardware and software requirements:</B>
 </P>
-<P>To use this package, you currently need to have specific NVIDIA
-hardware and install specific NVIDIA CUDA software on your system:
+<P>To use this package, you currently need to have an NVIDIA GPU and
+install the NVIDIA Cuda software on your system:
 </P>
-<UL><LI>Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0
+<UL><LI>Check if you have an NVIDIA GPU: cat /proc/driver/nvidia/cards/0
 <LI>Go to http://www.nvidia.com/object/cuda_get.html
 <LI>Install a driver and toolkit appropriate for your system (SDK is not necessary)
 <LI>Follow the instructions in lammps/lib/gpu/README to build the library (see below)
 <LI>Run lammps/lib/gpu/nvc_get_devices to list supported devices and properties 
 </UL>
 <P><B>Building LAMMPS with the GPU package:</B>
 </P>
 <P>As with other packages that include a separately compiled library, you
 need to first build the GPU library, before building LAMMPS itself.
 General instructions for doing this are in <A HREF = "Section_start.html#start_3">this
 section</A> of the manual.  For this package,
-do the following, using a Makefile in lib/gpu appropriate for your
-system:
+use a Makefile in lib/gpu appropriate for your system.
+</P>
+<P>Before building the library, you can set the precision it will use by
+editing the CUDA_PREC setting in the Makefile you are using, as
+follows:
+</P>
+<PRE>CUDA_PREC = -D_SINGLE_SINGLE  # Single precision for all calculations
+CUDA_PREC = -D_DOUBLE_DOUBLE  # Double precision for all calculations
+CUDA_PREC = -D_SINGLE_DOUBLE  # Accumulation of forces, etc, in double 
+</PRE>
+<P>The last setting is the mixed mode referred to above.  Note that your
+GPU must support double precision to use either the 2nd or 3rd of
+these settings.
+</P>
+<P>To build the library, then type:
 </P>
 <PRE>cd lammps/lib/gpu
 make -f Makefile.linux
 (see further instructions in lammps/lib/gpu/README) 
 </PRE>
 <P>If you are successful, you will produce the file lib/libgpu.a.
 </P>
 <P>Now you are ready to build LAMMPS with the GPU package installed:
 </P>
 <PRE>cd lammps/src
 make yes-gpu
 make machine 
 </PRE>
 <P>Note that the lo-level Makefile (e.g. src/MAKE/Makefile.linux) has
 these settings: gpu_SYSINC, gpu_SYSLIB, gpu_SYSPATH.  These need to be
 set appropriately to include the paths and settings for the CUDA
 system software on your machine.  See src/MAKE/Makefile.g++ for an
 example.
 </P>
-<P><B>GPU configuration</B>
+<P>Also note that if you change the GPU library precision, you need to
+re-build the entire library.  You should do a "clean" first,
+e.g. "make -f Makefile.linux clean".  Then you must also re-build
+LAMMPS if the library precision has changed, so that it re-links with
+the new library.
 </P>
-<P>When using GPUs, you are restricted to one physical GPU per LAMMPS
-process, which is an MPI process running on a single core or
-processor.  Multiple MPI processes (CPU cores) can share a single GPU,
-and in many cases it will be more efficient to run this way.
+<P><B>Running an input script:</B>
 </P>
-<P><B>Input script requirements:</B>
+<P>The examples/gpu and bench/GPU directories have scripts that can be
+run with the GPU package, as well as detailed instructions on how to
+run them.
 </P>
-<P>Additional input script requirements to run pair or PPPM styles with a
+<P>The total number of MPI tasks used by LAMMPS (one or multiple per
+compute node) is set in the usual manner via the mpirun or mpiexec
+commands, and is independent of the GPU package.
+</P>
+<P>When using the GPU package, you cannot assign more than one physical
+GPU to an MPI task.  However multiple MPI tasks can share the same
+GPU, and in many cases it will be more efficient to run this way.
+</P>
+<P>Input script requirements to run using pair or PPPM styles with a
 <I>gpu</I> suffix are as follows:
 </P>
-<UL><LI>To invoke specific styles from the GPU package, you can either append
-"gpu" to the style name (e.g. pair_style lj/cut/gpu), or use the
-<A HREF = "Section_start.html#start_7">-suffix command-line switch</A>, or use the
-<A HREF = "suffix.html">suffix</A> command. 
+<UL><LI>To invoke specific styles from the GPU package, either append "gpu" to
+the style name (e.g. pair_style lj/cut/gpu), or use the <A HREF = "Section_start.html#start_7">-suffix
+command-line switch</A>, or use the
+<A HREF = "suffix.html">suffix</A> command in the input script. 
 
-<LI>The <A HREF = "newton.html">newton pair</A> setting must be <I>off</I>. 
+<LI>The <A HREF = "newton.html">newton pair</A> setting in the input script must be
+<I>off</I>. 
 
-<LI>The <A HREF = "package.html">package gpu</A> command must be used near the beginning
-of your script to control the GPU selection and initialization
-settings.  It also has an option to enable asynchronous splitting of
-force computations between the CPUs and GPUs. 
+<LI>Unless the <A HREF = "Section_start.html#start_7">-suffix gpu command-line
+switch</A> is used, the <A HREF = "package.html">package
+gpu</A> command must be used near the beginning of the
+script to control the GPU selection and initialization settings.  It
+also has an option to enable asynchronous splitting of force
+computations between the CPUs and GPUs. 
 </UL>
-<P>As an example, if you have two GPUs per node and 8 CPU cores per node,
-and would like to run on 4 nodes (32 cores) with dynamic balancing of
-force calculation across CPU and GPU cores, you could specify
-</P>
-<PRE>package gpu force/neigh 0 1 -1 
-</PRE>
-<P>In this case, all CPU cores and GPU devices on the nodes would be
-utilized.  Each GPU device would be shared by 4 CPU cores. The CPU
-cores would perform force calculations for some fraction of the
-particles at the same time the GPUs performed force calculation for
-the other particles.
+<P>The default for the <A HREF = "package.html">package gpu</A> command is to have all
+the MPI tasks on the compute node use a single GPU.  If you have
+multiple GPUs per node, then be sure to create one or more MPI tasks
+per GPU, and use the first/last settings in the <A HREF = "package.html">package
+gpu</A> command to include all the GPU IDs on the node.
+E.g. first = 0, last = 1, for 2 GPUs.  For example, on an 8-core 2-GPU
+compute node, if you assign 8 MPI tasks to the node, the following
+command in the input script
+</P>
+<P>package gpu force/neigh 0 1 -1
+</P>
+<P>would speciy each GPU is shared by 4 MPI tasks.  The final -1 will
+dynamically balance force calculations across the CPU cores and GPUs.
+I.e. each CPU core will perform force calculations for some small
+fraction of the particles, at the same time the GPUs perform force
+calcaultions for the majority of the particles.
 </P>
 <P><B>Timing output:</B>
 </P>
 <P>As described by the <A HREF = "package.html">package gpu</A> command, GPU
 accelerated pair styles can perform computations asynchronously with
 CPU computations. The "Pair" time reported by LAMMPS will be the
 maximum of the time required to complete the CPU pair style
 computations and the time required to complete the GPU pair style
 computations. Any time spent for GPU-enabled pair styles for
 computations that run simultaneously with <A HREF = "bond_style.html">bond</A>,
 <A HREF = "angle_style.html">angle</A>, <A HREF = "dihedral_style.html">dihedral</A>,
 <A HREF = "improper_style.html">improper</A>, and <A HREF = "kspace_style.html">long-range</A>
 calculations will not be included in the "Pair" time.
 </P>
 <P>When the <I>mode</I> setting for the package gpu command is force/neigh,
 the time for neighbor list calculations on the GPU will be added into
 the "Pair" time, not the "Neigh" time.  An additional breakdown of the
 times required for various tasks on the GPU (data copy, neighbor
 calculations, force computations, etc) are output only with the LAMMPS
 screen output (not in the log file) at the end of each run.  These
 timings represent total time spent on the GPU for each routine,
 regardless of asynchronous CPU calculations.
 </P>
+<P>The output section "GPU Time Info (average)" reports "Max Mem / Proc".
+This is the maximum memory used at one time on the GPU for data
+storage by a single MPI process.
+</P>
 <P><B>Performance tips:</B>
 </P>
-<P>Generally speaking, for best performance, you should use multiple CPUs
-per GPU, as provided my most multi-core CPU/GPU configurations.
+<P>You should experiment with how many MPI tasks per GPU to use to see
+what gives the best performance for your problem.  This is a function
+of your problem size and what pair style you are using.  Likewise, you
+should also experiment with the precision setting for the GPU library
+to see if single or mixed precision will give accurate results, since
+they will typically be faster.
+</P>
+<P>Using multiple MPI tasks per GPU will often give the best performance,
+as allowed my most multi-core CPU/GPU configurations.
 </P>
-<P>Because of the large number of cores within each GPU device, it may be
-more efficient to run on fewer processes per GPU when the number of
-particles per MPI process is small (100's of particles); this can be
-necessary to keep the GPU cores busy.
+<P>If the number of particles per MPI task is small (e.g. 100s of
+particles), it can be more eefficient to run with fewer MPI tasks per
+GPU, even if you do not use all the cores on the compute node.
 </P>
-<P>See the lammps/lib/gpu/README file for instructions on how to build
-the GPU library for single, mixed, or double precision.  The latter
-requires that your GPU card support double precision.
+<P>The <A HREF = "http://lammps.sandia.gov/bench.html">Benchmark page</A> of the LAMMPS
+web site gives GPU performance on a desktop machine and the Titan HPC
+platform at ORNL for several of the LAMMPS benchmarks, as a function
+of problem size and number of compute nodes.
 </P>
 <HR>
 
 <H4><A NAME = "acc_7"></A>5.7 USER-CUDA package 
 </H4>
 <P>The USER-CUDA package was developed by Christian Trott at U Technology
 Ilmenau in Germany.  It provides NVIDIA GPU versions of many pair
 styles, many fixes, a few computes, and for long-range Coulombics via
 the PPPM command.  It has the following features:
 </P>
 <UL><LI>The package is designed to allow an entire LAMMPS calculation, for
 many timesteps, to run entirely on the GPU (except for inter-processor
 MPI communication), so that atom-based data (e.g. coordinates, forces)
 do not have to move back-and-forth between the CPU and GPU. 
 
 <LI>The speed-up advantage of this approach is typically better when the
 number of atoms per GPU is large 
 
 <LI>Data will stay on the GPU until a timestep where a non-GPU-ized fix or
 compute is invoked.  Whenever a non-GPU operation occurs (fix,
 compute, output), data automatically moves back to the CPU as needed.
 This may incur a performance penalty, but should otherwise work
 transparently. 
 
 <LI>Neighbor lists for GPU-ized pair styles are constructed on the
 GPU. 
 
 <LI>The package only supports use of a single CPU (core) with each
 GPU. 
 </UL>
 <P><B>Hardware and software requirements:</B>
 </P>
 <P>To use this package, you need to have specific NVIDIA hardware and
 install specific NVIDIA CUDA software on your system.
 </P>
 <P>Your NVIDIA GPU needs to support Compute Capability 1.3. This list may
 help you to find out the Compute Capability of your card:
 </P>
 <P>http://en.wikipedia.org/wiki/Comparison_of_Nvidia_graphics_processing_units
 </P>
 <P>Install the Nvidia Cuda Toolkit in version 3.2 or higher and the
 corresponding GPU drivers. The Nvidia Cuda SDK is not required for
 LAMMPSCUDA but we recommend it be installed.  You can then make sure
 that its sample projects can be compiled without problems.
 </P>
 <P><B>Building LAMMPS with the USER-CUDA package:</B>
 </P>
 <P>As with other packages that include a separately compiled library, you
 need to first build the USER-CUDA library, before building LAMMPS
 itself.  General instructions for doing this are in <A HREF = "Section_start.html#start_3">this
 section</A> of the manual.  For this package,
 do the following, using settings in the lib/cuda Makefiles appropriate
 for your system:
 </P>
 <UL><LI>Go to the lammps/lib/cuda directory 
 
 <LI>If your <I>CUDA</I> toolkit is not installed in the default system directoy
 <I>/usr/local/cuda</I> edit the file <I>lib/cuda/Makefile.common</I>
 accordingly. 
 
 <LI>Type "make OPTIONS", where <I>OPTIONS</I> are one or more of the following
 options. The settings will be written to the
 <I>lib/cuda/Makefile.defaults</I> and used in the next step. 
 
 <PRE><I>precision=N</I> to set the precision level
   N = 1 for single precision (default)
   N = 2 for double precision
   N = 3 for positions in double precision
   N = 4 for positions and velocities in double precision
 <I>arch=M</I> to set GPU compute capability
   M = 20 for CC2.0 (GF100/110, e.g. C2050,GTX580,GTX470) (default)
   M = 21 for CC2.1 (GF104/114,  e.g. GTX560, GTX460, GTX450)
   M = 13 for CC1.3 (GF200, e.g. C1060, GTX285)
 <I>prec_timer=0/1</I> to use hi-precision timers
   0 = do not use them (default)
   1 = use these timers
   this is usually only useful for Mac machines 
 <I>dbg=0/1</I> to activate debug mode
   0 = no debug mode (default)
   1 = yes debug mode
   this is only useful for developers
 <I>cufft=1</I> to determine usage of CUDA FFT library
   0 = no CUFFT support (default)
   in the future other CUDA-enabled FFT libraries might be supported 
 </PRE>
 <LI>Type "make" to build the library.  If you are successful, you will
 produce the file lib/libcuda.a. 
 </UL>
 <P>Now you are ready to build LAMMPS with the USER-CUDA package installed:
 </P>
 <PRE>cd lammps/src
 make yes-user-cuda
 make machine 
 </PRE>
 <P>Note that the LAMMPS build references the lib/cuda/Makefile.common
 file to extract setting specific CUDA settings.  So it is important
 that you have first built the cuda library (in lib/cuda) using
 settings appropriate to your system.
 </P>
 <P><B>Input script requirements:</B>
 </P>
 <P>Additional input script requirements to run styles with a <I>cuda</I>
 suffix are as follows:
 </P>
 <UL><LI>To invoke specific styles from the USER-CUDA package, you can either
 append "cuda" to the style name (e.g. pair_style lj/cut/cuda), or use
 the <A HREF = "Section_start.html#start_7">-suffix command-line switch</A>, or use
 the <A HREF = "suffix.html">suffix</A> command.  One exception is that the
 <A HREF = "kspace_style.html">kspace_style pppm/cuda</A> command has to be requested
 explicitly. 
 
 <LI>To use the USER-CUDA package with its default settings, no additional
 command is needed in your input script.  This is because when LAMMPS
 starts up, it detects if it has been built with the USER-CUDA package.
 See the <A HREF = "Section_start.html#start_7">-cuda command-line switch</A> for
 more details. 
 
 <LI>To change settings for the USER-CUDA package at run-time, the <A HREF = "package.html">package
 cuda</A> command can be used near the beginning of your
 input script.  See the <A HREF = "package.html">package</A> command doc page for
 details. 
 </UL>
 <P><B>Performance tips:</B>
 </P>
 <P>The USER-CUDA package offers more speed-up relative to CPU performance
 when the number of atoms per GPU is large, e.g. on the order of tens
 or hundreds of 1000s.
 </P>
 <P>As noted above, this package will continue to run a simulation
 entirely on the GPU(s) (except for inter-processor MPI communication),
 for multiple timesteps, until a CPU calculation is required, either by
 a fix or compute that is non-GPU-ized, or until output is performed
 (thermo or dump snapshot or restart file).  The less often this
 occurs, the faster your simulation will run.
 </P>
 <HR>
 
+<H4><A NAME = "acc_8"></A>5.8 KOKKOS package 
+</H4>
+<P>The KOKKOS package contains versions of pair, fix, and atom styles
+that use data structures and methods and macros provided by the Kokkos
+library, which is included with LAMMPS in lib/kokkos.
+</P>
+<P><A HREF = "http://trilinos.sandia.gov/packages/kokkos">Kokkos</A> is a C++ library
+that provides two key abstractions for an application like LAMMPS.
+First, it allows a single implementation of an application kernel
+(e.g. a pair style) to run efficiently on different kinds of hardware
+(GPU, Intel Phi, many-core chip).
+</P>
+<P>Second, it provides data abstractions to adjust (at compile time) the
+memory layout of basic data structures like 2d and 3d arrays and allow
+the transparent utilization of special hardware load and store units.
+Such data structures are used in LAMMPS to store atom coordinates or
+forces or neighbor lists.  The layout is chosen to optimize
+performance on different platforms.  Again this operation is hidden
+from the developer, and does not affect how the single implementation
+of the kernel is coded.
+</P>
+<P>These abstractions are set at build time, when LAMMPS is compiled with
+the KOKKOS package installed.  This is done by selecting a "host" and
+"device" to build for, compatible with the compute nodes in your
+machine.  Note that if you are running on a desktop machine, you
+typically have one compute node.  On a cluster or supercomputer there
+may be dozens or 1000s of compute nodes.  The procedure for building
+and running with the Kokkos library is the same, no matter how many
+nodes you run on.
+</P>
+<P>All Kokkos operations occur within the context of an individual MPI
+task running on a single node of the machine.  The total number of MPI
+tasks used by LAMMPS (one or multiple per compute node) is set in the
+usual manner via the mpirun or mpiexec commands, and is independent of
+Kokkos.
+</P>
+<P>Kokkos provides support for one or two modes of execution per MPI
+task.  This means that some computational tasks (pairwise
+interactions, neighbor list builds, time integration, etc) are
+parallelized in one or the other of the two modes.  The first mode is
+called the "host" and is one or more threads running on one or more
+physical CPUs (within the node).  Currently, both multi-core CPUs and
+an Intel Phi processor (running in native mode) are supported.  The
+second mode is called the "device" and is an accelerator chip of some
+kind.  Currently only an NVIDIA GPU is supported.  If your compute
+node does not have a GPU, then there is only one mode of execution,
+i.e. the host and device are the same.
+</P>
+<P>IMPORTNANT NOTE: Currently, if using GPUs, you should set the number
+of MPI tasks per compute node to be equal to the number of GPUs per
+compute node.  In the future Kokkos will support assigning one GPU to
+multiple MPI tasks or using multiple GPUs per MPI task.  Currently
+Kokkos does not support AMD GPUs due to limits in the available
+backend programming models (in particular relative extensive C++
+support is required for the Kernel language).  This is expected to
+change in the future.
+</P>
+<P>Here are several examples of how to build LAMMPS and run a simulation
+using the KOKKOS package for typical compute node configurations.
+Note that the -np setting for the mpirun command in these examples are
+for a run on a single node.  To scale these examples up to run on a
+system with N compute nodes, simply multiply the -np setting by N.
+</P>
+<P>All the build steps are performed from within the src directory.  All
+the run steps are performed in the bench directory using the in.lj
+input script.  It is assumed the LAMMPS executable has been copied to
+that directory or whatever directory the runs are being performed in.
+Details of the various options are discussed below.
+</P>
+<P><B>Compute node(s) = dual hex-core CPUs and no GPU:</B>
+</P>
+<PRE>make yes-kokkos                           # install the KOKKOS package
+make g++ OMP=yes                          # build with OpenMP, no CUDA 
+</PRE>
+<PRE>mpirun -np 12 lmp_g++ -k off < in.lj      # MPI-only mode with no Kokkos
+mpirun -np 12 lmp_g++ -sf kk < in.lj      # MPI-only mode with Kokkos
+mpirun -np 1 lmp_g++ -k on t 12 -sf kk < in.lj     # one MPI task, 12 threads
+mpirun -np 2 lmp_g++ -k on t 6 -sf kk < in.lj      # two MPI tasks, 6 threads/task 
+</PRE>
+<P><B>Compute node(s) = Intel Phi with 61 cores:</B>
+</P>
+<PRE>make yes-kokkos
+make g++ OMP=yes MIC=yes                  # build with OpenMP for Phi 
+</PRE>
+<PRE>mpirun -np 12 lmp_g++ -k on t 20 -sf kk < in.lj      # 12*20 = 240 total cores
+mpirun -np 15 lmp_g++ -k on t 16 -sf kk < in.lj
+mpirun -np 30 lmp_g++ -k on t 8 -sf kk < in.lj
+mpirun -np 1 lmp_g++ -k on t 240 -sf kk < in.lj 
+</PRE>
+<P><B>Compute node(s) = dual hex-core CPUs and a single GPU:</B>
+</P>
+<PRE>make yes-kokkos
+make cuda CUDA=yes             # build for GPU, use src/MAKE/Makefile.cuda 
+</PRE>
+<PRE>mpirun -np 1 lmp_cuda -k on t 6 -sf kk < in.lj 
+</PRE>
+<P><B>Compute node(s) = dual 8-core CPUs and 2 GPUs:</B>
+</P>
+<PRE>make yes-kokkos
+make cuda CUDA=yes 
+</PRE>
+<PRE>mpirun -np 2 lmp_cuda -k on t 8 g 2 -sf kk < in.lj     # use both GPUs, one per MPI task 
+</PRE>
+<P><B>Building LAMMPS with the KOKKOS package:</B>
+</P>
+<P>A summary of the build process is given here.  More details and all
+the available make variable options are given in <A HREF = "Section_start.html#start_3_4">this
+section</A> of the manual.
+</P>
+<P>From the src directory, type
+</P>
+<PRE>make yes-kokkos 
+</PRE>
+<P>to include the KOKKOS package.  Then perform a normal LAMMPS build,
+with additional make variable specifications to choose the host and
+device you will run the resulting executable on, e.g.
+</P>
+<PRE>make g++ OMP=yes
+make cuda CUDA=yes 
+</PRE>
+<P>As illustrated above, the most important variables to set are OMP,
+CUDA, and MIC.  The default settings are OMP=yes, CUDA=no, MIC=no
+Setting OMP to <I>yes</I> will use OpenMP for threading on the host, as
+well as on the device (if no GPU is present).  Setting CUDA to <I>yes</I>
+will use one or more GPUs as the device.  Setting MIC=yes is necessary
+when building for an Intel Phi processor.
+</P>
+<P>Note that to use a GPU, you must use a lo-level Makefile,
+e.g. src/MAKE/Makefile.cuda as included in the LAMMPS distro, which
+uses the NVIDA "nvcc" compiler.  You must check that the CCFLAGS -arch
+setting is appropriate for your NVIDIA hardware and installed
+software.  Typical values for -arch are given in <A HREF = "Section_start.html#start_3_4">this
+section</A> of the manual, as well as other
+settings that must be included in the lo-level Makefile, if you create
+your own.
+</P>
+<P><B>Input scripts and use of command-line switches -kokkos and -suffix:</B>
+</P>
+<P>To use any Kokkos-enabled style provided in the KOKKOS package, you
+must use a Kokkos-enabled atom style.  LAMMPS will give an error if
+you do not do this.
+</P>
+<P>There are two command-line switches relevant to using Kokkos, -k or
+-kokkos, and -sf or -suffix.  They are described in detail in <A HREF = "Section_start.html#start_7">this
+section</A> of the manual.
+</P>
+<P>Here are common options to use:
+</P>
+<UL><LI>-k off : runs an executable built with the KOKKOS pacakage, as
+ if Kokkos were not installed. 
+
+<LI>-sf kk : enables automatic use of Kokkos versions of atom, pair,
+fix, compute styles if they exist.  This can also be done with more
+precise control by using the <A HREF = "suffix.html">suffix</A> command or appending
+"kk" to styles within the input script, e.g. "pair_style lj/cut/kk". 
+
+<LI>-k on t Nt : specifies how many threads per MPI task to use within a
+ compute node.  For good performance, the product of MPI tasks *
+ threads/task should not exceed the number of physical CPU or Intel
+ Phi cores. 
+
+<LI>-k on g Ng : specifies how many GPUs per compute node are available.
+The default is 1, so this should be specified is you have 2 or more
+GPUs per compute node. 
+</UL>
+<P><B>Use of package command options:</B>
+</P>
+<P>Using the <A HREF = "package.html">package kokkos</A> command in an input script
+allows choice of options for neighbor lists and communication.  See
+the <A HREF = "package.html">package</A> command doc page for details and default
+settings.
+</P>
+<P>Experimenting with different styles of neighbor lists or inter-node
+communication can provide a speed-up for specific calculations.
+</P>
+<P><B>Running on a multi-core CPU:</B>
+</P>
+<P>Build with OMP=yes (the default) and CUDA=no (the default).
+</P>
+<P>If N is the number of physical cores/node, then the number of MPI
+tasks/node * number of threads/task should not exceed N, and should
+typically equal N.  Note that the default threads/task is 1, as set by
+the "t" keyword of the -k <A HREF = "Section_start.html#start_7">command-line
+switch</A>.  If you do not change this, no
+additional parallelism (beyond MPI) will be invoked on the host
+CPU(s).
+</P>
+<P>You can compare the performance running in different modes:
+</P>
+<UL><LI>run with 1 MPI task/node and N threads/task
+<LI>run with N MPI tasks/node and 1 thread/task
+<LI>run with settings in between these extremes 
+</UL>
+<P>Examples of mpirun commands in these modes, for nodes with dual
+hex-core CPUs and no GPU, are shown above.
+</P>
+<P><B>Running on GPUs:</B>
+</P>
+<P>Build with CUDA=yes, using src/MAKE/Makefile.cuda.  Insure the setting
+for CUDA_PATH in lib/kokkos/Makefile.lammps is correct for your Cuda
+software installation.  Insure the -arch setting in
+src/MAKE/Makefile.cuda is correct for your GPU hardware/software (see
+<A HREF = "Section_start.html#start_3_4">this section</A> of the manual for details.
+</P>
+<P>The -np setting of the mpirun command should set the number of MPI
+tasks/node to be equal to the # of physical GPUs on the node. 
+</P>
+<P>Use the <A HREF = "Section_commands.html#start_7">-kokkos command-line switch</A> to
+specify the number of GPUs per node, and the number of threads per MPI
+task.  As above for multi-core CPUs (and no GPU), if N is the number
+of physical cores/node, then the number of MPI tasks/node * number of
+threads/task should not exceed N.  With one GPU (and one MPI task) it
+may be faster to use less than all the available cores, by setting
+threads/task to a smaller value.  This is because using all the cores
+on a dual-socket node will incur extra cost to copy memory from the
+2nd socket to the GPU.
+</P>
+<P>Examples of mpirun commands that follow these rules, for nodes with
+dual hex-core CPUs and one or two GPUs, are shown above.
+</P>
+<P><B>Running on an Intel Phi:</B>
+</P>
+<P>Kokkos only uses Intel Phi processors in their "native" mode, i.e.
+not hosted by a CPU.
+</P>
+<P>Build with OMP=yes (the default) and MIC=yes.  The latter
+insures code is correctly compiled for the Intel Phi.  The
+OMP setting means OpenMP will be used for parallelization
+on the Phi, which is currently the best option within
+Kokkos.  In the future, other options may be added.
+</P>
+<P>Current-generation Intel Phi chips have either 61 or 57 cores.  One
+core should be excluded to run the OS, leaving 60 or 56 cores.  Each
+core is hyperthreaded, so there are effectively N = 240 (4*60) or N =
+224 (4*56) cores to run on.
+</P>
+<P>The -np setting of the mpirun command sets the number of MPI
+tasks/node.  The "-k on t Nt" command-line switch sets the number of
+threads/task as Nt.  The product of these 2 values should be N, i.e.
+240 or 224.  Also, the number of threads/task should be a multiple of
+4 so that logical threads from more than one MPI task do not run on
+the same physical core.
+</P>
+<P>Examples of mpirun commands that follow these rules, for Intel Phi
+nodes with 61 cores, are shown above.
+</P>
+<P><B>Examples and benchmarks:</B>
+</P>
+<P>The examples/kokkos and bench/KOKKOS directories have scripts that can
+be run with the KOKKOS package, as well as detailed instructions on
+how to run them.
+</P>
+<P>IMPORTANT NOTE: the bench/KOKKOS directory does not yet exist.  It
+will be added later.
+</P>
+<P><B>Additional performance issues:</B>
+</P>
+<P>When using threads (OpenMP or pthreads), it is important for
+performance to bind the threads to physical cores, so they do not
+migrate during a simulation.  The same is true for MPI tasks, but the
+default binding rules implemented for various MPI versions, do not
+account for thread binding.  
+</P>
+<P>Thus if you use more than one thread per MPI task, you should insure
+MPI tasks are bound to CPU sockets.  Furthermore, use thread affinity
+environment variables from the OpenMP runtime when using OpenMP and
+compile with hwloc support when using pthreads.  With OpenMP 3.1 (gcc
+4.7 or later, intel 12 or later) setting the environment variable
+OMP_PROC_BIND=true should be sufficient.  A typical mpirun command
+should set these flags:
+</P>
+<PRE>OpenMPI 1.8: mpirun -np 2 -bind-to socket -map-by socket ./lmp_openmpi ...
+Mvapich2 2.0: mpiexec -np 2 -bind-to socket -map-by socket ./lmp_mvapich ... 
+</PRE>
+<P>When using a GPU, you will achieve the best performance if your input
+script does not use any fix or compute styles which are not yet
+Kokkos-enabled.  This allows data to stay on the GPU for multiple
+timesteps, without being copied back to the host CPU.  Invoking a
+non-Kokkos fix or compute, or performing I/O for
+<A HREF = "thermo_style.html">thermo</A> or <A HREF = "dump.html">dump</A> output will cause data
+to be copied back to the CPU.
+</P>
+<P>You cannot yet assign multiple MPI tasks to the same GPU with the
+KOKKOS package.  We plan to support this in the future, similar to the
+GPU package in LAMMPS.
+</P>
+<P>You cannot yet use both the host (multi-threaded) and device (GPU)
+together to compute pairwise interactions with the KOKKOS package.  We
+hope to support this in the future, similar to the GPU package in
+LAMMPS.
+</P>
+<HR>
+
 <HR>
 
-<H4><A NAME = "acc_8"></A>5.8 Comparison of GPU and USER-CUDA packages 
+<H4><A NAME = "acc_9"></A>5.9 Comparison of GPU and USER-CUDA packages 
 </H4>
 <P>Both the GPU and USER-CUDA packages accelerate a LAMMPS calculation
 using NVIDIA hardware, but they do it in different ways.
 </P>
 <P>As a consequence, for a particular simulation on specific hardware,
 one package may be faster than the other.  We give guidelines below,
 but the best way to determine which package is faster for your input
 script is to try both of them on your machine.  See the benchmarking
 section below for examples where this has been done.
 </P>
 <P><B>Guidelines for using each package optimally:</B>
 </P>
 <UL><LI>The GPU package allows you to assign multiple CPUs (cores) to a single
 GPU (a common configuration for "hybrid" nodes that contain multicore
 CPU(s) and GPU(s)) and works effectively in this mode.  The USER-CUDA
 package does not allow this; you can only use one CPU per GPU. 
 
 <LI>The GPU package moves per-atom data (coordinates, forces)
 back-and-forth between the CPU and GPU every timestep.  The USER-CUDA
 package only does this on timesteps when a CPU calculation is required
 (e.g. to invoke a fix or compute that is non-GPU-ized).  Hence, if you
 can formulate your input script to only use GPU-ized fixes and
 computes, and avoid doing I/O too often (thermo output, dump file
 snapshots, restart files), then the data transfer cost of the
 USER-CUDA package can be very low, causing it to run faster than the
 GPU package. 
 
 <LI>The GPU package is often faster than the USER-CUDA package, if the
 number of atoms per GPU is "small".  The crossover point, in terms of
 atoms/GPU at which the USER-CUDA package becomes faster depends
 strongly on the pair style.  For example, for a simple Lennard Jones
 system the crossover (in single precision) is often about 50K-100K
 atoms per GPU.  When performing double precision calculations the
 crossover point can be significantly smaller. 
 
 <LI>Both packages compute bonded interactions (bonds, angles, etc) on the
 CPU.  This means a model with bonds will force the USER-CUDA package
 to transfer per-atom data back-and-forth between the CPU and GPU every
 timestep.  If the GPU package is running with several MPI processes
 assigned to one GPU, the cost of computing the bonded interactions is
 spread across more CPUs and hence the GPU package can run faster. 
 
 <LI>When using the GPU package with multiple CPUs assigned to one GPU, its
 performance depends to some extent on high bandwidth between the CPUs
 and the GPU.  Hence its performance is affected if full 16 PCIe lanes
 are not available for each GPU.  In HPC environments this can be the
 case if S2050/70 servers are used, where two devices generally share
 one PCIe 2.0 16x slot.  Also many multi-GPU mainboards do not provide
 full 16 lanes to each of the PCIe 2.0 16x slots. 
 </UL>
 <P><B>Differences between the two packages:</B>
 </P>
 <UL><LI>The GPU package accelerates only pair force, neighbor list, and PPPM
 calculations.  The USER-CUDA package currently supports a wider range
 of pair styles and can also accelerate many fix styles and some
 compute styles, as well as neighbor list and PPPM calculations. 
 
 <LI>The USER-CUDA package does not support acceleration for minimization. 
 
 <LI>The USER-CUDA package does not support hybrid pair styles. 
 
 <LI>The USER-CUDA package can order atoms in the neighbor list differently
 from run to run resulting in a different order for force accumulation. 
 
 <LI>The USER-CUDA package has a limit on the number of atom types that can be
 used in a simulation. 
 
 <LI>The GPU package requires neighbor lists to be built on the CPU when using
 exclusion lists or a triclinic simulation box. 
 
 <LI>The GPU package uses more GPU memory than the USER-CUDA package.  This
 is generally not a problem since typical runs are computation-limited
 rather than memory-limited. 
 </UL>
 <P><B>Examples:</B>
 </P>
 <P>The LAMMPS distribution has two directories with sample input scripts
 for the GPU and USER-CUDA packages.
 </P>
 <UL><LI>lammps/examples/gpu = GPU package files
 <LI>lammps/examples/USER/cuda = USER-CUDA package files 
 </UL>
 <P>These contain input scripts for identical systems, so they can be used
 to benchmark the performance of both packages on your system.
 </P>
 </HTML>
diff --git a/doc/Section_accelerate.txt b/doc/Section_accelerate.txt
index 1e995d5c7..a4252cf17 100644
--- a/doc/Section_accelerate.txt
+++ b/doc/Section_accelerate.txt
@@ -1,723 +1,1057 @@
 "Previous Section"_Section_packages.html - "LAMMPS WWW Site"_lws -
 "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc - "Next
 Section"_Section_howto.html :c
 
 :link(lws,http://lammps.sandia.gov)
 :link(ld,Manual.html)
 :link(lc,Section_commands.html#comm)
 
 :line
 
 5. Accelerating LAMMPS performance :h3
 
 This section describes various methods for improving LAMMPS
 performance for different classes of problems running on different
 kinds of machines.
 
 5.1 "Measuring performance"_#acc_1
 5.2 "General strategies"_#acc_2
 5.3 "Packages with optimized styles"_#acc_3
 5.4 "OPT package"_#acc_4
 5.5 "USER-OMP package"_#acc_5
 5.6 "GPU package"_#acc_6
 5.7 "USER-CUDA package"_#acc_7
-5.8 "Comparison of GPU and USER-CUDA packages"_#acc_8 :all(b)
+5.8 "KOKKOS package"_#acc_8
+5.9 "Comparison of GPU and USER-CUDA packages"_#acc_9 :all(b)
 
 :line
 :line
 
 5.1 Measuring performance :h4,link(acc_1)
 
 Before trying to make your simulation run faster, you should
 understand how it currently performs and where the bottlenecks are.
 
 The best way to do this is run the your system (actual number of
 atoms) for a modest number of timesteps (say 100, or a few 100 at
 most) on several different processor counts, including a single
 processor if possible.  Do this for an equilibrium version of your
 system, so that the 100-step timings are representative of a much
 longer run.  There is typically no need to run for 1000s or timesteps
 to get accurate timings; you can simply extrapolate from short runs.
 
 For the set of runs, look at the timing data printed to the screen and
 log file at the end of each LAMMPS run.  "This
 section"_Section_start.html#start_8 of the manual has an overview.
 
 Running on one (or a few processors) should give a good estimate of
 the serial performance and what portions of the timestep are taking
 the most time.  Running the same problem on a few different processor
 counts should give an estimate of parallel scalability.  I.e. if the
 simulation runs 16x faster on 16 processors, its 100% parallel
 efficient; if it runs 8x faster on 16 processors, it's 50% efficient.
 
 The most important data to look at in the timing info is the timing
 breakdown and relative percentages.  For example, trying different
 options for speeding up the long-range solvers will have little impact
 if they only consume 10% of the run time.  If the pairwise time is
 dominating, you may want to look at GPU or OMP versions of the pair
 style, as discussed below.  Comparing how the percentages change as
 you increase the processor count gives you a sense of how different
 operations within the timestep are scaling.  Note that if you are
 running with a Kspace solver, there is additional output on the
 breakdown of the Kspace time.  For PPPM, this includes the fraction
 spent on FFTs, which can be communication intensive.
 
 Another important detail in the timing info are the histograms of
 atoms counts and neighbor counts.  If these vary widely across
 processors, you have a load-imbalance issue.  This often results in
 inaccurate relative timing data, because processors have to wait when
 communication occurs for other processors to catch up.  Thus the
 reported times for "Communication" or "Other" may be higher than they
 really are, due to load-imbalance.  If this is an issue, you can
 uncomment the MPI_Barrier() lines in src/timer.cpp, and recompile
 LAMMPS, to obtain synchronized timings.
 
 :line
 
 5.2 General strategies :h4,link(acc_2)
 
 NOTE: this sub-section is still a work in progress
 
 Here is a list of general ideas for improving simulation performance.
 Most of them are only applicable to certain models and certain
 bottlenecks in the current performance, so let the timing data you
 generate be your guide.  It is hard, if not impossible, to predict how
 much difference these options will make, since it is a function of
 problem size, number of processors used, and your machine.  There is
 no substitute for identifying performance bottlenecks, and trying out
 various options.
 
 rRESPA
 2-FFT PPPM
 Staggered PPPM
 single vs double PPPM
 partial charge PPPM
 verlet/split
 processor mapping via processors numa command
 load-balancing: balance and fix balance
 processor command for layout
 OMP when lots of cores :ul
 
 2-FFT PPPM, also called {analytic differentiation} or {ad} PPPM, uses
 2 FFTs instead of the 4 FFTs used by the default {ik differentiation}
 PPPM. However, 2-FFT PPPM also requires a slightly larger mesh size to
 achieve the same accuracy as 4-FFT PPPM. For problems where the FFT
 cost is the performance bottleneck (typically large problems running
 on many processors), 2-FFT PPPM may be faster than 4-FFT PPPM.
   
 Staggered PPPM performs calculations using two different meshes, one
 shifted slightly with respect to the other.  This can reduce force
 aliasing errors and increase the accuracy of the method, but also
 doubles the amount of work required. For high relative accuracy, using
 staggered PPPM allows one to half the mesh size in each dimension as
 compared to regular PPPM, which can give around a 4x speedup in the
 kspace time. However, for low relative accuracy, using staggered PPPM
 gives little benefit and can be up to 2x slower in the kspace
 time. For example, the rhodopsin benchmark was run on a single
 processor, and results for kspace time vs. relative accuracy for the
 different methods are shown in the figure below.  For this system,
 staggered PPPM (using ik differentiation) becomes useful when using a
 relative accuracy of slightly greater than 1e-5 and above.
 
 :c,image(JPG/rhodo_staggered.jpg)
 
 IMPORTANT NOTE: Using staggered PPPM may not give the same increase in
 accuracy of energy and pressure as it does in forces, so some caution
 must be used if energy and/or pressure are quantities of interest,
 such as when using a barostat.
 
 :line
 
 5.3 Packages with optimized styles :h4,link(acc_3)
 
 Accelerated versions of various "pair_style"_pair_style.html,
 "fixes"_fix.html, "computes"_compute.html, and other commands have
 been added to LAMMPS, which will typically run faster than the
 standard non-accelerated versions, if you have the appropriate
 hardware on your system.
 
 The accelerated styles have the same name as the standard styles,
 except that a suffix is appended.  Otherwise, the syntax for the
 command is identical, their functionality is the same, and the
 numerical results it produces should also be identical, except for
 precision and round-off issues.
 
-For example, all of these variants of the basic Lennard-Jones pair
-style exist in LAMMPS:
+For example, all of these styles are variants of the basic
+Lennard-Jones pair style "pair_style lj/cut"_pair_lj.html:
 
-"pair_style lj/cut"_pair_lj.html
-"pair_style lj/cut/opt"_pair_lj.html
-"pair_style lj/cut/omp"_pair_lj.html
+"pair_style lj/cut/cuda"_pair_lj.html
 "pair_style lj/cut/gpu"_pair_lj.html
-"pair_style lj/cut/cuda"_pair_lj.html :ul
+"pair_style lj/cut/kk"_pair_lj.html
+"pair_style lj/cut/omp"_pair_lj.html
+"pair_style lj/cut/opt"_pair_lj.html :ul
 
 Assuming you have built LAMMPS with the appropriate package, these
 styles can be invoked by specifying them explicitly in your input
 script.  Or you can use the "-suffix command-line
 switch"_Section_start.html#start_7 to invoke the accelerated versions
 automatically, without changing your input script.  The
 "suffix"_suffix.html command allows you to set a suffix explicitly and
-to turn off/on the comand-line switch setting, both from within your
-input script.
+to turn off and back on the comand-line switch setting, both from
+within your input script.
 
-Styles with an "opt" suffix are part of the OPT package and typically
-speed-up the pairwise calculations of your simulation by 5-25%.
+Styles with a "cuda" or "gpu" suffix are part of the USER-CUDA or GPU
+packages, and can be run on NVIDIA GPUs associated with your CPUs.
+The speed-up due to GPU usage depends on a variety of factors, as
+discussed below.
+
+Styles with a "kk" suffix are part of the KOKKOS package, and can be
+run using OpenMP, pthreads, or on an NVIDIA GPU.  The speed-up depends
+on a variety of factors, as discussed below.
 
 Styles with an "omp" suffix are part of the USER-OMP package and allow
 a pair-style to be run in multi-threaded mode using OpenMP.  This can
 be useful on nodes with high-core counts when using less MPI processes
 than cores is advantageous, e.g. when running with PPPM so that FFTs
 are run on fewer MPI processors or when the many MPI tasks would
 overload the available bandwidth for communication.
 
-Styles with a "gpu" or "cuda" suffix are part of the GPU or USER-CUDA
-packages, and can be run on NVIDIA GPUs associated with your CPUs.
-The speed-up due to GPU usage depends on a variety of factors, as
-discussed below.
+Styles with an "opt" suffix are part of the OPT package and typically
+speed-up the pairwise calculations of your simulation by 5-25%.
 
 To see what styles are currently available in each of the accelerated
 packages, see "Section_commands 5"_Section_commands.html#cmd_5 of the
 manual.  A list of accelerated styles is included in the pair, fix,
-compute, and kspace sections.
+compute, and kspace sections.  The doc page for each indvidual style
+(e.g. "pair lj/cut"_pair_lj.html or "fix nve"_fix_nve.html) will also
+list any accelerated variants available for that style.
 
 The following sections explain:
 
 what hardware and software the accelerated styles require
-how to build LAMMPS with the accelerated packages in place
+how to build LAMMPS with the accelerated package in place
 what changes (if any) are needed in your input scripts
 guidelines for best performance
 speed-ups you can expect :ul
 
 The final section compares and contrasts the GPU and USER-CUDA
-packages, since they are both designed to use NVIDIA GPU hardware.
+packages, since they are both designed to use NVIDIA hardware.
 
 :line
 
 5.4 OPT package :h4,link(acc_4)
 
 The OPT package was developed by James Fischer (High Performance
 Technologies), David Richie, and Vincent Natoli (Stone Ridge
 Technologies).  It contains a handful of pair styles whose compute()
 methods were rewritten in C++ templated form to reduce the overhead
 due to if tests and other conditional code.
 
 The procedure for building LAMMPS with the OPT package is simple.  It
 is the same as for any other package which has no additional library
 dependencies:
 
 make yes-opt
 make machine :pre
 
-If your input script uses one of the OPT pair styles,
-you can run it as follows:
+If your input script uses one of the OPT pair styles, you can run it
+as follows:
 
 lmp_machine -sf opt -in in.script
 mpirun -np 4 lmp_machine -sf opt -in in.script :pre
 
 You should see a reduction in the "Pair time" printed out at the end
 of the run.  On most machines and problems, this will typically be a 5
 to 20% savings.
 
 :line
 
 5.5 USER-OMP package :h4,link(acc_5)
 
-The USER-OMP package was developed by Axel Kohlmeyer at Temple University.
-It provides multi-threaded versions of most pair styles, all dihedral
-styles and a few fixes in LAMMPS. The package currently uses the OpenMP
-interface which requires using a specific compiler flag in the makefile
-to enable multiple threads; without this flag the corresponding pair
-styles will still be compiled and work, but do not support multi-threading.
+The USER-OMP package was developed by Axel Kohlmeyer at Temple
+University.  It provides multi-threaded versions of most pair styles,
+all dihedral styles, and a few fixes in LAMMPS. The package currently
+uses the OpenMP interface which requires using a specific compiler
+flag in the makefile to enable multiple threads; without this flag the
+corresponding pair styles will still be compiled and work, but do not
+support multi-threading.
 
 [Building LAMMPS with the USER-OMP package:]
 
 The procedure for building LAMMPS with the USER-OMP package is simple.
 You have to edit your machine specific makefile to add the flag to
 enable OpenMP support to the CCFLAGS and LINKFLAGS variables. For the
 GNU compilers for example this flag is called {-fopenmp}. Check your
 compiler documentation to find out which flag you need to add.
 The rest of the compilation is the same as for any other package which
 has no additional library dependencies:
 
 make yes-user-omp
 make machine :pre
 
 Please note that this will only install accelerated versions
 of styles that are already installed, so you want to install
 this package as the last package, or else you may be missing
 some accelerated styles. If you plan to uninstall some package,
 you should first uninstall the USER-OMP package then the other
 package and then re-install USER-OMP, to make sure that there
 are no orphaned {omp} style files present, which would lead to
 compilation errors.
 
 If your input script uses one of regular styles that are also
 exist as an OpenMP version in the USER-OMP package you can run
 it as follows:
 
 env OMP_NUM_THREADS=4 lmp_serial -sf omp -in in.script
 env OMP_NUM_THREADS=2 mpirun -np 2 lmp_machine -sf omp -in in.script
 mpirun -x OMP_NUM_THREADS=2 -np 2 lmp_machine -sf omp -in in.script :pre
 
 The value of the environment variable OMP_NUM_THREADS determines how
-many threads per MPI task are launched. All three examples above use
-a total of 4 CPU cores.  For different MPI implementations the method
-to pass the OMP_NUM_THREADS environment variable to all processes is
-different.  Two different variants, one for MPICH and OpenMPI, respectively
-are shown above.  Please check the documentation of your MPI installation
-for additional details.  Alternatively, the value provided by OMP_NUM_THREADS
-can be overridded with the "package omp"_package.html command.
-Depending on which styles are accelerated in your input, you should
-see a reduction in the "Pair time" and/or "Bond time" and "Loop time"
-printed out at the end of the run. The optimal ratio of MPI to OpenMP
-can vary a lot and should always be confirmed through some benchmark
-runs for the current system and on the current machine.
+many threads per MPI task are launched. All three examples above use a
+total of 4 CPU cores.  For different MPI implementations the method to
+pass the OMP_NUM_THREADS environment variable to all processes is
+different.  Two different variants, one for MPICH and OpenMPI,
+respectively are shown above.  Please check the documentation of your
+MPI installation for additional details.  Alternatively, the value
+provided by OMP_NUM_THREADS can be overridded with the "package
+omp"_package.html command.  Depending on which styles are accelerated
+in your input, you should see a reduction in the "Pair time" and/or
+"Bond time" and "Loop time" printed out at the end of the run. The
+optimal ratio of MPI to OpenMP can vary a lot and should always be
+confirmed through some benchmark runs for the current system and on
+the current machine.
 
 [Restrictions:]
 
 Only a few of the pair styles in the USER-OMP package support the "inner",
 "middle", "outer" options for r-RESPA integration, even if the regular
 version does. For those only the "pair" option is supported.
 
 When using styles from the GPU package, they can only be used on the
 outermost RESPA level.
 
 [Parallel efficiency and performance tips:]
 
 In most simple cases the MPI parallelization in LAMMPS is more
 efficient than multi-threading implemented in the USER-OMP package.
 Also the parallel efficiency varies between individual styles.
 On the other hand, in many cases you still want to use the {omp} version
 - even when compiling or running without OpenMP support - since they
 all contain optimizations similar to those in the OPT package, which
 can result in serial speedup.
 
-Using multi-threading is most effective under the following circumstances:
+Using multi-threading is most effective under the following
+circumstances:
 
-Individual compute nodes have a significant number of CPU cores
-but the CPU itself has limited memory bandwidth, e.g. Intel Xeon 53xx
-(Clovertown) and 54xx (Harpertown) quad core processors. Running
-one MPI task per CPU core will result in significant performance
-degradation, so that running with 4 or even only 2 MPI tasks per
-nodes is faster. Running in hybrid MPI+OpenMP mode will reduce the
-inter-node communication bandwidth contention in the same way,
-but offers and additional speedup from utilizing the otherwise
-idle CPU cores. :ulb,l
+Individual compute nodes have a significant number of CPU cores but
+the CPU itself has limited memory bandwidth, e.g. Intel Xeon 53xx
+(Clovertown) and 54xx (Harpertown) quad core processors. Running one
+MPI task per CPU core will result in significant performance
+degradation, so that running with 4 or even only 2 MPI tasks per nodes
+is faster. Running in hybrid MPI+OpenMP mode will reduce the
+inter-node communication bandwidth contention in the same way, but
+offers and additional speedup from utilizing the otherwise idle CPU
+cores. :ulb,l
 
 The interconnect used for MPI communication is not able to provide
-sufficient bandwidth for a large number of MPI tasks per node.
-This applies for example to running over gigabit ethernet or
-on Cray XT4 or XT5 series supercomputers. Same as in the aforementioned
-case this effect worsens with using an increasing number of nodes. :l
-
-The input is a system that has an inhomogeneous particle density
-which cannot be mapped well to the domain decomposition scheme
-that LAMMPS employs. While this can be to some degree alleviated
-through using the "processors"_processors.html keyword, multi-threading
-provides a parallelism that parallelizes over the number of particles
-not their distribution in space. :l
+sufficient bandwidth for a large number of MPI tasks per node.  This
+applies for example to running over gigabit ethernet or on Cray XT4 or
+XT5 series supercomputers. Same as in the aforementioned case this
+effect worsens with using an increasing number of nodes. :l
+
+The input is a system that has an inhomogeneous particle density which
+cannot be mapped well to the domain decomposition scheme that LAMMPS
+employs. While this can be to some degree alleviated through using the
+"processors"_processors.html keyword, multi-threading provides a
+parallelism that parallelizes over the number of particles not their
+distribution in space. :l
 
 Finally, multi-threaded styles can improve performance when running
 LAMMPS in "capability mode", i.e. near the point where the MPI
-parallelism scales out. This can happen in particular when using
-as kspace style for long-range electrostatics. Here the scaling
-of the kspace style is the performance limiting factor and using
-multi-threaded styles allows to operate the kspace style at the
-limit of scaling and then increase performance parallelizing
-the real space calculations with hybrid MPI+OpenMP. Sometimes
-additional speedup can be achived by increasing the real-space
-coulomb cutoff and thus reducing the work in the kspace part. :l,ule
-
-The best parallel efficiency from {omp} styles is typically 
-achieved when there is at least one MPI task per physical 
-processor, i.e. socket or die.
+parallelism scales out. This can happen in particular when using as
+kspace style for long-range electrostatics. Here the scaling of the
+kspace style is the performance limiting factor and using
+multi-threaded styles allows to operate the kspace style at the limit
+of scaling and then increase performance parallelizing the real space
+calculations with hybrid MPI+OpenMP. Sometimes additional speedup can
+be achived by increasing the real-space coulomb cutoff and thus
+reducing the work in the kspace part. :l,ule
+
+The best parallel efficiency from {omp} styles is typically achieved
+when there is at least one MPI task per physical processor,
+i.e. socket or die.
 
 Using threads on hyper-threading enabled cores is usually
 counterproductive, as the cost in additional memory bandwidth
-requirements is not offset by the gain in CPU utilization
-through hyper-threading.
+requirements is not offset by the gain in CPU utilization through
+hyper-threading.
 
 A description of the multi-threading strategy and some performance
-examples are "presented here"_http://sites.google.com/site/akohlmey/software/lammps-icms/lammps-icms-tms2011-talk.pdf?attredirects=0&d=1
+examples are "presented
+here"_http://sites.google.com/site/akohlmey/software/lammps-icms/lammps-icms-tms2011-talk.pdf?attredirects=0&d=1
 
 :line
 
 5.6 GPU package :h4,link(acc_6)
 
 The GPU package was developed by Mike Brown at ORNL and his
 collaborators.  It provides GPU versions of several pair styles,
 including the 3-body Stillinger-Weber pair style, and for long-range
 Coulombics via the PPPM command.  It has the following features:
 
 The package is designed to exploit common GPU hardware configurations
 where one or more GPUs are coupled with many cores of a multi-core
 CPUs, e.g. within a node of a parallel machine. :ulb,l
 
 Atom-based data (e.g. coordinates, forces) moves back-and-forth
 between the CPU(s) and GPU every timestep. :l
 
 Neighbor lists can be constructed on the CPU or on the GPU :l
 
 The charge assignement and force interpolation portions of PPPM can be
 run on the GPU.  The FFT portion, which requires MPI communication
 between processors, runs on the CPU. :l
 
 Asynchronous force computations can be performed simultaneously on the
 CPU(s) and GPU. :l
 
+It allows for GPU computations to be performed in single or double
+precision, or in mixed-mode precision. where pairwise forces are
+cmoputed in single precision, but accumulated into double-precision
+force vectors. :l
+
 LAMMPS-specific code is in the GPU package.  It makes calls to a
 generic GPU library in the lib/gpu directory.  This library provides
 NVIDIA support as well as more general OpenCL support, so that the
 same functionality can eventually be supported on a variety of GPU
 hardware. :l,ule
 
-
-
-NOTE:
-  discuss 3 precisions
-    if change, also have to re-link with LAMMPS
-  always use newton off
-  expt with differing numbers of CPUs vs GPU - can't tell what is fastest
-  give command line switches in examples
-
-
-I am not very clear to the meaning of  "Max Mem / Proc"
-in the "GPU Time Info (average)".
-Is it the maximal of GPU memory used by one CPU core?
-
-It is the maximum memory used at one time on the GPU for data storage by
-a single MPI process. - Mike
-
-
 [Hardware and software requirements:]
 
-To use this package, you currently need to have specific NVIDIA
-hardware and install specific NVIDIA CUDA software on your system:
+To use this package, you currently need to have an NVIDIA GPU and
+install the NVIDIA Cuda software on your system:
 
-Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0
+Check if you have an NVIDIA GPU: cat /proc/driver/nvidia/cards/0
 Go to http://www.nvidia.com/object/cuda_get.html
 Install a driver and toolkit appropriate for your system (SDK is not necessary)
 Follow the instructions in lammps/lib/gpu/README to build the library (see below)
 Run lammps/lib/gpu/nvc_get_devices to list supported devices and properties :ul
 
 [Building LAMMPS with the GPU package:]
 
 As with other packages that include a separately compiled library, you
 need to first build the GPU library, before building LAMMPS itself.
 General instructions for doing this are in "this
 section"_Section_start.html#start_3 of the manual.  For this package,
-do the following, using a Makefile in lib/gpu appropriate for your
-system:
+use a Makefile in lib/gpu appropriate for your system.
+
+Before building the library, you can set the precision it will use by
+editing the CUDA_PREC setting in the Makefile you are using, as
+follows:
+
+CUDA_PREC = -D_SINGLE_SINGLE  # Single precision for all calculations
+CUDA_PREC = -D_DOUBLE_DOUBLE  # Double precision for all calculations
+CUDA_PREC = -D_SINGLE_DOUBLE  # Accumulation of forces, etc, in double :pre
+
+The last setting is the mixed mode referred to above.  Note that your
+GPU must support double precision to use either the 2nd or 3rd of
+these settings.
+
+To build the library, then type:
 
 cd lammps/lib/gpu
 make -f Makefile.linux
 (see further instructions in lammps/lib/gpu/README) :pre
 
 If you are successful, you will produce the file lib/libgpu.a.
 
 Now you are ready to build LAMMPS with the GPU package installed:
 
 cd lammps/src
 make yes-gpu
 make machine :pre
 
 Note that the lo-level Makefile (e.g. src/MAKE/Makefile.linux) has
 these settings: gpu_SYSINC, gpu_SYSLIB, gpu_SYSPATH.  These need to be
 set appropriately to include the paths and settings for the CUDA
 system software on your machine.  See src/MAKE/Makefile.g++ for an
 example.
 
-[GPU configuration]
+Also note that if you change the GPU library precision, you need to
+re-build the entire library.  You should do a "clean" first,
+e.g. "make -f Makefile.linux clean".  Then you must also re-build
+LAMMPS if the library precision has changed, so that it re-links with
+the new library.
 
-When using GPUs, you are restricted to one physical GPU per LAMMPS
-process, which is an MPI process running on a single core or
-processor.  Multiple MPI processes (CPU cores) can share a single GPU,
-and in many cases it will be more efficient to run this way.
+[Running an input script:]
 
-[Input script requirements:]
+The examples/gpu and bench/GPU directories have scripts that can be
+run with the GPU package, as well as detailed instructions on how to
+run them.
 
-Additional input script requirements to run pair or PPPM styles with a
-{gpu} suffix are as follows:
+The total number of MPI tasks used by LAMMPS (one or multiple per
+compute node) is set in the usual manner via the mpirun or mpiexec
+commands, and is independent of the GPU package.
 
-To invoke specific styles from the GPU package, you can either append
-"gpu" to the style name (e.g. pair_style lj/cut/gpu), or use the
-"-suffix command-line switch"_Section_start.html#start_7, or use the
-"suffix"_suffix.html command. :ulb,l
+When using the GPU package, you cannot assign more than one physical
+GPU to an MPI task.  However multiple MPI tasks can share the same
+GPU, and in many cases it will be more efficient to run this way.
 
-The "newton pair"_newton.html setting must be {off}. :l
-
-The "package gpu"_package.html command must be used near the beginning
-of your script to control the GPU selection and initialization
-settings.  It also has an option to enable asynchronous splitting of
-force computations between the CPUs and GPUs. :l,ule
-
-As an example, if you have two GPUs per node and 8 CPU cores per node,
-and would like to run on 4 nodes (32 cores) with dynamic balancing of
-force calculation across CPU and GPU cores, you could specify
-
-package gpu force/neigh 0 1 -1 :pre
+Input script requirements to run using pair or PPPM styles with a
+{gpu} suffix are as follows:
 
-In this case, all CPU cores and GPU devices on the nodes would be
-utilized.  Each GPU device would be shared by 4 CPU cores. The CPU
-cores would perform force calculations for some fraction of the
-particles at the same time the GPUs performed force calculation for
-the other particles.
+To invoke specific styles from the GPU package, either append "gpu" to
+the style name (e.g. pair_style lj/cut/gpu), or use the "-suffix
+command-line switch"_Section_start.html#start_7, or use the
+"suffix"_suffix.html command in the input script. :ulb,l
+
+The "newton pair"_newton.html setting in the input script must be
+{off}. :l
+
+Unless the "-suffix gpu command-line
+switch"_Section_start.html#start_7 is used, the "package
+gpu"_package.html command must be used near the beginning of the
+script to control the GPU selection and initialization settings.  It
+also has an option to enable asynchronous splitting of force
+computations between the CPUs and GPUs. :l,ule
+
+The default for the "package gpu"_package.html command is to have all
+the MPI tasks on the compute node use a single GPU.  If you have
+multiple GPUs per node, then be sure to create one or more MPI tasks
+per GPU, and use the first/last settings in the "package
+gpu"_package.html command to include all the GPU IDs on the node.
+E.g. first = 0, last = 1, for 2 GPUs.  For example, on an 8-core 2-GPU
+compute node, if you assign 8 MPI tasks to the node, the following
+command in the input script
+
+package gpu force/neigh 0 1 -1
+
+would speciy each GPU is shared by 4 MPI tasks.  The final -1 will
+dynamically balance force calculations across the CPU cores and GPUs.
+I.e. each CPU core will perform force calculations for some small
+fraction of the particles, at the same time the GPUs perform force
+calcaultions for the majority of the particles.
 
 [Timing output:]
 
 As described by the "package gpu"_package.html command, GPU
 accelerated pair styles can perform computations asynchronously with
 CPU computations. The "Pair" time reported by LAMMPS will be the
 maximum of the time required to complete the CPU pair style
 computations and the time required to complete the GPU pair style
 computations. Any time spent for GPU-enabled pair styles for
 computations that run simultaneously with "bond"_bond_style.html,
 "angle"_angle_style.html, "dihedral"_dihedral_style.html,
 "improper"_improper_style.html, and "long-range"_kspace_style.html
 calculations will not be included in the "Pair" time.
 
 When the {mode} setting for the package gpu command is force/neigh,
 the time for neighbor list calculations on the GPU will be added into
 the "Pair" time, not the "Neigh" time.  An additional breakdown of the
 times required for various tasks on the GPU (data copy, neighbor
 calculations, force computations, etc) are output only with the LAMMPS
 screen output (not in the log file) at the end of each run.  These
 timings represent total time spent on the GPU for each routine,
 regardless of asynchronous CPU calculations.
 
+The output section "GPU Time Info (average)" reports "Max Mem / Proc".
+This is the maximum memory used at one time on the GPU for data
+storage by a single MPI process.
+
 [Performance tips:]
 
-Generally speaking, for best performance, you should use multiple CPUs
-per GPU, as provided my most multi-core CPU/GPU configurations.
+You should experiment with how many MPI tasks per GPU to use to see
+what gives the best performance for your problem.  This is a function
+of your problem size and what pair style you are using.  Likewise, you
+should also experiment with the precision setting for the GPU library
+to see if single or mixed precision will give accurate results, since
+they will typically be faster.
+
+Using multiple MPI tasks per GPU will often give the best performance,
+as allowed my most multi-core CPU/GPU configurations.
 
-Because of the large number of cores within each GPU device, it may be
-more efficient to run on fewer processes per GPU when the number of
-particles per MPI process is small (100's of particles); this can be
-necessary to keep the GPU cores busy.
+If the number of particles per MPI task is small (e.g. 100s of
+particles), it can be more eefficient to run with fewer MPI tasks per
+GPU, even if you do not use all the cores on the compute node.
 
-See the lammps/lib/gpu/README file for instructions on how to build
-the GPU library for single, mixed, or double precision.  The latter
-requires that your GPU card support double precision.
+The "Benchmark page"_http://lammps.sandia.gov/bench.html of the LAMMPS
+web site gives GPU performance on a desktop machine and the Titan HPC
+platform at ORNL for several of the LAMMPS benchmarks, as a function
+of problem size and number of compute nodes.
 
 :line
 
 5.7 USER-CUDA package :h4,link(acc_7)
 
 The USER-CUDA package was developed by Christian Trott at U Technology
 Ilmenau in Germany.  It provides NVIDIA GPU versions of many pair
 styles, many fixes, a few computes, and for long-range Coulombics via
 the PPPM command.  It has the following features:
 
 The package is designed to allow an entire LAMMPS calculation, for
 many timesteps, to run entirely on the GPU (except for inter-processor
 MPI communication), so that atom-based data (e.g. coordinates, forces)
 do not have to move back-and-forth between the CPU and GPU. :ulb,l
 
 The speed-up advantage of this approach is typically better when the
 number of atoms per GPU is large :l
 
 Data will stay on the GPU until a timestep where a non-GPU-ized fix or
 compute is invoked.  Whenever a non-GPU operation occurs (fix,
 compute, output), data automatically moves back to the CPU as needed.
 This may incur a performance penalty, but should otherwise work
 transparently. :l
 
 Neighbor lists for GPU-ized pair styles are constructed on the
 GPU. :l
 
 The package only supports use of a single CPU (core) with each
 GPU. :l,ule
 
 [Hardware and software requirements:]
 
 To use this package, you need to have specific NVIDIA hardware and
 install specific NVIDIA CUDA software on your system.
 
 Your NVIDIA GPU needs to support Compute Capability 1.3. This list may
 help you to find out the Compute Capability of your card:
 
 http://en.wikipedia.org/wiki/Comparison_of_Nvidia_graphics_processing_units
 
 Install the Nvidia Cuda Toolkit in version 3.2 or higher and the
 corresponding GPU drivers. The Nvidia Cuda SDK is not required for
 LAMMPSCUDA but we recommend it be installed.  You can then make sure
 that its sample projects can be compiled without problems.
 
 [Building LAMMPS with the USER-CUDA package:]
 
 As with other packages that include a separately compiled library, you
 need to first build the USER-CUDA library, before building LAMMPS
 itself.  General instructions for doing this are in "this
 section"_Section_start.html#start_3 of the manual.  For this package,
 do the following, using settings in the lib/cuda Makefiles appropriate
 for your system:
 
 Go to the lammps/lib/cuda directory :ulb,l
 
 If your {CUDA} toolkit is not installed in the default system directoy
 {/usr/local/cuda} edit the file {lib/cuda/Makefile.common}
 accordingly. :l
 
 Type "make OPTIONS", where {OPTIONS} are one or more of the following
 options. The settings will be written to the
 {lib/cuda/Makefile.defaults} and used in the next step. :l
 
 {precision=N} to set the precision level
   N = 1 for single precision (default)
   N = 2 for double precision
   N = 3 for positions in double precision
   N = 4 for positions and velocities in double precision
 {arch=M} to set GPU compute capability
   M = 20 for CC2.0 (GF100/110, e.g. C2050,GTX580,GTX470) (default)
   M = 21 for CC2.1 (GF104/114,  e.g. GTX560, GTX460, GTX450)
   M = 13 for CC1.3 (GF200, e.g. C1060, GTX285)
 {prec_timer=0/1} to use hi-precision timers
   0 = do not use them (default)
   1 = use these timers
   this is usually only useful for Mac machines 
 {dbg=0/1} to activate debug mode
   0 = no debug mode (default)
   1 = yes debug mode
   this is only useful for developers
 {cufft=1} to determine usage of CUDA FFT library
   0 = no CUFFT support (default)
   in the future other CUDA-enabled FFT libraries might be supported :pre
 
 Type "make" to build the library.  If you are successful, you will
 produce the file lib/libcuda.a. :l,ule
 
 Now you are ready to build LAMMPS with the USER-CUDA package installed:
 
 cd lammps/src
 make yes-user-cuda
 make machine :pre
 
 Note that the LAMMPS build references the lib/cuda/Makefile.common
 file to extract setting specific CUDA settings.  So it is important
 that you have first built the cuda library (in lib/cuda) using
 settings appropriate to your system.
 
 [Input script requirements:]
 
 Additional input script requirements to run styles with a {cuda}
 suffix are as follows:
 
 To invoke specific styles from the USER-CUDA package, you can either
 append "cuda" to the style name (e.g. pair_style lj/cut/cuda), or use
 the "-suffix command-line switch"_Section_start.html#start_7, or use
 the "suffix"_suffix.html command.  One exception is that the
 "kspace_style pppm/cuda"_kspace_style.html command has to be requested
 explicitly. :ulb,l
 
 To use the USER-CUDA package with its default settings, no additional
 command is needed in your input script.  This is because when LAMMPS
 starts up, it detects if it has been built with the USER-CUDA package.
 See the "-cuda command-line switch"_Section_start.html#start_7 for
 more details. :l
 
 To change settings for the USER-CUDA package at run-time, the "package
 cuda"_package.html command can be used near the beginning of your
 input script.  See the "package"_package.html command doc page for
 details. :l,ule
 
 [Performance tips:]
 
 The USER-CUDA package offers more speed-up relative to CPU performance
 when the number of atoms per GPU is large, e.g. on the order of tens
 or hundreds of 1000s.
 
 As noted above, this package will continue to run a simulation
 entirely on the GPU(s) (except for inter-processor MPI communication),
 for multiple timesteps, until a CPU calculation is required, either by
 a fix or compute that is non-GPU-ized, or until output is performed
 (thermo or dump snapshot or restart file).  The less often this
 occurs, the faster your simulation will run.
 
+:line
+
+5.8 KOKKOS package :h4,link(acc_8)
+
+The KOKKOS package contains versions of pair, fix, and atom styles
+that use data structures and methods and macros provided by the Kokkos
+library, which is included with LAMMPS in lib/kokkos.
+
+"Kokkos"_http://trilinos.sandia.gov/packages/kokkos is a C++ library
+that provides two key abstractions for an application like LAMMPS.
+First, it allows a single implementation of an application kernel
+(e.g. a pair style) to run efficiently on different kinds of hardware
+(GPU, Intel Phi, many-core chip).
+
+Second, it provides data abstractions to adjust (at compile time) the
+memory layout of basic data structures like 2d and 3d arrays and allow
+the transparent utilization of special hardware load and store units.
+Such data structures are used in LAMMPS to store atom coordinates or
+forces or neighbor lists.  The layout is chosen to optimize
+performance on different platforms.  Again this operation is hidden
+from the developer, and does not affect how the single implementation
+of the kernel is coded.
+
+These abstractions are set at build time, when LAMMPS is compiled with
+the KOKKOS package installed.  This is done by selecting a "host" and
+"device" to build for, compatible with the compute nodes in your
+machine.  Note that if you are running on a desktop machine, you
+typically have one compute node.  On a cluster or supercomputer there
+may be dozens or 1000s of compute nodes.  The procedure for building
+and running with the Kokkos library is the same, no matter how many
+nodes you run on.
+
+All Kokkos operations occur within the context of an individual MPI
+task running on a single node of the machine.  The total number of MPI
+tasks used by LAMMPS (one or multiple per compute node) is set in the
+usual manner via the mpirun or mpiexec commands, and is independent of
+Kokkos.
+
+Kokkos provides support for one or two modes of execution per MPI
+task.  This means that some computational tasks (pairwise
+interactions, neighbor list builds, time integration, etc) are
+parallelized in one or the other of the two modes.  The first mode is
+called the "host" and is one or more threads running on one or more
+physical CPUs (within the node).  Currently, both multi-core CPUs and
+an Intel Phi processor (running in native mode) are supported.  The
+second mode is called the "device" and is an accelerator chip of some
+kind.  Currently only an NVIDIA GPU is supported.  If your compute
+node does not have a GPU, then there is only one mode of execution,
+i.e. the host and device are the same.
+
+IMPORTNANT NOTE: Currently, if using GPUs, you should set the number
+of MPI tasks per compute node to be equal to the number of GPUs per
+compute node.  In the future Kokkos will support assigning one GPU to
+multiple MPI tasks or using multiple GPUs per MPI task.  Currently
+Kokkos does not support AMD GPUs due to limits in the available
+backend programming models (in particular relative extensive C++
+support is required for the Kernel language).  This is expected to
+change in the future.
+
+Here are several examples of how to build LAMMPS and run a simulation
+using the KOKKOS package for typical compute node configurations.
+Note that the -np setting for the mpirun command in these examples are
+for a run on a single node.  To scale these examples up to run on a
+system with N compute nodes, simply multiply the -np setting by N.
+
+All the build steps are performed from within the src directory.  All
+the run steps are performed in the bench directory using the in.lj
+input script.  It is assumed the LAMMPS executable has been copied to
+that directory or whatever directory the runs are being performed in.
+Details of the various options are discussed below.
+
+[Compute node(s) = dual hex-core CPUs and no GPU:]
+
+make yes-kokkos                           # install the KOKKOS package
+make g++ OMP=yes                          # build with OpenMP, no CUDA :pre
+
+mpirun -np 12 lmp_g++ -k off < in.lj      # MPI-only mode with no Kokkos
+mpirun -np 12 lmp_g++ -sf kk < in.lj      # MPI-only mode with Kokkos
+mpirun -np 1 lmp_g++ -k on t 12 -sf kk < in.lj     # one MPI task, 12 threads
+mpirun -np 2 lmp_g++ -k on t 6 -sf kk < in.lj      # two MPI tasks, 6 threads/task :pre
+
+[Compute node(s) = Intel Phi with 61 cores:]
+
+make yes-kokkos
+make g++ OMP=yes MIC=yes                  # build with OpenMP for Phi :pre
+
+mpirun -np 12 lmp_g++ -k on t 20 -sf kk < in.lj      # 12*20 = 240 total cores
+mpirun -np 15 lmp_g++ -k on t 16 -sf kk < in.lj
+mpirun -np 30 lmp_g++ -k on t 8 -sf kk < in.lj
+mpirun -np 1 lmp_g++ -k on t 240 -sf kk < in.lj :pre
+
+[Compute node(s) = dual hex-core CPUs and a single GPU:]
+
+make yes-kokkos
+make cuda CUDA=yes             # build for GPU, use src/MAKE/Makefile.cuda :pre
+
+mpirun -np 1 lmp_cuda -k on t 6 -sf kk < in.lj :pre
+
+[Compute node(s) = dual 8-core CPUs and 2 GPUs:]
+
+make yes-kokkos
+make cuda CUDA=yes :pre
+
+mpirun -np 2 lmp_cuda -k on t 8 g 2 -sf kk < in.lj     # use both GPUs, one per MPI task :pre
+
+[Building LAMMPS with the KOKKOS package:]
+
+A summary of the build process is given here.  More details and all
+the available make variable options are given in "this
+section"_Section_start.html#start_3_4 of the manual.
+
+From the src directory, type
+
+make yes-kokkos :pre
+
+to include the KOKKOS package.  Then perform a normal LAMMPS build,
+with additional make variable specifications to choose the host and
+device you will run the resulting executable on, e.g.
+
+make g++ OMP=yes
+make cuda CUDA=yes :pre
+
+As illustrated above, the most important variables to set are OMP,
+CUDA, and MIC.  The default settings are OMP=yes, CUDA=no, MIC=no
+Setting OMP to {yes} will use OpenMP for threading on the host, as
+well as on the device (if no GPU is present).  Setting CUDA to {yes}
+will use one or more GPUs as the device.  Setting MIC=yes is necessary
+when building for an Intel Phi processor.
+
+Note that to use a GPU, you must use a lo-level Makefile,
+e.g. src/MAKE/Makefile.cuda as included in the LAMMPS distro, which
+uses the NVIDA "nvcc" compiler.  You must check that the CCFLAGS -arch
+setting is appropriate for your NVIDIA hardware and installed
+software.  Typical values for -arch are given in "this
+section"_Section_start.html#start_3_4 of the manual, as well as other
+settings that must be included in the lo-level Makefile, if you create
+your own.
+
+[Input scripts and use of command-line switches -kokkos and -suffix:]
+
+To use any Kokkos-enabled style provided in the KOKKOS package, you
+must use a Kokkos-enabled atom style.  LAMMPS will give an error if
+you do not do this.
+
+There are two command-line switches relevant to using Kokkos, -k or
+-kokkos, and -sf or -suffix.  They are described in detail in "this
+section"_Section_start.html#start_7 of the manual.
+
+Here are common options to use:
+
+-k off : runs an executable built with the KOKKOS pacakage, as
+ if Kokkos were not installed. :ulb,l
+
+-sf kk : enables automatic use of Kokkos versions of atom, pair,
+fix, compute styles if they exist.  This can also be done with more
+precise control by using the "suffix"_suffix.html command or appending
+"kk" to styles within the input script, e.g. "pair_style lj/cut/kk". :l
+
+-k on t Nt : specifies how many threads per MPI task to use within a
+ compute node.  For good performance, the product of MPI tasks *
+ threads/task should not exceed the number of physical CPU or Intel
+ Phi cores. :l
+
+-k on g Ng : specifies how many GPUs per compute node are available.
+The default is 1, so this should be specified is you have 2 or more
+GPUs per compute node. :ule,l
+
+[Use of package command options:]
+
+Using the "package kokkos"_package.html command in an input script
+allows choice of options for neighbor lists and communication.  See
+the "package"_package.html command doc page for details and default
+settings.
+
+Experimenting with different styles of neighbor lists or inter-node
+communication can provide a speed-up for specific calculations.
+
+[Running on a multi-core CPU:]
+
+Build with OMP=yes (the default) and CUDA=no (the default).
+
+If N is the number of physical cores/node, then the number of MPI
+tasks/node * number of threads/task should not exceed N, and should
+typically equal N.  Note that the default threads/task is 1, as set by
+the "t" keyword of the -k "command-line
+switch"_Section_start.html#start_7.  If you do not change this, no
+additional parallelism (beyond MPI) will be invoked on the host
+CPU(s).
+
+You can compare the performance running in different modes:
+  
+run with 1 MPI task/node and N threads/task
+run with N MPI tasks/node and 1 thread/task
+run with settings in between these extremes :ul
+
+Examples of mpirun commands in these modes, for nodes with dual
+hex-core CPUs and no GPU, are shown above.
+
+[Running on GPUs:]
+
+Build with CUDA=yes, using src/MAKE/Makefile.cuda.  Insure the setting
+for CUDA_PATH in lib/kokkos/Makefile.lammps is correct for your Cuda
+software installation.  Insure the -arch setting in
+src/MAKE/Makefile.cuda is correct for your GPU hardware/software (see
+"this section"_Section_start.html#start_3_4 of the manual for details.
+
+The -np setting of the mpirun command should set the number of MPI
+tasks/node to be equal to the # of physical GPUs on the node. 
+
+Use the "-kokkos command-line switch"_Section_commands.html#start_7 to
+specify the number of GPUs per node, and the number of threads per MPI
+task.  As above for multi-core CPUs (and no GPU), if N is the number
+of physical cores/node, then the number of MPI tasks/node * number of
+threads/task should not exceed N.  With one GPU (and one MPI task) it
+may be faster to use less than all the available cores, by setting
+threads/task to a smaller value.  This is because using all the cores
+on a dual-socket node will incur extra cost to copy memory from the
+2nd socket to the GPU.
+
+Examples of mpirun commands that follow these rules, for nodes with
+dual hex-core CPUs and one or two GPUs, are shown above.
+
+[Running on an Intel Phi:]
+
+Kokkos only uses Intel Phi processors in their "native" mode, i.e.
+not hosted by a CPU.
+
+Build with OMP=yes (the default) and MIC=yes.  The latter
+insures code is correctly compiled for the Intel Phi.  The
+OMP setting means OpenMP will be used for parallelization
+on the Phi, which is currently the best option within
+Kokkos.  In the future, other options may be added.
+
+Current-generation Intel Phi chips have either 61 or 57 cores.  One
+core should be excluded to run the OS, leaving 60 or 56 cores.  Each
+core is hyperthreaded, so there are effectively N = 240 (4*60) or N =
+224 (4*56) cores to run on.
+
+The -np setting of the mpirun command sets the number of MPI
+tasks/node.  The "-k on t Nt" command-line switch sets the number of
+threads/task as Nt.  The product of these 2 values should be N, i.e.
+240 or 224.  Also, the number of threads/task should be a multiple of
+4 so that logical threads from more than one MPI task do not run on
+the same physical core.
+
+Examples of mpirun commands that follow these rules, for Intel Phi
+nodes with 61 cores, are shown above.
+
+[Examples and benchmarks:]
+
+The examples/kokkos and bench/KOKKOS directories have scripts that can
+be run with the KOKKOS package, as well as detailed instructions on
+how to run them.
+
+IMPORTANT NOTE: the bench/KOKKOS directory does not yet exist.  It
+will be added later.
+
+[Additional performance issues:]
+
+When using threads (OpenMP or pthreads), it is important for
+performance to bind the threads to physical cores, so they do not
+migrate during a simulation.  The same is true for MPI tasks, but the
+default binding rules implemented for various MPI versions, do not
+account for thread binding.  
+
+Thus if you use more than one thread per MPI task, you should insure
+MPI tasks are bound to CPU sockets.  Furthermore, use thread affinity
+environment variables from the OpenMP runtime when using OpenMP and
+compile with hwloc support when using pthreads.  With OpenMP 3.1 (gcc
+4.7 or later, intel 12 or later) setting the environment variable
+OMP_PROC_BIND=true should be sufficient.  A typical mpirun command
+should set these flags:
+
+OpenMPI 1.8: mpirun -np 2 -bind-to socket -map-by socket ./lmp_openmpi ...
+Mvapich2 2.0: mpiexec -np 2 -bind-to socket -map-by socket ./lmp_mvapich ... :pre
+
+When using a GPU, you will achieve the best performance if your input
+script does not use any fix or compute styles which are not yet
+Kokkos-enabled.  This allows data to stay on the GPU for multiple
+timesteps, without being copied back to the host CPU.  Invoking a
+non-Kokkos fix or compute, or performing I/O for
+"thermo"_thermo_style.html or "dump"_dump.html output will cause data
+to be copied back to the CPU.
+
+You cannot yet assign multiple MPI tasks to the same GPU with the
+KOKKOS package.  We plan to support this in the future, similar to the
+GPU package in LAMMPS.
+
+You cannot yet use both the host (multi-threaded) and device (GPU)
+together to compute pairwise interactions with the KOKKOS package.  We
+hope to support this in the future, similar to the GPU package in
+LAMMPS.
+
 :line
 :line
 
-5.8 Comparison of GPU and USER-CUDA packages :h4,link(acc_8)
+5.9 Comparison of GPU and USER-CUDA packages :h4,link(acc_9)
 
 Both the GPU and USER-CUDA packages accelerate a LAMMPS calculation
 using NVIDIA hardware, but they do it in different ways.
 
 As a consequence, for a particular simulation on specific hardware,
 one package may be faster than the other.  We give guidelines below,
 but the best way to determine which package is faster for your input
 script is to try both of them on your machine.  See the benchmarking
 section below for examples where this has been done.
 
 [Guidelines for using each package optimally:]
 
 The GPU package allows you to assign multiple CPUs (cores) to a single
 GPU (a common configuration for "hybrid" nodes that contain multicore
 CPU(s) and GPU(s)) and works effectively in this mode.  The USER-CUDA
 package does not allow this; you can only use one CPU per GPU. :ulb,l
 
 The GPU package moves per-atom data (coordinates, forces)
 back-and-forth between the CPU and GPU every timestep.  The USER-CUDA
 package only does this on timesteps when a CPU calculation is required
 (e.g. to invoke a fix or compute that is non-GPU-ized).  Hence, if you
 can formulate your input script to only use GPU-ized fixes and
 computes, and avoid doing I/O too often (thermo output, dump file
 snapshots, restart files), then the data transfer cost of the
 USER-CUDA package can be very low, causing it to run faster than the
 GPU package. :l
 
 The GPU package is often faster than the USER-CUDA package, if the
 number of atoms per GPU is "small".  The crossover point, in terms of
 atoms/GPU at which the USER-CUDA package becomes faster depends
 strongly on the pair style.  For example, for a simple Lennard Jones
 system the crossover (in single precision) is often about 50K-100K
 atoms per GPU.  When performing double precision calculations the
 crossover point can be significantly smaller. :l
 
 Both packages compute bonded interactions (bonds, angles, etc) on the
 CPU.  This means a model with bonds will force the USER-CUDA package
 to transfer per-atom data back-and-forth between the CPU and GPU every
 timestep.  If the GPU package is running with several MPI processes
 assigned to one GPU, the cost of computing the bonded interactions is
 spread across more CPUs and hence the GPU package can run faster. :l
 
 When using the GPU package with multiple CPUs assigned to one GPU, its
 performance depends to some extent on high bandwidth between the CPUs
 and the GPU.  Hence its performance is affected if full 16 PCIe lanes
 are not available for each GPU.  In HPC environments this can be the
 case if S2050/70 servers are used, where two devices generally share
 one PCIe 2.0 16x slot.  Also many multi-GPU mainboards do not provide
 full 16 lanes to each of the PCIe 2.0 16x slots. :l,ule
 
 [Differences between the two packages:]
 
 The GPU package accelerates only pair force, neighbor list, and PPPM
 calculations.  The USER-CUDA package currently supports a wider range
 of pair styles and can also accelerate many fix styles and some
 compute styles, as well as neighbor list and PPPM calculations. :ulb,l
 
 The USER-CUDA package does not support acceleration for minimization. :l
 
 The USER-CUDA package does not support hybrid pair styles. :l
 
 The USER-CUDA package can order atoms in the neighbor list differently
 from run to run resulting in a different order for force accumulation. :l
 
 The USER-CUDA package has a limit on the number of atom types that can be
 used in a simulation. :l
 
 The GPU package requires neighbor lists to be built on the CPU when using
 exclusion lists or a triclinic simulation box. :l
 
 The GPU package uses more GPU memory than the USER-CUDA package.  This
 is generally not a problem since typical runs are computation-limited
 rather than memory-limited. :l,ule
 
 [Examples:]
 
 The LAMMPS distribution has two directories with sample input scripts
 for the GPU and USER-CUDA packages.
 
 lammps/examples/gpu = GPU package files
 lammps/examples/USER/cuda = USER-CUDA package files :ul
 
 These contain input scripts for identical systems, so they can be used
 to benchmark the performance of both packages on your system.
diff --git a/doc/Section_commands.html b/doc/Section_commands.html
index 085743c91..c8f264577 100644
--- a/doc/Section_commands.html
+++ b/doc/Section_commands.html
@@ -1,723 +1,723 @@
 <HTML>
 <CENTER><A HREF = "Section_start.html">Previous Section</A> - <A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> - <A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> - <A HREF = "Section_packages.html">Next Section</A> 
 </CENTER>
 
 
 
 
 
 
 <HR>
 
 <H3>3. Commands 
 </H3>
 <P>This section describes how a LAMMPS input script is formatted and the
 input script commands used to define a LAMMPS simulation.
 </P>
 3.1 <A HREF = "#cmd_1">LAMMPS input script</A><BR>
 3.2 <A HREF = "#cmd_2">Parsing rules</A><BR>
 3.3 <A HREF = "#cmd_3">Input script structure</A><BR>
 3.4 <A HREF = "#cmd_4">Commands listed by category</A><BR>
 3.5 <A HREF = "#cmd_5">Commands listed alphabetically</A> <BR>
 
 <HR>
 
 <HR>
 
 <A NAME = "cmd_1"></A><H4>3.1 LAMMPS input script 
 </H4>
 <P>LAMMPS executes by reading commands from a input script (text file),
 one line at a time.  When the input script ends, LAMMPS exits.  Each
 command causes LAMMPS to take some action.  It may set an internal
 variable, read in a file, or run a simulation.  Most commands have
 default settings, which means you only need to use the command if you
 wish to change the default.
 </P>
 <P>In many cases, the ordering of commands in an input script is not
 important.  However the following rules apply:
 </P>
 <P>(1) LAMMPS does not read your entire input script and then perform a
 simulation with all the settings.  Rather, the input script is read
 one line at a time and each command takes effect when it is read.
 Thus this sequence of commands:
 </P>
 <PRE>timestep 0.5 
 run      100 
 run      100 
 </PRE>
 <P>does something different than this sequence:
 </P>
 <PRE>run      100 
 timestep 0.5 
 run      100 
 </PRE>
 <P>In the first case, the specified timestep (0.5 fmsec) is used for two
 simulations of 100 timesteps each.  In the 2nd case, the default
 timestep (1.0 fmsec) is used for the 1st 100 step simulation and a 0.5
 fmsec timestep is used for the 2nd one.
 </P>
 <P>(2) Some commands are only valid when they follow other commands.  For
 example you cannot set the temperature of a group of atoms until atoms
 have been defined and a group command is used to define which atoms
 belong to the group.
 </P>
 <P>(3) Sometimes command B will use values that can be set by command A.
 This means command A must precede command B in the input script if it
 is to have the desired effect.  For example, the
 <A HREF = "read_data.html">read_data</A> command initializes the system by setting
 up the simulation box and assigning atoms to processors.  If default
 values are not desired, the <A HREF = "processors.html">processors</A> and
 <A HREF = "boundary.html">boundary</A> commands need to be used before read_data to
 tell LAMMPS how to map processors to the simulation box.
 </P>
 <P>Many input script errors are detected by LAMMPS and an ERROR or
 WARNING message is printed.  <A HREF = "Section_errors.html">This section</A> gives
 more information on what errors mean.  The documentation for each
 command lists restrictions on how the command can be used.
 </P>
 <HR>
 
 <A NAME = "cmd_2"></A><H4>3.2 Parsing rules 
 </H4>
 <P>Each non-blank line in the input script is treated as a command.
 LAMMPS commands are case sensitive.  Command names are lower-case, as
 are specified command arguments.  Upper case letters may be used in
 file names or user-chosen ID strings.
 </P>
 <P>Here is how each line in the input script is parsed by LAMMPS:
 </P>
 <P>(1) If the last printable character on the line is a "&" character
 (with no surrounding quotes), the command is assumed to continue on
 the next line.  The next line is concatenated to the previous line by
 removing the "&" character and newline.  This allows long commands to
 be continued across two or more lines.
 </P>
 <P>(2) All characters from the first "#" character onward are treated as
 comment and discarded.  See an exception in (6).  Note that a
 comment after a trailing "&" character will prevent the command from
 continuing on the next line.  Also note that for multi-line commands a
 single leading "#" will comment out the entire command.
 </P>
 <P>(3) The line is searched repeatedly for $ characters, which indicate
 variables that are replaced with a text string.  See an exception in
 (6).  
 </P>
 <P>If the $ is followed by curly brackets, then the variable name is the
 text inside the curly brackets.  If no curly brackets follow the $,
 then the variable name is the single character immediately following
 the $.  Thus ${myTemp} and $x refer to variable names "myTemp" and
 "x".
 </P>
 <P>How the variable is converted to a text string depends on what style
 of variable it is; see the <A HREF = "variable">variable</A> doc page for details.
 It can be a variable that stores multiple text strings, and return one
 of them.  The returned text string can be multiple "words" (space
 separated) which will then be interpreted as multiple arguments in the
 input command.  The variable can also store a numeric formula which
 will be evaluated and its numeric result returned as a string.
 </P>
 <P>As a special case, if the $ is followed by parenthesis, then the text
 inside the parenthesis is treated as an "immediate" variable and
 evaluated as an <A HREF = "variable.html">equal-style variable</A>.  This is a way
 to use numeric formulas in an input script without having to assign
 them to variable names.  For example, these 3 input script lines:
 </P>
 <PRE>variable X equal (xlo+xhi)/2+sqrt(v_area)
 region 1 block $X 2 INF INF EDGE EDGE
 variable X delete 
 </PRE>
 <P>can be replaced by 
 </P>
 <PRE>region 1 block $((xlo+xhi)/2+sqrt(v_area)) 2 INF INF EDGE EDGE 
 </PRE>
 <P>so that you do not have to define (or discard) a temporary variable X.
 </P>
 <P>Note that neither the curly-bracket or immediate form of variables can
 contain nested $ characters for other variables to substitute for.
 Thus you cannot do this:
 </P>
 <PRE>variable        a equal 2
 variable        b2 equal 4
 print           "B2 = ${b$a}" 
 </PRE>
 <P>Nor can you specify this $($x-1.0) for an immediate variable, but
 you could use $(v_x-1.0), since the latter is valid syntax for an
 <A HREF = "variable.html">equal-style variable</A>.
 </P>
 <P>See the <A HREF = "variable.html">variable</A> command for more details of how
 strings are assigned to variables and evaluated, and how they can be
 used in input script commands.
 </P>
 <P>(4) The line is broken into "words" separated by whitespace (tabs,
 spaces).  Note that words can thus contain letters, digits,
 underscores, or punctuation characters.
 </P>
 <P>(5) The first word is the command name.  All successive words in the
 line are arguments.
 </P>
 <P>(6) If you want text with spaces to be treated as a single argument,
 it can be enclosed in either double or single quotes.  A long single
 argument enclosed in quotes can even span multiple lines if the "&"
 character is used, as described above.  E.g.
 </P>
 <PRE>print "Volume = $v"
 print 'Volume = $v'
 variable a string "red green blue &
                    purple orange cyan"
 if "$<I>steps</I> > 1000" then quit 
 </PRE>
 <P>The quotes are removed when the single argument is stored internally.
 </P>
 <P>See the <A HREF = "dump_modify.html">dump modify format</A> or <A HREF = "print.html">print</A> or
 <A HREF = "if.html">if</A> commands for examples.  A "#" or "$" character that is
 between quotes will not be treated as a comment indicator in (2) or
 substituted for as a variable in (3). 
 </P>
 <P>IMPORTANT NOTE: If the argument is itself a command that requires a
 quoted argument (e.g. using a <A HREF = "print.html">print</A> command as part of an
 <A HREF = "if.html">if</A> or <A HREF = "run.html">run every</A> command), then the double and
 single quotes can be nested in the usual manner.  See the doc pages
 for those commands for examples.  Only one of level of nesting is
 allowed, but that should be sufficient for most use cases.
 </P>
 <HR>
 
 <H4><A NAME = "cmd_3"></A>3.3 Input script structure 
 </H4>
 <P>This section describes the structure of a typical LAMMPS input script.
 The "examples" directory in the LAMMPS distribution contains many
 sample input scripts; the corresponding problems are discussed in
 <A HREF = "Section_example.html">Section_example</A>, and animated on the <A HREF = "http://lammps.sandia.gov">LAMMPS
 WWW Site</A>.
 </P>
 <P>A LAMMPS input script typically has 4 parts:
 </P>
 <OL><LI>Initialization
 <LI>Atom definition
 <LI>Settings
 <LI>Run a simulation 
 </OL>
 <P>The last 2 parts can be repeated as many times as desired.  I.e. run a
 simulation, change some settings, run some more, etc.  Each of the 4
 parts is now described in more detail.  Remember that almost all the
 commands need only be used if a non-default value is desired.
 </P>
 <P>(1) Initialization
 </P>
 <P>Set parameters that need to be defined before atoms are created or
 read-in from a file.
 </P>
 <P>The relevant commands are <A HREF = "units.html">units</A>,
 <A HREF = "dimension.html">dimension</A>, <A HREF = "newton.html">newton</A>,
 <A HREF = "processors.html">processors</A>, <A HREF = "boundary.html">boundary</A>,
 <A HREF = "atom_style.html">atom_style</A>, <A HREF = "atom_modify.html">atom_modify</A>.
 </P>
 <P>If force-field parameters appear in the files that will be read, these
 commands tell LAMMPS what kinds of force fields are being used:
 <A HREF = "pair_style.html">pair_style</A>, <A HREF = "bond_style.html">bond_style</A>,
 <A HREF = "angle_style.html">angle_style</A>, <A HREF = "dihedral_style.html">dihedral_style</A>,
 <A HREF = "improper_style.html">improper_style</A>.
 </P>
 <P>(2) Atom definition
 </P>
 <P>There are 3 ways to define atoms in LAMMPS.  Read them in from a data
 or restart file via the <A HREF = "read_data.html">read_data</A> or
 <A HREF = "read_restart.html">read_restart</A> commands.  These files can contain
 molecular topology information.  Or create atoms on a lattice (with no
 molecular topology), using these commands: <A HREF = "lattice.html">lattice</A>,
 <A HREF = "region.html">region</A>, <A HREF = "create_box.html">create_box</A>,
 <A HREF = "create_atoms.html">create_atoms</A>.  The entire set of atoms can be
 duplicated to make a larger simulation using the
 <A HREF = "replicate.html">replicate</A> command.
 </P>
 <P>(3) Settings
 </P>
 <P>Once atoms and molecular topology are defined, a variety of settings
 can be specified: force field coefficients, simulation parameters,
 output options, etc.
 </P>
 <P>Force field coefficients are set by these commands (they can also be
 set in the read-in files): <A HREF = "pair_coeff.html">pair_coeff</A>,
 <A HREF = "bond_coeff.html">bond_coeff</A>, <A HREF = "angle_coeff.html">angle_coeff</A>,
 <A HREF = "dihedral_coeff.html">dihedral_coeff</A>,
 <A HREF = "improper_coeff.html">improper_coeff</A>,
 <A HREF = "kspace_style.html">kspace_style</A>, <A HREF = "dielectric.html">dielectric</A>,
 <A HREF = "special_bonds.html">special_bonds</A>.
 </P>
 <P>Various simulation parameters are set by these commands:
 <A HREF = "neighbor.html">neighbor</A>, <A HREF = "neigh_modify.html">neigh_modify</A>,
 <A HREF = "group.html">group</A>, <A HREF = "timestep.html">timestep</A>,
 <A HREF = "reset_timestep.html">reset_timestep</A>, <A HREF = "run_style.html">run_style</A>,
 <A HREF = "min_style.html">min_style</A>, <A HREF = "min_modify.html">min_modify</A>.
 </P>
 <P>Fixes impose a variety of boundary conditions, time integration, and
 diagnostic options.  The <A HREF = "fix.html">fix</A> command comes in many flavors.
 </P>
 <P>Various computations can be specified for execution during a
 simulation using the <A HREF = "compute.html">compute</A>,
 <A HREF = "compute_modify.html">compute_modify</A>, and <A HREF = "variable.html">variable</A>
 commands.
 </P>
 <P>Output options are set by the <A HREF = "thermo.html">thermo</A>, <A HREF = "dump.html">dump</A>,
 and <A HREF = "restart.html">restart</A> commands.
 </P>
 <P>(4) Run a simulation
 </P>
 <P>A molecular dynamics simulation is run using the <A HREF = "run.html">run</A>
 command.  Energy minimization (molecular statics) is performed using
 the <A HREF = "minimize.html">minimize</A> command.  A parallel tempering
 (replica-exchange) simulation can be run using the
 <A HREF = "temper.html">temper</A> command.
 </P>
 <HR>
 
 <A NAME = "cmd_4"></A><H4>3.4 Commands listed by category 
 </H4>
 <P>This section lists all LAMMPS commands, grouped by category.  The
 <A HREF = "#cmd_5">next section</A> lists the same commands alphabetically.  Note
 that some style options for some commands are part of specific LAMMPS
 packages, which means they cannot be used unless the package was
 included when LAMMPS was built.  Not all packages are included in a
 default LAMMPS build.  These dependencies are listed as Restrictions
 in the command's documentation.
 </P>
 <P>Initialization:
 </P>
 <P><A HREF = "atom_modify.html">atom_modify</A>, <A HREF = "atom_style.html">atom_style</A>,
 <A HREF = "boundary.html">boundary</A>, <A HREF = "dimension.html">dimension</A>,
 <A HREF = "newton.html">newton</A>, <A HREF = "processors.html">processors</A>, <A HREF = "units.html">units</A>
 </P>
 <P>Atom definition:
 </P>
 <P><A HREF = "create_atoms.html">create_atoms</A>, <A HREF = "create_box.html">create_box</A>,
 <A HREF = "lattice.html">lattice</A>, <A HREF = "read_data.html">read_data</A>,
 <A HREF = "read_dump.html">read_dump</A>, <A HREF = "read_restart.html">read_restart</A>,
 <A HREF = "region.html">region</A>, <A HREF = "replicate.html">replicate</A>
 </P>
 <P>Force fields:
 </P>
 <P><A HREF = "angle_coeff.html">angle_coeff</A>, <A HREF = "angle_style.html">angle_style</A>,
 <A HREF = "bond_coeff.html">bond_coeff</A>, <A HREF = "bond_style.html">bond_style</A>,
 <A HREF = "dielectric.html">dielectric</A>, <A HREF = "dihedral_coeff.html">dihedral_coeff</A>,
 <A HREF = "dihedral_style.html">dihedral_style</A>,
 <A HREF = "improper_coeff.html">improper_coeff</A>,
 <A HREF = "improper_style.html">improper_style</A>,
 <A HREF = "kspace_modify.html">kspace_modify</A>, <A HREF = "kspace_style.html">kspace_style</A>,
 <A HREF = "pair_coeff.html">pair_coeff</A>, <A HREF = "pair_modify.html">pair_modify</A>,
 <A HREF = "pair_style.html">pair_style</A>, <A HREF = "pair_write.html">pair_write</A>,
 <A HREF = "special_bonds.html">special_bonds</A>
 </P>
 <P>Settings:
 </P>
 <P><A HREF = "comm_style.html">comm_style</A>, <A HREF = "group.html">group</A>, <A HREF = "mass.html">mass</A>,
 <A HREF = "min_modify.html">min_modify</A>, <A HREF = "min_style.html">min_style</A>,
 <A HREF = "neigh_modify.html">neigh_modify</A>, <A HREF = "neighbor.html">neighbor</A>,
 <A HREF = "reset_timestep.html">reset_timestep</A>, <A HREF = "run_style.html">run_style</A>,
 <A HREF = "set.html">set</A>, <A HREF = "timestep.html">timestep</A>, <A HREF = "velocity.html">velocity</A>
 </P>
 <P>Fixes:
 </P>
 <P><A HREF = "fix.html">fix</A>, <A HREF = "fix_modify.html">fix_modify</A>, <A HREF = "unfix.html">unfix</A>
 </P>
 <P>Computes:
 </P>
 <P><A HREF = "compute.html">compute</A>, <A HREF = "compute_modify.html">compute_modify</A>,
 <A HREF = "uncompute.html">uncompute</A>
 </P>
 <P>Output:
 </P>
 <P><A HREF = "dump.html">dump</A>, <A HREF = "dump_image.html">dump image</A>,
 <A HREF = "dump_modify.html">dump_modify</A>, <A HREF = "dump_image.html">dump movie</A>,
 <A HREF = "restart.html">restart</A>, <A HREF = "thermo.html">thermo</A>,
 <A HREF = "thermo_modify.html">thermo_modify</A>, <A HREF = "thermo_style.html">thermo_style</A>,
 <A HREF = "undump.html">undump</A>, <A HREF = "write_data.html">write_data</A>,
 <A HREF = "write_dump.html">write_dump</A>, <A HREF = "write_restart.html">write_restart</A>
 </P>
 <P>Actions:
 </P>
 <P><A HREF = "delete_atoms.html">delete_atoms</A>, <A HREF = "delete_bonds.html">delete_bonds</A>,
 <A HREF = "displace_atoms.html">displace_atoms</A>, <A HREF = "change_box.html">change_box</A>,
 <A HREF = "minimize.html">minimize</A>, <A HREF = "neb.html">neb</A> <A HREF = "prd.html">prd</A>,
 <A HREF = "rerun.html">rerun</A>, <A HREF = "run.html">run</A>, <A HREF = "temper.html">temper</A>
 </P>
 <P>Miscellaneous:
 </P>
 <P><A HREF = "clear.html">clear</A>, <A HREF = "echo.html">echo</A>, <A HREF = "if.html">if</A>,
 <A HREF = "include.html">include</A>, <A HREF = "jump.html">jump</A>, <A HREF = "label.html">label</A>,
 <A HREF = "log.html">log</A>, <A HREF = "next.html">next</A>, <A HREF = "print.html">print</A>,
 <A HREF = "shell.html">shell</A>, <A HREF = "variable.html">variable</A>
 </P>
 <HR>
 
 <H4><A NAME = "cmd_5"></A><A NAME = "comm"></A>3.5 Individual commands 
 </H4>
 <P>This section lists all LAMMPS commands alphabetically, with a separate
 listing below of styles within certain commands.  The <A HREF = "#cmd_4">previous
 section</A> lists the same commands, grouped by category.  Note
 that some style options for some commands are part of specific LAMMPS
 packages, which means they cannot be used unless the package was
 included when LAMMPS was built.  Not all packages are included in a
 default LAMMPS build.  These dependencies are listed as Restrictions
 in the command's documentation.
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "angle_coeff.html">angle_coeff</A></TD><TD ><A HREF = "angle_style.html">angle_style</A></TD><TD ><A HREF = "atom_modify.html">atom_modify</A></TD><TD ><A HREF = "atom_style.html">atom_style</A></TD><TD ><A HREF = "balance.html">balance</A></TD><TD ><A HREF = "bond_coeff.html">bond_coeff</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "bond_style.html">bond_style</A></TD><TD ><A HREF = "boundary.html">boundary</A></TD><TD ><A HREF = "box.html">box</A></TD><TD ><A HREF = "change_box.html">change_box</A></TD><TD ><A HREF = "clear.html">clear</A></TD><TD ><A HREF = "comm_modify.html">comm_modify</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "comm_style.html">comm_style</A></TD><TD ><A HREF = "compute.html">compute</A></TD><TD ><A HREF = "compute_modify.html">compute_modify</A></TD><TD ><A HREF = "create_atoms.html">create_atoms</A></TD><TD ><A HREF = "create_box.html">create_box</A></TD><TD ><A HREF = "delete_atoms.html">delete_atoms</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "delete_bonds.html">delete_bonds</A></TD><TD ><A HREF = "dielectric.html">dielectric</A></TD><TD ><A HREF = "dihedral_coeff.html">dihedral_coeff</A></TD><TD ><A HREF = "dihedral_style.html">dihedral_style</A></TD><TD ><A HREF = "dimension.html">dimension</A></TD><TD ><A HREF = "displace_atoms.html">displace_atoms</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "dump.html">dump</A></TD><TD ><A HREF = "dump_image.html">dump image</A></TD><TD ><A HREF = "dump_modify.html">dump_modify</A></TD><TD ><A HREF = "dump_image.html">dump movie</A></TD><TD ><A HREF = "echo.html">echo</A></TD><TD ><A HREF = "fix.html">fix</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_modify.html">fix_modify</A></TD><TD ><A HREF = "group.html">group</A></TD><TD ><A HREF = "if.html">if</A></TD><TD ><A HREF = "improper_coeff.html">improper_coeff</A></TD><TD ><A HREF = "improper_style.html">improper_style</A></TD><TD ><A HREF = "include.html">include</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "jump.html">jump</A></TD><TD ><A HREF = "kspace_modify.html">kspace_modify</A></TD><TD ><A HREF = "kspace_style.html">kspace_style</A></TD><TD ><A HREF = "label.html">label</A></TD><TD ><A HREF = "lattice.html">lattice</A></TD><TD ><A HREF = "log.html">log</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "mass.html">mass</A></TD><TD ><A HREF = "minimize.html">minimize</A></TD><TD ><A HREF = "min_modify.html">min_modify</A></TD><TD ><A HREF = "min_style.html">min_style</A></TD><TD ><A HREF = "molecule.html">molecule</A></TD><TD ><A HREF = "neb.html">neb</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "neigh_modify.html">neigh_modify</A></TD><TD ><A HREF = "neighbor.html">neighbor</A></TD><TD ><A HREF = "newton.html">newton</A></TD><TD ><A HREF = "next.html">next</A></TD><TD ><A HREF = "package.html">package</A></TD><TD ><A HREF = "pair_coeff.html">pair_coeff</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_modify.html">pair_modify</A></TD><TD ><A HREF = "pair_style.html">pair_style</A></TD><TD ><A HREF = "pair_write.html">pair_write</A></TD><TD ><A HREF = "partition.html">partition</A></TD><TD ><A HREF = "prd.html">prd</A></TD><TD ><A HREF = "print.html">print</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "processors.html">processors</A></TD><TD ><A HREF = "quit.html">quit</A></TD><TD ><A HREF = "read_data.html">read_data</A></TD><TD ><A HREF = "read_dump.html">read_dump</A></TD><TD ><A HREF = "read_restart.html">read_restart</A></TD><TD ><A HREF = "region.html">region</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "replicate.html">replicate</A></TD><TD ><A HREF = "rerun.html">rerun</A></TD><TD ><A HREF = "reset_timestep.html">reset_timestep</A></TD><TD ><A HREF = "restart.html">restart</A></TD><TD ><A HREF = "run.html">run</A></TD><TD ><A HREF = "run_style.html">run_style</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "set.html">set</A></TD><TD ><A HREF = "shell.html">shell</A></TD><TD ><A HREF = "special_bonds.html">special_bonds</A></TD><TD ><A HREF = "suffix.html">suffix</A></TD><TD ><A HREF = "tad.html">tad</A></TD><TD ><A HREF = "temper.html">temper</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "thermo.html">thermo</A></TD><TD ><A HREF = "thermo_modify.html">thermo_modify</A></TD><TD ><A HREF = "thermo_style.html">thermo_style</A></TD><TD ><A HREF = "timestep.html">timestep</A></TD><TD ><A HREF = "uncompute.html">uncompute</A></TD><TD ><A HREF = "undump.html">undump</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "unfix.html">unfix</A></TD><TD ><A HREF = "units.html">units</A></TD><TD ><A HREF = "variable.html">variable</A></TD><TD ><A HREF = "velocity.html">velocity</A></TD><TD ><A HREF = "write_data.html">write_data</A></TD><TD ><A HREF = "write_dump.html">write_dump</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "write_restart.html">write_restart</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are commands contributed by users, which can be used if <A HREF = "Section_start.html#start_3">LAMMPS
 is built with the appropriate package</A>.
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "group2ndx.html">group2ndx</A> 
 </TD></TR></TABLE></DIV>
 
 <HR>
 
 <H4>Fix styles 
 </H4>
 <P>See the <A HREF = "fix.html">fix</A> command for one-line descriptions
 of each style or click on the style itself for a full description:
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "fix_adapt.html">adapt</A></TD><TD ><A HREF = "fix_addforce.html">addforce</A></TD><TD ><A HREF = "fix_append_atoms.html">append/atoms</A></TD><TD ><A HREF = "fix_aveforce.html">aveforce</A></TD><TD ><A HREF = "fix_ave_atom.html">ave/atom</A></TD><TD ><A HREF = "fix_ave_correlate.html">ave/correlate</A></TD><TD ><A HREF = "fix_ave_histo.html">ave/histo</A></TD><TD ><A HREF = "fix_ave_spatial.html">ave/spatial</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_ave_time.html">ave/time</A></TD><TD ><A HREF = "fix_balance.html">balance</A></TD><TD ><A HREF = "fix_bond_break.html">bond/break</A></TD><TD ><A HREF = "fix_bond_create.html">bond/create</A></TD><TD ><A HREF = "fix_bond_swap.html">bond/swap</A></TD><TD ><A HREF = "fix_box_relax.html">box/relax</A></TD><TD ><A HREF = "fix_deform.html">deform</A></TD><TD ><A HREF = "fix_deposit.html">deposit</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_drag.html">drag</A></TD><TD ><A HREF = "fix_dt_reset.html">dt/reset</A></TD><TD ><A HREF = "fix_efield.html">efield</A></TD><TD ><A HREF = "fix_enforce2d.html">enforce2d</A></TD><TD ><A HREF = "fix_evaporate.html">evaporate</A></TD><TD ><A HREF = "fix_external.html">external</A></TD><TD ><A HREF = "fix_freeze.html">freeze</A></TD><TD ><A HREF = "fix_gcmc.html">gcmc</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_gld.html">gld</A></TD><TD ><A HREF = "fix_gravity.html">gravity</A></TD><TD ><A HREF = "fix_heat.html">heat</A></TD><TD ><A HREF = "fix_indent.html">indent</A></TD><TD ><A HREF = "fix_langevin.html">langevin</A></TD><TD ><A HREF = "fix_lineforce.html">lineforce</A></TD><TD ><A HREF = "fix_momentum.html">momentum</A></TD><TD ><A HREF = "fix_move.html">move</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_msst.html">msst</A></TD><TD ><A HREF = "fix_neb.html">neb</A></TD><TD ><A HREF = "fix_nh.html">nph</A></TD><TD ><A HREF = "fix_nphug.html">nphug</A></TD><TD ><A HREF = "fix_nph_asphere.html">nph/asphere</A></TD><TD ><A HREF = "fix_nph_sphere.html">nph/sphere</A></TD><TD ><A HREF = "fix_nh.html">npt</A></TD><TD ><A HREF = "fix_npt_asphere.html">npt/asphere</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_npt_sphere.html">npt/sphere</A></TD><TD ><A HREF = "fix_nve.html">nve</A></TD><TD ><A HREF = "fix_nve_asphere.html">nve/asphere</A></TD><TD ><A HREF = "fix_nve_asphere_noforce.html">nve/asphere/noforce</A></TD><TD ><A HREF = "fix_nve_body.html">nve/body</A></TD><TD ><A HREF = "fix_nve_limit.html">nve/limit</A></TD><TD ><A HREF = "fix_nve_line.html">nve/line</A></TD><TD ><A HREF = "fix_nve_noforce.html">nve/noforce</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_nve_sphere.html">nve/sphere</A></TD><TD ><A HREF = "fix_nve_tri.html">nve/tri</A></TD><TD ><A HREF = "fix_nh.html">nvt</A></TD><TD ><A HREF = "fix_nvt_asphere.html">nvt/asphere</A></TD><TD ><A HREF = "fix_nvt_sllod.html">nvt/sllod</A></TD><TD ><A HREF = "fix_nvt_sphere.html">nvt/sphere</A></TD><TD ><A HREF = "fix_oneway.html">oneway</A></TD><TD ><A HREF = "fix_orient_fcc.html">orient/fcc</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_planeforce.html">planeforce</A></TD><TD ><A HREF = "fix_poems.html">poems</A></TD><TD ><A HREF = "fix_pour.html">pour</A></TD><TD ><A HREF = "fix_press_berendsen.html">press/berendsen</A></TD><TD ><A HREF = "fix_print.html">print</A></TD><TD ><A HREF = "fix_property_atom.html">property/atom</A></TD><TD ><A HREF = "fix_qeq_comb.html">qeq/comb</A></TD><TD ><A HREF = "fix_reax_bonds.html">reax/bonds</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_recenter.html">recenter</A></TD><TD ><A HREF = "fix_restrain.html">restrain</A></TD><TD ><A HREF = "fix_rigid.html">rigid</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nph</A></TD><TD ><A HREF = "fix_rigid.html">rigid/npt</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nve</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nvt</A></TD><TD ><A HREF = "fix_rigid.html">rigid/small</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_rigid.html">rigid/small/nph</A></TD><TD ><A HREF = "fix_rigid.html">rigid/small/npt</A></TD><TD ><A HREF = "fix_rigid.html">rigid/small/nve</A></TD><TD ><A HREF = "fix_rigid.html">rigid/small/nvt</A></TD><TD ><A HREF = "fix_setforce.html">setforce</A></TD><TD ><A HREF = "fix_shake.html">shake</A></TD><TD ><A HREF = "fix_spring.html">spring</A></TD><TD ><A HREF = "fix_spring_rg.html">spring/rg</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_spring_self.html">spring/self</A></TD><TD ><A HREF = "fix_srd.html">srd</A></TD><TD ><A HREF = "fix_store_force.html">store/force</A></TD><TD ><A HREF = "fix_store_state.html">store/state</A></TD><TD ><A HREF = "fix_temp_berendsen.html">temp/berendsen</A></TD><TD ><A HREF = "fix_temp_csvr.html">temp/csvr</A></TD><TD ><A HREF = "fix_temp_rescale.html">temp/rescale</A></TD><TD ><A HREF = "fix_thermal_conductivity.html">thermal/conductivity</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_tmd.html">tmd</A></TD><TD ><A HREF = "fix_ttm.html">ttm</A></TD><TD ><A HREF = "fix_tune_kspace.html">tune/kspace</A></TD><TD ><A HREF = "fix_vector.html">vector</A></TD><TD ><A HREF = "fix_viscosity.html">viscosity</A></TD><TD ><A HREF = "fix_viscous.html">viscous</A></TD><TD ><A HREF = "fix_wall.html">wall/colloid</A></TD><TD ><A HREF = "fix_wall_gran.html">wall/gran</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_wall.html">wall/harmonic</A></TD><TD ><A HREF = "fix_wall.html">wall/lj1043</A></TD><TD ><A HREF = "fix_wall.html">wall/lj126</A></TD><TD ><A HREF = "fix_wall.html">wall/lj93</A></TD><TD ><A HREF = "fix_wall_piston.html">wall/piston</A></TD><TD ><A HREF = "fix_wall_reflect.html">wall/reflect</A></TD><TD ><A HREF = "fix_wall_region.html">wall/region</A></TD><TD ><A HREF = "fix_wall_srd.html">wall/srd</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are fix styles contributed by users, which can be used if
 <A HREF = "Section_start.html#start_3">LAMMPS is built with the appropriate
 package</A>.
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "fix_adapt_fep.html">adapt/fep</A></TD><TD ><A HREF = "fix_addtorque.html">addtorque</A></TD><TD ><A HREF = "fix_atc.html">atc</A></TD><TD ><A HREF = "fix_colvars.html">colvars</A></TD><TD ><A HREF = "fix_imd.html">imd</A></TD><TD ><A HREF = "fix_langevin_eff.html">langevin/eff</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_lb_fluid.html">lb/fluid</A></TD><TD ><A HREF = "fix_lb_momentum.html">lb/momentum</A></TD><TD ><A HREF = "fix_lb_pc.html">lb/pc</A></TD><TD ><A HREF = "fix_lb_rigid_pc_sphere.html">lb/rigid/pc/sphere</A></TD><TD ><A HREF = "fix_lb_viscous.html">lb/viscous</A></TD><TD ><A HREF = "fix_meso.html">meso</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_meso_stationary.html">meso/stationary</A></TD><TD ><A HREF = "fix_nh_eff.html">nph/eff</A></TD><TD ><A HREF = "fix_nh_eff.html">npt/eff</A></TD><TD ><A HREF = "fix_nve_eff.html">nve/eff</A></TD><TD ><A HREF = "fix_nh_eff.html">nvt/eff</A></TD><TD ><A HREF = "fix_nvt_sllod_eff.html">nvt/sllod/eff</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_phonon.html">phonon</A></TD><TD ><A HREF = "fix_qeq_reax.html">qeq/reax</A></TD><TD ><A HREF = "fix_qmmm.html">qmmm</A></TD><TD ><A HREF = "fix_reax_bonds.html">reax/c/bonds</A></TD><TD ><A HREF = "fix_reaxc_species.html">reax/c/species</A></TD><TD ><A HREF = "fix_smd.html">smd</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_temp_rescale_eff.html">temp/rescale/eff</A></TD><TD ><A HREF = "fix_ti_rs.html">ti/rs</A></TD><TD ><A HREF = "fix_ti_spring.html">ti/spring</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are accelerated fix styles, which can be used if LAMMPS is
 built with the <A HREF = "Section_accelerate.html">appropriate accelerated
 package</A>.
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "fix_freeze.html">freeze/cuda</A></TD><TD ><A HREF = "fix_addforce.html">addforce/cuda</A></TD><TD ><A HREF = "fix_aveforce.html">aveforce/cuda</A></TD><TD ><A HREF = "fix_enforce2d.html">enforce2d/cuda</A></TD><TD ><A HREF = "fix_gravity.html">gravity/cuda</A></TD><TD ><A HREF = "fix_gravity.html">gravity/omp</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_nh.html">nph/omp</A></TD><TD ><A HREF = "fix_nphug.html">nphug/omp</A></TD><TD ><A HREF = "fix_nph_asphere.html">nph/asphere/omp</A></TD><TD ><A HREF = "fix_nph_sphere.html">nph/sphere/omp</A></TD><TD ><A HREF = "fix_nh.html">npt/cuda</A></TD><TD ><A HREF = "fix_nh.html">npt/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_npt_asphere.html">npt/asphere/omp</A></TD><TD ><A HREF = "fix_npt_sphere.html">npt/sphere/omp</A></TD><TD ><A HREF = "fix_nh.html">nve/cuda</A></TD><TD ><A HREF = "fix_nve.html">nve/omp</A></TD><TD ><A HREF = "fix_nve_sphere.html">nve/sphere/omp</A></TD><TD ><A HREF = "fix_nh.html">nvt/cuda</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_nh.html">nvt/omp</A></TD><TD ><A HREF = "fix_nvt_asphere.html">nvt/asphere/omp</A></TD><TD ><A HREF = "fix_nvt_sllod.html">nvt/sllod/omp</A></TD><TD ><A HREF = "fix_nvt_sphere.html">nvt/sphere/omp</A></TD><TD ><A HREF = "fix_qeq_comb.html">qeq/comb/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_rigid.html">rigid/nph/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/npt/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nve/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nvt/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/small/omp</A></TD><TD ><A HREF = "fix_setforce.html">setforce/cuda</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_shake.html">shake/cuda</A></TD><TD ><A HREF = "fix_temp_berendsen.html">temp/berendsen/cuda</A></TD><TD ><A HREF = "fix_temp_rescale.html">temp/rescale/cuda</A></TD><TD ><A HREF = "fix_temp_rescale.html">temp/rescale/limit/cuda</A></TD><TD ><A HREF = "fix_viscous.html">viscous/cuda</A> 
+<TR ALIGN="center"><TD ><A HREF = "fix_npt_asphere.html">npt/asphere/omp</A></TD><TD ><A HREF = "fix_npt_sphere.html">npt/sphere/omp</A></TD><TD ><A HREF = "fix_nve.html">nve/cuda</A></TD><TD ><A HREF = "fix_nve.html">nve/kk</A></TD><TD ><A HREF = "fix_nve.html">nve/omp</A></TD><TD ><A HREF = "fix_nve_sphere.html">nve/sphere/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "fix_nh.html">nvt/cuda</A></TD><TD ><A HREF = "fix_nh.html">nvt/omp</A></TD><TD ><A HREF = "fix_nvt_asphere.html">nvt/asphere/omp</A></TD><TD ><A HREF = "fix_nvt_sllod.html">nvt/sllod/omp</A></TD><TD ><A HREF = "fix_nvt_sphere.html">nvt/sphere/omp</A></TD><TD ><A HREF = "fix_qeq_comb.html">qeq/comb/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "fix_rigid.html">rigid/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nph/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/npt/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nve/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nvt/omp</A></TD><TD ><A HREF = "fix_rigid.html">rigid/small/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "fix_setforce.html">setforce/cuda</A></TD><TD ><A HREF = "fix_shake.html">shake/cuda</A></TD><TD ><A HREF = "fix_temp_berendsen.html">temp/berendsen/cuda</A></TD><TD ><A HREF = "fix_temp_rescale.html">temp/rescale/cuda</A></TD><TD ><A HREF = "fix_temp_rescale.html">temp/rescale/limit/cuda</A></TD><TD ><A HREF = "fix_viscous.html">viscous/cuda</A> 
 </TD></TR></TABLE></DIV>
 
 <HR>
 
 <H4>Compute styles 
 </H4>
 <P>See the <A HREF = "compute.html">compute</A> command for one-line descriptions of
 each style or click on the style itself for a full description:
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "compute_angle_local.html">angle/local</A></TD><TD ><A HREF = "compute_atom_molecule.html">atom/molecule</A></TD><TD ><A HREF = "compute_body_local.html">body/local</A></TD><TD ><A HREF = "compute_bond_local.html">bond/local</A></TD><TD ><A HREF = "compute_centro_atom.html">centro/atom</A></TD><TD ><A HREF = "compute_cluster_atom.html">cluster/atom</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "compute_cna_atom.html">cna/atom</A></TD><TD ><A HREF = "compute_com.html">com</A></TD><TD ><A HREF = "compute_com_molecule.html">com/molecule</A></TD><TD ><A HREF = "compute_contact_atom.html">contact/atom</A></TD><TD ><A HREF = "compute_coord_atom.html">coord/atom</A></TD><TD ><A HREF = "compute_damage_atom.html">damage/atom</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "compute_dihedral_local.html">dihedral/local</A></TD><TD ><A HREF = "compute_dilatation_atom.html">dilatation/atom</A></TD><TD ><A HREF = "compute_displace_atom.html">displace/atom</A></TD><TD ><A HREF = "compute_erotate_asphere.html">erotate/asphere</A></TD><TD ><A HREF = "compute_erotate_rigid.html">erotate/rigid</A></TD><TD ><A HREF = "compute_erotate_sphere.html">erotate/sphere</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "compute_erotate_sphere_atom.html">erotate/sphere/atom</A></TD><TD ><A HREF = "compute_event_displace.html">event/displace</A></TD><TD ><A HREF = "compute_group_group.html">group/group</A></TD><TD ><A HREF = "compute_gyration.html">gyration</A></TD><TD ><A HREF = "compute_gyration_molecule.html">gyration/molecule</A></TD><TD ><A HREF = "compute_heat_flux.html">heat/flux</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "compute_improper_local.html">improper/local</A></TD><TD ><A HREF = "compute_inertia_molecule.html">inertia/molecule</A></TD><TD ><A HREF = "compute_ke.html">ke</A></TD><TD ><A HREF = "compute_ke_atom.html">ke/atom</A></TD><TD ><A HREF = "compute_ke_rigid.html">ke/rigid</A></TD><TD ><A HREF = "compute_msd.html">msd</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "compute_msd_molecule.html">msd/molecule</A></TD><TD ><A HREF = "compute_msd_nongauss.html">msd/nongauss</A></TD><TD ><A HREF = "compute_pair.html">pair</A></TD><TD ><A HREF = "compute_pair_local.html">pair/local</A></TD><TD ><A HREF = "compute_pe.html">pe</A></TD><TD ><A HREF = "compute_pe_atom.html">pe/atom</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "compute_plasticity_atom.html">plasticity/atom</A></TD><TD ><A HREF = "compute_pressure.html">pressure</A></TD><TD ><A HREF = "compute_property_atom.html">property/atom</A></TD><TD ><A HREF = "compute_property_local.html">property/local</A></TD><TD ><A HREF = "compute_property_molecule.html">property/molecule</A></TD><TD ><A HREF = "compute_rdf.html">rdf</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "compute_reduce.html">reduce</A></TD><TD ><A HREF = "compute_reduce.html">reduce/region</A></TD><TD ><A HREF = "compute_slice.html">slice</A></TD><TD ><A HREF = "compute_stress_atom.html">stress/atom</A></TD><TD ><A HREF = "compute_temp.html">temp</A></TD><TD ><A HREF = "compute_temp_asphere.html">temp/asphere</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "compute_temp_com.html">temp/com</A></TD><TD ><A HREF = "compute_temp_deform.html">temp/deform</A></TD><TD ><A HREF = "compute_temp_partial.html">temp/partial</A></TD><TD ><A HREF = "compute_temp_profile.html">temp/profile</A></TD><TD ><A HREF = "compute_temp_ramp.html">temp/ramp</A></TD><TD ><A HREF = "compute_temp_region.html">temp/region</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "compute_temp_sphere.html">temp/sphere</A></TD><TD ><A HREF = "compute_ti.html">ti</A></TD><TD ><A HREF = "compute_vacf.html">vacf</A></TD><TD ><A HREF = "compute_voronoi_atom.html">voronoi/atom</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are compute styles contributed by users, which can be used if
 <A HREF = "Section_start.html#start_3">LAMMPS is built with the appropriate
 package</A>.
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "compute_ackland_atom.html">ackland/atom</A></TD><TD ><A HREF = "compute_basal_atom.html">basal/atom</A></TD><TD ><A HREF = "compute_fep.html">fep</A></TD><TD ><A HREF = "compute_ke_eff.html">ke/eff</A></TD><TD ><A HREF = "compute_ke_atom_eff.html">ke/atom/eff</A></TD><TD ><A HREF = "compute_meso_e_atom.html">meso_e/atom</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "compute_meso_rho_atom.html">meso_rho/atom</A></TD><TD ><A HREF = "compute_meso_t_atom.html">meso_t/atom</A></TD><TD ><A HREF = "compute_temp_eff.html">temp/eff</A></TD><TD ><A HREF = "compute_temp_deform_eff.html">temp/deform/eff</A></TD><TD ><A HREF = "compute_temp_region_eff.html">temp/region/eff</A></TD><TD ><A HREF = "compute_temp_rotate.html">temp/rotate</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are accelerated compute styles, which can be used if LAMMPS is
 built with the <A HREF = "Section_accelerate.html">appropriate accelerated
 package</A>.
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "compute_pe.html">pe/cuda</A></TD><TD ><A HREF = "compute_pressure.html">pressure/cuda</A></TD><TD ><A HREF = "compute_temp.html">temp/cuda</A></TD><TD ><A HREF = "compute_temp_partial.html">temp/partial/cuda</A> 
 </TD></TR></TABLE></DIV>
 
 <HR>
 
 <H4>Pair_style potentials 
 </H4>
 <P>See the <A HREF = "pair_style.html">pair_style</A> command for an overview of pair
 potentials.  Click on the style itself for a full description:
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "pair_none.html">none</A></TD><TD ><A HREF = "pair_hybrid.html">hybrid</A></TD><TD ><A HREF = "pair_hybrid.html">hybrid/overlay</A></TD><TD ><A HREF = "pair_adp.html">adp</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_airebo.html">airebo</A></TD><TD ><A HREF = "pair_beck.html">beck</A></TD><TD ><A HREF = "pair_body.html">body</A></TD><TD ><A HREF = "pair_bop.html">bop</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_born.html">born</A></TD><TD ><A HREF = "pair_born.html">born/coul/long</A></TD><TD ><A HREF = "pair_born.html">born/coul/msm</A></TD><TD ><A HREF = "pair_born.html">born/coul/wolf</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_brownian.html">brownian</A></TD><TD ><A HREF = "pair_brownian.html">brownian/poly</A></TD><TD ><A HREF = "pair_buck.html">buck</A></TD><TD ><A HREF = "pair_buck.html">buck/coul/cut</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_buck.html">buck/coul/long</A></TD><TD ><A HREF = "pair_buck.html">buck/coul/msm</A></TD><TD ><A HREF = "pair_buck_long.html">buck/long/coul/long</A></TD><TD ><A HREF = "pair_colloid.html">colloid</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_comb.html">comb</A></TD><TD ><A HREF = "pair_comb.html">comb3</A></TD><TD ><A HREF = "pair_coul.html">coul/cut</A></TD><TD ><A HREF = "pair_coul.html">coul/debye</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_coul.html">coul/dsf</A></TD><TD ><A HREF = "pair_coul.html">coul/long</A></TD><TD ><A HREF = "pair_coul.html">coul/msm</A></TD><TD ><A HREF = "pair_coul.html">coul/wolf</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_dpd.html">dpd</A></TD><TD ><A HREF = "pair_dpd.html">dpd/tstat</A></TD><TD ><A HREF = "pair_dsmc.html">dsmc</A></TD><TD ><A HREF = "pair_eam.html">eam</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_eam.html">eam/alloy</A></TD><TD ><A HREF = "pair_eam.html">eam/fs</A></TD><TD ><A HREF = "pair_eim.html">eim</A></TD><TD ><A HREF = "pair_gauss.html">gauss</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_gayberne.html">gayberne</A></TD><TD ><A HREF = "pair_gran.html">gran/hertz/history</A></TD><TD ><A HREF = "pair_gran.html">gran/hooke</A></TD><TD ><A HREF = "pair_gran.html">gran/hooke/history</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_hbond_dreiding.html">hbond/dreiding/lj</A></TD><TD ><A HREF = "pair_hbond_dreiding.html">hbond/dreiding/morse</A></TD><TD ><A HREF = "pair_kim.html">kim</A></TD><TD ><A HREF = "pair_lcbop.html">lcbop</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_line_lj.html">line/lj</A></TD><TD ><A HREF = "pair_charmm.html">lj/charmm/coul/charmm</A></TD><TD ><A HREF = "pair_charmm.html">lj/charmm/coul/charmm/implicit</A></TD><TD ><A HREF = "pair_charmm.html">lj/charmm/coul/long</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_charmm.html">lj/charmm/coul/msm</A></TD><TD ><A HREF = "pair_class2.html">lj/class2</A></TD><TD ><A HREF = "pair_class2.html">lj/class2/coul/cut</A></TD><TD ><A HREF = "pair_class2.html">lj/class2/coul/long</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/cut</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/debye</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/dsf</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/coul/long</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/msm</A></TD><TD ><A HREF = "pair_dipole.html">lj/cut/dipole/cut</A></TD><TD ><A HREF = "pair_dipole.html">lj/cut/dipole/long</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/tip4p/cut</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/tip4p/long</A></TD><TD ><A HREF = "pair_lj_expand.html">lj/expand</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_gromacs.html">lj/gromacs/coul/gromacs</A></TD><TD ><A HREF = "pair_lj_long.html">lj/long/coul/long</A></TD><TD ><A HREF = "pair_dipole.html">lj/long/dipole/long</A></TD><TD ><A HREF = "pair_lj_long.html">lj/long/tip4p/long</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_lj_smooth.html">lj/smooth</A></TD><TD ><A HREF = "pair_lj_smooth_linear.html">lj/smooth/linear</A></TD><TD ><A HREF = "pair_lj96.html">lj96/cut</A></TD><TD ><A HREF = "pair_lubricate.html">lubricate</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_lubricate.html">lubricate/poly</A></TD><TD ><A HREF = "pair_lubricateU.html">lubricateU</A></TD><TD ><A HREF = "pair_lubricateU.html">lubricateU/poly</A></TD><TD ><A HREF = "pair_meam.html">meam</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_mie.html">mie/cut</A></TD><TD ><A HREF = "pair_morse.html">morse</A></TD><TD ><A HREF = "pair_nb3b_harmonic.html">nb3b/harmonic</A></TD><TD ><A HREF = "pair_nm.html">nm/cut</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_nm.html">nm/cut/coul/cut</A></TD><TD ><A HREF = "pair_nm.html">nm/cut/coul/long</A></TD><TD ><A HREF = "pair_peri.html">peri/eps</A></TD><TD ><A HREF = "pair_peri.html">peri/lps</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_peri.html">peri/pmb</A></TD><TD ><A HREF = "pair_peri.html">peri/ves</A></TD><TD ><A HREF = "pair_reax.html">reax</A></TD><TD ><A HREF = "pair_airebo.html">rebo</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_resquared.html">resquared</A></TD><TD ><A HREF = "pair_soft.html">soft</A></TD><TD ><A HREF = "pair_sw.html">sw</A></TD><TD ><A HREF = "pair_table.html">table</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_tersoff.html">tersoff</A></TD><TD ><A HREF = "pair_tersoff_mod.html">tersoff/mod</A></TD><TD ><A HREF = "pair_tersoff_zbl.html">tersoff/zbl</A></TD><TD ><A HREF = "pair_coul.html">tip4p/cut</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_coul.html">tip4p/long</A></TD><TD ><A HREF = "pair_tri_lj.html">tri/lj</A></TD><TD ><A HREF = "pair_yukawa.html">yukawa</A></TD><TD ><A HREF = "pair_yukawa_colloid.html">yukawa/colloid</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_zbl.html">zbl</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are pair styles contributed by users, which can be used if
 <A HREF = "Section_start.html#start_3">LAMMPS is built with the appropriate
 package</A>.
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "pair_awpmd.html">awpmd/cut</A></TD><TD ><A HREF = "pair_lj_soft.html">coul/cut/soft</A></TD><TD ><A HREF = "pair_coul_diel.html">coul/diel</A></TD><TD ><A HREF = "pair_lj_soft.html">coul/long/soft</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_eam.html">eam/cd</A></TD><TD ><A HREF = "pair_edip.html">edip</A></TD><TD ><A HREF = "pair_eff.html">eff/cut</A></TD><TD ><A HREF = "pair_gauss.html">gauss/cut</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_list.html">list</A></TD><TD ><A HREF = "pair_lj_soft.html">lj/cut/coul/cut/soft</A></TD><TD ><A HREF = "pair_lj_soft.html">lj/cut/coul/long/soft</A></TD><TD ><A HREF = "pair_dipole.html">lj/cut/dipole/sf</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_lj_soft.html">lj/cut/soft</A></TD><TD ><A HREF = "pair_lj_soft.html">lj/cut/tip4p/long/soft</A></TD><TD ><A HREF = "pair_sdk.html">lj/sdk</A></TD><TD ><A HREF = "pair_sdk.html">lj/sdk/coul/long</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_sdk.html">lj/sdk/coul/msm</A></TD><TD ><A HREF = "pair_lj_sf.html">lj/sf</A></TD><TD ><A HREF = "pair_meam_spline.html">meam/spline</A></TD><TD ><A HREF = "pair_meam_sw_spline.html">meam/sw/spline</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_reax_c.html">reax/c</A></TD><TD ><A HREF = "pair_sph_heatconduction.html">sph/heatconduction</A></TD><TD ><A HREF = "pair_sph_idealgas.html">sph/idealgas</A></TD><TD ><A HREF = "pair_sph_lj.html">sph/lj</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_sph_rhosum.html">sph/rhosum</A></TD><TD ><A HREF = "pair_sph_taitwater.html">sph/taitwater</A></TD><TD ><A HREF = "pair_sph_taitwater_morris.html">sph/taitwater/morris</A></TD><TD ><A HREF = "pair_tersoff.html">tersoff/table</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_lj_soft.html">tip4p/long/soft</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are accelerated pair styles, which can be used if LAMMPS is
 built with the <A HREF = "Section_accelerate.html">appropriate accelerated
 package</A>.
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "pair_adp.html">adp/omp</A></TD><TD ><A HREF = "pair_airebo.html">airebo/omp</A></TD><TD ><A HREF = "pair_beck.html">beck/gpu</A></TD><TD ><A HREF = "pair_beck.html">beck/omp</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_born.html">born/coul/long/cuda</A></TD><TD ><A HREF = "pair_born.html">born/coul/long/gpu</A></TD><TD ><A HREF = "pair_born.html">born/coul/long/omp</A></TD><TD ><A HREF = "pair_born.html">born/coul/msm/omp</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_born.html">born/coul/wolf/gpu</A></TD><TD ><A HREF = "pair_born.html">born/coul/wolf/omp</A></TD><TD ><A HREF = "pair_born.html">born/gpu</A></TD><TD ><A HREF = "pair_born.html">born/omp</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_brownian.html">brownian/omp</A></TD><TD ><A HREF = "pair_brownian.html">brownian/poly/omp</A></TD><TD ><A HREF = "pair_buck.html">buck/coul/cut/cuda</A></TD><TD ><A HREF = "pair_buck.html">buck/coul/cut/gpu</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_buck.html">buck/coul/cut/omp</A></TD><TD ><A HREF = "pair_buck.html">buck/coul/long/cuda</A></TD><TD ><A HREF = "pair_buck.html">buck/coul/long/gpu</A></TD><TD ><A HREF = "pair_buck.html">buck/coul/long/omp</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_buck.html">buck/coul/msm/omp</A></TD><TD ><A HREF = "pair_buck.html">buck/cuda</A></TD><TD ><A HREF = "pair_buck_long.html">buck/long/coul/long/omp</A></TD><TD ><A HREF = "pair_buck.html">buck/gpu</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_buck.html">buck/omp</A></TD><TD ><A HREF = "pair_colloid.html">colloid/gpu</A></TD><TD ><A HREF = "pair_colloid.html">colloid/omp</A></TD><TD ><A HREF = "pair_comb.html">comb/omp</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_coul.html">coul/cut/omp</A></TD><TD ><A HREF = "pair_lj_soft.html">coul/cut/soft/omp</A></TD><TD ><A HREF = "pair_coul.html">coul/debye/omp</A></TD><TD ><A HREF = "pair_coul.html">coul/dsf/gpu</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_coul.html">coul/dsf/omp</A></TD><TD ><A HREF = "pair_coul.html">coul/long/gpu</A></TD><TD ><A HREF = "pair_coul.html">coul/long/omp</A></TD><TD ><A HREF = "pair_lj_soft.html">coul/long/soft/omp</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_coul.html">coul/msm/omp</A></TD><TD ><A HREF = "pair_coul.html">coul/wolf</A></TD><TD ><A HREF = "pair_dpd.html">dpd/omp</A></TD><TD ><A HREF = "pair_dpd.html">dpd/tstat/omp</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_eam.html">eam/alloy/cuda</A></TD><TD ><A HREF = "pair_eam.html">eam/alloy/gpu</A></TD><TD ><A HREF = "pair_eam.html">eam/alloy/omp</A></TD><TD ><A HREF = "pair_eam.html">eam/alloy/opt</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_eam.html">eam/cd/omp</A></TD><TD ><A HREF = "pair_eam.html">eam/cuda</A></TD><TD ><A HREF = "pair_eam.html">eam/fs/cuda</A></TD><TD ><A HREF = "pair_eam.html">eam/fs/gpu</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_eam.html">eam/fs/omp</A></TD><TD ><A HREF = "pair_eam.html">eam/fs/opt</A></TD><TD ><A HREF = "pair_eam.html">eam/gpu</A></TD><TD ><A HREF = "pair_eam.html">eam/omp</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_eam.html">eam/opt</A></TD><TD ><A HREF = "pair_edip.html">edip/omp</A></TD><TD ><A HREF = "pair_eim.html">eim/omp</A></TD><TD ><A HREF = "pair_gauss.html">gauss/gpu</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_gauss.html">gauss/omp</A></TD><TD ><A HREF = "pair_gayberne.html">gayberne/gpu</A></TD><TD ><A HREF = "pair_gayberne.html">gayberne/omp</A></TD><TD ><A HREF = "pair_gran.html">gran/hertz/history/omp</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_gran.html">gran/hooke/cuda</A></TD><TD ><A HREF = "pair_gran.html">gran/hooke/history/omp</A></TD><TD ><A HREF = "pair_gran.html">gran/hooke/omp</A></TD><TD ><A HREF = "pair_hbond_dreiding.html">hbond/dreiding/lj/omp</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_hbond_dreiding.html">hbond/dreiding/morse/omp</A></TD><TD ><A HREF = "pair_line_lj.html">line/lj/omp</A></TD><TD ><A HREF = "pair_charmm.html">lj/charmm/coul/charmm/cuda</A></TD><TD ><A HREF = "pair_charmm.html">lj/charmm/coul/charmm/omp</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_charmm.html">lj/charmm/coul/charmm/implicit/cuda</A></TD><TD ><A HREF = "pair_charmm.html">lj/charmm/coul/charmm/implicit/omp</A></TD><TD ><A HREF = "pair_charmm.html">lj/charmm/coul/long/cuda</A></TD><TD ><A HREF = "pair_charmm.html">lj/charmm/coul/long/gpu</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_charmm.html">lj/charmm/coul/long/omp</A></TD><TD ><A HREF = "pair_lj_soft.html">lj/charmm/coul/long/soft</A></TD><TD ><A HREF = "pair_lj_soft.html">lj/charmm/coul/long/soft/omp</A></TD><TD ><A HREF = "pair_class2.html">lj/class2/coul/cut/cuda</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_class2.html">lj/class2/coul/cut/omp</A></TD><TD ><A HREF = "pair_class2.html">lj/class2/coul/long/cuda</A></TD><TD ><A HREF = "pair_class2.html">lj/class2/coul/long/gpu</A></TD><TD ><A HREF = "pair_class2.html">lj/class2/coul/long/omp</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_class2.html">lj/class2/coul/msm/omp</A></TD><TD ><A HREF = "pair_class2.html">lj/class2/cuda</A></TD><TD ><A HREF = "pair_class2.html">lj/class2/gpu</A></TD><TD ><A HREF = "pair_class2.html">lj/class2/omp</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_lj_long.html">lj/long/coul/long/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/cut/cuda</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/cut/gpu</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/cut/omp</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_lj_soft.html">lj/cut/coul/cut/soft/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/debye/cuda</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/debye/gpu</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/debye/omp</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/coul/dsf/gpu</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/dsf/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/long/cuda</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/long/gpu</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/coul/long/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/long/opt</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/msm/gpu</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/msm/opt</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_lj_soft.html">lj/cut/coul/long/soft/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/cuda</A></TD><TD ><A HREF = "pair_dipole.html">lj/cut/dipole/cut/gpu</A></TD><TD ><A HREF = "pair_dipole.html">lj/cut/dipole/cut/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_dipole.html">lj/cut/dipole/sf/gpu</A></TD><TD ><A HREF = "pair_dipole.html">lj/cut/dipole/sf/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/experimental/cuda</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/gpu</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/opt</A></TD><TD ><A HREF = "pair_lj_soft.html">lj/cut/soft/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/tip4p/cut/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/tip4p/long/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/tip4p/long/opt</A></TD><TD ><A HREF = "pair_lj_soft.html">lj/cut/tip4p/long/soft/omp</A></TD><TD ><A HREF = "pair_lj_expand.html">lj/expand/cuda</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_lj_expand.html">lj/expand/gpu</A></TD><TD ><A HREF = "pair_lj_expand.html">lj/expand/omp</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/coul/gromacs/cuda</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/coul/gromacs/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_gromacs.html">lj/gromacs/cuda</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/gpu</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/omp</A></TD><TD ><A HREF = "pair_lj_long.html">lj/long/coul/long/opt</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_sdk.html">lj/sdk/gpu</A></TD><TD ><A HREF = "pair_sdk.html">lj/sdk/omp</A></TD><TD ><A HREF = "pair_sdk.html">lj/sdk/coul/long/gpu</A></TD><TD ><A HREF = "pair_sdk.html">lj/sdk/coul/long/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_sdk.html">lj/sdk/coul/msm/omp</A></TD><TD ><A HREF = "pair_lj_sf.html">lj/sf/omp</A></TD><TD ><A HREF = "pair_lj_smooth.html">lj/smooth/cuda</A></TD><TD ><A HREF = "pair_lj_smooth.html">lj/smooth/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_lj_smooth_linear.html">lj/smooth/linear/omp</A></TD><TD ><A HREF = "pair_lj96.html">lj96/cut/cuda</A></TD><TD ><A HREF = "pair_lj96.html">lj96/cut/gpu</A></TD><TD ><A HREF = "pair_lj96.html">lj96/cut/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_lubricate.html">lubricate/omp</A></TD><TD ><A HREF = "pair_lubricate.html">lubricate/poly/omp</A></TD><TD ><A HREF = "pair_meam_spline.html">meam/spline/omp</A></TD><TD ><A HREF = "pair_mie.html">mie/cut/gpu</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_morse.html">morse/cuda</A></TD><TD ><A HREF = "pair_morse.html">morse/gpu</A></TD><TD ><A HREF = "pair_morse.html">morse/omp</A></TD><TD ><A HREF = "pair_morse.html">morse/opt</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_nb3b_harmonic.html">nb3b/harmonic/omp</A></TD><TD ><A HREF = "pair_nm.html">nm/cut/omp</A></TD><TD ><A HREF = "pair_nm.html">nm/cut/coul/cut/omp</A></TD><TD ><A HREF = "pair_nm.html">nm/cut/coul/long/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_peri.html">peri/lps/omp</A></TD><TD ><A HREF = "pair_peri.html">peri/pmb/omp</A></TD><TD ><A HREF = "pair_airebo.html">rebo/omp</A></TD><TD ><A HREF = "pair_resquared.html">resquared/gpu</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_resquared.html">resquared/omp</A></TD><TD ><A HREF = "pair_soft.html">soft/gpu</A></TD><TD ><A HREF = "pair_soft.html">soft/omp</A></TD><TD ><A HREF = "pair_sw.html">sw/cuda</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_sw.html">sw/gpu</A></TD><TD ><A HREF = "pair_sw.html">sw/omp</A></TD><TD ><A HREF = "pair_table.html">table/gpu</A></TD><TD ><A HREF = "pair_table.html">table/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_tersoff.html">tersoff/cuda</A></TD><TD ><A HREF = "pair_tersoff.html">tersoff/omp</A></TD><TD ><A HREF = "pair_tersoff_mod.html">tersoff/mod/omp</A></TD><TD ><A HREF = "pair_tersoff.html">tersoff/table/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_tersoff_zbl.html">tersoff/zbl/omp</A></TD><TD ><A HREF = "pair_coul.html">tip4p/cut/omp</A></TD><TD ><A HREF = "pair_coul.html">tip4p/long/omp</A></TD><TD ><A HREF = "pair_lj_soft.html">tip4p/long/soft/omp</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_tri_lj.html">tri/lj/omp</A></TD><TD ><A HREF = "pair_yukawa.html">yukawa/gpu</A></TD><TD ><A HREF = "pair_yukawa.html">yukawa/omp</A></TD><TD ><A HREF = "pair_yukawa_colloid.html">yukawa/colloid/gpu</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_yukawa_colloid.html">yukawa/colloid/omp</A></TD><TD ><A HREF = "pair_zbl.html">zbl/omp</A> 
+<TR ALIGN="center"><TD ><A HREF = "pair_lj_soft.html">lj/cut/coul/long/soft/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/cuda</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/kk</A></TD><TD ><A HREF = "pair_dipole.html">lj/cut/dipole/cut/gpu</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_dipole.html">lj/cut/dipole/cut/omp</A></TD><TD ><A HREF = "pair_dipole.html">lj/cut/dipole/sf/gpu</A></TD><TD ><A HREF = "pair_dipole.html">lj/cut/dipole/sf/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/experimental/cuda</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/gpu</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/opt</A></TD><TD ><A HREF = "pair_lj_soft.html">lj/cut/soft/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/tip4p/cut/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/tip4p/long/omp</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/tip4p/long/opt</A></TD><TD ><A HREF = "pair_lj_soft.html">lj/cut/tip4p/long/soft/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_lj_expand.html">lj/expand/cuda</A></TD><TD ><A HREF = "pair_lj_expand.html">lj/expand/gpu</A></TD><TD ><A HREF = "pair_lj_expand.html">lj/expand/omp</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/coul/gromacs/cuda</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_gromacs.html">lj/gromacs/coul/gromacs/omp</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/cuda</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/gpu</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_lj_long.html">lj/long/coul/long/opt</A></TD><TD ><A HREF = "pair_sdk.html">lj/sdk/gpu</A></TD><TD ><A HREF = "pair_sdk.html">lj/sdk/omp</A></TD><TD ><A HREF = "pair_sdk.html">lj/sdk/coul/long/gpu</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_sdk.html">lj/sdk/coul/long/omp</A></TD><TD ><A HREF = "pair_sdk.html">lj/sdk/coul/msm/omp</A></TD><TD ><A HREF = "pair_lj_sf.html">lj/sf/omp</A></TD><TD ><A HREF = "pair_lj_smooth.html">lj/smooth/cuda</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_lj_smooth.html">lj/smooth/omp</A></TD><TD ><A HREF = "pair_lj_smooth_linear.html">lj/smooth/linear/omp</A></TD><TD ><A HREF = "pair_lj96.html">lj96/cut/cuda</A></TD><TD ><A HREF = "pair_lj96.html">lj96/cut/gpu</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_lj96.html">lj96/cut/omp</A></TD><TD ><A HREF = "pair_lubricate.html">lubricate/omp</A></TD><TD ><A HREF = "pair_lubricate.html">lubricate/poly/omp</A></TD><TD ><A HREF = "pair_meam_spline.html">meam/spline/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_mie.html">mie/cut/gpu</A></TD><TD ><A HREF = "pair_morse.html">morse/cuda</A></TD><TD ><A HREF = "pair_morse.html">morse/gpu</A></TD><TD ><A HREF = "pair_morse.html">morse/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_morse.html">morse/opt</A></TD><TD ><A HREF = "pair_nb3b_harmonic.html">nb3b/harmonic/omp</A></TD><TD ><A HREF = "pair_nm.html">nm/cut/omp</A></TD><TD ><A HREF = "pair_nm.html">nm/cut/coul/cut/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_nm.html">nm/cut/coul/long/omp</A></TD><TD ><A HREF = "pair_peri.html">peri/lps/omp</A></TD><TD ><A HREF = "pair_peri.html">peri/pmb/omp</A></TD><TD ><A HREF = "pair_airebo.html">rebo/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_resquared.html">resquared/gpu</A></TD><TD ><A HREF = "pair_resquared.html">resquared/omp</A></TD><TD ><A HREF = "pair_soft.html">soft/gpu</A></TD><TD ><A HREF = "pair_soft.html">soft/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_sw.html">sw/cuda</A></TD><TD ><A HREF = "pair_sw.html">sw/gpu</A></TD><TD ><A HREF = "pair_sw.html">sw/omp</A></TD><TD ><A HREF = "pair_table.html">table/gpu</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_table.html">table/kk</A></TD><TD ><A HREF = "pair_table.html">table/omp</A></TD><TD ><A HREF = "pair_tersoff.html">tersoff/cuda</A></TD><TD ><A HREF = "pair_tersoff.html">tersoff/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_tersoff_mod.html">tersoff/mod/omp</A></TD><TD ><A HREF = "pair_tersoff.html">tersoff/table/omp</A></TD><TD ><A HREF = "pair_tersoff_zbl.html">tersoff/zbl/omp</A></TD><TD ><A HREF = "pair_coul.html">tip4p/cut/omp</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_coul.html">tip4p/long/omp</A></TD><TD ><A HREF = "pair_lj_soft.html">tip4p/long/soft/omp</A></TD><TD ><A HREF = "pair_tri_lj.html">tri/lj/omp</A></TD><TD ><A HREF = "pair_yukawa.html">yukawa/gpu</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_yukawa.html">yukawa/omp</A></TD><TD ><A HREF = "pair_yukawa_colloid.html">yukawa/colloid/gpu</A></TD><TD ><A HREF = "pair_yukawa_colloid.html">yukawa/colloid/omp</A></TD><TD ><A HREF = "pair_zbl.html">zbl/omp</A> 
 </TD></TR></TABLE></DIV>
 
 <HR>
 
 <H4>Bond_style potentials 
 </H4>
 <P>See the <A HREF = "bond_style.html">bond_style</A> command for an overview of bond
 potentials.  Click on the style itself for a full description:
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "bond_none.html">none</A></TD><TD WIDTH="100"><A HREF = "bond_hybrid.html">hybrid</A></TD><TD WIDTH="100"><A HREF = "bond_class2.html">class2</A></TD><TD WIDTH="100"><A HREF = "bond_fene.html">fene</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "bond_fene_expand.html">fene/expand</A></TD><TD WIDTH="100"><A HREF = "bond_harmonic.html">harmonic</A></TD><TD WIDTH="100"><A HREF = "bond_morse.html">morse</A></TD><TD WIDTH="100"><A HREF = "bond_nonlinear.html">nonlinear</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "bond_quartic.html">quartic</A></TD><TD WIDTH="100"><A HREF = "bond_table.html">table</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are bond styles contributed by users, which can be used if
 <A HREF = "Section_start.html#start_3">LAMMPS is built with the appropriate
 package</A>.
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "bond_harmonic_shift.html">harmonic/shift</A></TD><TD ><A HREF = "bond_harmonic_shift_cut.html">harmonic/shift/cut</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are accelerated bond styles, which can be used if LAMMPS is
 built with the <A HREF = "Section_accelerate.html">appropriate accelerated
 package</A>.
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "bond_class2.html">class2/omp</A></TD><TD WIDTH="100"><A HREF = "bond_fene.html">fene/omp</A></TD><TD WIDTH="100"><A HREF = "bond_fene_expand.html">fene/expand/omp</A></TD><TD WIDTH="100"><A HREF = "bond_harmonic.html">harmonic/omp</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "bond_harmonic_shift.html">harmonic/shift/omp</A></TD><TD WIDTH="100"><A HREF = "bond_harmonic_shift_cut.html">harmonic/shift/cut/omp</A></TD><TD WIDTH="100"><A HREF = "bond_morse.html">morse/omp</A></TD><TD WIDTH="100"><A HREF = "bond_nonlinear.html">nonlinear/omp</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "bond_quartic.html">quartic/omp</A></TD><TD WIDTH="100"><A HREF = "bond_table.html">table/omp</A> 
 </TD></TR></TABLE></DIV>
 
 <HR>
 
 <H4>Angle_style potentials 
 </H4>
 <P>See the <A HREF = "angle_style.html">angle_style</A> command for an overview of
 angle potentials.  Click on the style itself for a full description:
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "angle_none.html">none</A></TD><TD WIDTH="100"><A HREF = "angle_hybrid.html">hybrid</A></TD><TD WIDTH="100"><A HREF = "angle_charmm.html">charmm</A></TD><TD WIDTH="100"><A HREF = "angle_class2.html">class2</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "angle_cosine.html">cosine</A></TD><TD WIDTH="100"><A HREF = "angle_cosine_delta.html">cosine/delta</A></TD><TD WIDTH="100"><A HREF = "angle_cosine_periodic.html">cosine/periodic</A></TD><TD WIDTH="100"><A HREF = "angle_cosine_squared.html">cosine/squared</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "angle_harmonic.html">harmonic</A></TD><TD WIDTH="100"><A HREF = "angle_table.html">table</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are angle styles contributed by users, which can be used if
 <A HREF = "Section_start.html#start_3">LAMMPS is built with the appropriate
 package</A>.
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "angle_sdk.html">sdk</A></TD><TD ><A HREF = "angle_cosine_shift.html">cosine/shift</A></TD><TD ><A HREF = "angle_cosine_shift_exp.html">cosine/shift/exp</A></TD><TD ><A HREF = "angle_dipole.html">dipole</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "angle_fourier.html">fourier</A></TD><TD ><A HREF = "angle_fourier_simple.html">fourier/simple</A></TD><TD ><A HREF = "angle_quartic.html">quartic</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are accelerated angle styles, which can be used if LAMMPS is
 built with the <A HREF = "Section_accelerate.html">appropriate accelerated
 package</A>.
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "angle_charmm.html">charmm/omp</A></TD><TD WIDTH="100"><A HREF = "angle_class2.html">class2/omp</A></TD><TD WIDTH="100"><A HREF = "angle_cosine.html">cosine/omp</A></TD><TD WIDTH="100"><A HREF = "angle_cosine_delta.html">cosine/delta/omp</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "angle_cosine_periodic.html">cosine/periodic/omp</A></TD><TD WIDTH="100"><A HREF = "angle_cosine_shift.html">cosine/shift/omp</A></TD><TD WIDTH="100"><A HREF = "angle_cosine_shift_exp.html">cosine/shift/exp/omp</A></TD><TD WIDTH="100"><A HREF = "angle_cosine_squared.html">cosine/squared/omp</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "angle_dipole.html">dipole/omp</A><A HREF = "angle_fourier.html">fourier/omp</A></TD><TD WIDTH="100"><A HREF = "angle_fourier_simple.html">fourier/simple/omp</A></TD><TD WIDTH="100"><A HREF = "angle_harmonic.html">harmonic/omp</A></TD><TD WIDTH="100"><A HREF = "angle_quartic.html">quartic/omp</A><A HREF = "angle_table.html">table/omp</A> 
 </TD></TR></TABLE></DIV>
 
 <HR>
 
 <H4>Dihedral_style potentials 
 </H4>
 <P>See the <A HREF = "dihedral_style.html">dihedral_style</A> command for an overview
 of dihedral potentials.  Click on the style itself for a full
 description:
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "dihedral_none.html">none</A></TD><TD WIDTH="100"><A HREF = "dihedral_hybrid.html">hybrid</A></TD><TD WIDTH="100"><A HREF = "dihedral_charmm.html">charmm</A></TD><TD WIDTH="100"><A HREF = "dihedral_class2.html">class2</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "dihedral_harmonic.html">harmonic</A></TD><TD WIDTH="100"><A HREF = "dihedral_helix.html">helix</A></TD><TD WIDTH="100"><A HREF = "dihedral_multi_harmonic.html">multi/harmonic</A></TD><TD WIDTH="100"><A HREF = "dihedral_opls.html">opls</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are dihedral styles contributed by users, which can be used if
 <A HREF = "Section_start.html#start_3">LAMMPS is built with the appropriate
 package</A>.
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "dihedral_cosine_shift_exp.html">cosine/shift/exp</A></TD><TD ><A HREF = "dihedral_fourier.html">fourier</A></TD><TD ><A HREF = "dihedral_nharmonic.html">nharmonic</A></TD><TD ><A HREF = "dihedral_quadratic.html">quadratic</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "dihedral_table.html">table</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are accelerated dihedral styles, which can be used if LAMMPS is
 built with the <A HREF = "Section_accelerate.html">appropriate accelerated
 package</A>.
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "dihedral_charmm.html">charmm/omp</A></TD><TD WIDTH="100"><A HREF = "dihedral_class2.html">class2/omp</A></TD><TD WIDTH="100"><A HREF = "dihedral_cosine_shift_exp.html">cosine/shift/exp/omp</A></TD><TD WIDTH="100"><A HREF = "dihedral_fourier.html">fourier/omp</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "dihedral_harmonic.html">harmonic/omp</A></TD><TD WIDTH="100"><A HREF = "dihedral_helix.html">helix/omp</A></TD><TD WIDTH="100"><A HREF = "dihedral_multi_harmonic.html">multi/harmonic/omp</A></TD><TD WIDTH="100"><A HREF = "dihedral_nharmonic.html">nharmonic/omp</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "dihedral_opls.html">opls/omp</A><A HREF = "dihedral_quadratic.html">quadratic/omp</A></TD><TD WIDTH="100"><A HREF = "dihedral_table.html">table/omp</A> 
 </TD></TR></TABLE></DIV>
 
 <HR>
 
 <H4>Improper_style potentials 
 </H4>
 <P>See the <A HREF = "improper_style.html">improper_style</A> command for an overview
 of improper potentials.  Click on the style itself for a full
 description:
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "improper_none.html">none</A></TD><TD WIDTH="100"><A HREF = "improper_hybrid.html">hybrid</A></TD><TD WIDTH="100"><A HREF = "improper_class2.html">class2</A></TD><TD WIDTH="100"><A HREF = "improper_cvff.html">cvff</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "improper_harmonic.html">harmonic</A></TD><TD WIDTH="100"><A HREF = "improper_umbrella.html">umbrella</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are improper styles contributed by users, which can be used if
 <A HREF = "Section_start.html#start_3">LAMMPS is built with the appropriate
 package</A>.
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "improper_cossq.html">cossq</A></TD><TD ><A HREF = "improper_fourier.html">fourier</A></TD><TD ><A HREF = "improper_ring.html">ring</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are accelerated improper styles, which can be used if LAMMPS is
 built with the <A HREF = "Section_accelerate.html">appropriate accelerated
 package</A>.
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "improper_class2.html">class2/omp</A></TD><TD WIDTH="100"><A HREF = "improper_cossq.html">cossq/omp</A></TD><TD WIDTH="100"><A HREF = "improper_cvff.html">cvff/omp</A></TD><TD WIDTH="100"><A HREF = "improper_fourier.html">fourier/omp</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "improper_harmonic.html">harmonic/omp</A></TD><TD WIDTH="100"><A HREF = "improper_ring.html">ring/omp</A></TD><TD WIDTH="100"><A HREF = "improper_umbrella.html">umbrella/omp</A> 
 </TD></TR></TABLE></DIV>
 
 <HR>
 
 <H4>Kspace solvers 
 </H4>
 <P>See the <A HREF = "kspace_style.html">kspace_style</A> command for an overview of
 Kspace solvers.  Click on the style itself for a full description:
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "kspace_style.html">ewald</A></TD><TD WIDTH="100"><A HREF = "kspace_style.html">ewald/disp</A></TD><TD WIDTH="100"><A HREF = "kspace_style.html">msm</A></TD><TD WIDTH="100"><A HREF = "kspace_style.html">msm/cg</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "kspace_style.html">pppm</A></TD><TD WIDTH="100"><A HREF = "kspace_style.html">pppm/cg</A></TD><TD WIDTH="100"><A HREF = "kspace_style.html">pppm/disp</A></TD><TD WIDTH="100"><A HREF = "kspace_style.html">pppm/disp/tip4p</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "kspace_style.html">pppm/tip4p</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are accelerated Kspace solvers, which can be used if LAMMPS is
 built with the <A HREF = "Section_accelerate.html">appropriate accelerated
 package</A>.
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "kspace_style.html">ewald/omp</A></TD><TD ><A HREF = "kspace_style.html">msm/omp</A></TD><TD ><A HREF = "kspace_style.html">msm/cg/omp</A></TD><TD ><A HREF = "kspace_style.html">pppm/cuda</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "kspace_style.html">pppm/gpu</A></TD><TD ><A HREF = "kspace_style.html">pppm/omp</A></TD><TD ><A HREF = "kspace_style.html">pppm/cg/omp</A></TD><TD ><A HREF = "kspace_style.html">pppm/tip4p/omp</A> 
 </TD></TR></TABLE></DIV>
 
 </HTML>
diff --git a/doc/Section_commands.txt b/doc/Section_commands.txt
index 9af13f7e6..3c65f7b19 100644
--- a/doc/Section_commands.txt
+++ b/doc/Section_commands.txt
@@ -1,1252 +1,1255 @@
 "Previous Section"_Section_start.html - "LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc - "Next Section"_Section_packages.html :c
 
 :link(lws,http://lammps.sandia.gov)
 :link(ld,Manual.html)
 :link(lc,Section_commands.html#comm)
 
 :line
 
 3. Commands :h3
 
 This section describes how a LAMMPS input script is formatted and the
 input script commands used to define a LAMMPS simulation.
 
 3.1 "LAMMPS input script"_#cmd_1
 3.2 "Parsing rules"_#cmd_2
 3.3 "Input script structure"_#cmd_3
 3.4 "Commands listed by category"_#cmd_4
 3.5 "Commands listed alphabetically"_#cmd_5 :all(b)
 
 :line
 :line
 
 3.1 LAMMPS input script :link(cmd_1),h4
 
 LAMMPS executes by reading commands from a input script (text file),
 one line at a time.  When the input script ends, LAMMPS exits.  Each
 command causes LAMMPS to take some action.  It may set an internal
 variable, read in a file, or run a simulation.  Most commands have
 default settings, which means you only need to use the command if you
 wish to change the default.
 
 In many cases, the ordering of commands in an input script is not
 important.  However the following rules apply:
 
 (1) LAMMPS does not read your entire input script and then perform a
 simulation with all the settings.  Rather, the input script is read
 one line at a time and each command takes effect when it is read.
 Thus this sequence of commands:
 
 timestep 0.5 
 run      100 
 run      100 :pre
 
 does something different than this sequence:
 
 run      100 
 timestep 0.5 
 run      100 :pre
 
 In the first case, the specified timestep (0.5 fmsec) is used for two
 simulations of 100 timesteps each.  In the 2nd case, the default
 timestep (1.0 fmsec) is used for the 1st 100 step simulation and a 0.5
 fmsec timestep is used for the 2nd one.
 
 (2) Some commands are only valid when they follow other commands.  For
 example you cannot set the temperature of a group of atoms until atoms
 have been defined and a group command is used to define which atoms
 belong to the group.
 
 (3) Sometimes command B will use values that can be set by command A.
 This means command A must precede command B in the input script if it
 is to have the desired effect.  For example, the
 "read_data"_read_data.html command initializes the system by setting
 up the simulation box and assigning atoms to processors.  If default
 values are not desired, the "processors"_processors.html and
 "boundary"_boundary.html commands need to be used before read_data to
 tell LAMMPS how to map processors to the simulation box.
 
 Many input script errors are detected by LAMMPS and an ERROR or
 WARNING message is printed.  "This section"_Section_errors.html gives
 more information on what errors mean.  The documentation for each
 command lists restrictions on how the command can be used.
 
 :line
 
 3.2 Parsing rules :link(cmd_2),h4
 
 Each non-blank line in the input script is treated as a command.
 LAMMPS commands are case sensitive.  Command names are lower-case, as
 are specified command arguments.  Upper case letters may be used in
 file names or user-chosen ID strings.
 
 Here is how each line in the input script is parsed by LAMMPS:
 
 (1) If the last printable character on the line is a "&" character
 (with no surrounding quotes), the command is assumed to continue on
 the next line.  The next line is concatenated to the previous line by
 removing the "&" character and newline.  This allows long commands to
 be continued across two or more lines.
 
 (2) All characters from the first "#" character onward are treated as
 comment and discarded.  See an exception in (6).  Note that a
 comment after a trailing "&" character will prevent the command from
 continuing on the next line.  Also note that for multi-line commands a
 single leading "#" will comment out the entire command.
 
 (3) The line is searched repeatedly for $ characters, which indicate
 variables that are replaced with a text string.  See an exception in
 (6).  
 
 If the $ is followed by curly brackets, then the variable name is the
 text inside the curly brackets.  If no curly brackets follow the $,
 then the variable name is the single character immediately following
 the $.  Thus $\{myTemp\} and $x refer to variable names "myTemp" and
 "x".
 
 How the variable is converted to a text string depends on what style
 of variable it is; see the "variable"_variable doc page for details.
 It can be a variable that stores multiple text strings, and return one
 of them.  The returned text string can be multiple "words" (space
 separated) which will then be interpreted as multiple arguments in the
 input command.  The variable can also store a numeric formula which
 will be evaluated and its numeric result returned as a string.
 
 As a special case, if the $ is followed by parenthesis, then the text
 inside the parenthesis is treated as an "immediate" variable and
 evaluated as an "equal-style variable"_variable.html.  This is a way
 to use numeric formulas in an input script without having to assign
 them to variable names.  For example, these 3 input script lines:
 
 variable X equal (xlo+xhi)/2+sqrt(v_area)
 region 1 block $X 2 INF INF EDGE EDGE
 variable X delete :pre
 
 can be replaced by 
 
 region 1 block $((xlo+xhi)/2+sqrt(v_area)) 2 INF INF EDGE EDGE :pre
 
 so that you do not have to define (or discard) a temporary variable X.
 
 Note that neither the curly-bracket or immediate form of variables can
 contain nested $ characters for other variables to substitute for.
 Thus you cannot do this:
 
 variable        a equal 2
 variable        b2 equal 4
 print           "B2 = $\{b$a\}" :pre
 
 Nor can you specify this $($x-1.0) for an immediate variable, but
 you could use $(v_x-1.0), since the latter is valid syntax for an
 "equal-style variable"_variable.html.
 
 See the "variable"_variable.html command for more details of how
 strings are assigned to variables and evaluated, and how they can be
 used in input script commands.
 
 (4) The line is broken into "words" separated by whitespace (tabs,
 spaces).  Note that words can thus contain letters, digits,
 underscores, or punctuation characters.
 
 (5) The first word is the command name.  All successive words in the
 line are arguments.
 
 (6) If you want text with spaces to be treated as a single argument,
 it can be enclosed in either double or single quotes.  A long single
 argument enclosed in quotes can even span multiple lines if the "&"
 character is used, as described above.  E.g.
 
 print "Volume = $v"
 print 'Volume = $v'
 variable a string "red green blue &
                    purple orange cyan"
 if "${steps} > 1000" then quit :pre
 
 The quotes are removed when the single argument is stored internally.
 
 See the "dump modify format"_dump_modify.html or "print"_print.html or
 "if"_if.html commands for examples.  A "#" or "$" character that is
 between quotes will not be treated as a comment indicator in (2) or
 substituted for as a variable in (3). 
 
 IMPORTANT NOTE: If the argument is itself a command that requires a
 quoted argument (e.g. using a "print"_print.html command as part of an
 "if"_if.html or "run every"_run.html command), then the double and
 single quotes can be nested in the usual manner.  See the doc pages
 for those commands for examples.  Only one of level of nesting is
 allowed, but that should be sufficient for most use cases.
 
 :line
 
 3.3 Input script structure :h4,link(cmd_3)
 
 This section describes the structure of a typical LAMMPS input script.
 The "examples" directory in the LAMMPS distribution contains many
 sample input scripts; the corresponding problems are discussed in
 "Section_example"_Section_example.html, and animated on the "LAMMPS
 WWW Site"_lws.
 
 A LAMMPS input script typically has 4 parts:
 
 Initialization
 Atom definition
 Settings
 Run a simulation :ol
 
 The last 2 parts can be repeated as many times as desired.  I.e. run a
 simulation, change some settings, run some more, etc.  Each of the 4
 parts is now described in more detail.  Remember that almost all the
 commands need only be used if a non-default value is desired.
 
 (1) Initialization
 
 Set parameters that need to be defined before atoms are created or
 read-in from a file.
 
 The relevant commands are "units"_units.html,
 "dimension"_dimension.html, "newton"_newton.html,
 "processors"_processors.html, "boundary"_boundary.html,
 "atom_style"_atom_style.html, "atom_modify"_atom_modify.html.
 
 If force-field parameters appear in the files that will be read, these
 commands tell LAMMPS what kinds of force fields are being used:
 "pair_style"_pair_style.html, "bond_style"_bond_style.html,
 "angle_style"_angle_style.html, "dihedral_style"_dihedral_style.html,
 "improper_style"_improper_style.html.
 
 (2) Atom definition
 
 There are 3 ways to define atoms in LAMMPS.  Read them in from a data
 or restart file via the "read_data"_read_data.html or
 "read_restart"_read_restart.html commands.  These files can contain
 molecular topology information.  Or create atoms on a lattice (with no
 molecular topology), using these commands: "lattice"_lattice.html,
 "region"_region.html, "create_box"_create_box.html,
 "create_atoms"_create_atoms.html.  The entire set of atoms can be
 duplicated to make a larger simulation using the
 "replicate"_replicate.html command.
 
 (3) Settings
 
 Once atoms and molecular topology are defined, a variety of settings
 can be specified: force field coefficients, simulation parameters,
 output options, etc.
 
 Force field coefficients are set by these commands (they can also be
 set in the read-in files): "pair_coeff"_pair_coeff.html,
 "bond_coeff"_bond_coeff.html, "angle_coeff"_angle_coeff.html,
 "dihedral_coeff"_dihedral_coeff.html,
 "improper_coeff"_improper_coeff.html,
 "kspace_style"_kspace_style.html, "dielectric"_dielectric.html,
 "special_bonds"_special_bonds.html.
 
 Various simulation parameters are set by these commands:
 "neighbor"_neighbor.html, "neigh_modify"_neigh_modify.html,
 "group"_group.html, "timestep"_timestep.html,
 "reset_timestep"_reset_timestep.html, "run_style"_run_style.html,
 "min_style"_min_style.html, "min_modify"_min_modify.html.
 
 Fixes impose a variety of boundary conditions, time integration, and
 diagnostic options.  The "fix"_fix.html command comes in many flavors.
 
 Various computations can be specified for execution during a
 simulation using the "compute"_compute.html,
 "compute_modify"_compute_modify.html, and "variable"_variable.html
 commands.
 
 Output options are set by the "thermo"_thermo.html, "dump"_dump.html,
 and "restart"_restart.html commands.
 
 (4) Run a simulation
 
 A molecular dynamics simulation is run using the "run"_run.html
 command.  Energy minimization (molecular statics) is performed using
 the "minimize"_minimize.html command.  A parallel tempering
 (replica-exchange) simulation can be run using the
 "temper"_temper.html command.
 
 :line
 
 3.4 Commands listed by category :link(cmd_4),h4
 
 This section lists all LAMMPS commands, grouped by category.  The
 "next section"_#cmd_5 lists the same commands alphabetically.  Note
 that some style options for some commands are part of specific LAMMPS
 packages, which means they cannot be used unless the package was
 included when LAMMPS was built.  Not all packages are included in a
 default LAMMPS build.  These dependencies are listed as Restrictions
 in the command's documentation.
 
 Initialization:
 
 "atom_modify"_atom_modify.html, "atom_style"_atom_style.html,
 "boundary"_boundary.html, "dimension"_dimension.html,
 "newton"_newton.html, "processors"_processors.html, "units"_units.html
 
 Atom definition:
 
 "create_atoms"_create_atoms.html, "create_box"_create_box.html,
 "lattice"_lattice.html, "read_data"_read_data.html,
 "read_dump"_read_dump.html, "read_restart"_read_restart.html,
 "region"_region.html, "replicate"_replicate.html
 
 Force fields:
 
 "angle_coeff"_angle_coeff.html, "angle_style"_angle_style.html,
 "bond_coeff"_bond_coeff.html, "bond_style"_bond_style.html,
 "dielectric"_dielectric.html, "dihedral_coeff"_dihedral_coeff.html,
 "dihedral_style"_dihedral_style.html,
 "improper_coeff"_improper_coeff.html,
 "improper_style"_improper_style.html,
 "kspace_modify"_kspace_modify.html, "kspace_style"_kspace_style.html,
 "pair_coeff"_pair_coeff.html, "pair_modify"_pair_modify.html,
 "pair_style"_pair_style.html, "pair_write"_pair_write.html,
 "special_bonds"_special_bonds.html
 
 Settings:
 
 "comm_style"_comm_style.html, "group"_group.html, "mass"_mass.html,
 "min_modify"_min_modify.html, "min_style"_min_style.html,
 "neigh_modify"_neigh_modify.html, "neighbor"_neighbor.html,
 "reset_timestep"_reset_timestep.html, "run_style"_run_style.html,
 "set"_set.html, "timestep"_timestep.html, "velocity"_velocity.html
 
 Fixes:
 
 "fix"_fix.html, "fix_modify"_fix_modify.html, "unfix"_unfix.html
 
 Computes:
 
 "compute"_compute.html, "compute_modify"_compute_modify.html,
 "uncompute"_uncompute.html
 
 Output:
 
 "dump"_dump.html, "dump image"_dump_image.html,
 "dump_modify"_dump_modify.html, "dump movie"_dump_image.html,
 "restart"_restart.html, "thermo"_thermo.html,
 "thermo_modify"_thermo_modify.html, "thermo_style"_thermo_style.html,
 "undump"_undump.html, "write_data"_write_data.html,
 "write_dump"_write_dump.html, "write_restart"_write_restart.html
 
 Actions:
 
 "delete_atoms"_delete_atoms.html, "delete_bonds"_delete_bonds.html,
 "displace_atoms"_displace_atoms.html, "change_box"_change_box.html,
 "minimize"_minimize.html, "neb"_neb.html "prd"_prd.html,
 "rerun"_rerun.html, "run"_run.html, "temper"_temper.html
 
 Miscellaneous:
 
 "clear"_clear.html, "echo"_echo.html, "if"_if.html,
 "include"_include.html, "jump"_jump.html, "label"_label.html,
 "log"_log.html, "next"_next.html, "print"_print.html,
 "shell"_shell.html, "variable"_variable.html
 
 :line
 
 3.5 Individual commands :h4,link(cmd_5),link(comm)
 
 This section lists all LAMMPS commands alphabetically, with a separate
 listing below of styles within certain commands.  The "previous
 section"_#cmd_4 lists the same commands, grouped by category.  Note
 that some style options for some commands are part of specific LAMMPS
 packages, which means they cannot be used unless the package was
 included when LAMMPS was built.  Not all packages are included in a
 default LAMMPS build.  These dependencies are listed as Restrictions
 in the command's documentation.
 
 "angle_coeff"_angle_coeff.html,
 "angle_style"_angle_style.html,
 "atom_modify"_atom_modify.html,
 "atom_style"_atom_style.html,
 "balance"_balance.html,
 "bond_coeff"_bond_coeff.html,
 "bond_style"_bond_style.html,
 "boundary"_boundary.html,
 "box"_box.html,
 "change_box"_change_box.html,
 "clear"_clear.html,
 "comm_modify"_comm_modify.html,
 "comm_style"_comm_style.html,
 "compute"_compute.html,
 "compute_modify"_compute_modify.html,
 "create_atoms"_create_atoms.html,
 "create_box"_create_box.html,
 "delete_atoms"_delete_atoms.html,
 "delete_bonds"_delete_bonds.html,
 "dielectric"_dielectric.html,
 "dihedral_coeff"_dihedral_coeff.html,
 "dihedral_style"_dihedral_style.html,
 "dimension"_dimension.html,
 "displace_atoms"_displace_atoms.html,
 "dump"_dump.html,
 "dump image"_dump_image.html,
 "dump_modify"_dump_modify.html,
 "dump movie"_dump_image.html,
 "echo"_echo.html,
 "fix"_fix.html,
 "fix_modify"_fix_modify.html,
 "group"_group.html,
 "if"_if.html,
 "improper_coeff"_improper_coeff.html,
 "improper_style"_improper_style.html,
 "include"_include.html,
 "jump"_jump.html,
 "kspace_modify"_kspace_modify.html,
 "kspace_style"_kspace_style.html,
 "label"_label.html,
 "lattice"_lattice.html,
 "log"_log.html,
 "mass"_mass.html,
 "minimize"_minimize.html,
 "min_modify"_min_modify.html,
 "min_style"_min_style.html,
 "molecule"_molecule.html,
 "neb"_neb.html,
 "neigh_modify"_neigh_modify.html,
 "neighbor"_neighbor.html,
 "newton"_newton.html,
 "next"_next.html,
 "package"_package.html,
 "pair_coeff"_pair_coeff.html,
 "pair_modify"_pair_modify.html,
 "pair_style"_pair_style.html,
 "pair_write"_pair_write.html,
 "partition"_partition.html,
 "prd"_prd.html,
 "print"_print.html,
 "processors"_processors.html,
 "quit"_quit.html,
 "read_data"_read_data.html,
 "read_dump"_read_dump.html,
 "read_restart"_read_restart.html,
 "region"_region.html,
 "replicate"_replicate.html,
 "rerun"_rerun.html,
 "reset_timestep"_reset_timestep.html,
 "restart"_restart.html,
 "run"_run.html,
 "run_style"_run_style.html,
 "set"_set.html,
 "shell"_shell.html,
 "special_bonds"_special_bonds.html,
 "suffix"_suffix.html,
 "tad"_tad.html,
 "temper"_temper.html,
 "thermo"_thermo.html,
 "thermo_modify"_thermo_modify.html,
 "thermo_style"_thermo_style.html,
 "timestep"_timestep.html,
 "uncompute"_uncompute.html,
 "undump"_undump.html,
 "unfix"_unfix.html,
 "units"_units.html,
 "variable"_variable.html,
 "velocity"_velocity.html,
 "write_data"_write_data.html,
 "write_dump"_write_dump.html,
 "write_restart"_write_restart.html :tb(c=6,ea=c)
 
 These are commands contributed by users, which can be used if "LAMMPS
 is built with the appropriate package"_Section_start.html#start_3.
 
 "group2ndx"_group2ndx.html :tb(c=1,ea=c)
 
 :line
 
 Fix styles :h4
 
 See the "fix"_fix.html command for one-line descriptions
 of each style or click on the style itself for a full description:
 
 "adapt"_fix_adapt.html,
 "addforce"_fix_addforce.html,
 "append/atoms"_fix_append_atoms.html,
 "aveforce"_fix_aveforce.html,
 "ave/atom"_fix_ave_atom.html,
 "ave/correlate"_fix_ave_correlate.html,
 "ave/histo"_fix_ave_histo.html,
 "ave/spatial"_fix_ave_spatial.html,
 "ave/time"_fix_ave_time.html,
 "balance"_fix_balance.html,
 "bond/break"_fix_bond_break.html,
 "bond/create"_fix_bond_create.html,
 "bond/swap"_fix_bond_swap.html,
 "box/relax"_fix_box_relax.html,
 "deform"_fix_deform.html,
 "deposit"_fix_deposit.html,
 "drag"_fix_drag.html,
 "dt/reset"_fix_dt_reset.html,
 "efield"_fix_efield.html,
 "enforce2d"_fix_enforce2d.html,
 "evaporate"_fix_evaporate.html,
 "external"_fix_external.html,
 "freeze"_fix_freeze.html,
 "gcmc"_fix_gcmc.html,
 "gld"_fix_gld.html,
 "gravity"_fix_gravity.html,
 "heat"_fix_heat.html,
 "indent"_fix_indent.html,
 "langevin"_fix_langevin.html,
 "lineforce"_fix_lineforce.html,
 "momentum"_fix_momentum.html,
 "move"_fix_move.html,
 "msst"_fix_msst.html,
 "neb"_fix_neb.html,
 "nph"_fix_nh.html,
 "nphug"_fix_nphug.html,
 "nph/asphere"_fix_nph_asphere.html,
 "nph/sphere"_fix_nph_sphere.html,
 "npt"_fix_nh.html,
 "npt/asphere"_fix_npt_asphere.html,
 "npt/sphere"_fix_npt_sphere.html,
 "nve"_fix_nve.html,
 "nve/asphere"_fix_nve_asphere.html,
 "nve/asphere/noforce"_fix_nve_asphere_noforce.html,
 "nve/body"_fix_nve_body.html,
 "nve/limit"_fix_nve_limit.html,
 "nve/line"_fix_nve_line.html,
 "nve/noforce"_fix_nve_noforce.html,
 "nve/sphere"_fix_nve_sphere.html,
 "nve/tri"_fix_nve_tri.html,
 "nvt"_fix_nh.html,
 "nvt/asphere"_fix_nvt_asphere.html,
 "nvt/sllod"_fix_nvt_sllod.html,
 "nvt/sphere"_fix_nvt_sphere.html,
 "oneway"_fix_oneway.html,
 "orient/fcc"_fix_orient_fcc.html,
 "planeforce"_fix_planeforce.html,
 "poems"_fix_poems.html,
 "pour"_fix_pour.html,
 "press/berendsen"_fix_press_berendsen.html,
 "print"_fix_print.html,
 "property/atom"_fix_property_atom.html,
 "qeq/comb"_fix_qeq_comb.html,
 "reax/bonds"_fix_reax_bonds.html,
 "recenter"_fix_recenter.html,
 "restrain"_fix_restrain.html,
 "rigid"_fix_rigid.html,
 "rigid/nph"_fix_rigid.html,
 "rigid/npt"_fix_rigid.html,
 "rigid/nve"_fix_rigid.html,
 "rigid/nvt"_fix_rigid.html,
 "rigid/small"_fix_rigid.html,
 "rigid/small/nph"_fix_rigid.html,
 "rigid/small/npt"_fix_rigid.html,
 "rigid/small/nve"_fix_rigid.html,
 "rigid/small/nvt"_fix_rigid.html,
 "setforce"_fix_setforce.html,
 "shake"_fix_shake.html,
 "spring"_fix_spring.html,
 "spring/rg"_fix_spring_rg.html,
 "spring/self"_fix_spring_self.html,
 "srd"_fix_srd.html,
 "store/force"_fix_store_force.html,
 "store/state"_fix_store_state.html,
 "temp/berendsen"_fix_temp_berendsen.html,
 "temp/csvr"_fix_temp_csvr.html,
 "temp/rescale"_fix_temp_rescale.html,
 "thermal/conductivity"_fix_thermal_conductivity.html,
 "tmd"_fix_tmd.html,
 "ttm"_fix_ttm.html,
 "tune/kspace"_fix_tune_kspace.html,
 "vector"_fix_vector.html,
 "viscosity"_fix_viscosity.html,
 "viscous"_fix_viscous.html,
 "wall/colloid"_fix_wall.html,
 "wall/gran"_fix_wall_gran.html,
 "wall/harmonic"_fix_wall.html,
 "wall/lj1043"_fix_wall.html,
 "wall/lj126"_fix_wall.html,
 "wall/lj93"_fix_wall.html,
 "wall/piston"_fix_wall_piston.html,
 "wall/reflect"_fix_wall_reflect.html,
 "wall/region"_fix_wall_region.html,
 "wall/srd"_fix_wall_srd.html :tb(c=8,ea=c)
 
 These are fix styles contributed by users, which can be used if
 "LAMMPS is built with the appropriate
 package"_Section_start.html#start_3.
 
 "adapt/fep"_fix_adapt_fep.html,
 "addtorque"_fix_addtorque.html,
 "atc"_fix_atc.html,
 "colvars"_fix_colvars.html,
 "imd"_fix_imd.html,
 "langevin/eff"_fix_langevin_eff.html,
 "lb/fluid"_fix_lb_fluid.html,
 "lb/momentum"_fix_lb_momentum.html,
 "lb/pc"_fix_lb_pc.html,
 "lb/rigid/pc/sphere"_fix_lb_rigid_pc_sphere.html,
 "lb/viscous"_fix_lb_viscous.html,
 "meso"_fix_meso.html,
 "meso/stationary"_fix_meso_stationary.html,
 "nph/eff"_fix_nh_eff.html,
 "npt/eff"_fix_nh_eff.html,
 "nve/eff"_fix_nve_eff.html,
 "nvt/eff"_fix_nh_eff.html,
 "nvt/sllod/eff"_fix_nvt_sllod_eff.html,
 "phonon"_fix_phonon.html,
 "qeq/reax"_fix_qeq_reax.html,
 "qmmm"_fix_qmmm.html,
 "reax/c/bonds"_fix_reax_bonds.html,
 "reax/c/species"_fix_reaxc_species.html,
 "smd"_fix_smd.html,
 "temp/rescale/eff"_fix_temp_rescale_eff.html,
 "ti/rs"_fix_ti_rs.html,
 "ti/spring"_fix_ti_spring.html :tb(c=6,ea=c)
 
 These are accelerated fix styles, which can be used if LAMMPS is
 built with the "appropriate accelerated
 package"_Section_accelerate.html.
 
 "freeze/cuda"_fix_freeze.html,
 "addforce/cuda"_fix_addforce.html,
 "aveforce/cuda"_fix_aveforce.html,
 "enforce2d/cuda"_fix_enforce2d.html,
 "gravity/cuda"_fix_gravity.html,
 "gravity/omp"_fix_gravity.html,
 "nph/omp"_fix_nh.html,
 "nphug/omp"_fix_nphug.html,
 "nph/asphere/omp"_fix_nph_asphere.html,
 "nph/sphere/omp"_fix_nph_sphere.html,
 "npt/cuda"_fix_nh.html,
 "npt/omp"_fix_nh.html,
 "npt/asphere/omp"_fix_npt_asphere.html,
 "npt/sphere/omp"_fix_npt_sphere.html,
-"nve/cuda"_fix_nh.html,
+"nve/cuda"_fix_nve.html,
+"nve/kk"_fix_nve.html,
 "nve/omp"_fix_nve.html,
 "nve/sphere/omp"_fix_nve_sphere.html,
 "nvt/cuda"_fix_nh.html,
 "nvt/omp"_fix_nh.html,
 "nvt/asphere/omp"_fix_nvt_asphere.html,
 "nvt/sllod/omp"_fix_nvt_sllod.html,
 "nvt/sphere/omp"_fix_nvt_sphere.html,
 "qeq/comb/omp"_fix_qeq_comb.html,
 "rigid/omp"_fix_rigid.html,
 "rigid/nph/omp"_fix_rigid.html,
 "rigid/npt/omp"_fix_rigid.html,
 "rigid/nve/omp"_fix_rigid.html,
 "rigid/nvt/omp"_fix_rigid.html,
 "rigid/small/omp"_fix_rigid.html,
 "setforce/cuda"_fix_setforce.html,
 "shake/cuda"_fix_shake.html,
 "temp/berendsen/cuda"_fix_temp_berendsen.html,
 "temp/rescale/cuda"_fix_temp_rescale.html,
 "temp/rescale/limit/cuda"_fix_temp_rescale.html,
 "viscous/cuda"_fix_viscous.html :tb(c=6,ea=c)
 
 :line
 
 Compute styles :h4
 
 See the "compute"_compute.html command for one-line descriptions of
 each style or click on the style itself for a full description:
 
 "angle/local"_compute_angle_local.html,
 "atom/molecule"_compute_atom_molecule.html,
 "body/local"_compute_body_local.html,
 "bond/local"_compute_bond_local.html,
 "centro/atom"_compute_centro_atom.html,
 "cluster/atom"_compute_cluster_atom.html,
 "cna/atom"_compute_cna_atom.html,
 "com"_compute_com.html,
 "com/molecule"_compute_com_molecule.html,
 "contact/atom"_compute_contact_atom.html,
 "coord/atom"_compute_coord_atom.html,
 "damage/atom"_compute_damage_atom.html,
 "dihedral/local"_compute_dihedral_local.html,
 "dilatation/atom"_compute_dilatation_atom.html,
 "displace/atom"_compute_displace_atom.html,
 "erotate/asphere"_compute_erotate_asphere.html,
 "erotate/rigid"_compute_erotate_rigid.html,
 "erotate/sphere"_compute_erotate_sphere.html,
 "erotate/sphere/atom"_compute_erotate_sphere_atom.html,
 "event/displace"_compute_event_displace.html,
 "group/group"_compute_group_group.html,
 "gyration"_compute_gyration.html,
 "gyration/molecule"_compute_gyration_molecule.html,
 "heat/flux"_compute_heat_flux.html,
 "improper/local"_compute_improper_local.html,
 "inertia/molecule"_compute_inertia_molecule.html,
 "ke"_compute_ke.html,
 "ke/atom"_compute_ke_atom.html,
 "ke/rigid"_compute_ke_rigid.html,
 "msd"_compute_msd.html,
 "msd/molecule"_compute_msd_molecule.html,
 "msd/nongauss"_compute_msd_nongauss.html,
 "pair"_compute_pair.html,
 "pair/local"_compute_pair_local.html,
 "pe"_compute_pe.html,
 "pe/atom"_compute_pe_atom.html,
 "plasticity/atom"_compute_plasticity_atom.html,
 "pressure"_compute_pressure.html,
 "property/atom"_compute_property_atom.html,
 "property/local"_compute_property_local.html,
 "property/molecule"_compute_property_molecule.html,
 "rdf"_compute_rdf.html,
 "reduce"_compute_reduce.html,
 "reduce/region"_compute_reduce.html,
 "slice"_compute_slice.html,
 "stress/atom"_compute_stress_atom.html,
 "temp"_compute_temp.html,
 "temp/asphere"_compute_temp_asphere.html,
 "temp/com"_compute_temp_com.html,
 "temp/deform"_compute_temp_deform.html,
 "temp/partial"_compute_temp_partial.html,
 "temp/profile"_compute_temp_profile.html,
 "temp/ramp"_compute_temp_ramp.html,
 "temp/region"_compute_temp_region.html,
 "temp/sphere"_compute_temp_sphere.html,
 "ti"_compute_ti.html,
 "vacf"_compute_vacf.html,
 "voronoi/atom"_compute_voronoi_atom.html :tb(c=6,ea=c)
 
 These are compute styles contributed by users, which can be used if
 "LAMMPS is built with the appropriate
 package"_Section_start.html#start_3.
 
 "ackland/atom"_compute_ackland_atom.html,
 "basal/atom"_compute_basal_atom.html,
 "fep"_compute_fep.html,
 "ke/eff"_compute_ke_eff.html,
 "ke/atom/eff"_compute_ke_atom_eff.html,
 "meso_e/atom"_compute_meso_e_atom.html,
 "meso_rho/atom"_compute_meso_rho_atom.html,
 "meso_t/atom"_compute_meso_t_atom.html,
 "temp/eff"_compute_temp_eff.html,
 "temp/deform/eff"_compute_temp_deform_eff.html,
 "temp/region/eff"_compute_temp_region_eff.html,
 "temp/rotate"_compute_temp_rotate.html :tb(c=6,ea=c)
 
 These are accelerated compute styles, which can be used if LAMMPS is
 built with the "appropriate accelerated
 package"_Section_accelerate.html.
 
 "pe/cuda"_compute_pe.html,
 "pressure/cuda"_compute_pressure.html,
 "temp/cuda"_compute_temp.html,
 "temp/partial/cuda"_compute_temp_partial.html :tb(c=6,ea=c)
 
 :line
 
 Pair_style potentials :h4
 
 See the "pair_style"_pair_style.html command for an overview of pair
 potentials.  Click on the style itself for a full description:
 
 "none"_pair_none.html,
 "hybrid"_pair_hybrid.html,
 "hybrid/overlay"_pair_hybrid.html,
 "adp"_pair_adp.html,
 "airebo"_pair_airebo.html,
 "beck"_pair_beck.html,
 "body"_pair_body.html,
 "bop"_pair_bop.html,
 "born"_pair_born.html,
 "born/coul/long"_pair_born.html,
 "born/coul/msm"_pair_born.html,
 "born/coul/wolf"_pair_born.html,
 "brownian"_pair_brownian.html,
 "brownian/poly"_pair_brownian.html,
 "buck"_pair_buck.html,
 "buck/coul/cut"_pair_buck.html,
 "buck/coul/long"_pair_buck.html,
 "buck/coul/msm"_pair_buck.html,
 "buck/long/coul/long"_pair_buck_long.html,
 "colloid"_pair_colloid.html,
 "comb"_pair_comb.html,
 "comb3"_pair_comb.html,
 "coul/cut"_pair_coul.html,
 "coul/debye"_pair_coul.html,
 "coul/dsf"_pair_coul.html,
 "coul/long"_pair_coul.html,
 "coul/msm"_pair_coul.html,
 "coul/wolf"_pair_coul.html,
 "dpd"_pair_dpd.html,
 "dpd/tstat"_pair_dpd.html,
 "dsmc"_pair_dsmc.html,
 "eam"_pair_eam.html,
 "eam/alloy"_pair_eam.html,
 "eam/fs"_pair_eam.html,
 "eim"_pair_eim.html,
 "gauss"_pair_gauss.html,
 "gayberne"_pair_gayberne.html,
 "gran/hertz/history"_pair_gran.html,
 "gran/hooke"_pair_gran.html,
 "gran/hooke/history"_pair_gran.html,
 "hbond/dreiding/lj"_pair_hbond_dreiding.html,
 "hbond/dreiding/morse"_pair_hbond_dreiding.html,
 "kim"_pair_kim.html,
 "lcbop"_pair_lcbop.html,
 "line/lj"_pair_line_lj.html,
 "lj/charmm/coul/charmm"_pair_charmm.html,
 "lj/charmm/coul/charmm/implicit"_pair_charmm.html,
 "lj/charmm/coul/long"_pair_charmm.html,
 "lj/charmm/coul/msm"_pair_charmm.html,
 "lj/class2"_pair_class2.html,
 "lj/class2/coul/cut"_pair_class2.html,
 "lj/class2/coul/long"_pair_class2.html,
 "lj/cut"_pair_lj.html,
 "lj/cut/coul/cut"_pair_lj.html,
 "lj/cut/coul/debye"_pair_lj.html,
 "lj/cut/coul/dsf"_pair_lj.html,
 "lj/cut/coul/long"_pair_lj.html,
 "lj/cut/coul/msm"_pair_lj.html,
 "lj/cut/dipole/cut"_pair_dipole.html,
 "lj/cut/dipole/long"_pair_dipole.html,
 "lj/cut/tip4p/cut"_pair_lj.html,
 "lj/cut/tip4p/long"_pair_lj.html,
 "lj/expand"_pair_lj_expand.html,
 "lj/gromacs"_pair_gromacs.html,
 "lj/gromacs/coul/gromacs"_pair_gromacs.html,
 "lj/long/coul/long"_pair_lj_long.html,
 "lj/long/dipole/long"_pair_dipole.html,
 "lj/long/tip4p/long"_pair_lj_long.html,
 "lj/smooth"_pair_lj_smooth.html,
 "lj/smooth/linear"_pair_lj_smooth_linear.html,
 "lj96/cut"_pair_lj96.html,
 "lubricate"_pair_lubricate.html,
 "lubricate/poly"_pair_lubricate.html,
 "lubricateU"_pair_lubricateU.html,
 "lubricateU/poly"_pair_lubricateU.html,
 "meam"_pair_meam.html,
 "mie/cut"_pair_mie.html,
 "morse"_pair_morse.html,
 "nb3b/harmonic"_pair_nb3b_harmonic.html,
 "nm/cut"_pair_nm.html,
 "nm/cut/coul/cut"_pair_nm.html,
 "nm/cut/coul/long"_pair_nm.html,
 "peri/eps"_pair_peri.html,
 "peri/lps"_pair_peri.html,
 "peri/pmb"_pair_peri.html,
 "peri/ves"_pair_peri.html,
 "reax"_pair_reax.html,
 "rebo"_pair_airebo.html,
 "resquared"_pair_resquared.html,
 "soft"_pair_soft.html,
 "sw"_pair_sw.html,
 "table"_pair_table.html,
 "tersoff"_pair_tersoff.html,
 "tersoff/mod"_pair_tersoff_mod.html,
 "tersoff/zbl"_pair_tersoff_zbl.html,
 "tip4p/cut"_pair_coul.html,
 "tip4p/long"_pair_coul.html,
 "tri/lj"_pair_tri_lj.html,
 "yukawa"_pair_yukawa.html,
 "yukawa/colloid"_pair_yukawa_colloid.html,
 "zbl"_pair_zbl.html :tb(c=4,ea=c)
 
 These are pair styles contributed by users, which can be used if
 "LAMMPS is built with the appropriate
 package"_Section_start.html#start_3.
 
 "awpmd/cut"_pair_awpmd.html,
 "coul/cut/soft"_pair_lj_soft.html,
 "coul/diel"_pair_coul_diel.html,
 "coul/long/soft"_pair_lj_soft.html,
 "eam/cd"_pair_eam.html,
 "edip"_pair_edip.html,
 "eff/cut"_pair_eff.html,
 "gauss/cut"_pair_gauss.html,
 "list"_pair_list.html,
 "lj/cut/coul/cut/soft"_pair_lj_soft.html,
 "lj/cut/coul/long/soft"_pair_lj_soft.html,
 "lj/cut/dipole/sf"_pair_dipole.html,
 "lj/cut/soft"_pair_lj_soft.html,
 "lj/cut/tip4p/long/soft"_pair_lj_soft.html,
 "lj/sdk"_pair_sdk.html,
 "lj/sdk/coul/long"_pair_sdk.html,
 "lj/sdk/coul/msm"_pair_sdk.html,
 "lj/sf"_pair_lj_sf.html,
 "meam/spline"_pair_meam_spline.html,
 "meam/sw/spline"_pair_meam_sw_spline.html,
 "reax/c"_pair_reax_c.html,
 "sph/heatconduction"_pair_sph_heatconduction.html,
 "sph/idealgas"_pair_sph_idealgas.html,
 "sph/lj"_pair_sph_lj.html,
 "sph/rhosum"_pair_sph_rhosum.html,
 "sph/taitwater"_pair_sph_taitwater.html,
 "sph/taitwater/morris"_pair_sph_taitwater_morris.html,
 "tersoff/table"_pair_tersoff.html,
 "tip4p/long/soft"_pair_lj_soft.html :tb(c=4,ea=c)
 
 These are accelerated pair styles, which can be used if LAMMPS is
 built with the "appropriate accelerated
 package"_Section_accelerate.html.
 
 "adp/omp"_pair_adp.html,
 "airebo/omp"_pair_airebo.html,
 "beck/gpu"_pair_beck.html,
 "beck/omp"_pair_beck.html,
 "born/coul/long/cuda"_pair_born.html,
 "born/coul/long/gpu"_pair_born.html,
 "born/coul/long/omp"_pair_born.html,
 "born/coul/msm/omp"_pair_born.html,
 "born/coul/wolf/gpu"_pair_born.html,
 "born/coul/wolf/omp"_pair_born.html,
 "born/gpu"_pair_born.html,
 "born/omp"_pair_born.html,
 "brownian/omp"_pair_brownian.html,
 "brownian/poly/omp"_pair_brownian.html,
 "buck/coul/cut/cuda"_pair_buck.html,
 "buck/coul/cut/gpu"_pair_buck.html,
 "buck/coul/cut/omp"_pair_buck.html,
 "buck/coul/long/cuda"_pair_buck.html,
 "buck/coul/long/gpu"_pair_buck.html,
 "buck/coul/long/omp"_pair_buck.html,
 "buck/coul/msm/omp"_pair_buck.html,
 "buck/cuda"_pair_buck.html,
 "buck/long/coul/long/omp"_pair_buck_long.html,
 "buck/gpu"_pair_buck.html,
 "buck/omp"_pair_buck.html,
 "colloid/gpu"_pair_colloid.html,
 "colloid/omp"_pair_colloid.html,
 "comb/omp"_pair_comb.html,
 "coul/cut/omp"_pair_coul.html,
 "coul/cut/soft/omp"_pair_lj_soft.html,
 "coul/debye/omp"_pair_coul.html,
 "coul/diel/omp"_pair_diel.html,
 "coul/dsf/gpu"_pair_coul.html,
 "coul/dsf/omp"_pair_coul.html,
 "coul/long/gpu"_pair_coul.html,
 "coul/long/omp"_pair_coul.html,
 "coul/long/soft/omp"_pair_lj_soft.html,
 "coul/msm/omp"_pair_coul.html,
 "coul/wolf"_pair_coul.html,
 "coul/cut/soft/omp"_pair_lj_soft.html,
 "coul/long/soft/omp"_pair_lj_soft.html,
 "dpd/omp"_pair_dpd.html,
 "dpd/tstat/omp"_pair_dpd.html,
 "eam/alloy/cuda"_pair_eam.html,
 "eam/alloy/gpu"_pair_eam.html,
 "eam/alloy/omp"_pair_eam.html,
 "eam/alloy/opt"_pair_eam.html,
 "eam/cd/omp"_pair_eam.html,
 "eam/cuda"_pair_eam.html,
 "eam/fs/cuda"_pair_eam.html,
 "eam/fs/gpu"_pair_eam.html,
 "eam/fs/omp"_pair_eam.html,
 "eam/fs/opt"_pair_eam.html,
 "eam/gpu"_pair_eam.html,
 "eam/omp"_pair_eam.html,
 "eam/opt"_pair_eam.html,
 "edip/omp"_pair_edip.html,
 "eim/omp"_pair_eim.html,
 "gauss/gpu"_pair_gauss.html,
 "gauss/omp"_pair_gauss.html,
 "gayberne/gpu"_pair_gayberne.html,
 "gayberne/omp"_pair_gayberne.html,
 "gran/hertz/history/omp"_pair_gran.html,
 "gran/hooke/cuda"_pair_gran.html,
 "gran/hooke/history/omp"_pair_gran.html,
 "gran/hooke/omp"_pair_gran.html,
 "hbond/dreiding/lj/omp"_pair_hbond_dreiding.html,
 "hbond/dreiding/morse/omp"_pair_hbond_dreiding.html,
 "line/lj/omp"_pair_line_lj.html,
 "lj/charmm/coul/charmm/cuda"_pair_charmm.html,
 "lj/charmm/coul/charmm/omp"_pair_charmm.html,
 "lj/charmm/coul/charmm/implicit/cuda"_pair_charmm.html,
 "lj/charmm/coul/charmm/implicit/omp"_pair_charmm.html,
 "lj/charmm/coul/long/cuda"_pair_charmm.html,
 "lj/charmm/coul/long/gpu"_pair_charmm.html,
 "lj/charmm/coul/long/omp"_pair_charmm.html,
 "lj/charmm/coul/long/soft"_pair_lj_soft.html,
 "lj/charmm/coul/long/soft/omp"_pair_lj_soft.html,
 "lj/class2/coul/cut/cuda"_pair_class2.html,
 "lj/class2/coul/cut/omp"_pair_class2.html,
 "lj/class2/coul/long/cuda"_pair_class2.html,
 "lj/class2/coul/long/gpu"_pair_class2.html,
 "lj/class2/coul/long/omp"_pair_class2.html,
 "lj/class2/coul/msm/omp"_pair_class2.html,
 "lj/class2/cuda"_pair_class2.html,
 "lj/class2/gpu"_pair_class2.html,
 "lj/class2/omp"_pair_class2.html,
 "lj/long/coul/long/omp"_pair_lj_long.html,
 "lj/cut/coul/cut/cuda"_pair_lj.html,
 "lj/cut/coul/cut/gpu"_pair_lj.html,
 "lj/cut/coul/cut/omp"_pair_lj.html,
 "lj/cut/coul/cut/soft/omp"_pair_lj_soft.html,
 "lj/cut/coul/debye/cuda"_pair_lj.html,
 "lj/cut/coul/debye/gpu"_pair_lj.html,
 "lj/cut/coul/debye/omp"_pair_lj.html,
 "lj/cut/coul/dsf/gpu"_pair_lj.html,
 "lj/cut/coul/dsf/omp"_pair_lj.html,
 "lj/cut/coul/long/cuda"_pair_lj.html,
 "lj/cut/coul/long/gpu"_pair_lj.html,
 "lj/cut/coul/long/omp"_pair_lj.html,
 "lj/cut/coul/long/opt"_pair_lj.html,
 "lj/cut/coul/msm/gpu"_pair_lj.html,
 "lj/cut/coul/msm/opt"_pair_lj.html,
 "lj/cut/coul/long/soft/omp"_pair_lj_soft.html,
 "lj/cut/cuda"_pair_lj.html,
+"lj/cut/kk"_pair_lj.html,
 "lj/cut/dipole/cut/gpu"_pair_dipole.html,
 "lj/cut/dipole/cut/omp"_pair_dipole.html,
 "lj/cut/dipole/sf/gpu"_pair_dipole.html,
 "lj/cut/dipole/sf/omp"_pair_dipole.html,
 "lj/cut/experimental/cuda"_pair_lj.html,
 "lj/cut/gpu"_pair_lj.html,
 "lj/cut/omp"_pair_lj.html,
 "lj/cut/opt"_pair_lj.html,
 "lj/cut/soft/omp"_pair_lj_soft.html,
 "lj/cut/tip4p/cut/omp"_pair_lj.html,
 "lj/cut/tip4p/long/omp"_pair_lj.html,
 "lj/cut/tip4p/long/opt"_pair_lj.html,
 "lj/cut/tip4p/long/soft/omp"_pair_lj_soft.html,
 "lj/expand/cuda"_pair_lj_expand.html,
 "lj/expand/gpu"_pair_lj_expand.html,
 "lj/expand/omp"_pair_lj_expand.html,
 "lj/gromacs/coul/gromacs/cuda"_pair_gromacs.html,
 "lj/gromacs/coul/gromacs/omp"_pair_gromacs.html,
 "lj/gromacs/cuda"_pair_gromacs.html,
 "lj/gromacs/gpu"_pair_gromacs.html,
 "lj/gromacs/omp"_pair_gromacs.html,
 "lj/long/coul/long/opt"_pair_lj_long.html,
 "lj/sdk/gpu"_pair_sdk.html,
 "lj/sdk/omp"_pair_sdk.html,
 "lj/sdk/coul/long/gpu"_pair_sdk.html,
 "lj/sdk/coul/long/omp"_pair_sdk.html,
 "lj/sdk/coul/msm/omp"_pair_sdk.html,
 "lj/sf/omp"_pair_lj_sf.html,
 "lj/smooth/cuda"_pair_lj_smooth.html,
 "lj/smooth/omp"_pair_lj_smooth.html,
 "lj/smooth/linear/omp"_pair_lj_smooth_linear.html,
 "lj96/cut/cuda"_pair_lj96.html,
 "lj96/cut/gpu"_pair_lj96.html,
 "lj96/cut/omp"_pair_lj96.html,
 "lubricate/omp"_pair_lubricate.html,
 "lubricate/poly/omp"_pair_lubricate.html,
 "meam/spline/omp"_pair_meam_spline.html,
 "mie/cut/gpu"_pair_mie.html,
 "morse/cuda"_pair_morse.html,
 "morse/gpu"_pair_morse.html,
 "morse/omp"_pair_morse.html,
 "morse/opt"_pair_morse.html,
 "nb3b/harmonic/omp"_pair_nb3b_harmonic.html,
 "nm/cut/omp"_pair_nm.html,
 "nm/cut/coul/cut/omp"_pair_nm.html,
 "nm/cut/coul/long/omp"_pair_nm.html,
 "peri/lps/omp"_pair_peri.html,
 "peri/pmb/omp"_pair_peri.html,
 "rebo/omp"_pair_airebo.html,
 "resquared/gpu"_pair_resquared.html,
 "resquared/omp"_pair_resquared.html,
 "soft/gpu"_pair_soft.html,
 "soft/omp"_pair_soft.html,
 "sw/cuda"_pair_sw.html,
 "sw/gpu"_pair_sw.html,
 "sw/omp"_pair_sw.html,
 "table/gpu"_pair_table.html,
+"table/kk"_pair_table.html,
 "table/omp"_pair_table.html,
 "tersoff/cuda"_pair_tersoff.html,
 "tersoff/omp"_pair_tersoff.html,
 "tersoff/mod/omp"_pair_tersoff_mod.html,
 "tersoff/table/omp"_pair_tersoff.html,
 "tersoff/zbl/omp"_pair_tersoff_zbl.html,
 "tip4p/cut/omp"_pair_coul.html,
 "tip4p/long/omp"_pair_coul.html,
 "tip4p/long/soft/omp"_pair_lj_soft.html,
 "tri/lj/omp"_pair_tri_lj.html,
 "yukawa/gpu"_pair_yukawa.html,
 "yukawa/omp"_pair_yukawa.html,
 "yukawa/colloid/gpu"_pair_yukawa_colloid.html,
 "yukawa/colloid/omp"_pair_yukawa_colloid.html,
 "zbl/omp"_pair_zbl.html :tb(c=4,ea=c)
 
 :line
 
 Bond_style potentials :h4
 
 See the "bond_style"_bond_style.html command for an overview of bond
 potentials.  Click on the style itself for a full description:
 
 "none"_bond_none.html,
 "hybrid"_bond_hybrid.html,
 "class2"_bond_class2.html,
 "fene"_bond_fene.html,
 "fene/expand"_bond_fene_expand.html,
 "harmonic"_bond_harmonic.html,
 "morse"_bond_morse.html,
 "nonlinear"_bond_nonlinear.html,
 "quartic"_bond_quartic.html,
 "table"_bond_table.html :tb(c=4,ea=c,w=100)
 
 These are bond styles contributed by users, which can be used if
 "LAMMPS is built with the appropriate
 package"_Section_start.html#start_3.
 
 "harmonic/shift"_bond_harmonic_shift.html,
 "harmonic/shift/cut"_bond_harmonic_shift_cut.html :tb(c=4,ea=c)
 
 These are accelerated bond styles, which can be used if LAMMPS is
 built with the "appropriate accelerated
 package"_Section_accelerate.html.
 
 "class2/omp"_bond_class2.html,
 "fene/omp"_bond_fene.html,
 "fene/expand/omp"_bond_fene_expand.html,
 "harmonic/omp"_bond_harmonic.html,
 "harmonic/shift/omp"_bond_harmonic_shift.html,
 "harmonic/shift/cut/omp"_bond_harmonic_shift_cut.html,
 "morse/omp"_bond_morse.html,
 "nonlinear/omp"_bond_nonlinear.html,
 "quartic/omp"_bond_quartic.html,
 "table/omp"_bond_table.html :tb(c=4,ea=c,w=100)
 
 :line
 
 Angle_style potentials :h4
 
 See the "angle_style"_angle_style.html command for an overview of
 angle potentials.  Click on the style itself for a full description:
 
 "none"_angle_none.html,
 "hybrid"_angle_hybrid.html,
 "charmm"_angle_charmm.html,
 "class2"_angle_class2.html,
 "cosine"_angle_cosine.html,
 "cosine/delta"_angle_cosine_delta.html,
 "cosine/periodic"_angle_cosine_periodic.html,
 "cosine/squared"_angle_cosine_squared.html,
 "harmonic"_angle_harmonic.html,
 "table"_angle_table.html :tb(c=4,ea=c,w=100)
 
 These are angle styles contributed by users, which can be used if
 "LAMMPS is built with the appropriate
 package"_Section_start.html#start_3.
 
 "sdk"_angle_sdk.html,
 "cosine/shift"_angle_cosine_shift.html,
 "cosine/shift/exp"_angle_cosine_shift_exp.html,
 "dipole"_angle_dipole.html,
 "fourier"_angle_fourier.html,
 "fourier/simple"_angle_fourier_simple.html,
 "quartic"_angle_quartic.html :tb(c=4,ea=c)
 
 These are accelerated angle styles, which can be used if LAMMPS is
 built with the "appropriate accelerated
 package"_Section_accelerate.html.
 
 "charmm/omp"_angle_charmm.html,
 "class2/omp"_angle_class2.html,
 "cosine/omp"_angle_cosine.html,
 "cosine/delta/omp"_angle_cosine_delta.html,
 "cosine/periodic/omp"_angle_cosine_periodic.html,
 "cosine/shift/omp"_angle_cosine_shift.html,
 "cosine/shift/exp/omp"_angle_cosine_shift_exp.html,
 "cosine/squared/omp"_angle_cosine_squared.html,
 "dipole/omp"_angle_dipole.html
 "fourier/omp"_angle_fourier.html,
 "fourier/simple/omp"_angle_fourier_simple.html,
 "harmonic/omp"_angle_harmonic.html,
 "quartic/omp"_angle_quartic.html
 "table/omp"_angle_table.html :tb(c=4,ea=c,w=100)
 
 :line
 
 Dihedral_style potentials :h4
 
 See the "dihedral_style"_dihedral_style.html command for an overview
 of dihedral potentials.  Click on the style itself for a full
 description:
 
 "none"_dihedral_none.html,
 "hybrid"_dihedral_hybrid.html,
 "charmm"_dihedral_charmm.html,
 "class2"_dihedral_class2.html,
 "harmonic"_dihedral_harmonic.html,
 "helix"_dihedral_helix.html,
 "multi/harmonic"_dihedral_multi_harmonic.html,
 "opls"_dihedral_opls.html :tb(c=4,ea=c,w=100)
 
 These are dihedral styles contributed by users, which can be used if
 "LAMMPS is built with the appropriate
 package"_Section_start.html#start_3.
 
 "cosine/shift/exp"_dihedral_cosine_shift_exp.html,
 "fourier"_dihedral_fourier.html,
 "nharmonic"_dihedral_nharmonic.html,
 "quadratic"_dihedral_quadratic.html,
 "table"_dihedral_table.html :tb(c=4,ea=c)
 
 These are accelerated dihedral styles, which can be used if LAMMPS is
 built with the "appropriate accelerated
 package"_Section_accelerate.html.
 
 "charmm/omp"_dihedral_charmm.html,
 "class2/omp"_dihedral_class2.html,
 "cosine/shift/exp/omp"_dihedral_cosine_shift_exp.html,
 "fourier/omp"_dihedral_fourier.html,
 "harmonic/omp"_dihedral_harmonic.html,
 "helix/omp"_dihedral_helix.html,
 "multi/harmonic/omp"_dihedral_multi_harmonic.html,
 "nharmonic/omp"_dihedral_nharmonic.html,
 "opls/omp"_dihedral_opls.html
 "quadratic/omp"_dihedral_quadratic.html,
 "table/omp"_dihedral_table.html :tb(c=4,ea=c,w=100)
 
 :line
 
 Improper_style potentials :h4
 
 See the "improper_style"_improper_style.html command for an overview
 of improper potentials.  Click on the style itself for a full
 description:
 
 "none"_improper_none.html,
 "hybrid"_improper_hybrid.html,
 "class2"_improper_class2.html,
 "cvff"_improper_cvff.html,
 "harmonic"_improper_harmonic.html,
 "umbrella"_improper_umbrella.html :tb(c=4,ea=c,w=100)
 
 These are improper styles contributed by users, which can be used if
 "LAMMPS is built with the appropriate
 package"_Section_start.html#start_3.
 
 "cossq"_improper_cossq.html,
 "fourier"_improper_fourier.html,
 "ring"_improper_ring.html :tb(c=4,ea=c)
 
 These are accelerated improper styles, which can be used if LAMMPS is
 built with the "appropriate accelerated
 package"_Section_accelerate.html.
 
 "class2/omp"_improper_class2.html,
 "cossq/omp"_improper_cossq.html,
 "cvff/omp"_improper_cvff.html,
 "fourier/omp"_improper_fourier.html,
 "harmonic/omp"_improper_harmonic.html,
 "ring/omp"_improper_ring.html,
 "umbrella/omp"_improper_umbrella.html :tb(c=4,ea=c,w=100)
 
 :line
 
 Kspace solvers :h4
 
 See the "kspace_style"_kspace_style.html command for an overview of
 Kspace solvers.  Click on the style itself for a full description:
 
 "ewald"_kspace_style.html,
 "ewald/disp"_kspace_style.html,
 "msm"_kspace_style.html,
 "msm/cg"_kspace_style.html,
 "pppm"_kspace_style.html,
 "pppm/cg"_kspace_style.html,
 "pppm/disp"_kspace_style.html,
 "pppm/disp/tip4p"_kspace_style.html,
 "pppm/tip4p"_kspace_style.html :tb(c=4,ea=c,w=100)
 
 These are accelerated Kspace solvers, which can be used if LAMMPS is
 built with the "appropriate accelerated
 package"_Section_accelerate.html.
 
 "ewald/omp"_kspace_style.html,
 "msm/omp"_kspace_style.html,
 "msm/cg/omp"_kspace_style.html,
 "pppm/cuda"_kspace_style.html,
 "pppm/gpu"_kspace_style.html,
 "pppm/omp"_kspace_style.html,
 "pppm/cg/omp"_kspace_style.html,
 "pppm/tip4p/omp"_kspace_style.html :tb(c=4,ea=c)
diff --git a/doc/Section_packages.html b/doc/Section_packages.html
index 55e3901ea..39a853311 100644
--- a/doc/Section_packages.html
+++ b/doc/Section_packages.html
@@ -1,564 +1,569 @@
 <HTML>
 <CENTER><A HREF = "Section_commands.html">Previous Section</A> - <A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> -
 <A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> - <A HREF = "Section_accelerate.html">Next
 Section</A> 
 </CENTER>
 
 
 
 
 
 
 <HR>
 
 <H3>4. Packages 
 </H3>
 <P>This section gives a quick overview of the add-on packages that extend
 LAMMPS functionality.
 </P>
 4.1 <A HREF = "#pkg_1">Standard packages</A><BR>
 4.2 <A HREF = "#pkg_2">User packages</A> <BR>
 
 <P>LAMMPS includes many optional packages, which are groups of files that
 enable a specific set of features.  For example, force fields for
 molecular systems or granular systems are in packages.  You can see
 the list of all packages by typing "make package" from within the src
 directory of the LAMMPS distribution.
 </P>
 <P>See <A HREF = "Section_start.html#start_3">Section_start 3</A> of the manual for
 details on how to include/exclude specific packages as part of the
 LAMMPS build process, and for more details about the differences
 between standard packages and user packages in LAMMPS.
 </P>
 <P>Below, the packages currently availabe in LAMMPS are listed.  For
 standard packages, just a one-line description is given.  For user
 packages, more details are provided.
 </P>
 <HR>
 
 <HR>
 
 <H4><A NAME = "pkg_1"></A>4.1 Standard packages 
 </H4>
 <P>The current list of standard packages is as follows:
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD >Package</TD><TD > Description</TD><TD > Author(s)</TD><TD > Doc page</TD><TD > Example</TD><TD > Library</TD></TR>
 <TR ALIGN="center"><TD >ASPHERE</TD><TD > aspherical particles</TD><TD > -</TD><TD > <A HREF = "Section_howto.html#howto_14">Section_howto</A></TD><TD > ellipse</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >BODY</TD><TD > body-style particles</TD><TD > -</TD><TD > <A HREF = "body.html">body</A></TD><TD > body</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >CLASS2</TD><TD > class 2 force fields</TD><TD > -</TD><TD > <A HREF = "pair_class2.html">pair_style lj/class2</A></TD><TD > -</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >COLLOID</TD><TD > colloidal particles</TD><TD > -</TD><TD > <A HREF = "atom_style.html">atom_style colloid</A></TD><TD > colloid</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >DIPOLE</TD><TD > point dipole particles</TD><TD > -</TD><TD > <A HREF = "pair_dipole.html">pair_style dipole/cut</A></TD><TD > dipole</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >FLD</TD><TD > Fast Lubrication Dynamics</TD><TD > Kumar & Bybee & Higdon (1)</TD><TD > <A HREF = "pair_lubricateU.html">pair_style lubricateU</A></TD><TD > -</TD><TD > -</TD></TR>
-<TR ALIGN="center"><TD >GPU</TD><TD > GPU-enabled potentials</TD><TD > Mike Brown (ORNL)</TD><TD > <A HREF = "Section_accelerate.html#acc_6">Section accelerate</A></TD><TD > gpu</TD><TD > lib/gpu</TD></TR>
+<TR ALIGN="center"><TD >GPU</TD><TD > GPU-enabled styles</TD><TD > Mike Brown (ORNL)</TD><TD > <A HREF = "Section_accelerate.html#acc_6">Section accelerate</A></TD><TD > gpu</TD><TD > lib/gpu</TD></TR>
 <TR ALIGN="center"><TD >GRANULAR</TD><TD > granular systems</TD><TD > -</TD><TD > <A HREF = "Section_howto.html#howto_6">Section_howto</A></TD><TD > pour</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >KIM</TD><TD > openKIM potentials</TD><TD > Smirichinski & Elliot & Tadmor (3)</TD><TD > <A HREF = "pair_kim.html">pair_style kim</A></TD><TD > kim</TD><TD > KIM</TD></TR>
+<TR ALIGN="center"><TD >KOKKOS</TD><TD > Kokkos-enabled styles</TD><TD > Trott & Edwards (4)</TD><TD > <A HREF = "Section_accelerate.html#acc_8">Section_accelerate</A></TD><TD > kokkos</TD><TD > lib/kokkos</TD></TR>
 <TR ALIGN="center"><TD >KSPACE</TD><TD > long-range Coulombic solvers</TD><TD > -</TD><TD > <A HREF = "kspace_style.html">kspace_style</A></TD><TD > peptide</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >MANYBODY</TD><TD > many-body potentials</TD><TD > -</TD><TD > <A HREF = "pair_tersoff.html">pair_style tersoff</A></TD><TD > shear</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >MEAM</TD><TD > modified EAM potential</TD><TD > Greg Wagner (Sandia)</TD><TD > <A HREF = "pair_meam.html">pair_style meam</A></TD><TD > meam</TD><TD > lib/meam</TD></TR>
 <TR ALIGN="center"><TD >MC</TD><TD > Monte Carlo options</TD><TD > -</TD><TD > <A HREF = "fix_gcmc.html">fix gcmc</A></TD><TD > -</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >MOLECULE</TD><TD > molecular system force fields</TD><TD > -</TD><TD > <A HREF = "Section_howto.html#howto_3">Section_howto</A></TD><TD > peptide</TD><TD > -</TD></TR>
-<TR ALIGN="center"><TD >OPT</TD><TD > optimized pair potentials</TD><TD > Fischer & Richie & Natoli (2)</TD><TD > <A HREF = "Section_accelerate.html#acc_4">Section accelerate</A></TD><TD > -</TD><TD > -</TD></TR>
+<TR ALIGN="center"><TD >OPT</TD><TD > optimized pair styles</TD><TD > Fischer & Richie & Natoli (2)</TD><TD > <A HREF = "Section_accelerate.html#acc_4">Section accelerate</A></TD><TD > -</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >PERI</TD><TD > Peridynamics models</TD><TD > Mike Parks (Sandia)</TD><TD > <A HREF = "pair_peri.html">pair_style peri</A></TD><TD > peri</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >POEMS</TD><TD > coupled rigid body motion</TD><TD > Rudra Mukherjee (JPL)</TD><TD > <A HREF = "fix_poems.html">fix poems</A></TD><TD > rigid</TD><TD > lib/poems</TD></TR>
 <TR ALIGN="center"><TD >REAX</TD><TD > ReaxFF potential</TD><TD > Aidan Thompson (Sandia)</TD><TD > <A HREF = "pair_reax.html">pair_style reax</A></TD><TD > reax</TD><TD >  lib/reax</TD></TR>
 <TR ALIGN="center"><TD >REPLICA</TD><TD > multi-replica methods</TD><TD > -</TD><TD > <A HREF = "Section_howto.html#howto_5">Section_howto</A></TD><TD > tad</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >RIGID</TD><TD > rigid bodies</TD><TD > -</TD><TD > <A HREF = "fix_rigid.html">fix rigid</A></TD><TD > rigid</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >SHOCK</TD><TD > shock loading methods</TD><TD > -</TD><TD > <A HREF = "fix_msst.html">fix msst</A></TD><TD > -</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >SRD</TD><TD > stochastic rotation dynamics</TD><TD > -</TD><TD > <A HREF = "fix_srd.html">fix srd</A></TD><TD > srd</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >VORONOI</TD><TD > Voronoi tesselations</TD><TD > Daniel Schwen (LANL)</TD><TD > <A HREF = "compute_voronoi_atom.html">compute voronoi/atom</A></TD><TD > -</TD><TD > Voro++</TD></TR>
 <TR ALIGN="center"><TD >XTC</TD><TD > dumps in XTC format</TD><TD > -</TD><TD > <A HREF = "dump.html">dump</A></TD><TD > -</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >
 </TD></TR></TABLE></DIV>
 
 <P>The "Authors" column lists a name(s) if a specific person is
 responible for creating and maintaining the package.
 </P>
 <P>(1) The FLD package was created by Amit Kumar and Michael Bybee from
 Jonathan Higdon's group at UIUC.
 </P>
 <P>(2) The OPT package was created by James Fischer (High Performance
 Technologies), David Richie, and Vincent Natoli (Stone Ridge
 Technolgy).
 </P>
 <P>(3) The KIM package was created by Valeriu Smirichinski, Ryan Elliott,
 and Ellad Tadmor (U Minn).
 </P>
+<P>(4) The KOKKOS package was created primarily by Christian Trott
+(Sandia).  It uses the Kokkos library which was developed by Carter
+Edwards, Christian, and collaborators at Sandia.
+</P>
 <P>The "Doc page" column links to either a portion of the
 <A HREF = "Section_howto.html">Section_howto</A> of the manual, or an input script
 command implemented as part of the package.
 </P>
 <P>The "Example" column is a sub-directory in the examples directory of
 the distribution which has an input script that uses the package.
 E.g. "peptide" refers to the examples/peptide directory.
 </P>
 <P>The "Library" column lists an external library which must be built
 first and which LAMMPS links to when it is built.  If it is listed as
 lib/package, then the code for the library is under the lib directory
 of the LAMMPS distribution.  See the lib/package/README file for info
 on how to build the library.  If it is not listed as lib/package, then
 it is a third-party library not included in the LAMMPS distribution.
 See the src/package/README or src/package/Makefile.lammps file for
 info on where to download the library.  <A HREF = "Section_start.html#start_3_3">Section
 start</A> of the manual also gives details
 on how to build LAMMPS with both kinds of auxiliary libraries.
 </P>
 <HR>
 
 <HR>
 
 <H4><A NAME = "pkg_2"></A>4.2 User packages 
 </H4>
 <P>The current list of user-contributed packages is as follows:
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD >Package</TD><TD > Description</TD><TD > Author(s)</TD><TD > Doc page</TD><TD > Example</TD><TD > Pic/movie</TD><TD > Library</TD></TR>
 <TR ALIGN="center"><TD >USER-ATC</TD><TD > atom-to-continuum coupling</TD><TD > Jones & Templeton & Zimmerman (2)</TD><TD > <A HREF = "fix_atc.html">fix atc</A></TD><TD > USER/atc</TD><TD > <A HREF = "http://lammps.sandia.gov/pictures.html#atc">atc</A></TD><TD > lib/atc</TD></TR>
 <TR ALIGN="center"><TD >USER-AWPMD</TD><TD > wave-packet MD</TD><TD > Ilya Valuev (JIHT)</TD><TD > <A HREF = "pair_awpmd.html">pair_style awpmd/cut</A></TD><TD > USER/awpmd</TD><TD > -</TD><TD > lib/awpmd</TD></TR>
 <TR ALIGN="center"><TD >USER-CG-CMM</TD><TD > coarse-graining model</TD><TD > Axel Kohlmeyer (Temple U)</TD><TD > <A HREF = "pair_sdk.html">pair_style lj/sdk</A></TD><TD > USER/cg-cmm</TD><TD > <A HREF = "http://lammps.sandia.gov/pictures.html#cg">cg</A></TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >USER-COLVARS</TD><TD > collective variables</TD><TD > Fiorin & Henin & Kohlmeyer (3)</TD><TD > <A HREF = "fix_colvars.html">fix colvars</A></TD><TD > USER/colvars</TD><TD > <A HREF = "colvars">colvars</A></TD><TD > lib/colvars</TD></TR>
 <TR ALIGN="center"><TD >USER-CUDA</TD><TD > NVIDIA GPU styles</TD><TD > Christian Trott (U Tech Ilmenau)</TD><TD > <A HREF = "Section_accelerate.html#acc_7">Section accelerate</A></TD><TD > USER/cuda</TD><TD > -</TD><TD > lib/cuda</TD></TR>
 <TR ALIGN="center"><TD >USER-EFF</TD><TD > electron force field</TD><TD > Andres Jaramillo-Botero (Caltech)</TD><TD > <A HREF = "pair_eff.html">pair_style eff/cut</A></TD><TD > USER/eff</TD><TD > <A HREF = "http://lammps.sandia.gov/movies.html#eff">eff</A></TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >USER-FEP</TD><TD > free energy perturbation</TD><TD > Agilio Padua (U Blaise Pascal Clermont-Ferrand)</TD><TD > <A HREF = "fix_adapt.html">fix adapt/fep</A></TD><TD > USER/fep</TD><TD > -</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >USER-LB</TD><TD > Lattice Boltzmann fluid</TD><TD > Colin Denniston (U Western Ontario)</TD><TD > <A HREF = "fix_lb_fluid.html">fix lb/fluid</A></TD><TD > USER/lb</TD><TD > -</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >USER-MISC</TD><TD > single-file contributions</TD><TD > USER-MISC/README</TD><TD > USER-MISC/README</TD><TD > -</TD><TD > -</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >USER-MOLFILE</TD><TD > <A HREF = "http://www.ks.uiuc.edu/Research/vmd">VMD</A> molfile plug-ins</TD><TD > Axel Kohlmeyer (Temple U)</TD><TD > <A HREF = "dump_molfile.html">dump molfile</A></TD><TD > -</TD><TD > -</TD><TD > VMD-MOLFILE</TD></TR>
 <TR ALIGN="center"><TD >USER-OMP</TD><TD > OpenMP threaded styles</TD><TD > Axel Kohlmeyer (Temple U)</TD><TD > <A HREF = "Section_accelerate.html#acc_5">Section accelerate</A></TD><TD > -</TD><TD > -</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >USER-PHONON</TD><TD > phonon dynamical matrix</TD><TD > Ling-Ti Kong (Shanghai Jiao Tong U)</TD><TD > <A HREF = "fix_phonon.html">fix phonon</A></TD><TD > USER/phonon</TD><TD > -</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >USER-QMMM</TD><TD > QM/MM coupling</TD><TD > Axel Kohlmeyer (Temple U)</TD><TD > <A HREF = "fix_qmmm.html">fix qmmm</A></TD><TD > lib/qmmm/example1</TD><TD > -</TD><TD > lib/qmmm</TD></TR>
 <TR ALIGN="center"><TD >USER-REAXC</TD><TD > C version of ReaxFF</TD><TD > Metin Aktulga (LBNL)</TD><TD > <A HREF = "pair_reax_c.html">pair_style reaxc</A></TD><TD > reax</TD><TD > -</TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >USER-SPH</TD><TD > smoothed particle hydrodynamics</TD><TD > Georg Ganzenmuller (EMI)</TD><TD > <A HREF = "USER/sph/SPH_LAMMPS_userguide.pdf">userguide.pdf</A></TD><TD > USER/sph</TD><TD > <A HREF = "http://lammps.sandia.gov/movies.html#sph">sph</A></TD><TD > -</TD></TR>
 <TR ALIGN="center"><TD >
 </TD></TR></TABLE></DIV>
 
 
 
 
 
 
 
 
 
 
 
 <P>The "Authors" column lists a name(s) if a specific person is
 responible for creating and maintaining the package.
 </P>
 <P>If the Library is not listed as lib/package, then it is a third-party
 library not included in the LAMMPS distribution.  See the
 src/package/Makefile.lammps file for info on where to download the
 library from.
 </P>
 <P>(2) The ATC package was created by Reese Jones, Jeremy Templeton, and
 Jon Zimmerman (Sandia).
 </P>
 <P>(3) The COLVARS package was created by Axel Kohlmeyer (Temple U) using
 the colvars module library written by Giacomo Fiorin (Temple U) and
 Jerome Henin (LISM, Marseille, France).
 </P>
 <P>The "Doc page" column links to either a portion of the
 <A HREF = "Section_howto.html">Section_howto</A> of the manual, or an input script
 command implemented as part of the package, or to additional
 documentation provided witht he package.
 </P>
 <P>The "Example" column is a sub-directory in the examples directory of
 the distribution which has an input script that uses the package.
 E.g. "peptide" refers to the examples/peptide directory.  USER/cuda
 refers to the examples/USER/cuda directory.
 </P>
 <P>The "Library" column lists an external library which must be built
 first and which LAMMPS links to when it is built.  If it is listed as
 lib/package, then the code for the library is under the lib directory
 of the LAMMPS distribution.  See the lib/package/README file for info
 on how to build the library.  If it is not listed as lib/package, then
 it is a third-party library not included in the LAMMPS distribution.
 See the src/package/Makefile.lammps file for info on where to download
 the library.  <A HREF = "Section_start.html#start_3_3">Section start</A> of the
 manual also gives details on how to build LAMMPS with both kinds of
 auxiliary libraries.
 </P>
 <P>More details on each package, from the USER-*/README file is given
 below.
 </P>
 <HR>
 
 <H4>USER-ATC package 
 </H4>
 <P>This package implements a "fix atc" command which can be used in a
 LAMMPS input script.  This fix can be employed to either do concurrent
 coupling of MD with FE-based physics surrogates or on-the-fly
 post-processing of atomic information to continuum fields.
 </P>
 <P>See the doc page for the fix atc command to get started.  At the
 bottom of the doc page are many links to additional documentation
 contained in the doc/USER/atc directory.
 </P>
 <P>There are example scripts for using this package in examples/USER/atc.
 </P>
 <P>This package uses an external library in lib/atc which must be
 compiled before making LAMMPS.  See the lib/atc/README file and the
 LAMMPS manual for information on building LAMMPS with external
 libraries.
 </P>
 <P>The primary people who created this package are Reese Jones (rjones at
 sandia.gov), Jeremy Templeton (jatempl at sandia.gov) and Jon
 Zimmerman (jzimmer at sandia.gov) at Sandia.  Contact them directly if
 you have questions.
 </P>
 <HR>
 
 <H4>USER-AWPMD package 
 </H4>
 <P>This package contains a LAMMPS implementation of the Antisymmetrized
 Wave Packet Molecular Dynamics (AWPMD) method.
 </P>
 <P>See the doc page for the pair_style awpmd/cut command to get started.
 </P>
 <P>There are example scripts for using this package in examples/USER/awpmd.
 </P>
 <P>This package uses an external library in lib/awpmd which must be
 compiled before making LAMMPS.  See the lib/awpmd/README file and the
 LAMMPS manual for information on building LAMMPS with external
 libraries.
 </P>
 <P>The person who created this package is Ilya Valuev at the JIHT in
 Russia (valuev at physik.hu-berlin.de).  Contact him directly if you
 have questions.
 </P>
 <HR>
 
 <H4>USER-CG-CMM package 
 </H4>
 <P>This package implements 3 commands which can be used in a LAMMPS input
 script:
 </P>
 <UL><LI>pair_style lj/sdk
 <LI>pair_style lj/sdk/coul/long
 <LI>angle_style sdk 
 </UL>
 <P>These styles allow coarse grained MD simulations with the
 parametrization of Shinoda, DeVane, Klein, Mol Sim, 33, 27 (2007)
 (SDK), with extensions to simulate ionic liquids, electrolytes, lipids
 and charged amino acids.
 </P>
 <P>See the doc pages for these commands for details.
 </P>
 <P>There are example scripts for using this package in
 examples/USER/cg-cmm.
 </P>
 <P>This is the second generation implementation reducing the the clutter
 of the previous version. For many systems with electrostatics, it will
 be faster to use pair_style hybrid/overlay with lj/sdk and coul/long
 instead of the combined lj/sdk/coul/long style.  since the number of
 charged atom types is usually small.  For any other coulomb
 interactions this is now required.  To exploit this property, the use
 of the kspace_style pppm/cg is recommended over regular pppm. For all
 new styles, input file backward compatibility is provided.  The old
 implementation is still available through appending the /old
 suffix. These will be discontinued and removed after the new
 implementation has been fully validated.
 </P>
 <P>The current version of this package should be considered beta
 quality. The CG potentials work correctly for "normal" situations, but
 have not been testing with all kinds of potential parameters and
 simulation systems.
 </P>
 <P>The person who created this package is Axel Kohlmeyer at Temple U
 (akohlmey at gmail.com).  Contact him directly if you have questions.
 </P>
 <HR>
 
 <H4>USER-COLVARS package 
 </H4>
 <P>This package implements the "fix colvars" command which can be
 used in a LAMMPS input script.
 </P>
 <P>This fix allows to use "collective variables" to implement
 Adaptive Biasing Force, Metadynamics, Steered MD, Umbrella
 Sampling and Restraints. This code consists of two parts:
 </P>
 <UL><LI>A portable collective variable module library written and maintained
 <LI>by Giacomo Fiorin (ICMS, Temple University, Philadelphia, PA, USA) and
 <LI>Jerome Henin (LISM, CNRS, Marseille, France). This code is located in
 <LI>the directory lib/colvars and needs to be compiled first.  The colvars
 <LI>fix and an interface layer, exchanges information between LAMMPS and
 <LI>the collective variable module. 
 </UL>
 <P>See the doc page of <A HREF = "fix_colvars.html">fix colvars</A> for more details.
 </P>
 <P>There are example scripts for using this package in
 examples/USER/colvars
 </P>
 <P>This is a very new interface that does not yet support all
 features in the module and will see future optimizations
 and improvements. The colvars module library is also available
 in NAMD has been thoroughly used and tested there. Bugs and
 problems are likely due to the interface layers code.
 Thus the current version of this package should be considered
 beta quality.
 </P>
 <P>The person who created this package is Axel Kohlmeyer at Temple U
 (akohlmey at gmail.com).  Contact him directly if you have questions.
 </P>
 <HR>
 
 <H4>USER-CUDA package 
 </H4>
 <P>This package provides acceleration of various LAMMPS pair styles, fix
 styles, compute styles, and long-range Coulombics via PPPM for NVIDIA
 GPUs.
 </P>
 <P>See this section of the manual to get started:
 </P>
 <P><A HREF = "Section_accelerate.html#acc_7">Section_accelerate</A>
 </P>
 <P>There are example scripts for using this package in
 examples/USER/cuda.
 </P>
 <P>This package uses an external library in lib/cuda which must be
 compiled before making LAMMPS.  See the lib/cuda/README file and the
 LAMMPS manual for information on building LAMMPS with external
 libraries.
 </P>
 <P>The person who created this package is Christian Trott at the
 University of Technology Ilmenau, Germany (christian.trott at
 tu-ilmenau.de).  Contact him directly if you have questions.
 </P>
 <HR>
 
 <H4>USER-EFF package 
 </H4>
 <P>This package contains a LAMMPS implementation of the electron Force
 Field (eFF) currently under development at Caltech, as described in
 A. Jaramillo-Botero, J. Su, Q. An, and W.A. Goddard III, JCC,
 2010. The eFF potential was first introduced by Su and Goddard, in
 2007.
 </P>
 <P>eFF can be viewed as an approximation to QM wave packet dynamics and
 Fermionic molecular dynamics, combining the ability of electronic
 structure methods to describe atomic structure, bonding, and chemistry
 in materials, and of plasma methods to describe nonequilibrium
 dynamics of large systems with a large number of highly excited
 electrons. We classify it as a mixed QM-classical approach rather than
 a conventional force field method, which introduces QM-based terms (a
 spin-dependent repulsion term to account for the Pauli exclusion
 principle and the electron wavefunction kinetic energy associated with
 the Heisenberg principle) that reduce, along with classical
 electrostatic terms between nuclei and electrons, to the sum of a set
 of effective pairwise potentials.  This makes eFF uniquely suited to
 simulate materials over a wide range of temperatures and pressures
 where electronically excited and ionized states of matter can occur
 and coexist.
 </P>
 <P>The necessary customizations to the LAMMPS core are in place to
 enable the correct handling of explicit electron properties during
 minimization and dynamics.
 </P>
 <P>See the doc page for the pair_style eff/cut command to get started.
 </P>
 <P>There are example scripts for using this package in
 examples/USER/eff.
 </P>
 <P>There are auxiliary tools for using this package in tools/eff.
 </P>
 <P>The person who created this package is Andres Jaramillo-Botero at
 CalTech (ajaramil at wag.caltech.edu).  Contact him directly if you
 have questions.
 </P>
 <HR>
 
 <H4>USER-FEP package 
 </H4>
 <P>This package provides methods for performing free energy perturbation
 simulations with soft-core pair potentials in LAMMPS.
 </P>
 <P>See these doc pages and their related commands to get started:
 </P>
 <UL><LI><A HREF = "fix_adapt_fep.html">fix adapt/fep</A>
 <LI><A HREF = "compute_fep.html">compute fep</A>
 <LI><A HREF = "pair_lj_soft.html">soft pair styles</A> 
 </UL>
 <P>The person who created this package is Agilio Padua at Université
 Blaise Pascal Clermont-Ferrand (agilio.padua at univ-bpclermont.fr)
 Contact him directly if you have questions.
 </P>
 <HR>
 
 <H4>USER-LB package 
 </H4>
 <P>This package contains a LAMMPS implementation of a background
 Lattice-Boltzmann fluid, which can be used to model MD particles
 influenced by hydrodynamic forces.
 </P>
 <P>See this doc page and its related commands to get started:
 </P>
 <P><A HREF = "fix_lb_fluid.html">fix lb/fluid</A>
 </P>
 <P>The people who created this package are Frances Mackay (fmackay at
 uwo.ca) and Colin (cdennist at uwo.ca) Denniston, University of
 Western Ontario.  Contact them directly if you have questions.
 </P>
 <HR>
 
 <H4>USER-MISC package 
 </H4>
 <P>The files in this package are a potpourri of (mostly) unrelated
 features contributed to LAMMPS by users.  Each feature is a single
 pair of files (*.cpp and *.h).
 </P>
 <P>More information about each feature can be found by reading its doc
 page in the LAMMPS doc directory.  The doc page which lists all LAMMPS
 input script commands is as follows:
 </P>
 <P><A HREF = "Section_commands.html#cmd_5">Section_commands</A>
 </P>
 <P>User-contributed features are listed at the bottom of the fix,
 compute, pair, etc sections.
 </P>
 <P>The list of features and author of each is given in the
 src/USER-MISC/README file.
 </P>
 <P>You should contact the author directly if you have specific questions
 about the feature or its coding.
 </P>
 <HR>
 
 <H4>USER-MOLFILE package 
 </H4>
 <P>This package contains a dump molfile command which uses molfile
 plugins that are bundled with the
 <A HREF = "http://www.ks.uiuc.edu/Research/vmd">VMD</A> molecular visualization and
 analysis program, to enable LAMMPS to dump its information in formats
 compatible with various molecular simulation tools.
 </P>
 <P>The package only provides the interface code, not the plugins.  These
 can be obtained from a VMD installation which has to match the
 platform that you are using to compile LAMMPS for. By adding plugins
 to VMD, support for new file formats can be added to LAMMPS (or VMD or
 other programs that use them) without having to recompile the
 application itself.
 </P>
 <P>See this doc page to get started:
 </P>
 <P><A HREF = "dump_molfile.html#acc_5">dump molfile</A>
 </P>
 <P>The person who created this package is Axel Kohlmeyer at Temple U
 (akohlmey at gmail.com).  Contact him directly if you have questions.
 </P>
 <HR>
 
 <H4>USER-OMP package 
 </H4>
 <P>This package provides OpenMP multi-threading support and
 other optimizations of various LAMMPS pair styles, dihedral
 styles, and fix styles.
 </P>
 <P>See this section of the manual to get started:
 </P>
 <P><A HREF = "Section_accelerate.html#acc_5">Section_accelerate</A>
 </P>
 <P>The person who created this package is Axel Kohlmeyer at Temple U
 (akohlmey at gmail.com).  Contact him directly if you have questions.
 </P>
 <HR>
 
 <H4>USER-PHONON package 
 </H4>
 <P>This package contains a fix phonon command that calculates dynamical
 matrices, which can then be used to compute phonon dispersion
 relations, directly from molecular dynamics simulations.
 </P>
 <P>See this doc page to get started:
 </P>
 <P><A HREF = "fix_phonon.html">fix phonon</A>
 </P>
 <P>The person who created this package is Ling-Ti Kong (konglt at
 sjtu.edu.cn) at Shanghai Jiao Tong University.  Contact him directly
 if you have questions.
 </P>
 <HR>
 
 <H4>USER-QMMM package 
 </H4>
 <P>This package provides a fix qmmm command which allows LAMMPS to be
 used in a QM/MM simulation, currently only in combination with pw.x
 code from the <A HREF = "http://www.quantum-espresso.org">Quantum ESPRESSO</A> package.
 </P>
 
 
 <P>The current implementation only supports an ONIOM style mechanical
 coupling to the Quantum ESPRESSO plane wave DFT package.
 Electrostatic coupling is in preparation and the interface has been
 written in a manner that coupling to other QM codes should be possible
 without changes to LAMMPS itself.
 </P>
 <P>See this doc page to get started:
 </P>
 <P><A HREF = "fix_qmmm.html">fix qmmm</A>
 </P>
 <P>as well as the lib/qmmm/README file.
 </P>
 <P>The person who created this package is Axel Kohlmeyer at Temple U
 (akohlmey at gmail.com).  Contact him directly if you have questions.
 </P>
 <HR>
 
 <H4>USER-REAXC package 
 </H4>
 <P>This package contains a implementation for LAMMPS of the ReaxFF force
 field.  ReaxFF uses distance-dependent bond-order functions to
 represent the contributions of chemical bonding to the potential
 energy.  It was originally developed by Adri van Duin and the Goddard
 group at CalTech.
 </P>
 <P>The USER-REAXC version of ReaxFF (pair_style reax/c), implemented in
 C, should give identical or very similar results to pair_style reax,
 which is a ReaxFF implementation on top of a Fortran library, a
 version of which library was originally authored by Adri van Duin.
 </P>
 <P>The reax/c version should be somewhat faster and more scalable,
 particularly with respect to the charge equilibration calculation.  It
 should also be easier to build and use since there are no complicating
 issues with Fortran memory allocation or linking to a Fortran library.
 </P>
 <P>For technical details about this implemention of ReaxFF, see
 this paper:
 </P>
 <P>Parallel and Scalable Reactive Molecular Dynamics: Numerical Methods
 and Algorithmic Techniques, H. M. Aktulga, J. C. Fogarty,
 S. A. Pandit, A. Y. Grama, Parallel Computing, in press (2011).
 </P>
 <P>See the doc page for the pair_style reax/c command for details
 of how to use it in LAMMPS.
 </P>
 <P>The person who created this package is Hasan Metin Aktulga (hmaktulga
 at lbl.gov), while at Purdue University.  Contact him directly, or
 Aidan Thompson at Sandia (athomps at sandia.gov), if you have
 questions.
 </P>
 <HR>
 
 <H4>USER-SPH package 
 </H4>
 <P>This package implements smoothed particle hydrodynamics (SPH) in
 LAMMPS.  Currently, the package has the following features:
 </P>
 <P>* Tait, ideal gas, Lennard-Jones equation of states, full support for 
   complete (i.e. internal-energy dependent) equations of state
 * plain or Monaghans XSPH integration of the equations of motion
 * density continuity or density summation to propagate the density field
 * commands to set internal energy and density of particles from the 
   input script
 * output commands to access internal energy and density for dumping and 
   thermo output
 </P>
 <P>See the file doc/USER/sph/SPH_LAMMPS_userguide.pdf to get started.
 </P>
 <P>There are example scripts for using this package in examples/USER/sph.
 </P>
 <P>The person who created this package is Georg Ganzenmuller at the
 Fraunhofer-Institute for High-Speed Dynamics, Ernst Mach Institute in
 Germany (georg.ganzenmueller at emi.fhg.de).  Contact him directly if
 you have questions.
 </P>
 </HTML>
diff --git a/doc/Section_packages.txt b/doc/Section_packages.txt
index 700c88b95..7ede067fd 100644
--- a/doc/Section_packages.txt
+++ b/doc/Section_packages.txt
@@ -1,549 +1,554 @@
 "Previous Section"_Section_commands.html - "LAMMPS WWW Site"_lws -
 "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc - "Next
 Section"_Section_accelerate.html :c
 
 :link(lws,http://lammps.sandia.gov)
 :link(ld,Manual.html)
 :link(lc,Section_commands.html#comm)
 
 :line
 
 4. Packages :h3
 
 This section gives a quick overview of the add-on packages that extend
 LAMMPS functionality.
 
 4.1 "Standard packages"_#pkg_1
 4.2 "User packages"_#pkg_2 :all(b)
 
 LAMMPS includes many optional packages, which are groups of files that
 enable a specific set of features.  For example, force fields for
 molecular systems or granular systems are in packages.  You can see
 the list of all packages by typing "make package" from within the src
 directory of the LAMMPS distribution.
 
 See "Section_start 3"_Section_start.html#start_3 of the manual for
 details on how to include/exclude specific packages as part of the
 LAMMPS build process, and for more details about the differences
 between standard packages and user packages in LAMMPS.
 
 Below, the packages currently availabe in LAMMPS are listed.  For
 standard packages, just a one-line description is given.  For user
 packages, more details are provided.
 
 :line
 :line
 
 4.1 Standard packages :h4,link(pkg_1)
 
 The current list of standard packages is as follows:
 
 Package, Description, Author(s), Doc page, Example, Library
 ASPHERE, aspherical particles, -, "Section_howto"_Section_howto.html#howto_14, ellipse, -
 BODY, body-style particles, -, "body"_body.html, body, -
 CLASS2, class 2 force fields, -, "pair_style lj/class2"_pair_class2.html, -, -
 COLLOID, colloidal particles, -, "atom_style colloid"_atom_style.html, colloid, -
 DIPOLE, point dipole particles, -, "pair_style dipole/cut"_pair_dipole.html, dipole, -
 FLD, Fast Lubrication Dynamics, Kumar & Bybee & Higdon (1), "pair_style lubricateU"_pair_lubricateU.html, -, -
-GPU, GPU-enabled potentials, Mike Brown (ORNL), "Section accelerate"_Section_accelerate.html#acc_6, gpu, lib/gpu
+GPU, GPU-enabled styles, Mike Brown (ORNL), "Section accelerate"_Section_accelerate.html#acc_6, gpu, lib/gpu
 GRANULAR, granular systems, -, "Section_howto"_Section_howto.html#howto_6, pour, -
 KIM, openKIM potentials, Smirichinski & Elliot & Tadmor (3), "pair_style kim"_pair_kim.html, kim, KIM
+KOKKOS, Kokkos-enabled styles, Trott & Edwards (4), "Section_accelerate"_Section_accelerate.html#acc_8, kokkos, lib/kokkos
 KSPACE, long-range Coulombic solvers, -, "kspace_style"_kspace_style.html, peptide, -
 MANYBODY, many-body potentials, -, "pair_style tersoff"_pair_tersoff.html, shear, -
 MEAM, modified EAM potential, Greg Wagner (Sandia), "pair_style meam"_pair_meam.html, meam, lib/meam
 MC, Monte Carlo options, -, "fix gcmc"_fix_gcmc.html, -, -
 MOLECULE, molecular system force fields, -, "Section_howto"_Section_howto.html#howto_3, peptide, -
-OPT, optimized pair potentials, Fischer & Richie & Natoli (2), "Section accelerate"_Section_accelerate.html#acc_4, -, -
+OPT, optimized pair styles, Fischer & Richie & Natoli (2), "Section accelerate"_Section_accelerate.html#acc_4, -, -
 PERI, Peridynamics models, Mike Parks (Sandia), "pair_style peri"_pair_peri.html, peri, -
 POEMS, coupled rigid body motion, Rudra Mukherjee (JPL), "fix poems"_fix_poems.html, rigid, lib/poems
 REAX, ReaxFF potential, Aidan Thompson (Sandia), "pair_style reax"_pair_reax.html, reax,  lib/reax
 REPLICA, multi-replica methods, -, "Section_howto"_Section_howto.html#howto_5, tad, -
 RIGID, rigid bodies, -, "fix rigid"_fix_rigid.html, rigid, -
 SHOCK, shock loading methods, -, "fix msst"_fix_msst.html, -, -
 SRD, stochastic rotation dynamics, -, "fix srd"_fix_srd.html, srd, -
 VORONOI, Voronoi tesselations, Daniel Schwen (LANL), "compute voronoi/atom"_compute_voronoi_atom.html, -, Voro++
 XTC, dumps in XTC format, -, "dump"_dump.html, -, -
 :tb(ea=c)
 
 The "Authors" column lists a name(s) if a specific person is
 responible for creating and maintaining the package.
 
 (1) The FLD package was created by Amit Kumar and Michael Bybee from
 Jonathan Higdon's group at UIUC.
 
 (2) The OPT package was created by James Fischer (High Performance
 Technologies), David Richie, and Vincent Natoli (Stone Ridge
 Technolgy).
 
 (3) The KIM package was created by Valeriu Smirichinski, Ryan Elliott,
 and Ellad Tadmor (U Minn).
 
+(4) The KOKKOS package was created primarily by Christian Trott
+(Sandia).  It uses the Kokkos library which was developed by Carter
+Edwards, Christian, and collaborators at Sandia.
+
 The "Doc page" column links to either a portion of the
 "Section_howto"_Section_howto.html of the manual, or an input script
 command implemented as part of the package.
 
 The "Example" column is a sub-directory in the examples directory of
 the distribution which has an input script that uses the package.
 E.g. "peptide" refers to the examples/peptide directory.
 
 The "Library" column lists an external library which must be built
 first and which LAMMPS links to when it is built.  If it is listed as
 lib/package, then the code for the library is under the lib directory
 of the LAMMPS distribution.  See the lib/package/README file for info
 on how to build the library.  If it is not listed as lib/package, then
 it is a third-party library not included in the LAMMPS distribution.
 See the src/package/README or src/package/Makefile.lammps file for
 info on where to download the library.  "Section
 start"_Section_start.html#start_3_3 of the manual also gives details
 on how to build LAMMPS with both kinds of auxiliary libraries.
 
 :line
 :line
 
 4.2 User packages :h4,link(pkg_2)
 
 The current list of user-contributed packages is as follows:
 
 Package, Description, Author(s), Doc page, Example, Pic/movie, Library
 USER-ATC, atom-to-continuum coupling, Jones & Templeton & Zimmerman (2), "fix atc"_fix_atc.html, USER/atc, "atc"_atc, lib/atc
 USER-AWPMD, wave-packet MD, Ilya Valuev (JIHT), "pair_style awpmd/cut"_pair_awpmd.html, USER/awpmd, -, lib/awpmd
 USER-CG-CMM, coarse-graining model, Axel Kohlmeyer (Temple U), "pair_style lj/sdk"_pair_sdk.html, USER/cg-cmm, "cg"_cg, -
 USER-COLVARS, collective variables, Fiorin & Henin & Kohlmeyer (3), "fix colvars"_fix_colvars.html, USER/colvars, "colvars"_colvars, lib/colvars
 USER-CUDA, NVIDIA GPU styles, Christian Trott (U Tech Ilmenau), "Section accelerate"_Section_accelerate.html#acc_7, USER/cuda, -, lib/cuda
 USER-EFF, electron force field, Andres Jaramillo-Botero (Caltech), "pair_style eff/cut"_pair_eff.html, USER/eff, "eff"_eff, -
 USER-FEP, free energy perturbation, Agilio Padua (U Blaise Pascal Clermont-Ferrand), "fix adapt/fep"_fix_adapt.html, USER/fep, -, -
 USER-LB, Lattice Boltzmann fluid, Colin Denniston (U Western Ontario), "fix lb/fluid"_fix_lb_fluid.html, USER/lb, -, -
 USER-MISC, single-file contributions, USER-MISC/README, USER-MISC/README, -, -, -
 USER-MOLFILE, "VMD"_VMD molfile plug-ins, Axel Kohlmeyer (Temple U), "dump molfile"_dump_molfile.html, -, -, VMD-MOLFILE
 USER-OMP, OpenMP threaded styles, Axel Kohlmeyer (Temple U), "Section accelerate"_Section_accelerate.html#acc_5, -, -, -
 USER-PHONON, phonon dynamical matrix, Ling-Ti Kong (Shanghai Jiao Tong U), "fix phonon"_fix_phonon.html, USER/phonon, -, -
 USER-QMMM, QM/MM coupling, Axel Kohlmeyer (Temple U), "fix qmmm"_fix_qmmm.html, lib/qmmm/example1, -, lib/qmmm
 USER-REAXC, C version of ReaxFF, Metin Aktulga (LBNL), "pair_style reaxc"_pair_reax_c.html, reax, -, -
 USER-SPH, smoothed particle hydrodynamics, Georg Ganzenmuller (EMI), "userguide.pdf"_USER/sph/SPH_LAMMPS_userguide.pdf, USER/sph, "sph"_sph, -
 :tb(ea=c)
 
 :link(atc,http://lammps.sandia.gov/pictures.html#atc)
 :link(cg,http://lammps.sandia.gov/pictures.html#cg)
 :link(eff,http://lammps.sandia.gov/movies.html#eff)
 :link(sph,http://lammps.sandia.gov/movies.html#sph)
 :link(VMD,http://www.ks.uiuc.edu/Research/vmd)
 
 The "Authors" column lists a name(s) if a specific person is
 responible for creating and maintaining the package.
 
 If the Library is not listed as lib/package, then it is a third-party
 library not included in the LAMMPS distribution.  See the
 src/package/Makefile.lammps file for info on where to download the
 library from.
 
 (2) The ATC package was created by Reese Jones, Jeremy Templeton, and
 Jon Zimmerman (Sandia).
 
 (3) The COLVARS package was created by Axel Kohlmeyer (Temple U) using
 the colvars module library written by Giacomo Fiorin (Temple U) and
 Jerome Henin (LISM, Marseille, France).
 
 The "Doc page" column links to either a portion of the
 "Section_howto"_Section_howto.html of the manual, or an input script
 command implemented as part of the package, or to additional
 documentation provided witht he package.
 
 The "Example" column is a sub-directory in the examples directory of
 the distribution which has an input script that uses the package.
 E.g. "peptide" refers to the examples/peptide directory.  USER/cuda
 refers to the examples/USER/cuda directory.
 
 The "Library" column lists an external library which must be built
 first and which LAMMPS links to when it is built.  If it is listed as
 lib/package, then the code for the library is under the lib directory
 of the LAMMPS distribution.  See the lib/package/README file for info
 on how to build the library.  If it is not listed as lib/package, then
 it is a third-party library not included in the LAMMPS distribution.
 See the src/package/Makefile.lammps file for info on where to download
 the library.  "Section start"_Section_start.html#start_3_3 of the
 manual also gives details on how to build LAMMPS with both kinds of
 auxiliary libraries.
 
 More details on each package, from the USER-*/README file is given
 below.
 
 :line
 
 USER-ATC package :h4
 
 This package implements a "fix atc" command which can be used in a
 LAMMPS input script.  This fix can be employed to either do concurrent
 coupling of MD with FE-based physics surrogates or on-the-fly
 post-processing of atomic information to continuum fields.
 
 See the doc page for the fix atc command to get started.  At the
 bottom of the doc page are many links to additional documentation
 contained in the doc/USER/atc directory.
 
 There are example scripts for using this package in examples/USER/atc.
 
 This package uses an external library in lib/atc which must be
 compiled before making LAMMPS.  See the lib/atc/README file and the
 LAMMPS manual for information on building LAMMPS with external
 libraries.
 
 The primary people who created this package are Reese Jones (rjones at
 sandia.gov), Jeremy Templeton (jatempl at sandia.gov) and Jon
 Zimmerman (jzimmer at sandia.gov) at Sandia.  Contact them directly if
 you have questions.
 
 :line
 
 USER-AWPMD package :h4
 
 This package contains a LAMMPS implementation of the Antisymmetrized
 Wave Packet Molecular Dynamics (AWPMD) method.
 
 See the doc page for the pair_style awpmd/cut command to get started.
 
 There are example scripts for using this package in examples/USER/awpmd.
 
 This package uses an external library in lib/awpmd which must be
 compiled before making LAMMPS.  See the lib/awpmd/README file and the
 LAMMPS manual for information on building LAMMPS with external
 libraries.
 
 The person who created this package is Ilya Valuev at the JIHT in
 Russia (valuev at physik.hu-berlin.de).  Contact him directly if you
 have questions.
 
 :line
 
 USER-CG-CMM package :h4
 
 This package implements 3 commands which can be used in a LAMMPS input
 script:
 
 pair_style lj/sdk
 pair_style lj/sdk/coul/long
 angle_style sdk :ul
 
 These styles allow coarse grained MD simulations with the
 parametrization of Shinoda, DeVane, Klein, Mol Sim, 33, 27 (2007)
 (SDK), with extensions to simulate ionic liquids, electrolytes, lipids
 and charged amino acids.
 
 See the doc pages for these commands for details.
 
 There are example scripts for using this package in
 examples/USER/cg-cmm.
 
 This is the second generation implementation reducing the the clutter
 of the previous version. For many systems with electrostatics, it will
 be faster to use pair_style hybrid/overlay with lj/sdk and coul/long
 instead of the combined lj/sdk/coul/long style.  since the number of
 charged atom types is usually small.  For any other coulomb
 interactions this is now required.  To exploit this property, the use
 of the kspace_style pppm/cg is recommended over regular pppm. For all
 new styles, input file backward compatibility is provided.  The old
 implementation is still available through appending the /old
 suffix. These will be discontinued and removed after the new
 implementation has been fully validated.
 
 The current version of this package should be considered beta
 quality. The CG potentials work correctly for "normal" situations, but
 have not been testing with all kinds of potential parameters and
 simulation systems.
 
 The person who created this package is Axel Kohlmeyer at Temple U
 (akohlmey at gmail.com).  Contact him directly if you have questions.
 
 :line
 
 USER-COLVARS package :h4
 
 This package implements the "fix colvars" command which can be
 used in a LAMMPS input script.
 
 This fix allows to use "collective variables" to implement
 Adaptive Biasing Force, Metadynamics, Steered MD, Umbrella
 Sampling and Restraints. This code consists of two parts:
 
 A portable collective variable module library written and maintained
 by Giacomo Fiorin (ICMS, Temple University, Philadelphia, PA, USA) and
 Jerome Henin (LISM, CNRS, Marseille, France). This code is located in
 the directory lib/colvars and needs to be compiled first.  The colvars
 fix and an interface layer, exchanges information between LAMMPS and
 the collective variable module. :ul
 
 See the doc page of "fix colvars"_fix_colvars.html for more details.
 
 There are example scripts for using this package in
 examples/USER/colvars
 
 This is a very new interface that does not yet support all
 features in the module and will see future optimizations
 and improvements. The colvars module library is also available
 in NAMD has been thoroughly used and tested there. Bugs and
 problems are likely due to the interface layers code.
 Thus the current version of this package should be considered
 beta quality.
 
 The person who created this package is Axel Kohlmeyer at Temple U
 (akohlmey at gmail.com).  Contact him directly if you have questions.
 
 :line
 
 USER-CUDA package :h4
 
 This package provides acceleration of various LAMMPS pair styles, fix
 styles, compute styles, and long-range Coulombics via PPPM for NVIDIA
 GPUs.
  
 See this section of the manual to get started:
 
 "Section_accelerate"_Section_accelerate.html#acc_7
 
 There are example scripts for using this package in
 examples/USER/cuda.
 
 This package uses an external library in lib/cuda which must be
 compiled before making LAMMPS.  See the lib/cuda/README file and the
 LAMMPS manual for information on building LAMMPS with external
 libraries.
 
 The person who created this package is Christian Trott at the
 University of Technology Ilmenau, Germany (christian.trott at
 tu-ilmenau.de).  Contact him directly if you have questions.
 
 :line
 
 USER-EFF package :h4
 
 This package contains a LAMMPS implementation of the electron Force
 Field (eFF) currently under development at Caltech, as described in
 A. Jaramillo-Botero, J. Su, Q. An, and W.A. Goddard III, JCC,
 2010. The eFF potential was first introduced by Su and Goddard, in
 2007.
 
 eFF can be viewed as an approximation to QM wave packet dynamics and
 Fermionic molecular dynamics, combining the ability of electronic
 structure methods to describe atomic structure, bonding, and chemistry
 in materials, and of plasma methods to describe nonequilibrium
 dynamics of large systems with a large number of highly excited
 electrons. We classify it as a mixed QM-classical approach rather than
 a conventional force field method, which introduces QM-based terms (a
 spin-dependent repulsion term to account for the Pauli exclusion
 principle and the electron wavefunction kinetic energy associated with
 the Heisenberg principle) that reduce, along with classical
 electrostatic terms between nuclei and electrons, to the sum of a set
 of effective pairwise potentials.  This makes eFF uniquely suited to
 simulate materials over a wide range of temperatures and pressures
 where electronically excited and ionized states of matter can occur
 and coexist.
 
 The necessary customizations to the LAMMPS core are in place to
 enable the correct handling of explicit electron properties during
 minimization and dynamics.
 
 See the doc page for the pair_style eff/cut command to get started.
 
 There are example scripts for using this package in
 examples/USER/eff.
 
 There are auxiliary tools for using this package in tools/eff.
 
 The person who created this package is Andres Jaramillo-Botero at
 CalTech (ajaramil at wag.caltech.edu).  Contact him directly if you
 have questions.
 
 :line
 
 USER-FEP package :h4
 
 This package provides methods for performing free energy perturbation
 simulations with soft-core pair potentials in LAMMPS.
 
 See these doc pages and their related commands to get started:
 
 "fix adapt/fep"_fix_adapt_fep.html
 "compute fep"_compute_fep.html
 "soft pair styles"_pair_lj_soft.html :ul
 
 The person who created this package is Agilio Padua at Université
 Blaise Pascal Clermont-Ferrand (agilio.padua at univ-bpclermont.fr)
 Contact him directly if you have questions.
 
 :line
 
 USER-LB package :h4
 
 This package contains a LAMMPS implementation of a background
 Lattice-Boltzmann fluid, which can be used to model MD particles
 influenced by hydrodynamic forces.
 
 See this doc page and its related commands to get started:
 
 "fix lb/fluid"_fix_lb_fluid.html
 
 The people who created this package are Frances Mackay (fmackay at
 uwo.ca) and Colin (cdennist at uwo.ca) Denniston, University of
 Western Ontario.  Contact them directly if you have questions.
 
 :line
 
 USER-MISC package :h4
 
 The files in this package are a potpourri of (mostly) unrelated
 features contributed to LAMMPS by users.  Each feature is a single
 pair of files (*.cpp and *.h).
 
 More information about each feature can be found by reading its doc
 page in the LAMMPS doc directory.  The doc page which lists all LAMMPS
 input script commands is as follows:
 
 "Section_commands"_Section_commands.html#cmd_5
 
 User-contributed features are listed at the bottom of the fix,
 compute, pair, etc sections.
 
 The list of features and author of each is given in the
 src/USER-MISC/README file.
 
 You should contact the author directly if you have specific questions
 about the feature or its coding.
 
 :line
 
 USER-MOLFILE package :h4
 
 This package contains a dump molfile command which uses molfile
 plugins that are bundled with the
 "VMD"_http://www.ks.uiuc.edu/Research/vmd molecular visualization and
 analysis program, to enable LAMMPS to dump its information in formats
 compatible with various molecular simulation tools.
 
 The package only provides the interface code, not the plugins.  These
 can be obtained from a VMD installation which has to match the
 platform that you are using to compile LAMMPS for. By adding plugins
 to VMD, support for new file formats can be added to LAMMPS (or VMD or
 other programs that use them) without having to recompile the
 application itself.
 
 See this doc page to get started:
 
 "dump molfile"_dump_molfile.html#acc_5
 
 The person who created this package is Axel Kohlmeyer at Temple U
 (akohlmey at gmail.com).  Contact him directly if you have questions.
 
 :line
 
 USER-OMP package :h4
 
 This package provides OpenMP multi-threading support and
 other optimizations of various LAMMPS pair styles, dihedral
 styles, and fix styles.
  
 See this section of the manual to get started:
 
 "Section_accelerate"_Section_accelerate.html#acc_5
 
 The person who created this package is Axel Kohlmeyer at Temple U
 (akohlmey at gmail.com).  Contact him directly if you have questions.
 
 :line
 
 USER-PHONON package :h4
 
 This package contains a fix phonon command that calculates dynamical
 matrices, which can then be used to compute phonon dispersion
 relations, directly from molecular dynamics simulations.
 
 See this doc page to get started:
 
 "fix phonon"_fix_phonon.html
 
 The person who created this package is Ling-Ti Kong (konglt at
 sjtu.edu.cn) at Shanghai Jiao Tong University.  Contact him directly
 if you have questions.
 
 :line
 
 USER-QMMM package :h4
 
 This package provides a fix qmmm command which allows LAMMPS to be
 used in a QM/MM simulation, currently only in combination with pw.x
 code from the "Quantum ESPRESSO"_espresso package.
 
 :link(espresso,http://www.quantum-espresso.org)
 
 The current implementation only supports an ONIOM style mechanical
 coupling to the Quantum ESPRESSO plane wave DFT package.
 Electrostatic coupling is in preparation and the interface has been
 written in a manner that coupling to other QM codes should be possible
 without changes to LAMMPS itself.
 
 See this doc page to get started:
 
 "fix qmmm"_fix_qmmm.html
 
 as well as the lib/qmmm/README file.
 
 The person who created this package is Axel Kohlmeyer at Temple U
 (akohlmey at gmail.com).  Contact him directly if you have questions.
 
 :line
 
 USER-REAXC package :h4
 
 This package contains a implementation for LAMMPS of the ReaxFF force
 field.  ReaxFF uses distance-dependent bond-order functions to
 represent the contributions of chemical bonding to the potential
 energy.  It was originally developed by Adri van Duin and the Goddard
 group at CalTech.
 
 The USER-REAXC version of ReaxFF (pair_style reax/c), implemented in
 C, should give identical or very similar results to pair_style reax,
 which is a ReaxFF implementation on top of a Fortran library, a
 version of which library was originally authored by Adri van Duin.
 
 The reax/c version should be somewhat faster and more scalable,
 particularly with respect to the charge equilibration calculation.  It
 should also be easier to build and use since there are no complicating
 issues with Fortran memory allocation or linking to a Fortran library.
 
 For technical details about this implemention of ReaxFF, see
 this paper:
 
 Parallel and Scalable Reactive Molecular Dynamics: Numerical Methods
 and Algorithmic Techniques, H. M. Aktulga, J. C. Fogarty,
 S. A. Pandit, A. Y. Grama, Parallel Computing, in press (2011).
 
 See the doc page for the pair_style reax/c command for details
 of how to use it in LAMMPS.
 
 The person who created this package is Hasan Metin Aktulga (hmaktulga
 at lbl.gov), while at Purdue University.  Contact him directly, or
 Aidan Thompson at Sandia (athomps at sandia.gov), if you have
 questions.
 
 :line
 
 USER-SPH package :h4
 
 This package implements smoothed particle hydrodynamics (SPH) in
 LAMMPS.  Currently, the package has the following features:
 
 * Tait, ideal gas, Lennard-Jones equation of states, full support for 
   complete (i.e. internal-energy dependent) equations of state
 * plain or Monaghans XSPH integration of the equations of motion
 * density continuity or density summation to propagate the density field
 * commands to set internal energy and density of particles from the 
   input script
 * output commands to access internal energy and density for dumping and 
   thermo output
 
 See the file doc/USER/sph/SPH_LAMMPS_userguide.pdf to get started.
 
 There are example scripts for using this package in examples/USER/sph.
 
 The person who created this package is Georg Ganzenmuller at the
 Fraunhofer-Institute for High-Speed Dynamics, Ernst Mach Institute in
 Germany (georg.ganzenmueller at emi.fhg.de).  Contact him directly if
 you have questions.
diff --git a/doc/Section_start.html b/doc/Section_start.html
index f8c34710b..12abe89a3 100644
--- a/doc/Section_start.html
+++ b/doc/Section_start.html
@@ -1,1473 +1,1666 @@
 <HTML>
 <CENTER><A HREF = "Section_intro.html">Previous Section</A> - <A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> - <A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> - <A HREF = "Section_commands.html">Next Section</A> 
 </CENTER>
 
 
 
 
 
 
 <HR>
 
 <H3>2. Getting Started 
 </H3>
 <P>This section describes how to build and run LAMMPS, for both new and
 experienced users.
 </P>
 2.1 <A HREF = "#start_1">What's in the LAMMPS distribution</A><BR>
 2.2 <A HREF = "#start_2">Making LAMMPS</A><BR>
 2.3 <A HREF = "#start_3">Making LAMMPS with optional packages</A><BR>
 2.4 <A HREF = "#start_4">Building LAMMPS via the Make.py script</A><BR>
 2.5 <A HREF = "#start_5">Building LAMMPS as a library</A><BR>
 2.6 <A HREF = "#start_6">Running LAMMPS</A><BR>
 2.7 <A HREF = "#start_7">Command-line options</A><BR>
 2.8 <A HREF = "#start_8">Screen output</A><BR>
 2.9 <A HREF = "#start_9">Tips for users of previous versions</A> <BR>
 
 <HR>
 
 <HR>
 
 <H4><A NAME = "start_1"></A>2.1 What's in the LAMMPS distribution 
 </H4>
 <P>When you download LAMMPS you will need to unzip and untar the
 downloaded file with the following commands, after placing the file in
 an appropriate directory.
 </P>
 <PRE>gunzip lammps*.tar.gz 
 tar xvf lammps*.tar 
 </PRE>
 <P>This will create a LAMMPS directory containing two files and several
 sub-directories:
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR><TD >README</TD><TD > text file</TD></TR>
 <TR><TD >LICENSE</TD><TD > the GNU General Public License (GPL)</TD></TR>
 <TR><TD >bench</TD><TD > benchmark problems</TD></TR>
 <TR><TD >doc</TD><TD > documentation</TD></TR>
 <TR><TD >examples</TD><TD > simple test problems</TD></TR>
 <TR><TD >potentials</TD><TD > embedded atom method (EAM) potential files</TD></TR>
 <TR><TD >src</TD><TD > source files</TD></TR>
 <TR><TD >tools</TD><TD > pre- and post-processing tools 
 </TD></TR></TABLE></DIV>
 
 <P>If you download one of the Windows executables from the download page,
 then you get a single file:
 </P>
 <PRE>lmp_windows.exe 
 </PRE>
 <P>Skip to the <A HREF = "#start_6">Running LAMMPS</A> sections for info on how to
 launch these executables on a Windows box.
 </P>
 <P>The Windows executables for serial or parallel only include certain
 packages and bug-fixes/upgrades listed on <A HREF = "http://lammps.sandia.gov/bug.html">this
 page</A> up to a certain date, as
 stated on the download page.  If you want something with more packages
 or that is more current, you'll have to download the source tarball
 and build it yourself from source code using Microsoft Visual Studio,
 as described in the next section.
 </P>
 <HR>
 
 <H4><A NAME = "start_2"></A>2.2 Making LAMMPS 
 </H4>
 <P>This section has the following sub-sections:
 </P>
 <UL><LI><A HREF = "#start_2_1">Read this first</A>
 <LI><A HREF = "#start_2_2">Steps to build a LAMMPS executable</A>
 <LI><A HREF = "#start_2_3">Common errors that can occur when making LAMMPS</A>
 <LI><A HREF = "#start_2_4">Additional build tips</A>
 <LI><A HREF = "#start_2_5">Building for a Mac</A>
 <LI><A HREF = "#start_2_6">Building for Windows</A> 
 </UL>
 <HR>
 
 <A NAME = "start_2_1"></A><B><I>Read this first:</I></B> 
 
 <P>Building LAMMPS can be non-trivial.  You may need to edit a makefile,
 there are compiler options to consider, additional libraries can be
 used (MPI, FFT, JPEG, PNG), LAMMPS packages may be included or
 excluded, some of these packages use auxiliary libraries which need to
 be pre-built, etc.
 </P>
 <P>Please read this section carefully.  If you are not comfortable with
 makefiles, or building codes on a Unix platform, or running an MPI job
 on your machine, please find a local expert to help you.  Many
 compiling, linking, and run problems that users have are often not
 LAMMPS issues - they are peculiar to the user's system, compilers,
 libraries, etc.  Such questions are better answered by a local expert.
 </P>
 <P>If you have a build problem that you are convinced is a LAMMPS issue
 (e.g. the compiler complains about a line of LAMMPS source code), then
 please post a question to the <A HREF = "http://lammps.sandia.gov/mail.html">LAMMPS mail
 list</A>.
 </P>
 <P>If you succeed in building LAMMPS on a new kind of machine, for which
 there isn't a similar Makefile for in the src/MAKE directory, send it
 to the developers and we can include it in the LAMMPS distribution.
 </P>
 <HR>
 
 <A NAME = "start_2_2"></A><B><I>Steps to build a LAMMPS executable:</I></B> 
 
 <P><B>Step 0</B>
 </P>
 <P>The src directory contains the C++ source and header files for LAMMPS.
 It also contains a top-level Makefile and a MAKE sub-directory with
 low-level Makefile.* files for many machines.  From within the src
 directory, type "make" or "gmake".  You should see a list of available
 choices.  If one of those is the machine and options you want, you can
 type a command like:
 </P>
 <PRE>make linux
 or
 gmake mac 
 </PRE>
 <P>Note that on a multi-processor or multi-core platform you can launch a
 parallel make, by using the "-j" switch with the make command, which
 will build LAMMPS more quickly.
 </P>
 <P>If you get no errors and an executable like lmp_linux or lmp_mac is
 produced, you're done; it's your lucky day.
 </P>
 <P>Note that by default only a few of LAMMPS optional packages are
 installed.  To build LAMMPS with optional packages, see <A HREF = "#start_3">this
 section</A> below.
 </P>
 <P><B>Step 1</B>
 </P>
 <P>If Step 0 did not work, you will need to create a low-level Makefile
 for your machine, like Makefile.foo.  You should make a copy of an
 existing src/MAKE/Makefile.* as a starting point.  The only portions
 of the file you need to edit are the first line, the "compiler/linker
 settings" section, and the "LAMMPS-specific settings" section.
 </P>
 <P><B>Step 2</B>
 </P>
 <P>Change the first line of src/MAKE/Makefile.foo to list the word "foo"
 after the "#", and whatever other options it will set.  This is the
 line you will see if you just type "make".
 </P>
 <P><B>Step 3</B>
 </P>
 <P>The "compiler/linker settings" section lists compiler and linker
 settings for your C++ compiler, including optimization flags.  You can
 use g++, the open-source GNU compiler, which is available on all Unix
 systems.  You can also use mpicc which will typically be available if
 MPI is installed on your system, though you should check which actual
 compiler it wraps.  Vendor compilers often produce faster code.  On
 boxes with Intel CPUs, we suggest using the commercial Intel icc
 compiler, which can be downloaded from <A HREF = "http://www.intel.com/software/products/noncom">Intel's compiler site</A>.
 </P>
 
 
 <P>If building a C++ code on your machine requires additional libraries,
 then you should list them as part of the LIB variable.
 </P>
 <P>The DEPFLAGS setting is what triggers the C++ compiler to create a
 dependency list for a source file.  This speeds re-compilation when
 source (*.cpp) or header (*.h) files are edited.  Some compilers do
 not support dependency file creation, or may use a different switch
 than -D.  GNU g++ works with -D.  If your compiler can't create
 dependency files, then you'll need to create a Makefile.foo patterned
 after Makefile.storm, which uses different rules that do not involve
 dependency files.  Note that when you build LAMMPS for the first time
 on a new platform, a long list of *.d files will be printed out
 rapidly.  This is not an error; it is the Makefile doing its normal
 creation of dependencies.
 </P>
 <P><B>Step 4</B>
 </P>
 <P>The "system-specific settings" section has several parts.  Note that
 if you change any -D setting in this section, you should do a full
 re-compile, after typing "make clean" (which will describe different
 clean options).
 </P>
 <P>The LMP_INC variable is used to include options that turn on ifdefs
 within the LAMMPS code.  The options that are currently recogized are:
 </P>
 <UL><LI>-DLAMMPS_GZIP
 <LI>-DLAMMPS_JPEG
 <LI>-DLAMMPS_PNG
 <LI>-DLAMMPS_FFMPEG
 <LI>-DLAMMPS_MEMALIGN
 <LI>-DLAMMPS_XDR
 <LI>-DLAMMPS_SMALLBIG
 <LI>-DLAMMPS_BIGBIG
 <LI>-DLAMMPS_SMALLSMALL
 <LI>-DLAMMPS_LONGLONG_TO_LONG
 <LI>-DPACK_ARRAY
 <LI>-DPACK_POINTER
 <LI>-DPACK_MEMCPY 
 </UL>
 <P>The read_data and dump commands will read/write gzipped files if you
 compile with -DLAMMPS_GZIP.  It requires that your machine supports
 the "popen" function in the standard runtime library and that a gzip
 executable can be found by LAMMPS during a run.
 </P>
 <P>If you use -DLAMMPS_JPEG, the <A HREF = "dump_image.html">dump image</A> command
 will be able to write out JPEG image files. For JPEG files, you must
 also link LAMMPS with a JPEG library, as described below. If you use
 -DLAMMPS_PNG, the <A HREF = "dump.html">dump image</A> command will be able to write
 out PNG image files.  For PNG files, you must also link LAMMPS with a
 PNG library, as described below.  If neither of those two defines are
 used, LAMMPS will only be able to write out uncompressed PPM image
 files.
 </P>
 <P>If you use -DLAMMPS_FFMPEG, the <A HREF = "dump_image.html">dump movie</A> command
 will be available to support on-the-fly generation of rendered movies
 the need to store intermediate image files. It requires that your
 machines supports the "popen" function in the standard runtime library
 and that an FFmpeg executable can be found by LAMMPS during the run.
 </P>
 <P>Using -DLAMMPS_MEMALIGN=<bytes> enables the use of the
 posix_memalign() call instead of malloc() when large chunks or memory
 are allocated by LAMMPS.  This can help to make more efficient use of
 vector instructions of modern CPUS, since dynamically allocated memory
 has to be aligned on larger than default byte boundaries (e.g. 16
 bytes instead of 8 bytes on x86 type platforms) for optimal
 performance.
 </P>
 <P>If you use -DLAMMPS_XDR, the build will include XDR compatibility
 files for doing particle dumps in XTC format.  This is only necessary
 if your platform does have its own XDR files available.  See the
 Restrictions section of the <A HREF = "dump.html">dump</A> command for details.
 </P>
 <P>Use at most one of the -DLAMMPS_SMALLBIG, -DLAMMPS_BIGBIG, -D-
 DLAMMPS_SMALLSMALL settings. The default is -DLAMMPS_SMALLBIG. These
 settings refer to use of 4-byte (small) vs 8-byte (big) integers
 within LAMMPS, as specified in src/lmptype.h.  The only reason to use
 the BIGBIG setting is to enable simulation of huge molecular systems
 (which store bond topology info) with more than 2 billion atoms, or to
 track the image flags of moving atoms that wrap around a periodic box
 more than 512 times. The only reason to use the SMALLSMALL setting is
 if your machine does not support 64-bit integers.  See the <A HREF = "#start_2_4">Additional
 build tips</A> section below for more details.
 </P>
 <P>The -DLAMMPS_LONGLONG_TO_LONG setting may be needed if your system or
 MPI version does not recognize "long long" data types.  In this case a
 "long" data type is likely already 64-bits, in which case this setting
 will convert to that data type.
 </P>
 <P>Using one of the -DPACK_ARRAY, -DPACK_POINTER, and -DPACK_MEMCPY
 options can make for faster parallel FFTs (in the PPPM solver) on some
 platforms.  The -DPACK_ARRAY setting is the default.  See the
 <A HREF = "kspace_style.html">kspace_style</A> command for info about PPPM.  See
 Step 6 below for info about building LAMMPS with an FFT library.
 </P>
 <P><B>Step 5</B>
 </P>
 <P>The 3 MPI variables are used to specify an MPI library to build LAMMPS
 with. 
 </P>
 <P>If you want LAMMPS to run in parallel, you must have an MPI library
 installed on your platform.  If you use an MPI-wrapped compiler, such
 as "mpicc" to build LAMMPS, you should be able to leave these 3
 variables blank; the MPI wrapper knows where to find the needed files.
 If not, and MPI is installed on your system in the usual place (under
 /usr/local), you also may not need to specify these 3 variables.  On
 some large parallel machines which use "modules" for their
 compile/link environements, you may simply need to include the correct
 module in your build environment.  Or the parallel machine may have a
 vendor-provided MPI which the compiler has no trouble finding.
 </P>
 <P>Failing this, with these 3 variables you can specify where the mpi.h
 file (MPI_INC) and the MPI library file (MPI_PATH) are found and the
 name of the library file (MPI_LIB).
 </P>
 <P>If you are installing MPI yourself, we recommend Argonne's MPICH2
 or OpenMPI.  MPICH can be downloaded from the <A HREF = "http://www.mcs.anl.gov/research/projects/mpich2/">Argonne MPI
 site</A>.  OpenMPI can
 be downloaded from the <A HREF = "http://www.open-mpi.org">OpenMPI site</A>.
 Other MPI packages should also work. If you are running on a big
 parallel platform, your system people or the vendor should have
 already installed a version of MPI, which is likely to be faster
 than a self-installed MPICH or OpenMPI, so find out how to build
 and link with it.  If you use MPICH or OpenMPI, you will have to
 configure and build it for your platform.  The MPI configure script
 should have compiler options to enable you to use the same compiler
 you are using for the LAMMPS build, which can avoid problems that can
 arise when linking LAMMPS to the MPI library.
 </P>
 <P>If you just want to run LAMMPS on a single processor, you can use the
 dummy MPI library provided in src/STUBS, since you don't need a true
 MPI library installed on your system.  See the
 src/MAKE/Makefile.serial file for how to specify the 3 MPI variables
 in this case.  You will also need to build the STUBS library for your
 platform before making LAMMPS itself.  To build from the src
 directory, type "make stubs", or from the STUBS dir, type "make".
 This should create a libmpi_stubs.a file suitable for linking to
 LAMMPS.  If the build fails, you will need to edit the STUBS/Makefile
 for your platform.
 </P>
 <P>The file STUBS/mpi.c provides a CPU timer function called
 MPI_Wtime() that calls gettimeofday() .  If your system doesn't
 support gettimeofday() , you'll need to insert code to call another
 timer.  Note that the ANSI-standard function clock() rolls over after
 an hour or so, and is therefore insufficient for timing long LAMMPS
 simulations.
 </P>
 <P><B>Step 6</B>
 </P>
 <P>The 3 FFT variables allow you to specify an FFT library which LAMMPS
 uses (for performing 1d FFTs) when running the particle-particle
 particle-mesh (PPPM) option for long-range Coulombics via the
 <A HREF = "kspace_style.html">kspace_style</A> command.
 </P>
 <P>LAMMPS supports various open-source or vendor-supplied FFT libraries
 for this purpose.  If you leave these 3 variables blank, LAMMPS will
 use the open-source <A HREF = "http://kissfft.sf.net">KISS FFT library</A>, which is
 included in the LAMMPS distribution.  This library is portable to all
 platforms and for typical LAMMPS simulations is almost as fast as FFTW
 or vendor optimized libraries.  If you are not including the KSPACE
 package in your build, you can also leave the 3 variables blank.
 </P>
 <P>Otherwise, select which kinds of FFTs to use as part of the FFT_INC
 setting by a switch of the form -DFFT_XXX.  Recommended values for XXX
 are: MKL, SCSL, FFTW2, and FFTW3.  Legacy options are: INTEL, SGI,
 ACML, and T3E.  For backward compatability, using -DFFT_FFTW will use
 the FFTW2 library.  Using -DFFT_NONE will use the KISS library
 described above.
 </P>
 <P>You may also need to set the FFT_INC, FFT_PATH, and FFT_LIB variables,
 so the compiler and linker can find the needed FFT header and library
 files.  Note that on some large parallel machines which use "modules"
 for their compile/link environements, you may simply need to include
 the correct module in your build environment.  Or the parallel machine
 may have a vendor-provided FFT library which the compiler has no
 trouble finding.
 </P>
 <P>FFTW is a fast, portable library that should also work on any
 platform.  You can download it from
 <A HREF = "http://www.fftw.org">www.fftw.org</A>.  Both the legacy version 2.1.X and
 the newer 3.X versions are supported as -DFFT_FFTW2 or -DFFT_FFTW3.
 Building FFTW for your box should be as simple as ./configure; make.
 Note that on some platforms FFTW2 has been pre-installed, and uses
 renamed files indicating the precision it was compiled with,
 e.g. sfftw.h, or dfftw.h instead of fftw.h.  In this case, you can
 specify an additional define variable for FFT_INC called -DFFTW_SIZE,
 which will select the correct include file.  In this case, for FFT_LIB
 you must also manually specify the correct library, namely -lsfftw or
 -ldfftw.
 </P>
 <P>The FFT_INC variable also allows for a -DFFT_SINGLE setting that will
 use single-precision FFTs with PPPM, which can speed-up long-range
 calulations, particularly in parallel or on GPUs.  Fourier transform
 and related PPPM operations are somewhat insensitive to floating point
 truncation errors and thus do not always need to be performed in
 double precision.  Using the -DFFT_SINGLE setting trades off a little
 accuracy for reduced memory use and parallel communication costs for
 transposing 3d FFT data.  Note that single precision FFTs have only
 been tested with the FFTW3, FFTW2, MKL, and KISS FFT options.
 </P>
 <P><B>Step 7</B>
 </P>
 <P>The 3 JPG variables allow you to specify a JPEG and/or PNG library
 which LAMMPS uses when writing out JPEG or PNG files via the <A HREF = "dump_image.html">dump
 image</A> command.  These can be left blank if you do not
 use the -DLAMMPS_JPEG or -DLAMMPS_PNG switches discussed above in Step
 4, since in that case JPEG/PNG output will be disabled.
 </P>
 <P>A standard JPEG library usually goes by the name libjpeg.a or
 libjpeg.so and has an associated header file jpeglib.h.  Whichever
 JPEG library you have on your platform, you'll need to set the
 appropriate JPG_INC, JPG_PATH, and JPG_LIB variables, so that the
 compiler and linker can find it.
 </P>
 <P>A standard PNG library usually goes by the name libpng.a or libpng.so
 and has an associated header file png.h.  Whichever PNG library you
 have on your platform, you'll need to set the appropriate JPG_INC,
 JPG_PATH, and JPG_LIB variables, so that the compiler and linker can
 find it.
 </P>
 <P>As before, if these header and library files are in the usual place on
 your machine, you may not need to set these variables.
 </P>
 <P><B>Step 8</B>
 </P>
 <P>Note that by default only a few of LAMMPS optional packages are
 installed.  To build LAMMPS with optional packages, see <A HREF = "#start_3">this
 section</A> below, before proceeding to Step 9.
 </P>
 <P><B>Step 9</B>
 </P>
 <P>That's it.  Once you have a correct Makefile.foo, you have installed
 the optional LAMMPS packages you want to include in your build, and
 you have pre-built any other needed libraries (e.g. MPI, FFT, package
 libraries), all you need to do from the src directory is type
 something like this:
 </P>
 <PRE>make foo
 or
 gmake foo 
 </PRE>
 <P>You should get the executable lmp_foo when the build is complete.
 </P>
 <HR>
 
 <A NAME = "start_2_3"></A><B><I>Errors that can occur when making LAMMPS:</I></B> 
 
 <P>IMPORTANT NOTE: If an error occurs when building LAMMPS, the compiler
 or linker will state very explicitly what the problem is.  The error
 message should give you a hint as to which of the steps above has
 failed, and what you need to do in order to fix it.  Building a code
 with a Makefile is a very logical process.  The compiler and linker
 need to find the appropriate files and those files need to be
 compatible with LAMMPS source files.  When a make fails, there is
 usually a very simple reason, which you or a local expert will need to
 fix.
 </P>
 <P>Here are two non-obvious errors that can occur:
 </P>
 <P>(1) If the make command breaks immediately with errors that indicate
 it can't find files with a "*" in their names, this can be because
 your machine's native make doesn't support wildcard expansion in a
 makefile.  Try gmake instead of make.  If that doesn't work, try using
 a -f switch with your make command to use a pre-generated
 Makefile.list which explicitly lists all the needed files, e.g.
 </P>
 <PRE>make makelist
 make -f Makefile.list linux
 gmake -f Makefile.list mac 
 </PRE>
 <P>The first "make" command will create a current Makefile.list with all
 the file names in your src dir.  The 2nd "make" command (make or
 gmake) will use it to build LAMMPS.  Note that you should
 include/exclude any desired optional packages before using the "make
 makelist" command.
 </P>
 <P>(2) If you get an error that says something like 'identifier "atoll"
 is undefined', then your machine does not support "long long"
 integers.  Try using the -DLAMMPS_LONGLONG_TO_LONG setting described
 above in Step 4.
 </P>
 <HR>
 
 <A NAME = "start_2_4"></A><B><I>Additional build tips:</I></B> 
 
 <P>(1) Building LAMMPS for multiple platforms.
 </P>
 <P>You can make LAMMPS for multiple platforms from the same src
 directory.  Each target creates its own object sub-directory called
 Obj_target where it stores the system-specific *.o files.
 </P>
 <P>(2) Cleaning up.
 </P>
 <P>Typing "make clean-all" or "make clean-machine" will delete *.o object
 files created when LAMMPS is built, for either all builds or for a
 particular machine.
 </P>
 <P>(3) Changing the LAMMPS size limits via -DLAMMPS_SMALLBIG or
 -DLAMMPS_BIGBIG or -DLAMMPS_SMALLSMALL
 </P>
 <P>As explained above, any of these 3 settings can be specified on the
 LMP_INC line in your low-level src/MAKE/Makefile.foo.
 </P>
 <P>The default is -DLAMMPS_SMALLBIG which allows for systems with up to
 2^63 atoms and 2^63 timesteps (about 9e18). The atom limit is for
 atomic systems which do not store bond topology info and thus do not
 require atom IDs.  If you use atom IDs for atomic systems (which is
 the default) or if you use a molecular model, which stores bond
 topology info and thus requires atom IDs, the limit is 2^31 atoms
 (about 2 billion).  This is because the IDs are stored in 32-bit
 integers.
 </P>
 <P>Likewise, with this setting, the 3 image flags for each atom (see the
 <A HREF = "dump.html">dump</A> doc page for a discussion) are stored in a 32-bit
 integer, which means the atoms can only wrap around a periodic box (in
 each dimension) at most 512 times.  If atoms move through the periodic
 box more than this many times, the image flags will "roll over",
 e.g. from 511 to -512, which can cause diagnostics like the
 mean-squared displacement, as calculated by the <A HREF = "compute_msd.html">compute
 msd</A> command, to be faulty.
 </P>
 <P>To allow for larger atomic systems with atom IDs or larger molecular
 systems or larger image flags, compile with -DLAMMPS_BIGBIG.  This
 stores atom IDs and image flags in 64-bit integers.  This enables
 atomic or molecular systems with atom IDS of up to 2^63 atoms (about
 9e18).  And image flags will not "roll over" until they reach 2^20 =
 1048576.
 </P>
 <P>If your system does not support 8-byte integers, you will need to
 compile with the -DLAMMPS_SMALLSMALL setting.  This will restrict the
 total number of atoms (for atomic or molecular systems) and timesteps
 to 2^31 (about 2 billion).  Image flags will roll over at 2^9 = 512.
 </P>
 <P>Note that in src/lmptype.h there are definitions of all these data
 types as well as the MPI data types associated with them.  The MPI
 types need to be consistent with the associated C data types, or else
 LAMMPS will generate a run-time error.  As far as we know, the
 settings defined in src/lmptype.h are portable and work on every
 current system.
 </P>
 <P>In all cases, the size of problem that can be run on a per-processor  
 basis is limited by 4-byte integer storage to 2^31 atoms per processor  
 (about 2 billion). This should not normally be a limitation since such  
 a problem would have a huge per-processor memory footprint due to  
 neighbor lists and would run very slowly in terms of CPU secs/timestep.
 </P>
 <HR>
 
 <A NAME = "start_2_5"></A><B><I>Building for a Mac:</I></B> 
 
 <P>OS X is BSD Unix, so it should just work.  See the
 src/MAKE/Makefile.mac file.
 </P>
 <HR>
 
 <A NAME = "start_2_6"></A><B><I>Building for Windows:</I></B> 
 
 <P>The LAMMPS download page has an option to download both a serial and
 parallel pre-built Windows executable.  See the <A HREF = "#start_6">Running
 LAMMPS</A> section for instructions on running these executables
 on a Windows box.
 </P>
 <P>The pre-built executables hosted on the <A HREF = "http://lammps.sandia.gov/download.html">LAMMPS download
 page</A> are built with a subset
 of the available packages; see the download page for the list. These
 are single executable files.  No examples or documentation in
 included. You will need to download the full source code package to
 obtain those.
 </P>
 <P>As an alternative, you can download "daily builds" (and some older
 versions) of the installer packages from
 <A HREF = "http://rpm.lammps.org/windows.html">rpm.lammps.org/windows.html</A>.
 These executables are built with most optional packages and the
 download includes documentation, some tools and most examples.
 </P>
 <P>If you want a Windows version with specific packages included and
 excluded, you can build it yourself.
 </P>
 <P>One way to do this is install and use cygwin to build LAMMPS with a
 standard unix style make program, just as you would on a Linux box;
 see src/MAKE/Makefile.cygwin.
 </P>
 <P>The other way to do this is using Visual Studio and project files.
 See the src/WINDOWS directory and its README.txt file for instructions
 on both a basic build and a customized build with pacakges you select.
 </P>
 <HR>
 
 <H4><A NAME = "start_3"></A>2.3 Making LAMMPS with optional packages 
 </H4>
 <P>This section has the following sub-sections:
 </P>
 <UL><LI><A HREF = "#start_3_1">Package basics</A>
 <LI><A HREF = "#start_3_2">Including/excluding packages</A>
 <LI><A HREF = "#start_3_3">Packages that require extra libraries</A>
-<LI><A HREF = "#start_3_4">Additional Makefile settings for extra libraries</A> 
+<LI><A HREF = "#start_3_4">Packages that use make variable settings</A> 
 </UL>
 <HR>
 
 <A NAME = "start_3_1"></A><B><I>Package basics:</I></B> 
 
 <P>The source code for LAMMPS is structured as a set of core files which
 are always included, plus optional packages.  Packages are groups of
 files that enable a specific set of features.  For example, force
 fields for molecular systems or granular systems are in packages.  You
 can see the list of all packages by typing "make package" from within
 the src directory of the LAMMPS distribution.
 </P>
 <P>If you use a command in a LAMMPS input script that is specific to a
 particular package, you must have built LAMMPS with that package, else
 you will get an error that the style is invalid or the command is
 unknown.  Every command's doc page specfies if it is part of a
 package.  You can also type
 </P>
 <PRE>lmp_machine -h 
 </PRE>
 <P>to run your executable with the optional <A HREF = "#start_7">-h command-line
 switch</A> for "help", which will list the styles and commands
 known to your executable.
 </P>
 <P>There are two kinds of packages in LAMMPS, standard and user packages.
 More information about the contents of standard and user packages is
 given in <A HREF = "Section_packages.html">Section_packages</A> of the manual.  The
 difference between standard and user packages is as follows:
 </P>
 <P>Standard packages are supported by the LAMMPS developers and are
 written in a syntax and style consistent with the rest of LAMMPS.
 This means we will answer questions about them, debug and fix them if
 necessary, and keep them compatible with future changes to LAMMPS.
 </P>
 <P>User packages have been contributed by users, and always begin with
 the user prefix.  If they are a single command (single file), they are
 typically in the user-misc package.  Otherwise, they are a a set of
 files grouped together which add a specific functionality to the code.
 </P>
 <P>User packages don't necessarily meet the requirements of the standard
 packages.  If you have problems using a feature provided in a user
 package, you will likely need to contact the contributor directly to
 get help.  Information on how to submit additions you make to LAMMPS
 as a user-contributed package is given in <A HREF = "Section_modify.html#mod_15">this
 section</A> of the documentation.
 </P>
 <P>Some packages (both standard and user) require additional libraries.
 See more details below.
 </P>
 <HR>
 
 <A NAME = "start_3_2"></A><B><I>Including/excluding packages:</I></B> 
 
 <P>To use or not use a package you must include or exclude it before
 building LAMMPS.  From the src directory, this is typically as simple
 as:
 </P>
 <PRE>make yes-colloid
 make g++ 
 </PRE>
 <P>or
 </P>
 <PRE>make no-manybody
 make g++ 
 </PRE>
 <P>IMPORTANT NOTE: You should NOT include/exclude packages and build
 LAMMPS in a single make command by using multiple targets, e.g. make
 yes-colloid g++.  This is because the make procedure creates a list of
 source files that will be out-of-date for the build if the package
 configuration changes during the same command.
 </P>
 <P>Some packages have individual files that depend on other packages
 being included.  LAMMPS checks for this and does the right thing.
 I.e. individual files are only included if their dependencies are
 already included.  Likewise, if a package is excluded, other files
 dependent on that package are also excluded.
 </P>
 <P>The reason to exclude packages is if you will never run certain kinds
 of simulations.  For some packages, this will keep you from having to
 build auxiliary libraries (see below), and will also produce a smaller
 executable which may run a bit faster.
 </P>
 <P>When you download a LAMMPS tarball, these packages are pre-installed
 in the src directory: KSPACE, MANYBODY,MOLECULE.  When you download
 LAMMPS source files from the SVN or Git repositories, no packages are
 pre-installed.
 </P>
 <P>Packages are included or excluded by typing "make yes-name" or "make
 no-name", where "name" is the name of the package in lower-case, e.g.
 name = kspace for the KSPACE package or name = user-atc for the
 USER-ATC package.  You can also type "make yes-standard", "make
 no-standard", "make yes-user", "make no-user", "make yes-all" or "make
 no-all" to include/exclude various sets of packages.  Type "make
 package" to see the all of the package-related make options.
 </P>
 <P>IMPORTANT NOTE: Inclusion/exclusion of a package works by simply
 moving files back and forth between the main src directory and
 sub-directories with the package name (e.g. src/KSPACE, src/USER-ATC),
 so that the files are seen or not seen when LAMMPS is built.  After
 you have included or excluded a package, you must re-build LAMMPS.
 </P>
 <P>Additional package-related make options exist to help manage LAMMPS
 files that exist in both the src directory and in package
 sub-directories.  You do not normally need to use these commands
 unless you are editing LAMMPS files or have downloaded a patch from
 the LAMMPS WWW site.
 </P>
 <P>Typing "make package-update" will overwrite src files with files from
 the package sub-directories if the package has been included.  It
 should be used after a patch is installed, since patches only update
 the files in the package sub-directory, but not the src files.  Typing
 "make package-overwrite" will overwrite files in the package
 sub-directories with src files.
 </P>
 <P>Typing "make package-status" will show which packages are currently
 included. Of those that are included, it will list files that are
 different in the src directory and package sub-directory.  Typing
 "make package-diff" lists all differences between these files.  Again,
 type "make package" to see all of the package-related make options.
 </P>
 <HR>
 
 <A NAME = "start_3_3"></A><B><I>Packages that require extra libraries:</I></B> 
 
 <P>A few of the standard and user packages require additional auxiliary
 libraries.  They must be compiled first, before LAMMPS is built.  If
 you get a LAMMPS build error about a missing library, this is likely
 the reason.  See the <A HREF = "Section_packages.html">Section_packages</A> doc page
 for a list of packages that have auxiliary libraries.
 </P>
 <P>Code for some of these auxiliary libraries is included in the LAMMPS
 distribution under the lib directory.  Examples are the USER-ATC and
-MEAM packages.  Some auxiliary libraries are not included with LAMMPS;
+MEAM packages.  Some auxiliary libraries are NOT included with LAMMPS;
 to use the associated package you must download and install the
 auxiliary library yourself.  Examples are the KIM and VORONOI and
 USER-MOLFILE packages.
 </P>
 <P>For libraries with provided source code, each lib directory has a
 README file (e.g. lib/reax/README) with instructions on how to build
 that library.  Typically this is done by typing something like:
 </P>
 <PRE>make -f Makefile.g++ 
 </PRE>
-<P>If one of the provided Makefiles is not
-appropriate for your system you will need to edit or add one.
-Note that all the Makefiles have a setting for EXTRAMAKE at
-the top that names a Makefile.lammps.* file.
+<P>If one of the provided Makefiles is not appropriate for your system
+you will need to edit or add one.  Note that all the Makefiles have a
+setting for EXTRAMAKE at the top that specifies a Makefile.lammps.*
+file.
 </P>
-<P>If successful, this will produce 2 files in the lib directory:
+<P>If the library build is successful, it will produce 2 files in the lib
+directory:
 </P>
 <PRE>libpackage.a
 Makefile.lammps 
 </PRE>
-<P>The Makefile.lammps file is a copy of the EXTRAMAKE file specified
-in the Makefile you used.
-</P>
-<P>You MUST insure that the settings in Makefile.lammps are appropriate
-for your system.  If they are not, the LAMMPS build will fail.
-</P>
-<P>As explained in the lib/package/README files, they are used to specify
-additional system libraries and their locations so that LAMMPS can
-build with the auxiliary library.  For example, if the MEAM or REAX
-packages are used, the auxiliary libraries consist of F90 code, build
-with a F90 complier.  To link that library with LAMMPS (a C++ code)
-via whatever C++ compiler LAMMPS is built with, typically requires
-additional Fortran-to-C libraries be included in the link.  Another
-example are the BLAS and LAPACK libraries needed to use the USER-ATC
-or USER-AWPMD packages.
+<P>The Makefile.lammps file will be a copy of the EXTRAMAKE file setting
+specified in the library Makefile.* you used.
+</P>
+<P>Note that you must insure that the settings in Makefile.lammps are
+appropriate for your system.  If they are not, the LAMMPS build will
+fail.
+</P>
+<P>As explained in the lib/package/README files, the settings in
+Makefile.lammps are used to specify additional system libraries and
+their locations so that LAMMPS can build with the auxiliary library.
+For example, if the MEAM or REAX packages are used, the auxiliary
+libraries consist of F90 code, built with a Fortran complier.  To link
+that library with LAMMPS (a C++ code) via whatever C++ compiler LAMMPS
+is built with, typically requires additional Fortran-to-C libraries be
+included in the link.  Another example are the BLAS and LAPACK
+libraries needed to use the USER-ATC or USER-AWPMD packages.
 </P>
 <P>For libraries without provided source code, see the
 src/package/Makefile.lammps file for information on where to find the
 library and how to build it.  E.g. the file src/KIM/Makefile.lammps or
 src/VORONOI/Makefile.lammps or src/UESR-MOLFILE/Makefile.lammps.
 These files serve the same purpose as the lib/package/Makefile.lammps
 files described above.  The files have settings needed when LAMMPS is
-built to link with the corresponding auxiliary library.  Again, you
-MUST insure that the settings in src/package/Makefile.lammps are
-appropriate for your system and where you installed the auxiliary
-library.  If they are not, the LAMMPS build will fail.
+built to link with the corresponding auxiliary library.
+</P>
+<P>Again, you must insure that the settings in
+src/package/Makefile.lammps are appropriate for your system and where
+you installed the auxiliary library.  If they are not, the LAMMPS
+build will fail.
+</P>
+<HR>
+
+<A NAME = "start_3_4"></A><B><I>Packages that use make variable settings</I></B> 
+
+<P>One package, the KOKKOS package, allows its build options to be
+specified by setting variables via the "make" command, rather than by
+first building an auxiliary library and editing a Makefile.lammps
+file, as discussed in the previous sub-section for other packages.
+This is for convenience since it is common to want to experiment with
+different Kokkos library options.  Using variables enables a direct
+re-build of LAMMPS and its Kokkos dependencies, so that a benchmark
+test with different Kokkos options can be quickly performed.
+</P>
+<P>The syntax for setting make variables is as follows.  You must
+use a GNU-compatible make command for this to work.  Try "gmake"
+if your system's standard make complains.
+</P>
+<PRE>make yes-kokkos
+make g++ VAR1=value VAR2=value ... 
+</PRE>
+<P>The first line installs the KOKKOS package, which only needs to be
+done once.  The second line builds LAMMPS with src/MAKE/Makefile.g++
+and optionally sets one or more variables that affect the build.  Each
+variable is specified in upper-case; its value follows an equal sign
+with no spaces.  The second line can be repeated with different
+variable settings, though a "clean" must be done before the rebuild.
+Type "make clean" to see options for this operation.
+</P>
+<P>These are the variables that can be specified.  Each takes a value of
+<I>yes</I> or <I>no</I>.  The default value is listed, which is set in the
+lib/kokkos/Makefile.lammps file.  See <A HREF = "Section_accelerate.html#acc_8">this
+section</A> for a discussion of what is
+meant by "host" and "device" in the Kokkos context.
+</P>
+<UL><LI>OMP, default = <I>yes</I>
+<LI>CUDA, default = <I>no</I>
+<LI>HWLOC, default = <I>no</I>
+<LI>AVX, default = <I>no</I>
+<LI>MIC, default = <I>no</I>
+<LI>LIBRT, default = <I>no</I>
+<LI>DEBUG, default = <I>no</I> 
+</UL>
+<P>OMP sets the parallelization method used for Kokkos code (within
+LAMMPS) that runs on the host.  OMP=yes means that OpenMP will be
+used.  OMP=no means that pthreads will be used.
+</P>
+<P>CUDA sets the parallelization method used for Kokkos code (within
+LAMMPS) that runs on the device.  CUDA=yes means an NVIDIA GPU running
+CUDA will be used.  CUDA=no means that the OMP=yes or OMP=no setting
+will be used for the device as well as the host.
+</P>
+<P>If CUDA=yes, then the lo-level Makefile in the src/MAKE directory must
+use "nvcc" as its compiler, via its CC setting.  For best performance
+its CCFLAGS setting should use -O3 and have an -arch setting that
+matches the compute capability of your NVIDIA hardware and software
+installation, e.g. -arch=sm_20.  Generally Fermi Generation GPUs are
+sm_20, while Kepler generation GPUs are sm_30 or sm_35 and Maxwell
+cards are sm_50.  A complete list can be found on
+<A HREF = "http://en.wikipedia.org/wiki/CUDA#Supported_GPUs">wikipedia</A>. You can
+also use the deviceQuery tool that comes with the CUDA samples.  Note
+the minimal required compute capability is 2.0, but this will give
+signicantly reduced performance compared to Kepler generation GPUs
+with compute capability 3.x.  For the LINK setting, "nvcc" should not
+be used; instead use g++ or another compiler suitable for linking C++
+applications.  Often you will want to use your MPI compiler wrapper
+for this setting (i.e. mpicxx).  Finally, the lo-level Makefile must
+also have a "Compilation rule" for creating *.o files from *.cu files.
+See src/Makefile.cuda for an example of a lo-level Makefile with all
+of these settings.
+</P>
+<P>HWLOC binds threads to hardware cores, so they do not migrate during a
+simulation.  HWLOC=yes should always be used if running with OMP=no
+for pthreads.  It is not necessary for OMP=yes for OpenMP, because
+OpenMP provides alternative methods via environment variables for
+binding threads to hardware cores.  More info on binding threads to
+cores is given in <A HREF = "Section_accelerate.html#acc_8">this section</A>.
+</P>
+<P>AVX enables Intel advanced vector extensions when compiling for an
+Intel-compatible chip.  AVX=yes should only be set if your host
+hardware supports AVX.  If it does not support it, this will cause a
+run-time crash.
+</P>
+<P>MIC enables compiler switches needed when compling for an Intel Phi
+processor.
+</P>
+<P>LIBRT enables use of a more accurate timer mechanism on most Unix
+platforms.  This library is not available on all platforms.
+</P>
+<P>DEBUG is only useful when developing a Kokkos-enabled style within
+LAMMPS.  DEBUG=yes enables printing of run-time debugging information
+that can be useful.  It also enables runtime bounds checking on Kokkos
+data structures.
 </P>
 <HR>
 
 <H4><A NAME = "start_4"></A>2.4 Building LAMMPS via the Make.py script 
 </H4>
 <P>The src directory includes a Make.py script, written
 in Python, which can be used to automate various steps
 of the build process.
 </P>
 <P>You can run the script from the src directory by typing either:
 </P>
 <PRE>Make.py
 python Make.py 
 </PRE>
 <P>which will give you info about the tool.  For the former to work, you
 may need to edit the 1st line of the script to point to your local
 Python.  And you may need to insure the script is executable:
 </P>
 <PRE>chmod +x Make.py 
 </PRE>
 <P>The following options are supported as switches:
 </P>
 <UL><LI>-i file1 file2 ...
 <LI>-p package1 package2 ...
 <LI>-u package1 package2 ...
 <LI>-e package1 arg1 arg2 package2 ...
 <LI>-o dir
 <LI>-b machine
 <LI>-s suffix1 suffix2 ...
 <LI>-l dir
 <LI>-j N
 <LI>-h switch1 switch2 ... 
 </UL>
 <P>Help on any switch can be listed by using -h, e.g.
 </P>
 <PRE>Make.py -h -i -p 
 </PRE>
 <P>At a hi-level, these are the kinds of package management
 and build tasks that can be performed easily, using
 the Make.py tool:
 </P>
 <UL><LI>install/uninstall packages and build the associated external libs (use -p and -u and -e)
 <LI>install packages needed for one or more input scripts (use -i and -p)
 <LI>build LAMMPS, either in the src dir or new dir (use -b)
 <LI>create a new dir with only the source code needed for one or more input scripts (use -i and -o) 
 </UL>
 <P>The last bullet can be useful when you wish to build a stripped-down
 version of LAMMPS to run a specific script(s).  Or when you wish to
 move the minimal amount of files to another platform for a remote
 LAMMPS build.
 </P>
 <P>Note that using Make.py is not a substitute for insuring you have a
 valid src/MAKE/Makefile.foo for your system, or that external library
 Makefiles in any lib/* directories you use are also valid for your
 system.  But once you have done that, you can use Make.py to quickly
 include/exclude the packages and external libraries needed by your
 input scripts.
 </P>
 <HR>
 
 <H4><A NAME = "start_5"></A>2.5 Building LAMMPS as a library 
 </H4>
 <P>LAMMPS can be built as either a static or shared library, which can
 then be called from another application or a scripting language.  See
 <A HREF = "Section_howto.html#howto_10">this section</A> for more info on coupling
 LAMMPS to other codes.  See <A HREF = "Section_python.html">this section</A> for
 more info on wrapping and running LAMMPS from Python.
 </P>
 <H5><B>Static library:</B> 
 </H5>
 <P>To build LAMMPS as a static library (*.a file on Linux), type
 </P>
 <PRE>make makelib
 make -f Makefile.lib foo 
 </PRE>
 <P>where foo is the machine name.  This kind of library is typically used
 to statically link a driver application to LAMMPS, so that you can
 insure all dependencies are satisfied at compile time.  Note that
 inclusion or exclusion of any desired optional packages should be done
 before typing "make makelib".  The first "make" command will create a
 current Makefile.lib with all the file names in your src dir.  The
 second "make" command will use it to build LAMMPS as a static library,
 using the ARCHIVE and ARFLAGS settings in src/MAKE/Makefile.foo.  The
 build will create the file liblammps_foo.a which another application can
 link to.
 </P>
 <H5><B>Shared library:</B> 
 </H5>
 <P>To build LAMMPS as a shared library (*.so file on Linux), which can be
 dynamically loaded, e.g. from Python, type
 </P>
 <PRE>make makeshlib
 make -f Makefile.shlib foo 
 </PRE>
 <P>where foo is the machine name.  This kind of library is required when
 wrapping LAMMPS with Python; see <A HREF = "Section_python.html">Section_python</A>
 for details.  Again, note that inclusion or exclusion of any desired
 optional packages should be done before typing "make makelib".  The
 first "make" command will create a current Makefile.shlib with all the
 file names in your src dir.  The second "make" command will use it to
 build LAMMPS as a shared library, using the SHFLAGS and SHLIBFLAGS
 settings in src/MAKE/Makefile.foo.  The build will create the file
 liblammps_foo.so which another application can link to dyamically.  It
 will also create a soft link liblammps.so, which the Python wrapper uses
 by default.
 </P>
 <P>Note that for a shared library to be usable by a calling program, all
 the auxiliary libraries it depends on must also exist as shared
 libraries.  This will be the case for libraries included with LAMMPS,
 such as the dummy MPI library in src/STUBS or any package libraries in
 lib/packges, since they are always built as shared libraries with the
 -fPIC switch.  However, if a library like MPI or FFTW does not exist
 as a shared library, the second make command will generate an error.
 This means you will need to install a shared library version of the
 package.  The build instructions for the library should tell you how
 to do this.
 </P>
 <P>As an example, here is how to build and install the <A HREF = "http://www-unix.mcs.anl.gov/mpi">MPICH
 library</A>, a popular open-source version of MPI, distributed by
 Argonne National Labs, as a shared library in the default
 /usr/local/lib location:
 </P>
 
 
 <PRE>./configure --enable-shared
 make
 make install 
 </PRE>
 <P>You may need to use "sudo make install" in place of the last line if
 you do not have write privileges for /usr/local/lib.  The end result
 should be the file /usr/local/lib/libmpich.so.
 </P>
 <H5><B>Additional requirement for using a shared library:</B> 
 </H5>
 <P>The operating system finds shared libraries to load at run-time using
 the environment variable LD_LIBRARY_PATH.  So you may wish to copy the
 file src/liblammps.so or src/liblammps_g++.so (for example) to a place
 the system can find it by default, such as /usr/local/lib, or you may
 wish to add the LAMMPS src directory to LD_LIBRARY_PATH, so that the
 current version of the shared library is always available to programs
 that use it.
 </P>
 <P>For the csh or tcsh shells, you would add something like this to your
 ~/.cshrc file:
 </P>
 <PRE>setenv LD_LIBRARY_PATH $<I>LD_LIBRARY_PATH</I>:/home/sjplimp/lammps/src 
 </PRE>
 <H5><B>Calling the LAMMPS library:</B> 
 </H5>
 <P>Either flavor of library (static or shared0 allows one or more LAMMPS
 objects to be instantiated from the calling program.
 </P>
 <P>When used from a C++ program, all of LAMMPS is wrapped in a LAMMPS_NS
 namespace; you can safely use any of its classes and methods from
 within the calling code, as needed.
 </P>
 <P>When used from a C or Fortran program or a scripting language like
 Python, the library has a simple function-style interface, provided in
 src/library.cpp and src/library.h.
 </P>
 <P>See the sample codes in examples/COUPLE/simple for examples of C++ and
 C and Fortran codes that invoke LAMMPS thru its library interface.
 There are other examples as well in the COUPLE directory which are
 discussed in <A HREF = "Section_howto.html#howto_10">Section_howto 10</A> of the
 manual.  See <A HREF = "Section_python.html">Section_python</A> of the manual for a
 description of the Python wrapper provided with LAMMPS that operates
 through the LAMMPS library interface.
 </P>
 <P>The files src/library.cpp and library.h define the C-style API for
 using LAMMPS as a library.  See <A HREF = "Section_howto.html#howto_19">Section_howto
 19</A> of the manual for a description of the
 interface and how to extend it for your needs.
 </P>
 <HR>
 
 <H4><A NAME = "start_6"></A>2.6 Running LAMMPS 
 </H4>
 <P>By default, LAMMPS runs by reading commands from standard input.  Thus
 if you run the LAMMPS executable by itself, e.g.
 </P>
 <PRE>lmp_linux 
 </PRE>
 <P>it will simply wait, expecting commands from the keyboard.  Typically
 you should put commands in an input script and use I/O redirection,
 e.g.
 </P>
 <PRE>lmp_linux < in.file 
 </PRE>
 <P>For parallel environments this should also work.  If it does not, use
 the '-in' command-line switch, e.g.
 </P>
 <PRE>lmp_linux -in in.file 
 </PRE>
 <P><A HREF = "Section_commands.html">This section</A> describes how input scripts are
 structured and what commands they contain.
 </P>
 <P>You can test LAMMPS on any of the sample inputs provided in the
 examples or bench directory.  Input scripts are named in.* and sample
 outputs are named log.*.name.P where name is a machine and P is the
 number of processors it was run on.
 </P>
 <P>Here is how you might run a standard Lennard-Jones benchmark on a
 Linux box, using mpirun to launch a parallel job:
 </P>
 <PRE>cd src
 make linux
 cp lmp_linux ../bench
 cd ../bench
 mpirun -np 4 lmp_linux < in.lj 
 </PRE>
 <P>See <A HREF = "http://lammps.sandia.gov/bench.html">this page</A> for timings for this and the other benchmarks on
 various platforms.  Note that some of the example scripts require
 LAMMPS to be built with one or more of its optional packages.
 </P>
 
 
 <HR>
 
 <P>On a Windows box, you can skip making LAMMPS and simply download an
 executable, as described above, though the pre-packaged executables
 include only certain packages.
 </P>
 <P>To run a LAMMPS executable on a Windows machine, first decide whether
 you want to download the non-MPI (serial) or the MPI (parallel)
 version of the executable. Download and save the version you have
 chosen.
 </P>
 <P>For the non-MPI version, follow these steps:
 </P>
 <UL><LI>Get a command prompt by going to Start->Run... , 
 then typing "cmd". 
 
 <LI>Move to the directory where you have saved lmp_win_no-mpi.exe
 (e.g. by typing: cd "Documents"). 
 
 <LI>At the command prompt, type "lmp_win_no-mpi -in in.lj", replacing in.lj
 with the name of your LAMMPS input script. 
 </UL>
 <P>For the MPI version, which allows you to run LAMMPS under Windows on 
 multiple processors, follow these steps:
 </P>
 <UL><LI>Download and install
 <A HREF = "http://www.mcs.anl.gov/research/projects/mpich2/downloads/index.php?s=downloads">MPICH2</A>
 for Windows. 
 
 <LI>You'll need to use the mpiexec.exe and smpd.exe files from the MPICH2
 package. Put them in same directory (or path) as the LAMMPS Windows
 executable. 
 
 <LI>Get a command prompt by going to Start->Run... , 
 then typing "cmd". 
 
 <LI>Move to the directory where you have saved lmp_win_mpi.exe
 (e.g. by typing: cd "Documents"). 
 
 <LI>Then type something like this: "mpiexec -localonly 4 lmp_win_mpi -in
 in.lj", replacing in.lj with the name of your LAMMPS input script. 
 
 <LI>Note that you may need to provide smpd with a passphrase (it doesn't
 matter what you type). 
 
 <LI>In this mode, output may not immediately show up on the screen, so if
 your input script takes a long time to execute, you may need to be
 patient before the output shows up. :l Alternatively, you can still
 use this executable to run on a single processor by typing something
 like: "lmp_win_mpi -in in.lj". 
 </UL>
 <HR>
 
 <P>The screen output from LAMMPS is described in the next section.  As it
 runs, LAMMPS also writes a log.lammps file with the same information.
 </P>
 <P>Note that this sequence of commands copies the LAMMPS executable
 (lmp_linux) to the directory with the input files.  This may not be
 necessary, but some versions of MPI reset the working directory to
 where the executable is, rather than leave it as the directory where
 you launch mpirun from (if you launch lmp_linux on its own and not
 under mpirun).  If that happens, LAMMPS will look for additional input
 files and write its output files to the executable directory, rather
 than your working directory, which is probably not what you want.
 </P>
 <P>If LAMMPS encounters errors in the input script or while running a
 simulation it will print an ERROR message and stop or a WARNING
 message and continue.  See <A HREF = "Section_errors.html">Section_errors</A> for a
 discussion of the various kinds of errors LAMMPS can or can't detect,
 a list of all ERROR and WARNING messages, and what to do about them.
 </P>
 <P>LAMMPS can run a problem on any number of processors, including a
 single processor.  In theory you should get identical answers on any
 number of processors and on any machine.  In practice, numerical
 round-off can cause slight differences and eventual divergence of
 molecular dynamics phase space trajectories.
 </P>
 <P>LAMMPS can run as large a problem as will fit in the physical memory
 of one or more processors.  If you run out of memory, you must run on
 more processors or setup a smaller problem.
 </P>
 <HR>
 
 <H4><A NAME = "start_7"></A>2.7 Command-line options 
 </H4>
 <P>At run time, LAMMPS recognizes several optional command-line switches
 which may be used in any order.  Either the full word or a one-or-two
 letter abbreviation can be used:
 </P>
 <UL><LI>-c or -cuda
 <LI>-e or -echo
 <LI>-i or -in
 <LI>-h or -help
+<LI>-k or -kokkos
 <LI>-l or -log
 <LI>-nc or -nocite
 <LI>-p or -partition
 <LI>-pl or -plog
 <LI>-ps or -pscreen
 <LI>-r or -restart
 <LI>-ro or -reorder
 <LI>-sc or -screen
 <LI>-sf or -suffix
 <LI>-v or -var 
 </UL>
 <P>For example, lmp_ibm might be launched as follows:
 </P>
 <PRE>mpirun -np 16 lmp_ibm -v f tmp.out -l my.log -sc none < in.alloy
 mpirun -np 16 lmp_ibm -var f tmp.out -log my.log -screen none < in.alloy 
 </PRE>
 <P>Here are the details on the options:
 </P>
 <PRE>-cuda on/off 
 </PRE>
 <P>Explicitly enable or disable CUDA support, as provided by the
 USER-CUDA package.  If LAMMPS is built with this package, as described
 above in <A HREF = "#start_3">Section 2.3</A>, then by default LAMMPS will run in
 CUDA mode.  If this switch is set to "off", then it will not, even if
 it was built with the USER-CUDA package, which means you can run
 standard LAMMPS or with the GPU package for testing or benchmarking
 purposes.  The only reason to set the switch to "on", is to check if
 LAMMPS was built with the USER-CUDA package, since an error will be
 generated if it was not.
 </P>
 <PRE>-echo style 
 </PRE>
 <P>Set the style of command echoing.  The style can be <I>none</I> or <I>screen</I>
 or <I>log</I> or <I>both</I>.  Depending on the style, each command read from
 the input script will be echoed to the screen and/or logfile.  This
 can be useful to figure out which line of your script is causing an
 input error.  The default value is <I>log</I>.  The echo style can also be
 set by using the <A HREF = "echo.html">echo</A> command in the input script itself.
 </P>
 <PRE>-in file 
 </PRE>
 <P>Specify a file to use as an input script.  This is an optional switch
 when running LAMMPS in one-partition mode.  If it is not specified,
 LAMMPS reads its script from standard input, typically from a script
 via I/O redirection; e.g. lmp_linux < in.run.  I/O redirection should
 also work in parallel, but if it does not (in the unlikely case that
 an MPI implementation does not support it), then use the -in flag.
 Note that this is a required switch when running LAMMPS in
 multi-partition mode, since multiple processors cannot all read from
 stdin.
 </P>
 <PRE>-help 
 </PRE>
 <P>Print a brief help summary and a list of options compiled into this
 executable for each LAMMPS style (atom_style, fix, compute,
 pair_style, bond_style, etc).  This can tell you if the command you
 want to use was included via the appropriate package at compile time.
 LAMMPS will print the info and immediately exit if this switch is
 used.
 </P>
+<PRE>-kokkos on/off keyword/value ... 
+</PRE>
+<P>Explicitly enable or disable Kokkos support, as provided by the KOKKOS
+package.  If LAMMPS is built with this package, as described above in
+<A HREF = "#start_3">Section 2.3</A>, then by default LAMMPS will run in Kokkos
+mode.  If this switch is set to "off", then it will not, even if it
+was built with the KOKKOS package, which means you can run standard
+LAMMPS styles or use styles enhanced by other acceleration packages,
+such as the GPU or USER-CUDA or USER-OMP packages, for testing or
+benchmarking purposes.  The only reason to set the switch to "on", is
+to check if LAMMPS was built with the KOKKOS package, since an error
+will be generated if it was not.
+</P>
+<P>Additional optional keyword/value pairs can be specified which
+determine how Kokkos will use the underlying hardware on your
+platform.  These settings apply to each MPI task you launch via the
+"mpirun" or "mpiexec" command.  You may choose to run one or more MPI
+tasks per physical node.  Note that if you are running on a desktop
+machine, you typically have one physical node.  On a cluster or
+supercomputer there may be dozens or 1000s of physical nodes.
+</P>
+<P>Either the full word or an abbreviation can be used for the keywords.
+Note that the keywords do not use a leading minus sign.  I.e. the
+keyword is "t", not "-t".  Also note that each of the keywords has a
+default setting.  More explanation as to when to use these options and
+what settings to use on different platforms is given in <A HREF = "Section_accerlerate.html#acc_8">this
+section</A>.
+</P>
+<UL><LI>d or device
+<LI>g or gpus
+<LI>t or threads
+<LI>n or numa 
+</UL>
+<PRE>device Nd 
+</PRE>
+<P>This option is only relevant if you built LAMMPS with CUDA=yes, you
+have more than one GPU per node, and if you are running with only one
+MPI task per node.  The Nd setting is the ID of the GPU on the node to
+run on.  By default Nd = 0.  If you have multiple GPUs per node, they
+have consecutive IDs numbered as 0,1,2,etc.  This setting allows you
+to launch multiple independent jobs on the node, each with a single
+MPI task per node, and assign each job to run on a different GPU.
+</P>
+<PRE>gpus Ng Ns 
+</PRE>
+<P>This option is only relevant if you built LAMMPS with CUDA=yes, you
+have more than one GPU per node, and you are running with multiple MPI
+tasks per node (up to one per GPU).  The Ng setting is how many GPUs
+you will use.  The Ns setting is optional.  If set, it is the ID of a
+GPU to skip when assigning MPI tasks to GPUs.  This may be useful if
+your desktop system reserves one GPU to drive the screen and the rest
+are intended for computational work like running LAMMPS.  By default
+Ng = 1 and Ns is not set.
+</P>
+<P>Depending on which flavor of MPI you are running, LAMMPS will look for
+one of these 3 environment variables
+</P>
+<PRE>SLURM_LOCALID (various MPI variants compiled with SLURM support)
+MV2_COMM_WORLD_LOCAL_RANK (Mvapich)
+OMPI_COMM_WORLD_LOCAL_RANK (OpenMPI) 
+</PRE>
+<P>which are initialized by the "srun", "mpirun" or "mpiexec" commands.
+The environment variable setting for each MPI rank is used to assign a
+unique GPU ID to the MPI task.
+</P>
+<PRE>threads Nt 
+</PRE>
+<P>This option assigns Nt number of threads to each MPI task for
+performing work when Kokkos is executing in OpenMP or pthreads mode.
+The default is Nt = 1, which essentially runs in MPI-only mode.  If
+there are Np MPI tasks per physical node, you generally want Np*Nt =
+the number of physical cores per node, to use your available hardware
+optimally.  This also sets the number of threads used by the host when
+LAMMPS is compiled with CUDA=yes.
+</P>
+<PRE>numa Nm 
+</PRE>
+<P>This option is only relevant when using pthreads with hwloc support.
+In this case Nm defines the number of NUMA regions (typicaly sockets)
+on a node which will be utilizied by a single MPI rank.  By default Nm
+= 1.  If this option is used the total number of worker-threads per
+MPI rank is threads*numa.  Currently it is always almost better to
+assign at least one MPI rank per NUMA region, and leave numa set to
+its default value of 1. This is because letting a single process span
+multiple NUMA regions induces a significant amount of cross NUMA data
+traffic which is slow.
+</P>
 <PRE>-log file 
 </PRE>
 <P>Specify a log file for LAMMPS to write status information to.  In
 one-partition mode, if the switch is not used, LAMMPS writes to the
 file log.lammps.  If this switch is used, LAMMPS writes to the
 specified file.  In multi-partition mode, if the switch is not used, a
 log.lammps file is created with hi-level status information.  Each
 partition also writes to a log.lammps.N file where N is the partition
 ID.  If the switch is specified in multi-partition mode, the hi-level
 logfile is named "file" and each partition also logs information to a
 file.N.  For both one-partition and multi-partition mode, if the
 specified file is "none", then no log files are created.  Using a
 <A HREF = "log.html">log</A> command in the input script will override this setting.
 Option -plog will override the name of the partition log files file.N.
 </P>
 <PRE>-nocite 
 </PRE>
 <P>Disable writing the log.cite file which is normally written to list
 references for specific cite-able features used during a LAMMPS run.
 See the <A HREF = "http://lammps.sandia.gov/cite.html">citation page</A> for more
 details.
 </P>
 <PRE>-partition 8x2 4 5 ... 
 </PRE>
 <P>Invoke LAMMPS in multi-partition mode.  When LAMMPS is run on P
 processors and this switch is not used, LAMMPS runs in one partition,
 i.e. all P processors run a single simulation.  If this switch is
 used, the P processors are split into separate partitions and each
 partition runs its own simulation.  The arguments to the switch
 specify the number of processors in each partition.  Arguments of the
 form MxN mean M partitions, each with N processors.  Arguments of the
 form N mean a single partition with N processors.  The sum of
 processors in all partitions must equal P.  Thus the command
 "-partition 8x2 4 5" has 10 partitions and runs on a total of 25
 processors.
 </P>
 <P>Running with multiple partitions can e useful for running
 <A HREF = "Section_howto.html#howto_5">multi-replica simulations</A>, where each
 replica runs on on one or a few processors.  Note that with MPI
 installed on a machine (e.g. your desktop), you can run on more
 (virtual) processors than you have physical processors.
 </P>
 <P>To run multiple independent simulatoins from one input script, using
 multiple partitions, see <A HREF = "Section_howto.html#howto_4">Section_howto 4</A>
 of the manual.  World- and universe-style <A HREF = "variable.html">variables</A>
 are useful in this context.
 </P>
 <PRE>-plog file 
 </PRE>
 <P>Specify the base name for the partition log files, so partition N
 writes log information to file.N. If file is none, then no partition
 log files are created.  This overrides the filename specified in the
 -log command-line option.  This option is useful when working with
 large numbers of partitions, allowing the partition log files to be
 suppressed (-plog none) or placed in a sub-directory (-plog
 replica_files/log.lammps) If this option is not used the log file for
 partition N is log.lammps.N or whatever is specified by the -log
 command-line option.
 </P>
 <PRE>-pscreen file 
 </PRE>
 <P>Specify the base name for the partition screen file, so partition N
 writes screen information to file.N. If file is none, then no
 partition screen files are created.  This overrides the filename
 specified in the -screen command-line option.  This option is useful
 when working with large numbers of partitions, allowing the partition
 screen files to be suppressed (-pscreen none) or placed in a
 sub-directory (-pscreen replica_files/screen).  If this option is not
 used the screen file for partition N is screen.N or whatever is
 specified by the -screen command-line option.
 </P>
 <PRE>-restart restartfile datafile keyword value ... 
 </PRE>
 <P>Convert the restart file into a data file and immediately exit.  This
 is the same operation as if the following 2-line input script were
 run:
 </P>
 <PRE>read_restart restartfile
 write_data datafile keyword value ... 
 </PRE>
 <P>Note that the specified restartfile and datafile can have wild-card
 characters ("*",%") as described by the
 <A HREF = "read_restart.html">read_restart</A> and <A HREF = "write_data.html">write_data</A>
 commands.  But a filename such as file.* will need to be enclosed in
 quotes to avoid shell expansion of the "*" character.
 </P>
 <P>Also note that following datafile, the same optional keyword/value
 pairs can be listed as used by the <A HREF = "write_data.html">write_data</A>
 command.
 </P>
 <PRE>-reorder nth N
 -reorder custom filename 
 </PRE>
 <P>Reorder the processors in the MPI communicator used to instantiate
 LAMMPS, in one of several ways.  The original MPI communicator ranks
 all P processors from 0 to P-1.  The mapping of these ranks to
 physical processors is done by MPI before LAMMPS begins.  It may be
 useful in some cases to alter the rank order.  E.g. to insure that
 cores within each node are ranked in a desired order.  Or when using
 the <A HREF = "run_style.html">run_style verlet/split</A> command with 2 partitions
 to insure that a specific Kspace processor (in the 2nd partition) is
 matched up with a specific set of processors in the 1st partition.
 See the <A HREF = "Section_accelerate.html">Section_accelerate</A> doc pages for
 more details.
 </P>
 <P>If the keyword <I>nth</I> is used with a setting <I>N</I>, then it means every
 Nth processor will be moved to the end of the ranking.  This is useful
 when using the <A HREF = "run_style.html">run_style verlet/split</A> command with 2
 partitions via the -partition command-line switch.  The first set of
 processors will be in the first partition, the 2nd set in the 2nd
 partition.  The -reorder command-line switch can alter this so that
 the 1st N procs in the 1st partition and one proc in the 2nd partition
 will be ordered consecutively, e.g. as the cores on one physical node.
 This can boost performance.  For example, if you use "-reorder nth 4"
 and "-partition 9 3" and you are running on 12 processors, the
 processors will be reordered from
 </P>
 <PRE>0 1 2 3 4 5 6 7 8 9 10 11 
 </PRE>
 <P>to
 </P>
 <PRE>0 1 2 4 5 6 8 9 10 3 7 11 
 </PRE>
 <P>so that the processors in each partition will be
 </P>
 <PRE>0 1 2 4 5 6 8 9 10 
 3 7 11 
 </PRE>
 <P>See the "processors" command for how to insure processors from each
 partition could then be grouped optimally for quad-core nodes.
 </P>
 <P>If the keyword is <I>custom</I>, then a file that specifies a permutation
 of the processor ranks is also specified.  The format of the reorder
 file is as follows.  Any number of initial blank or comment lines
 (starting with a "#" character) can be present.  These should be
 followed by P lines of the form:
 </P>
 <PRE>I J 
 </PRE>
 <P>where P is the number of processors LAMMPS was launched with.  Note
 that if running in multi-partition mode (see the -partition switch
 above) P is the total number of processors in all partitions.  The I
 and J values describe a permutation of the P processors.  Every I and
 J should be values from 0 to P-1 inclusive.  In the set of P I values,
 every proc ID should appear exactly once.  Ditto for the set of P J
 values.  A single I,J pairing means that the physical processor with
 rank I in the original MPI communicator will have rank J in the
 reordered communicator.
 </P>
 <P>Note that rank ordering can also be specified by many MPI
 implementations, either by environment variables that specify how to
 order physical processors, or by config files that specify what
 physical processors to assign to each MPI rank.  The -reorder switch
 simply gives you a portable way to do this without relying on MPI
 itself.  See the <A HREF = "processors">processors out</A> command for how to output
 info on the final assignment of physical processors to the LAMMPS
 simulation domain.
 </P>
 <PRE>-screen file 
 </PRE>
 <P>Specify a file for LAMMPS to write its screen information to.  In
 one-partition mode, if the switch is not used, LAMMPS writes to the
 screen.  If this switch is used, LAMMPS writes to the specified file
 instead and you will see no screen output.  In multi-partition mode,
 if the switch is not used, hi-level status information is written to
 the screen.  Each partition also writes to a screen.N file where N is
 the partition ID.  If the switch is specified in multi-partition mode,
 the hi-level screen dump is named "file" and each partition also
 writes screen information to a file.N.  For both one-partition and
 multi-partition mode, if the specified file is "none", then no screen
 output is performed. Option -pscreen will override the name of the
 partition screen files file.N.
 </P>
-<PRE>-suffix style 
+<PRE>-suffix style args 
 </PRE>
 <P>Use variants of various styles if they exist.  The specified style can
-be <I>opt</I>, <I>omp</I>, <I>gpu</I>, or <I>cuda</I>.  These refer to optional packages that
-LAMMPS can be built with, as described above in <A HREF = "#start_3">Section
-2.3</A>.  The "opt" style corrsponds to the OPT package, the
-"omp" style to the USER-OMP package, the "gpu" style to the GPU 
-package, and the "cuda" style to the USER-CUDA package.
+be <I>cuda</I>, <I>gpu</I>, <I>kk</I>, <I>omp</I>, or <I>opt</I>.  These refer to optional
+packages that LAMMPS can be built with, as described above in <A HREF = "#start_3">Section
+2.3</A>.  The "cuda" style corresponds to the USER-CUDA package,
+the "gpu" style to the GPU package, the "kk" style to the KOKKOS
+pacakge, the "opt" style to the OPT package, and the "omp" style to
+the USER-OMP package.
 </P>
 <P>As an example, all of the packages provide a <A HREF = "pair_lj.html">pair_style
-lj/cut</A> variant, with style names lj/cut/opt, lj/cut/omp,
-lj/cut/gpu, or lj/cut/cuda.  A variant styles can be specified
-explicitly in your input script, e.g. pair_style lj/cut/gpu.  If the
--suffix switch is used, you do not need to modify your input script.
-The specified suffix (opt,omp,gpu,cuda) is automatically appended
-whenever your input script command creates a new
-<A HREF = "atom_style.html">atom</A>, <A HREF = "pair_style.html">pair</A>, <A HREF = "fix.html">fix</A>,
+lj/cut</A> variant, with style names lj/cut/cuda,
+lj/cut/gpu, lj/cut/kk, lj/cut/omp, or lj/cut/opt.  A variant styles
+can be specified explicitly in your input script, e.g. pair_style
+lj/cut/gpu.  If the -suffix switch is used, you do not need to modify
+your input script.  The specified suffix (cuda,gpu,kk,omp,opt) is
+automatically appended whenever your input script command creates a
+new <A HREF = "atom_style.html">atom</A>, <A HREF = "pair_style.html">pair</A>, <A HREF = "fix.html">fix</A>,
 <A HREF = "compute.html">compute</A>, or <A HREF = "run_style.html">run</A> style.  If the variant
 version does not exist, the standard version is created.
 </P>
 <P>For the GPU package, using this command-line switch also invokes the
 default GPU settings, as if the command "package gpu force/neigh 0 0
 1" were used at the top of your input script.  These settings can be
 changed by using the <A HREF = "package.html">package gpu</A> command in your script
 if desired.
 </P>
+<P>For the KOKKOS package, using this command-line switch also invokes
+the default KOKKOS settings, as if the command "package kokkos neigh
+full comm/exchange host comm/forward host " were used at the top of
+your input script.  These settings can be changed by using the
+<A HREF = "package.html">package kokkos</A> command in your script if desired.
+</P>
 <P>For the OMP package, using this command-line switch also invokes the
 default OMP settings, as if the command "package omp *" were used at
 the top of your input script.  These settings can be changed by using
 the <A HREF = "package.html">package omp</A> command in your script if desired.
 </P>
-<P>The <A HREF = "suffix.html">suffix</A> command can also set a suffix and it can also
-turn off/on any suffix setting made via the command line.
+<P>The <A HREF = "suffix.html">suffix</A> command can also be used set a suffix and it
+can also turn off or back on any suffix setting made via the command
+line.
 </P>
 <PRE>-var name value1 value2 ... 
 </PRE>
 <P>Specify a variable that will be defined for substitution purposes when
 the input script is read.  "Name" is the variable name which can be a
 single character (referenced as $x in the input script) or a full
 string (referenced as ${abc}).  An <A HREF = "variable.html">index-style
 variable</A> will be created and populated with the
 subsequent values, e.g. a set of filenames.  Using this command-line
 option is equivalent to putting the line "variable name index value1
 value2 ..."  at the beginning of the input script.  Defining an index
 variable as a command-line argument overrides any setting for the same
 index variable in the input script, since index variables cannot be
 re-defined.  See the <A HREF = "variable.html">variable</A> command for more info on
 defining index and other kinds of variables and <A HREF = "Section_commands.html#cmd_2">this
 section</A> for more info on using variables
 in input scripts.
 </P>
 <P>NOTE: Currently, the command-line parser looks for arguments that
 start with "-" to indicate new switches.  Thus you cannot specify
 multiple variable values if any of they start with a "-", e.g. a
 negative numeric value.  It is OK if the first value1 starts with a
 "-", since it is automatically skipped.
 </P>
 <HR>
 
 <H4><A NAME = "start_8"></A>2.8 LAMMPS screen output 
 </H4>
 <P>As LAMMPS reads an input script, it prints information to both the
 screen and a log file about significant actions it takes to setup a
 simulation.  When the simulation is ready to begin, LAMMPS performs
 various initializations and prints the amount of memory (in MBytes per
 processor) that the simulation requires.  It also prints details of
 the initial thermodynamic state of the system.  During the run itself,
 thermodynamic information is printed periodically, every few
 timesteps.  When the run concludes, LAMMPS prints the final
 thermodynamic state and a total run time for the simulation.  It then
 appends statistics about the CPU time and storage requirements for the
 simulation.  An example set of statistics is shown here:
 </P>
 <PRE>Loop time of 49.002 on 2 procs for 2004 atoms 
 </PRE>
 <PRE>Pair   time (%) = 35.0495 (71.5267)
 Bond   time (%) = 0.092046 (0.187841)
 Kspce  time (%) = 6.42073 (13.103)
 Neigh  time (%) = 2.73485 (5.5811)
 Comm   time (%) = 1.50291 (3.06703)
 Outpt  time (%) = 0.013799 (0.0281601)
 Other  time (%) = 2.13669 (4.36041) 
 </PRE>
 <PRE>Nlocal:    1002 ave, 1015 max, 989 min
 Histogram: 1 0 0 0 0 0 0 0 0 1 
 Nghost:    8720 ave, 8724 max, 8716 min 
 Histogram: 1 0 0 0 0 0 0 0 0 1
 Neighs:    354141 ave, 361422 max, 346860 min 
 Histogram: 1 0 0 0 0 0 0 0 0 1 
 </PRE>
 <PRE>Total # of neighbors = 708282
 Ave neighs/atom = 353.434
 Ave special neighs/atom = 2.34032
 Number of reneighborings = 42
 Dangerous reneighborings = 2 
 </PRE>
 <P>The first section gives the breakdown of the CPU run time (in seconds)
 into major categories.  The second section lists the number of owned
 atoms (Nlocal), ghost atoms (Nghost), and pair-wise neighbors stored
 per processor.  The max and min values give the spread of these values
 across processors with a 10-bin histogram showing the distribution.
 The total number of histogram counts is equal to the number of
 processors.
 </P>
 <P>The last section gives aggregate statistics for pair-wise neighbors
 and special neighbors that LAMMPS keeps track of (see the
 <A HREF = "special_bonds.html">special_bonds</A> command).  The number of times
 neighbor lists were rebuilt during the run is given as well as the
 number of potentially "dangerous" rebuilds.  If atom movement
 triggered neighbor list rebuilding (see the
 <A HREF = "neigh_modify.html">neigh_modify</A> command), then dangerous
 reneighborings are those that were triggered on the first timestep
 atom movement was checked for.  If this count is non-zero you may wish
 to reduce the delay factor to insure no force interactions are missed
 by atoms moving beyond the neighbor skin distance before a rebuild
 takes place.
 </P>
 <P>If an energy minimization was performed via the
 <A HREF = "minimize.html">minimize</A> command, additional information is printed,
 e.g.
 </P>
 <PRE>Minimization stats:
   E initial, next-to-last, final = -0.895962 -2.94193 -2.94342
   Gradient 2-norm init/final= 1920.78 20.9992
   Gradient inf-norm init/final= 304.283 9.61216
   Iterations = 36
   Force evaluations = 177 
 </PRE>
 <P>The first line lists the initial and final energy, as well as the
 energy on the next-to-last iteration.  The next 2 lines give a measure
 of the gradient of the energy (force on all atoms).  The 2-norm is the
 "length" of this force vector; the inf-norm is the largest component.
 The last 2 lines are statistics on how many iterations and
 force-evaluations the minimizer required.  Multiple force evaluations
 are typically done at each iteration to perform a 1d line minimization
 in the search direction.
 </P>
 <P>If a <A HREF = "kspace_style.html">kspace_style</A> long-range Coulombics solve was
 performed during the run (PPPM, Ewald), then additional information is
 printed, e.g.
 </P>
 <PRE>FFT time (% of Kspce) = 0.200313 (8.34477)
 FFT Gflps 3d 1d-only = 2.31074 9.19989 
 </PRE>
 <P>The first line gives the time spent doing 3d FFTs (4 per timestep) and
 the fraction it represents of the total KSpace time (listed above).
 Each 3d FFT requires computation (3 sets of 1d FFTs) and communication
 (transposes).  The total flops performed is 5Nlog_2(N), where N is the
 number of points in the 3d grid.  The FFTs are timed with and without
 the communication and a Gflop rate is computed.  The 3d rate is with
 communication; the 1d rate is without (just the 1d FFTs).  Thus you
 can estimate what fraction of your FFT time was spent in
 communication, roughly 75% in the example above.
 </P>
 <HR>
 
 <H4><A NAME = "start_9"></A>2.9 Tips for users of previous LAMMPS versions 
 </H4>
 <P>The current C++ began with a complete rewrite of LAMMPS 2001, which
 was written in F90.  Features of earlier versions of LAMMPS are listed
 in <A HREF = "Section_history.html">Section_history</A>.  The F90 and F77 versions
 (2001 and 99) are also freely distributed as open-source codes; check
 the <A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> for distribution information if you prefer
 those versions.  The 99 and 2001 versions are no longer under active
 development; they do not have all the features of C++ LAMMPS.
 </P>
 <P>If you are a previous user of LAMMPS 2001, these are the most
 significant changes you will notice in C++ LAMMPS:
 </P>
 <P>(1) The names and arguments of many input script commands have
 changed.  All commands are now a single word (e.g. read_data instead
 of read data).
 </P>
 <P>(2) All the functionality of LAMMPS 2001 is included in C++ LAMMPS,
 but you may need to specify the relevant commands in different ways.
 </P>
 <P>(3) The format of the data file can be streamlined for some problems.
 See the <A HREF = "read_data.html">read_data</A> command for details.  The data file
 section "Nonbond Coeff" has been renamed to "Pair Coeff" in C++ LAMMPS.
 </P>
 <P>(4) Binary restart files written by LAMMPS 2001 cannot be read by C++
 LAMMPS with a <A HREF = "read_restart.html">read_restart</A> command.  This is
 because they were output by F90 which writes in a different binary
 format than C or C++ writes or reads.  Use the <I>restart2data</I> tool
 provided with LAMMPS 2001 to convert the 2001 restart file to a text
 data file.  Then edit the data file as necessary before using the C++
 LAMMPS <A HREF = "read_data.html">read_data</A> command to read it in.
 </P>
 <P>(5) There are numerous small numerical changes in C++ LAMMPS that mean
 you will not get identical answers when comparing to a 2001 run.
 However, your initial thermodynamic energy and MD trajectory should be
 close if you have setup the problem for both codes the same.
 </P>
 </HTML>
diff --git a/doc/Section_start.txt b/doc/Section_start.txt
index c126503b1..2e8d78680 100644
--- a/doc/Section_start.txt
+++ b/doc/Section_start.txt
@@ -1,1465 +1,1658 @@
 "Previous Section"_Section_intro.html - "LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc - "Next Section"_Section_commands.html :c
 
 :link(lws,http://lammps.sandia.gov)
 :link(ld,Manual.html)
 :link(lc,Section_commands.html#comm)
 
 :line
 
 2. Getting Started :h3
 
 This section describes how to build and run LAMMPS, for both new and
 experienced users.
 
 2.1 "What's in the LAMMPS distribution"_#start_1
 2.2 "Making LAMMPS"_#start_2
 2.3 "Making LAMMPS with optional packages"_#start_3
 2.4 "Building LAMMPS via the Make.py script"_#start_4
 2.5 "Building LAMMPS as a library"_#start_5
 2.6 "Running LAMMPS"_#start_6
 2.7 "Command-line options"_#start_7
 2.8 "Screen output"_#start_8
 2.9 "Tips for users of previous versions"_#start_9 :all(b)
 
 :line
 :line
 
 2.1 What's in the LAMMPS distribution :h4,link(start_1)
 
 When you download LAMMPS you will need to unzip and untar the
 downloaded file with the following commands, after placing the file in
 an appropriate directory.
 
 gunzip lammps*.tar.gz 
 tar xvf lammps*.tar :pre
 
 This will create a LAMMPS directory containing two files and several
 sub-directories:
     
 README: text file
 LICENSE: the GNU General Public License (GPL)
 bench: benchmark problems
 doc: documentation
 examples: simple test problems
 potentials: embedded atom method (EAM) potential files
 src: source files
 tools: pre- and post-processing tools :tb(s=:)
 
 If you download one of the Windows executables from the download page,
 then you get a single file:
 
 lmp_windows.exe :pre
 
 Skip to the "Running LAMMPS"_#start_6 sections for info on how to
 launch these executables on a Windows box.
 
 The Windows executables for serial or parallel only include certain
 packages and bug-fixes/upgrades listed on "this
 page"_http://lammps.sandia.gov/bug.html up to a certain date, as
 stated on the download page.  If you want something with more packages
 or that is more current, you'll have to download the source tarball
 and build it yourself from source code using Microsoft Visual Studio,
 as described in the next section.
 
 :line
 
 2.2 Making LAMMPS :h4,link(start_2)
 
 This section has the following sub-sections:
 
 "Read this first"_#start_2_1
 "Steps to build a LAMMPS executable"_#start_2_2
 "Common errors that can occur when making LAMMPS"_#start_2_3
 "Additional build tips"_#start_2_4
 "Building for a Mac"_#start_2_5
 "Building for Windows"_#start_2_6 :ul
 
 :line
 
 [{Read this first:}] :link(start_2_1)
 
 Building LAMMPS can be non-trivial.  You may need to edit a makefile,
 there are compiler options to consider, additional libraries can be
 used (MPI, FFT, JPEG, PNG), LAMMPS packages may be included or
 excluded, some of these packages use auxiliary libraries which need to
 be pre-built, etc.
 
 Please read this section carefully.  If you are not comfortable with
 makefiles, or building codes on a Unix platform, or running an MPI job
 on your machine, please find a local expert to help you.  Many
 compiling, linking, and run problems that users have are often not
 LAMMPS issues - they are peculiar to the user's system, compilers,
 libraries, etc.  Such questions are better answered by a local expert.
 
 If you have a build problem that you are convinced is a LAMMPS issue
 (e.g. the compiler complains about a line of LAMMPS source code), then
 please post a question to the "LAMMPS mail
 list"_http://lammps.sandia.gov/mail.html.
 
 If you succeed in building LAMMPS on a new kind of machine, for which
 there isn't a similar Makefile for in the src/MAKE directory, send it
 to the developers and we can include it in the LAMMPS distribution.
 
 :line
 
 [{Steps to build a LAMMPS executable:}] :link(start_2_2)
 
 [Step 0]
 
 The src directory contains the C++ source and header files for LAMMPS.
 It also contains a top-level Makefile and a MAKE sub-directory with
 low-level Makefile.* files for many machines.  From within the src
 directory, type "make" or "gmake".  You should see a list of available
 choices.  If one of those is the machine and options you want, you can
 type a command like:
 
 make linux
 or
 gmake mac :pre
 
 Note that on a multi-processor or multi-core platform you can launch a
 parallel make, by using the "-j" switch with the make command, which
 will build LAMMPS more quickly.
 
 If you get no errors and an executable like lmp_linux or lmp_mac is
 produced, you're done; it's your lucky day.
 
 Note that by default only a few of LAMMPS optional packages are
 installed.  To build LAMMPS with optional packages, see "this
 section"_#start_3 below.
 
 [Step 1]
 
 If Step 0 did not work, you will need to create a low-level Makefile
 for your machine, like Makefile.foo.  You should make a copy of an
 existing src/MAKE/Makefile.* as a starting point.  The only portions
 of the file you need to edit are the first line, the "compiler/linker
 settings" section, and the "LAMMPS-specific settings" section.
 
 [Step 2]
 
 Change the first line of src/MAKE/Makefile.foo to list the word "foo"
 after the "#", and whatever other options it will set.  This is the
 line you will see if you just type "make".
 
 [Step 3]
 
 The "compiler/linker settings" section lists compiler and linker
 settings for your C++ compiler, including optimization flags.  You can
 use g++, the open-source GNU compiler, which is available on all Unix
 systems.  You can also use mpicc which will typically be available if
 MPI is installed on your system, though you should check which actual
 compiler it wraps.  Vendor compilers often produce faster code.  On
 boxes with Intel CPUs, we suggest using the commercial Intel icc
 compiler, which can be downloaded from "Intel's compiler site"_intel.
 
 :link(intel,http://www.intel.com/software/products/noncom)
 
 If building a C++ code on your machine requires additional libraries,
 then you should list them as part of the LIB variable.
 
 The DEPFLAGS setting is what triggers the C++ compiler to create a
 dependency list for a source file.  This speeds re-compilation when
 source (*.cpp) or header (*.h) files are edited.  Some compilers do
 not support dependency file creation, or may use a different switch
 than -D.  GNU g++ works with -D.  If your compiler can't create
 dependency files, then you'll need to create a Makefile.foo patterned
 after Makefile.storm, which uses different rules that do not involve
 dependency files.  Note that when you build LAMMPS for the first time
 on a new platform, a long list of *.d files will be printed out
 rapidly.  This is not an error; it is the Makefile doing its normal
 creation of dependencies.
 
 [Step 4]
 
 The "system-specific settings" section has several parts.  Note that
 if you change any -D setting in this section, you should do a full
 re-compile, after typing "make clean" (which will describe different
 clean options).
 
 The LMP_INC variable is used to include options that turn on ifdefs
 within the LAMMPS code.  The options that are currently recogized are:
 
 -DLAMMPS_GZIP
 -DLAMMPS_JPEG
 -DLAMMPS_PNG
 -DLAMMPS_FFMPEG
 -DLAMMPS_MEMALIGN
 -DLAMMPS_XDR
 -DLAMMPS_SMALLBIG
 -DLAMMPS_BIGBIG
 -DLAMMPS_SMALLSMALL
 -DLAMMPS_LONGLONG_TO_LONG
 -DPACK_ARRAY
 -DPACK_POINTER
 -DPACK_MEMCPY :ul
 
 The read_data and dump commands will read/write gzipped files if you
 compile with -DLAMMPS_GZIP.  It requires that your machine supports
 the "popen" function in the standard runtime library and that a gzip
 executable can be found by LAMMPS during a run.
 
 If you use -DLAMMPS_JPEG, the "dump image"_dump_image.html command
 will be able to write out JPEG image files. For JPEG files, you must
 also link LAMMPS with a JPEG library, as described below. If you use
 -DLAMMPS_PNG, the "dump image"_dump.html command will be able to write
 out PNG image files.  For PNG files, you must also link LAMMPS with a
 PNG library, as described below.  If neither of those two defines are
 used, LAMMPS will only be able to write out uncompressed PPM image
 files.
 
 If you use -DLAMMPS_FFMPEG, the "dump movie"_dump_image.html command
 will be available to support on-the-fly generation of rendered movies
 the need to store intermediate image files. It requires that your
 machines supports the "popen" function in the standard runtime library
 and that an FFmpeg executable can be found by LAMMPS during the run.
 
 Using -DLAMMPS_MEMALIGN=<bytes> enables the use of the
 posix_memalign() call instead of malloc() when large chunks or memory
 are allocated by LAMMPS.  This can help to make more efficient use of
 vector instructions of modern CPUS, since dynamically allocated memory
 has to be aligned on larger than default byte boundaries (e.g. 16
 bytes instead of 8 bytes on x86 type platforms) for optimal
 performance.
 
 If you use -DLAMMPS_XDR, the build will include XDR compatibility
 files for doing particle dumps in XTC format.  This is only necessary
 if your platform does have its own XDR files available.  See the
 Restrictions section of the "dump"_dump.html command for details.
 
 Use at most one of the -DLAMMPS_SMALLBIG, -DLAMMPS_BIGBIG, -D-
 DLAMMPS_SMALLSMALL settings. The default is -DLAMMPS_SMALLBIG. These
 settings refer to use of 4-byte (small) vs 8-byte (big) integers
 within LAMMPS, as specified in src/lmptype.h.  The only reason to use
 the BIGBIG setting is to enable simulation of huge molecular systems
 (which store bond topology info) with more than 2 billion atoms, or to
 track the image flags of moving atoms that wrap around a periodic box
 more than 512 times. The only reason to use the SMALLSMALL setting is
 if your machine does not support 64-bit integers.  See the "Additional
 build tips"_#start_2_4 section below for more details.
 
 The -DLAMMPS_LONGLONG_TO_LONG setting may be needed if your system or
 MPI version does not recognize "long long" data types.  In this case a
 "long" data type is likely already 64-bits, in which case this setting
 will convert to that data type.
 
 Using one of the -DPACK_ARRAY, -DPACK_POINTER, and -DPACK_MEMCPY
 options can make for faster parallel FFTs (in the PPPM solver) on some
 platforms.  The -DPACK_ARRAY setting is the default.  See the
 "kspace_style"_kspace_style.html command for info about PPPM.  See
 Step 6 below for info about building LAMMPS with an FFT library.
 
 [Step 5]
 
 The 3 MPI variables are used to specify an MPI library to build LAMMPS
 with. 
 
 If you want LAMMPS to run in parallel, you must have an MPI library
 installed on your platform.  If you use an MPI-wrapped compiler, such
 as "mpicc" to build LAMMPS, you should be able to leave these 3
 variables blank; the MPI wrapper knows where to find the needed files.
 If not, and MPI is installed on your system in the usual place (under
 /usr/local), you also may not need to specify these 3 variables.  On
 some large parallel machines which use "modules" for their
 compile/link environements, you may simply need to include the correct
 module in your build environment.  Or the parallel machine may have a
 vendor-provided MPI which the compiler has no trouble finding.
 
 Failing this, with these 3 variables you can specify where the mpi.h
 file (MPI_INC) and the MPI library file (MPI_PATH) are found and the
 name of the library file (MPI_LIB).
 
 If you are installing MPI yourself, we recommend Argonne's MPICH2
 or OpenMPI.  MPICH can be downloaded from the "Argonne MPI
 site"_http://www.mcs.anl.gov/research/projects/mpich2/.  OpenMPI can
 be downloaded from the "OpenMPI site"_http://www.open-mpi.org.
 Other MPI packages should also work. If you are running on a big
 parallel platform, your system people or the vendor should have
 already installed a version of MPI, which is likely to be faster
 than a self-installed MPICH or OpenMPI, so find out how to build
 and link with it.  If you use MPICH or OpenMPI, you will have to
 configure and build it for your platform.  The MPI configure script
 should have compiler options to enable you to use the same compiler
 you are using for the LAMMPS build, which can avoid problems that can
 arise when linking LAMMPS to the MPI library.
 
 If you just want to run LAMMPS on a single processor, you can use the
 dummy MPI library provided in src/STUBS, since you don't need a true
 MPI library installed on your system.  See the
 src/MAKE/Makefile.serial file for how to specify the 3 MPI variables
 in this case.  You will also need to build the STUBS library for your
 platform before making LAMMPS itself.  To build from the src
 directory, type "make stubs", or from the STUBS dir, type "make".
 This should create a libmpi_stubs.a file suitable for linking to
 LAMMPS.  If the build fails, you will need to edit the STUBS/Makefile
 for your platform.
 
 The file STUBS/mpi.c provides a CPU timer function called
 MPI_Wtime() that calls gettimeofday() .  If your system doesn't
 support gettimeofday() , you'll need to insert code to call another
 timer.  Note that the ANSI-standard function clock() rolls over after
 an hour or so, and is therefore insufficient for timing long LAMMPS
 simulations.
 
 [Step 6]
 
 The 3 FFT variables allow you to specify an FFT library which LAMMPS
 uses (for performing 1d FFTs) when running the particle-particle
 particle-mesh (PPPM) option for long-range Coulombics via the
 "kspace_style"_kspace_style.html command.
 
 LAMMPS supports various open-source or vendor-supplied FFT libraries
 for this purpose.  If you leave these 3 variables blank, LAMMPS will
 use the open-source "KISS FFT library"_http://kissfft.sf.net, which is
 included in the LAMMPS distribution.  This library is portable to all
 platforms and for typical LAMMPS simulations is almost as fast as FFTW
 or vendor optimized libraries.  If you are not including the KSPACE
 package in your build, you can also leave the 3 variables blank.
 
 Otherwise, select which kinds of FFTs to use as part of the FFT_INC
 setting by a switch of the form -DFFT_XXX.  Recommended values for XXX
 are: MKL, SCSL, FFTW2, and FFTW3.  Legacy options are: INTEL, SGI,
 ACML, and T3E.  For backward compatability, using -DFFT_FFTW will use
 the FFTW2 library.  Using -DFFT_NONE will use the KISS library
 described above.
 
 You may also need to set the FFT_INC, FFT_PATH, and FFT_LIB variables,
 so the compiler and linker can find the needed FFT header and library
 files.  Note that on some large parallel machines which use "modules"
 for their compile/link environements, you may simply need to include
 the correct module in your build environment.  Or the parallel machine
 may have a vendor-provided FFT library which the compiler has no
 trouble finding.
 
 FFTW is a fast, portable library that should also work on any
 platform.  You can download it from
 "www.fftw.org"_http://www.fftw.org.  Both the legacy version 2.1.X and
 the newer 3.X versions are supported as -DFFT_FFTW2 or -DFFT_FFTW3.
 Building FFTW for your box should be as simple as ./configure; make.
 Note that on some platforms FFTW2 has been pre-installed, and uses
 renamed files indicating the precision it was compiled with,
 e.g. sfftw.h, or dfftw.h instead of fftw.h.  In this case, you can
 specify an additional define variable for FFT_INC called -DFFTW_SIZE,
 which will select the correct include file.  In this case, for FFT_LIB
 you must also manually specify the correct library, namely -lsfftw or
 -ldfftw.
 
 The FFT_INC variable also allows for a -DFFT_SINGLE setting that will
 use single-precision FFTs with PPPM, which can speed-up long-range
 calulations, particularly in parallel or on GPUs.  Fourier transform
 and related PPPM operations are somewhat insensitive to floating point
 truncation errors and thus do not always need to be performed in
 double precision.  Using the -DFFT_SINGLE setting trades off a little
 accuracy for reduced memory use and parallel communication costs for
 transposing 3d FFT data.  Note that single precision FFTs have only
 been tested with the FFTW3, FFTW2, MKL, and KISS FFT options.
 
 [Step 7]
 
 The 3 JPG variables allow you to specify a JPEG and/or PNG library
 which LAMMPS uses when writing out JPEG or PNG files via the "dump
 image"_dump_image.html command.  These can be left blank if you do not
 use the -DLAMMPS_JPEG or -DLAMMPS_PNG switches discussed above in Step
 4, since in that case JPEG/PNG output will be disabled.
 
 A standard JPEG library usually goes by the name libjpeg.a or
 libjpeg.so and has an associated header file jpeglib.h.  Whichever
 JPEG library you have on your platform, you'll need to set the
 appropriate JPG_INC, JPG_PATH, and JPG_LIB variables, so that the
 compiler and linker can find it.
 
 A standard PNG library usually goes by the name libpng.a or libpng.so
 and has an associated header file png.h.  Whichever PNG library you
 have on your platform, you'll need to set the appropriate JPG_INC,
 JPG_PATH, and JPG_LIB variables, so that the compiler and linker can
 find it.
 
 As before, if these header and library files are in the usual place on
 your machine, you may not need to set these variables.
 
 [Step 8]
 
 Note that by default only a few of LAMMPS optional packages are
 installed.  To build LAMMPS with optional packages, see "this
 section"_#start_3 below, before proceeding to Step 9.
 
 [Step 9]
 
 That's it.  Once you have a correct Makefile.foo, you have installed
 the optional LAMMPS packages you want to include in your build, and
 you have pre-built any other needed libraries (e.g. MPI, FFT, package
 libraries), all you need to do from the src directory is type
 something like this:
 
 make foo
 or
 gmake foo :pre
 
 You should get the executable lmp_foo when the build is complete.
 
 :line
 
 [{Errors that can occur when making LAMMPS:}] :link(start_2_3)
 
 IMPORTANT NOTE: If an error occurs when building LAMMPS, the compiler
 or linker will state very explicitly what the problem is.  The error
 message should give you a hint as to which of the steps above has
 failed, and what you need to do in order to fix it.  Building a code
 with a Makefile is a very logical process.  The compiler and linker
 need to find the appropriate files and those files need to be
 compatible with LAMMPS source files.  When a make fails, there is
 usually a very simple reason, which you or a local expert will need to
 fix.
 
 Here are two non-obvious errors that can occur:
 
 (1) If the make command breaks immediately with errors that indicate
 it can't find files with a "*" in their names, this can be because
 your machine's native make doesn't support wildcard expansion in a
 makefile.  Try gmake instead of make.  If that doesn't work, try using
 a -f switch with your make command to use a pre-generated
 Makefile.list which explicitly lists all the needed files, e.g.
 
 make makelist
 make -f Makefile.list linux
 gmake -f Makefile.list mac :pre
 
 The first "make" command will create a current Makefile.list with all
 the file names in your src dir.  The 2nd "make" command (make or
 gmake) will use it to build LAMMPS.  Note that you should
 include/exclude any desired optional packages before using the "make
 makelist" command.
 
 (2) If you get an error that says something like 'identifier "atoll"
 is undefined', then your machine does not support "long long"
 integers.  Try using the -DLAMMPS_LONGLONG_TO_LONG setting described
 above in Step 4.
 
 :line
 
 [{Additional build tips:}] :link(start_2_4)
 
 (1) Building LAMMPS for multiple platforms.
 
 You can make LAMMPS for multiple platforms from the same src
 directory.  Each target creates its own object sub-directory called
 Obj_target where it stores the system-specific *.o files.
 
 (2) Cleaning up.
 
 Typing "make clean-all" or "make clean-machine" will delete *.o object
 files created when LAMMPS is built, for either all builds or for a
 particular machine.
 
 (3) Changing the LAMMPS size limits via -DLAMMPS_SMALLBIG or
 -DLAMMPS_BIGBIG or -DLAMMPS_SMALLSMALL
 
 As explained above, any of these 3 settings can be specified on the
 LMP_INC line in your low-level src/MAKE/Makefile.foo.
 
 The default is -DLAMMPS_SMALLBIG which allows for systems with up to
 2^63 atoms and 2^63 timesteps (about 9e18). The atom limit is for
 atomic systems which do not store bond topology info and thus do not
 require atom IDs.  If you use atom IDs for atomic systems (which is
 the default) or if you use a molecular model, which stores bond
 topology info and thus requires atom IDs, the limit is 2^31 atoms
 (about 2 billion).  This is because the IDs are stored in 32-bit
 integers.
 
 Likewise, with this setting, the 3 image flags for each atom (see the
 "dump"_dump.html doc page for a discussion) are stored in a 32-bit
 integer, which means the atoms can only wrap around a periodic box (in
 each dimension) at most 512 times.  If atoms move through the periodic
 box more than this many times, the image flags will "roll over",
 e.g. from 511 to -512, which can cause diagnostics like the
 mean-squared displacement, as calculated by the "compute
 msd"_compute_msd.html command, to be faulty.
 
 To allow for larger atomic systems with atom IDs or larger molecular
 systems or larger image flags, compile with -DLAMMPS_BIGBIG.  This
 stores atom IDs and image flags in 64-bit integers.  This enables
 atomic or molecular systems with atom IDS of up to 2^63 atoms (about
 9e18).  And image flags will not "roll over" until they reach 2^20 =
 1048576.
 
 If your system does not support 8-byte integers, you will need to
 compile with the -DLAMMPS_SMALLSMALL setting.  This will restrict the
 total number of atoms (for atomic or molecular systems) and timesteps
 to 2^31 (about 2 billion).  Image flags will roll over at 2^9 = 512.
 
 Note that in src/lmptype.h there are definitions of all these data
 types as well as the MPI data types associated with them.  The MPI
 types need to be consistent with the associated C data types, or else
 LAMMPS will generate a run-time error.  As far as we know, the
 settings defined in src/lmptype.h are portable and work on every
 current system.
 
 In all cases, the size of problem that can be run on a per-processor  
 basis is limited by 4-byte integer storage to 2^31 atoms per processor  
 (about 2 billion). This should not normally be a limitation since such  
 a problem would have a huge per-processor memory footprint due to  
 neighbor lists and would run very slowly in terms of CPU secs/timestep.
 
 :line
 
 [{Building for a Mac:}] :link(start_2_5)
 
 OS X is BSD Unix, so it should just work.  See the
 src/MAKE/Makefile.mac file.
 
 :line
 
 [{Building for Windows:}] :link(start_2_6)
 
 The LAMMPS download page has an option to download both a serial and
 parallel pre-built Windows executable.  See the "Running
 LAMMPS"_#start_6 section for instructions on running these executables
 on a Windows box.
 
 The pre-built executables hosted on the "LAMMPS download
 page"_http://lammps.sandia.gov/download.html are built with a subset
 of the available packages; see the download page for the list. These
 are single executable files.  No examples or documentation in
 included. You will need to download the full source code package to
 obtain those.
 
 As an alternative, you can download "daily builds" (and some older
 versions) of the installer packages from
 "rpm.lammps.org/windows.html"_http://rpm.lammps.org/windows.html.
 These executables are built with most optional packages and the
 download includes documentation, some tools and most examples.
 
 If you want a Windows version with specific packages included and
 excluded, you can build it yourself.
 
 One way to do this is install and use cygwin to build LAMMPS with a
 standard unix style make program, just as you would on a Linux box;
 see src/MAKE/Makefile.cygwin.
 
 The other way to do this is using Visual Studio and project files.
 See the src/WINDOWS directory and its README.txt file for instructions
 on both a basic build and a customized build with pacakges you select.
 
 :line
 
 2.3 Making LAMMPS with optional packages :h4,link(start_3)
 
 This section has the following sub-sections:
 
 "Package basics"_#start_3_1
 "Including/excluding packages"_#start_3_2
 "Packages that require extra libraries"_#start_3_3
-"Additional Makefile settings for extra libraries"_#start_3_4 :ul
+"Packages that use make variable settings"_#start_3_4 :ul
 
 :line
 
 [{Package basics:}] :link(start_3_1)
 
 The source code for LAMMPS is structured as a set of core files which
 are always included, plus optional packages.  Packages are groups of
 files that enable a specific set of features.  For example, force
 fields for molecular systems or granular systems are in packages.  You
 can see the list of all packages by typing "make package" from within
 the src directory of the LAMMPS distribution.
 
 If you use a command in a LAMMPS input script that is specific to a
 particular package, you must have built LAMMPS with that package, else
 you will get an error that the style is invalid or the command is
 unknown.  Every command's doc page specfies if it is part of a
 package.  You can also type
 
 lmp_machine -h :pre
 
 to run your executable with the optional "-h command-line
 switch"_#start_7 for "help", which will list the styles and commands
 known to your executable.
 
 There are two kinds of packages in LAMMPS, standard and user packages.
 More information about the contents of standard and user packages is
 given in "Section_packages"_Section_packages.html of the manual.  The
 difference between standard and user packages is as follows:
 
 Standard packages are supported by the LAMMPS developers and are
 written in a syntax and style consistent with the rest of LAMMPS.
 This means we will answer questions about them, debug and fix them if
 necessary, and keep them compatible with future changes to LAMMPS.
 
 User packages have been contributed by users, and always begin with
 the user prefix.  If they are a single command (single file), they are
 typically in the user-misc package.  Otherwise, they are a a set of
 files grouped together which add a specific functionality to the code.
 
 User packages don't necessarily meet the requirements of the standard
 packages.  If you have problems using a feature provided in a user
 package, you will likely need to contact the contributor directly to
 get help.  Information on how to submit additions you make to LAMMPS
 as a user-contributed package is given in "this
 section"_Section_modify.html#mod_15 of the documentation.
 
 Some packages (both standard and user) require additional libraries.
 See more details below.
 
 :line
 
 [{Including/excluding packages:}] :link(start_3_2)
 
 To use or not use a package you must include or exclude it before
 building LAMMPS.  From the src directory, this is typically as simple
 as:
 
 make yes-colloid
 make g++ :pre
 
 or
 
 make no-manybody
 make g++ :pre
 
 IMPORTANT NOTE: You should NOT include/exclude packages and build
 LAMMPS in a single make command by using multiple targets, e.g. make
 yes-colloid g++.  This is because the make procedure creates a list of
 source files that will be out-of-date for the build if the package
 configuration changes during the same command.
 
 Some packages have individual files that depend on other packages
 being included.  LAMMPS checks for this and does the right thing.
 I.e. individual files are only included if their dependencies are
 already included.  Likewise, if a package is excluded, other files
 dependent on that package are also excluded.
 
 The reason to exclude packages is if you will never run certain kinds
 of simulations.  For some packages, this will keep you from having to
 build auxiliary libraries (see below), and will also produce a smaller
 executable which may run a bit faster.
 
 When you download a LAMMPS tarball, these packages are pre-installed
 in the src directory: KSPACE, MANYBODY,MOLECULE.  When you download
 LAMMPS source files from the SVN or Git repositories, no packages are
 pre-installed.
 
 Packages are included or excluded by typing "make yes-name" or "make
 no-name", where "name" is the name of the package in lower-case, e.g.
 name = kspace for the KSPACE package or name = user-atc for the
 USER-ATC package.  You can also type "make yes-standard", "make
 no-standard", "make yes-user", "make no-user", "make yes-all" or "make
 no-all" to include/exclude various sets of packages.  Type "make
 package" to see the all of the package-related make options.
 
 IMPORTANT NOTE: Inclusion/exclusion of a package works by simply
 moving files back and forth between the main src directory and
 sub-directories with the package name (e.g. src/KSPACE, src/USER-ATC),
 so that the files are seen or not seen when LAMMPS is built.  After
 you have included or excluded a package, you must re-build LAMMPS.
 
 Additional package-related make options exist to help manage LAMMPS
 files that exist in both the src directory and in package
 sub-directories.  You do not normally need to use these commands
 unless you are editing LAMMPS files or have downloaded a patch from
 the LAMMPS WWW site.
 
 Typing "make package-update" will overwrite src files with files from
 the package sub-directories if the package has been included.  It
 should be used after a patch is installed, since patches only update
 the files in the package sub-directory, but not the src files.  Typing
 "make package-overwrite" will overwrite files in the package
 sub-directories with src files.
 
 Typing "make package-status" will show which packages are currently
 included. Of those that are included, it will list files that are
 different in the src directory and package sub-directory.  Typing
 "make package-diff" lists all differences between these files.  Again,
 type "make package" to see all of the package-related make options.
 
 :line
 
 [{Packages that require extra libraries:}] :link(start_3_3)
 
 A few of the standard and user packages require additional auxiliary
 libraries.  They must be compiled first, before LAMMPS is built.  If
 you get a LAMMPS build error about a missing library, this is likely
 the reason.  See the "Section_packages"_Section_packages.html doc page
 for a list of packages that have auxiliary libraries.
 
 Code for some of these auxiliary libraries is included in the LAMMPS
 distribution under the lib directory.  Examples are the USER-ATC and
-MEAM packages.  Some auxiliary libraries are not included with LAMMPS;
+MEAM packages.  Some auxiliary libraries are NOT included with LAMMPS;
 to use the associated package you must download and install the
 auxiliary library yourself.  Examples are the KIM and VORONOI and
 USER-MOLFILE packages.
 
 For libraries with provided source code, each lib directory has a
 README file (e.g. lib/reax/README) with instructions on how to build
 that library.  Typically this is done by typing something like:
 
 make -f Makefile.g++ :pre
 
-If one of the provided Makefiles is not
-appropriate for your system you will need to edit or add one.
-Note that all the Makefiles have a setting for EXTRAMAKE at
-the top that names a Makefile.lammps.* file.
+If one of the provided Makefiles is not appropriate for your system
+you will need to edit or add one.  Note that all the Makefiles have a
+setting for EXTRAMAKE at the top that specifies a Makefile.lammps.*
+file.
 
-If successful, this will produce 2 files in the lib directory:
+If the library build is successful, it will produce 2 files in the lib
+directory:
 
 libpackage.a
 Makefile.lammps :pre
 
-The Makefile.lammps file is a copy of the EXTRAMAKE file specified
-in the Makefile you used.
+The Makefile.lammps file will be a copy of the EXTRAMAKE file setting
+specified in the library Makefile.* you used.
 
-You MUST insure that the settings in Makefile.lammps are appropriate
-for your system.  If they are not, the LAMMPS build will fail.
+Note that you must insure that the settings in Makefile.lammps are
+appropriate for your system.  If they are not, the LAMMPS build will
+fail.
 
-As explained in the lib/package/README files, they are used to specify
-additional system libraries and their locations so that LAMMPS can
-build with the auxiliary library.  For example, if the MEAM or REAX
-packages are used, the auxiliary libraries consist of F90 code, build
-with a F90 complier.  To link that library with LAMMPS (a C++ code)
-via whatever C++ compiler LAMMPS is built with, typically requires
-additional Fortran-to-C libraries be included in the link.  Another
-example are the BLAS and LAPACK libraries needed to use the USER-ATC
-or USER-AWPMD packages.
+As explained in the lib/package/README files, the settings in
+Makefile.lammps are used to specify additional system libraries and
+their locations so that LAMMPS can build with the auxiliary library.
+For example, if the MEAM or REAX packages are used, the auxiliary
+libraries consist of F90 code, built with a Fortran complier.  To link
+that library with LAMMPS (a C++ code) via whatever C++ compiler LAMMPS
+is built with, typically requires additional Fortran-to-C libraries be
+included in the link.  Another example are the BLAS and LAPACK
+libraries needed to use the USER-ATC or USER-AWPMD packages.
 
 For libraries without provided source code, see the
 src/package/Makefile.lammps file for information on where to find the
 library and how to build it.  E.g. the file src/KIM/Makefile.lammps or
 src/VORONOI/Makefile.lammps or src/UESR-MOLFILE/Makefile.lammps.
 These files serve the same purpose as the lib/package/Makefile.lammps
 files described above.  The files have settings needed when LAMMPS is
-built to link with the corresponding auxiliary library.  Again, you
-MUST insure that the settings in src/package/Makefile.lammps are
-appropriate for your system and where you installed the auxiliary
-library.  If they are not, the LAMMPS build will fail.
+built to link with the corresponding auxiliary library.
+
+Again, you must insure that the settings in
+src/package/Makefile.lammps are appropriate for your system and where
+you installed the auxiliary library.  If they are not, the LAMMPS
+build will fail.
+
+:line
+
+[{Packages that use make variable settings}] :link(start_3_4)
+
+One package, the KOKKOS package, allows its build options to be
+specified by setting variables via the "make" command, rather than by
+first building an auxiliary library and editing a Makefile.lammps
+file, as discussed in the previous sub-section for other packages.
+This is for convenience since it is common to want to experiment with
+different Kokkos library options.  Using variables enables a direct
+re-build of LAMMPS and its Kokkos dependencies, so that a benchmark
+test with different Kokkos options can be quickly performed.
+
+The syntax for setting make variables is as follows.  You must
+use a GNU-compatible make command for this to work.  Try "gmake"
+if your system's standard make complains.
+
+make yes-kokkos
+make g++ VAR1=value VAR2=value ... :pre
+
+The first line installs the KOKKOS package, which only needs to be
+done once.  The second line builds LAMMPS with src/MAKE/Makefile.g++
+and optionally sets one or more variables that affect the build.  Each
+variable is specified in upper-case; its value follows an equal sign
+with no spaces.  The second line can be repeated with different
+variable settings, though a "clean" must be done before the rebuild.
+Type "make clean" to see options for this operation.
+
+These are the variables that can be specified.  Each takes a value of
+{yes} or {no}.  The default value is listed, which is set in the
+lib/kokkos/Makefile.lammps file.  See "this
+section"_Section_accelerate.html#acc_8 for a discussion of what is
+meant by "host" and "device" in the Kokkos context.
+
+OMP, default = {yes}
+CUDA, default = {no}
+HWLOC, default = {no}
+AVX, default = {no}
+MIC, default = {no}
+LIBRT, default = {no}
+DEBUG, default = {no} :ul
+
+OMP sets the parallelization method used for Kokkos code (within
+LAMMPS) that runs on the host.  OMP=yes means that OpenMP will be
+used.  OMP=no means that pthreads will be used.
+
+CUDA sets the parallelization method used for Kokkos code (within
+LAMMPS) that runs on the device.  CUDA=yes means an NVIDIA GPU running
+CUDA will be used.  CUDA=no means that the OMP=yes or OMP=no setting
+will be used for the device as well as the host.
+
+If CUDA=yes, then the lo-level Makefile in the src/MAKE directory must
+use "nvcc" as its compiler, via its CC setting.  For best performance
+its CCFLAGS setting should use -O3 and have an -arch setting that
+matches the compute capability of your NVIDIA hardware and software
+installation, e.g. -arch=sm_20.  Generally Fermi Generation GPUs are
+sm_20, while Kepler generation GPUs are sm_30 or sm_35 and Maxwell
+cards are sm_50.  A complete list can be found on
+"wikipedia"_http://en.wikipedia.org/wiki/CUDA#Supported_GPUs. You can
+also use the deviceQuery tool that comes with the CUDA samples.  Note
+the minimal required compute capability is 2.0, but this will give
+signicantly reduced performance compared to Kepler generation GPUs
+with compute capability 3.x.  For the LINK setting, "nvcc" should not
+be used; instead use g++ or another compiler suitable for linking C++
+applications.  Often you will want to use your MPI compiler wrapper
+for this setting (i.e. mpicxx).  Finally, the lo-level Makefile must
+also have a "Compilation rule" for creating *.o files from *.cu files.
+See src/Makefile.cuda for an example of a lo-level Makefile with all
+of these settings.
+
+HWLOC binds threads to hardware cores, so they do not migrate during a
+simulation.  HWLOC=yes should always be used if running with OMP=no
+for pthreads.  It is not necessary for OMP=yes for OpenMP, because
+OpenMP provides alternative methods via environment variables for
+binding threads to hardware cores.  More info on binding threads to
+cores is given in "this section"_Section_accelerate.html#acc_8.
+
+AVX enables Intel advanced vector extensions when compiling for an
+Intel-compatible chip.  AVX=yes should only be set if your host
+hardware supports AVX.  If it does not support it, this will cause a
+run-time crash.
+
+MIC enables compiler switches needed when compling for an Intel Phi
+processor.
+
+LIBRT enables use of a more accurate timer mechanism on most Unix
+platforms.  This library is not available on all platforms.
+
+DEBUG is only useful when developing a Kokkos-enabled style within
+LAMMPS.  DEBUG=yes enables printing of run-time debugging information
+that can be useful.  It also enables runtime bounds checking on Kokkos
+data structures.
 
 :line
 
 2.4 Building LAMMPS via the Make.py script :h4,link(start_4)
 
 The src directory includes a Make.py script, written
 in Python, which can be used to automate various steps
 of the build process.
 
 You can run the script from the src directory by typing either:
 
 Make.py
 python Make.py :pre
 
 which will give you info about the tool.  For the former to work, you
 may need to edit the 1st line of the script to point to your local
 Python.  And you may need to insure the script is executable:
 
 chmod +x Make.py :pre
 
 The following options are supported as switches:
 
 -i file1 file2 ...
 -p package1 package2 ...
 -u package1 package2 ...
 -e package1 arg1 arg2 package2 ...
 -o dir
 -b machine
 -s suffix1 suffix2 ...
 -l dir
 -j N
 -h switch1 switch2 ... :ul
 
 Help on any switch can be listed by using -h, e.g.
 
 Make.py -h -i -p :pre
 
 At a hi-level, these are the kinds of package management
 and build tasks that can be performed easily, using
 the Make.py tool:
 
 install/uninstall packages and build the associated external libs (use -p and -u and -e)
 install packages needed for one or more input scripts (use -i and -p)
 build LAMMPS, either in the src dir or new dir (use -b)
 create a new dir with only the source code needed for one or more input scripts (use -i and -o) :ul
 
 The last bullet can be useful when you wish to build a stripped-down
 version of LAMMPS to run a specific script(s).  Or when you wish to
 move the minimal amount of files to another platform for a remote
 LAMMPS build.
 
 Note that using Make.py is not a substitute for insuring you have a
 valid src/MAKE/Makefile.foo for your system, or that external library
 Makefiles in any lib/* directories you use are also valid for your
 system.  But once you have done that, you can use Make.py to quickly
 include/exclude the packages and external libraries needed by your
 input scripts.
 
 :line
 
 2.5 Building LAMMPS as a library :h4,link(start_5)
 
 LAMMPS can be built as either a static or shared library, which can
 then be called from another application or a scripting language.  See
 "this section"_Section_howto.html#howto_10 for more info on coupling
 LAMMPS to other codes.  See "this section"_Section_python.html for
 more info on wrapping and running LAMMPS from Python.
 
 [Static library:] :h5
 
 To build LAMMPS as a static library (*.a file on Linux), type
 
 make makelib
 make -f Makefile.lib foo :pre
 
 where foo is the machine name.  This kind of library is typically used
 to statically link a driver application to LAMMPS, so that you can
 insure all dependencies are satisfied at compile time.  Note that
 inclusion or exclusion of any desired optional packages should be done
 before typing "make makelib".  The first "make" command will create a
 current Makefile.lib with all the file names in your src dir.  The
 second "make" command will use it to build LAMMPS as a static library,
 using the ARCHIVE and ARFLAGS settings in src/MAKE/Makefile.foo.  The
 build will create the file liblammps_foo.a which another application can
 link to.
 
 [Shared library:] :h5
 
 To build LAMMPS as a shared library (*.so file on Linux), which can be
 dynamically loaded, e.g. from Python, type
 
 make makeshlib
 make -f Makefile.shlib foo :pre
 
 where foo is the machine name.  This kind of library is required when
 wrapping LAMMPS with Python; see "Section_python"_Section_python.html
 for details.  Again, note that inclusion or exclusion of any desired
 optional packages should be done before typing "make makelib".  The
 first "make" command will create a current Makefile.shlib with all the
 file names in your src dir.  The second "make" command will use it to
 build LAMMPS as a shared library, using the SHFLAGS and SHLIBFLAGS
 settings in src/MAKE/Makefile.foo.  The build will create the file
 liblammps_foo.so which another application can link to dyamically.  It
 will also create a soft link liblammps.so, which the Python wrapper uses
 by default.
 
 Note that for a shared library to be usable by a calling program, all
 the auxiliary libraries it depends on must also exist as shared
 libraries.  This will be the case for libraries included with LAMMPS,
 such as the dummy MPI library in src/STUBS or any package libraries in
 lib/packges, since they are always built as shared libraries with the
 -fPIC switch.  However, if a library like MPI or FFTW does not exist
 as a shared library, the second make command will generate an error.
 This means you will need to install a shared library version of the
 package.  The build instructions for the library should tell you how
 to do this.
 
 As an example, here is how to build and install the "MPICH
 library"_mpich, a popular open-source version of MPI, distributed by
 Argonne National Labs, as a shared library in the default
 /usr/local/lib location:
 
 :link(mpich,http://www-unix.mcs.anl.gov/mpi)
 
 ./configure --enable-shared
 make
 make install :pre
 
 You may need to use "sudo make install" in place of the last line if
 you do not have write privileges for /usr/local/lib.  The end result
 should be the file /usr/local/lib/libmpich.so.
 
 [Additional requirement for using a shared library:] :h5
 
 The operating system finds shared libraries to load at run-time using
 the environment variable LD_LIBRARY_PATH.  So you may wish to copy the
 file src/liblammps.so or src/liblammps_g++.so (for example) to a place
 the system can find it by default, such as /usr/local/lib, or you may
 wish to add the LAMMPS src directory to LD_LIBRARY_PATH, so that the
 current version of the shared library is always available to programs
 that use it.
 
 For the csh or tcsh shells, you would add something like this to your
 ~/.cshrc file:
 
 setenv LD_LIBRARY_PATH ${LD_LIBRARY_PATH}:/home/sjplimp/lammps/src :pre
 
 [Calling the LAMMPS library:] :h5
 
 Either flavor of library (static or shared0 allows one or more LAMMPS
 objects to be instantiated from the calling program.
 
 When used from a C++ program, all of LAMMPS is wrapped in a LAMMPS_NS
 namespace; you can safely use any of its classes and methods from
 within the calling code, as needed.
 
 When used from a C or Fortran program or a scripting language like
 Python, the library has a simple function-style interface, provided in
 src/library.cpp and src/library.h.
 
 See the sample codes in examples/COUPLE/simple for examples of C++ and
 C and Fortran codes that invoke LAMMPS thru its library interface.
 There are other examples as well in the COUPLE directory which are
 discussed in "Section_howto 10"_Section_howto.html#howto_10 of the
 manual.  See "Section_python"_Section_python.html of the manual for a
 description of the Python wrapper provided with LAMMPS that operates
 through the LAMMPS library interface.
 
 The files src/library.cpp and library.h define the C-style API for
 using LAMMPS as a library.  See "Section_howto
 19"_Section_howto.html#howto_19 of the manual for a description of the
 interface and how to extend it for your needs.
 
 :line
 
 2.6 Running LAMMPS :h4,link(start_6)
 
 By default, LAMMPS runs by reading commands from standard input.  Thus
 if you run the LAMMPS executable by itself, e.g.
 
 lmp_linux :pre
 
 it will simply wait, expecting commands from the keyboard.  Typically
 you should put commands in an input script and use I/O redirection,
 e.g.
 
 lmp_linux < in.file :pre
 
 For parallel environments this should also work.  If it does not, use
 the '-in' command-line switch, e.g.
 
 lmp_linux -in in.file :pre
 
 "This section"_Section_commands.html describes how input scripts are
 structured and what commands they contain.
 
 You can test LAMMPS on any of the sample inputs provided in the
 examples or bench directory.  Input scripts are named in.* and sample
 outputs are named log.*.name.P where name is a machine and P is the
 number of processors it was run on.
 
 Here is how you might run a standard Lennard-Jones benchmark on a
 Linux box, using mpirun to launch a parallel job:
 
 cd src
 make linux
 cp lmp_linux ../bench
 cd ../bench
 mpirun -np 4 lmp_linux -in in.lj :pre
 
 See "this page"_bench for timings for this and the other benchmarks on
 various platforms.  Note that some of the example scripts require
 LAMMPS to be built with one or more of its optional packages.
 
 :link(bench,http://lammps.sandia.gov/bench.html)
 
 :line
 
 On a Windows box, you can skip making LAMMPS and simply download an
 executable, as described above, though the pre-packaged executables
 include only certain packages.
 
 To run a LAMMPS executable on a Windows machine, first decide whether
 you want to download the non-MPI (serial) or the MPI (parallel)
 version of the executable. Download and save the version you have
 chosen.
 
 For the non-MPI version, follow these steps:
 
 Get a command prompt by going to Start->Run... , 
 then typing "cmd". :ulb,l
 
 Move to the directory where you have saved lmp_win_no-mpi.exe
 (e.g. by typing: cd "Documents"). :l
 
 At the command prompt, type "lmp_win_no-mpi -in in.lj", replacing in.lj
 with the name of your LAMMPS input script. :l,ule
 
 For the MPI version, which allows you to run LAMMPS under Windows on 
 multiple processors, follow these steps:
 
 Download and install
 "MPICH2"_http://www.mcs.anl.gov/research/projects/mpich2/downloads/index.php?s=downloads
 for Windows. :ulb,l
 
 You'll need to use the mpiexec.exe and smpd.exe files from the MPICH2
 package. Put them in same directory (or path) as the LAMMPS Windows
 executable. :l
 
 Get a command prompt by going to Start->Run... , 
 then typing "cmd". :l
 
 Move to the directory where you have saved lmp_win_mpi.exe
 (e.g. by typing: cd "Documents"). :l
 
 Then type something like this: "mpiexec -localonly 4 lmp_win_mpi -in
 in.lj", replacing in.lj with the name of your LAMMPS input script. :l
 
 Note that you may need to provide smpd with a passphrase (it doesn't
 matter what you type). :l
 
 In this mode, output may not immediately show up on the screen, so if
 your input script takes a long time to execute, you may need to be
 patient before the output shows up. :l Alternatively, you can still
 use this executable to run on a single processor by typing something
 like: "lmp_win_mpi -in in.lj". :l,ule
 
 :line
 
 The screen output from LAMMPS is described in the next section.  As it
 runs, LAMMPS also writes a log.lammps file with the same information.
 
 Note that this sequence of commands copies the LAMMPS executable
 (lmp_linux) to the directory with the input files.  This may not be
 necessary, but some versions of MPI reset the working directory to
 where the executable is, rather than leave it as the directory where
 you launch mpirun from (if you launch lmp_linux on its own and not
 under mpirun).  If that happens, LAMMPS will look for additional input
 files and write its output files to the executable directory, rather
 than your working directory, which is probably not what you want.
 
 If LAMMPS encounters errors in the input script or while running a
 simulation it will print an ERROR message and stop or a WARNING
 message and continue.  See "Section_errors"_Section_errors.html for a
 discussion of the various kinds of errors LAMMPS can or can't detect,
 a list of all ERROR and WARNING messages, and what to do about them.
 
 LAMMPS can run a problem on any number of processors, including a
 single processor.  In theory you should get identical answers on any
 number of processors and on any machine.  In practice, numerical
 round-off can cause slight differences and eventual divergence of
 molecular dynamics phase space trajectories.
 
 LAMMPS can run as large a problem as will fit in the physical memory
 of one or more processors.  If you run out of memory, you must run on
 more processors or setup a smaller problem.
 
 :line
 
 2.7 Command-line options :h4,link(start_7)
 
 At run time, LAMMPS recognizes several optional command-line switches
 which may be used in any order.  Either the full word or a one-or-two
 letter abbreviation can be used:
 
 -c or -cuda
 -e or -echo
 -i or -in
 -h or -help
+-k or -kokkos
 -l or -log
 -nc or -nocite
 -p or -partition
 -pl or -plog
 -ps or -pscreen
 -r or -restart
 -ro or -reorder
 -sc or -screen
 -sf or -suffix
 -v or -var :ul
 
 For example, lmp_ibm might be launched as follows:
 
 mpirun -np 16 lmp_ibm -v f tmp.out -l my.log -sc none -in in.alloy
 mpirun -np 16 lmp_ibm -var f tmp.out -log my.log -screen none -in in.alloy :pre
 
 Here are the details on the options:
 
 -cuda on/off :pre
 
 Explicitly enable or disable CUDA support, as provided by the
 USER-CUDA package.  If LAMMPS is built with this package, as described
 above in "Section 2.3"_#start_3, then by default LAMMPS will run in
 CUDA mode.  If this switch is set to "off", then it will not, even if
 it was built with the USER-CUDA package, which means you can run
 standard LAMMPS or with the GPU package for testing or benchmarking
 purposes.  The only reason to set the switch to "on", is to check if
 LAMMPS was built with the USER-CUDA package, since an error will be
 generated if it was not.
 
 -echo style :pre
 
 Set the style of command echoing.  The style can be {none} or {screen}
 or {log} or {both}.  Depending on the style, each command read from
 the input script will be echoed to the screen and/or logfile.  This
 can be useful to figure out which line of your script is causing an
 input error.  The default value is {log}.  The echo style can also be
 set by using the "echo"_echo.html command in the input script itself.
 
 -in file :pre
 
 Specify a file to use as an input script.  This is an optional switch
 when running LAMMPS in one-partition mode.  If it is not specified,
 LAMMPS reads its script from standard input, typically from a script
 via I/O redirection; e.g. lmp_linux < in.run.  I/O redirection should
 also work in parallel, but if it does not (in the unlikely case that
 an MPI implementation does not support it), then use the -in flag.
 Note that this is a required switch when running LAMMPS in
 multi-partition mode, since multiple processors cannot all read from
 stdin.
 
 -help :pre
 
 Print a brief help summary and a list of options compiled into this
 executable for each LAMMPS style (atom_style, fix, compute,
 pair_style, bond_style, etc).  This can tell you if the command you
 want to use was included via the appropriate package at compile time.
 LAMMPS will print the info and immediately exit if this switch is
 used.
 
+-kokkos on/off keyword/value ... :pre
+
+Explicitly enable or disable Kokkos support, as provided by the KOKKOS
+package.  If LAMMPS is built with this package, as described above in
+"Section 2.3"_#start_3, then by default LAMMPS will run in Kokkos
+mode.  If this switch is set to "off", then it will not, even if it
+was built with the KOKKOS package, which means you can run standard
+LAMMPS styles or use styles enhanced by other acceleration packages,
+such as the GPU or USER-CUDA or USER-OMP packages, for testing or
+benchmarking purposes.  The only reason to set the switch to "on", is
+to check if LAMMPS was built with the KOKKOS package, since an error
+will be generated if it was not.
+
+Additional optional keyword/value pairs can be specified which
+determine how Kokkos will use the underlying hardware on your
+platform.  These settings apply to each MPI task you launch via the
+"mpirun" or "mpiexec" command.  You may choose to run one or more MPI
+tasks per physical node.  Note that if you are running on a desktop
+machine, you typically have one physical node.  On a cluster or
+supercomputer there may be dozens or 1000s of physical nodes.
+
+Either the full word or an abbreviation can be used for the keywords.
+Note that the keywords do not use a leading minus sign.  I.e. the
+keyword is "t", not "-t".  Also note that each of the keywords has a
+default setting.  More explanation as to when to use these options and
+what settings to use on different platforms is given in "this
+section"_Section_accerlerate.html#acc_8.
+
+d or device
+g or gpus
+t or threads
+n or numa :ul
+
+device Nd :pre
+
+This option is only relevant if you built LAMMPS with CUDA=yes, you
+have more than one GPU per node, and if you are running with only one
+MPI task per node.  The Nd setting is the ID of the GPU on the node to
+run on.  By default Nd = 0.  If you have multiple GPUs per node, they
+have consecutive IDs numbered as 0,1,2,etc.  This setting allows you
+to launch multiple independent jobs on the node, each with a single
+MPI task per node, and assign each job to run on a different GPU.
+
+gpus Ng Ns :pre
+
+This option is only relevant if you built LAMMPS with CUDA=yes, you
+have more than one GPU per node, and you are running with multiple MPI
+tasks per node (up to one per GPU).  The Ng setting is how many GPUs
+you will use.  The Ns setting is optional.  If set, it is the ID of a
+GPU to skip when assigning MPI tasks to GPUs.  This may be useful if
+your desktop system reserves one GPU to drive the screen and the rest
+are intended for computational work like running LAMMPS.  By default
+Ng = 1 and Ns is not set.
+
+Depending on which flavor of MPI you are running, LAMMPS will look for
+one of these 3 environment variables
+
+SLURM_LOCALID (various MPI variants compiled with SLURM support)
+MV2_COMM_WORLD_LOCAL_RANK (Mvapich)
+OMPI_COMM_WORLD_LOCAL_RANK (OpenMPI) :pre
+
+which are initialized by the "srun", "mpirun" or "mpiexec" commands.
+The environment variable setting for each MPI rank is used to assign a
+unique GPU ID to the MPI task.
+
+threads Nt :pre
+
+This option assigns Nt number of threads to each MPI task for
+performing work when Kokkos is executing in OpenMP or pthreads mode.
+The default is Nt = 1, which essentially runs in MPI-only mode.  If
+there are Np MPI tasks per physical node, you generally want Np*Nt =
+the number of physical cores per node, to use your available hardware
+optimally.  This also sets the number of threads used by the host when
+LAMMPS is compiled with CUDA=yes.
+
+numa Nm :pre
+
+This option is only relevant when using pthreads with hwloc support.
+In this case Nm defines the number of NUMA regions (typicaly sockets)
+on a node which will be utilizied by a single MPI rank.  By default Nm
+= 1.  If this option is used the total number of worker-threads per
+MPI rank is threads*numa.  Currently it is always almost better to
+assign at least one MPI rank per NUMA region, and leave numa set to
+its default value of 1. This is because letting a single process span
+multiple NUMA regions induces a significant amount of cross NUMA data
+traffic which is slow.
+
 -log file :pre
 
 Specify a log file for LAMMPS to write status information to.  In
 one-partition mode, if the switch is not used, LAMMPS writes to the
 file log.lammps.  If this switch is used, LAMMPS writes to the
 specified file.  In multi-partition mode, if the switch is not used, a
 log.lammps file is created with hi-level status information.  Each
 partition also writes to a log.lammps.N file where N is the partition
 ID.  If the switch is specified in multi-partition mode, the hi-level
 logfile is named "file" and each partition also logs information to a
 file.N.  For both one-partition and multi-partition mode, if the
 specified file is "none", then no log files are created.  Using a
 "log"_log.html command in the input script will override this setting.
 Option -plog will override the name of the partition log files file.N.
 
 -nocite :pre
 
 Disable writing the log.cite file which is normally written to list
 references for specific cite-able features used during a LAMMPS run.
 See the "citation page"_http://lammps.sandia.gov/cite.html for more
 details.
 
 -partition 8x2 4 5 ... :pre
 
 Invoke LAMMPS in multi-partition mode.  When LAMMPS is run on P
 processors and this switch is not used, LAMMPS runs in one partition,
 i.e. all P processors run a single simulation.  If this switch is
 used, the P processors are split into separate partitions and each
 partition runs its own simulation.  The arguments to the switch
 specify the number of processors in each partition.  Arguments of the
 form MxN mean M partitions, each with N processors.  Arguments of the
 form N mean a single partition with N processors.  The sum of
 processors in all partitions must equal P.  Thus the command
 "-partition 8x2 4 5" has 10 partitions and runs on a total of 25
 processors.
 
 Running with multiple partitions can e useful for running
 "multi-replica simulations"_Section_howto.html#howto_5, where each
 replica runs on on one or a few processors.  Note that with MPI
 installed on a machine (e.g. your desktop), you can run on more
 (virtual) processors than you have physical processors.
 
 To run multiple independent simulatoins from one input script, using
 multiple partitions, see "Section_howto 4"_Section_howto.html#howto_4
 of the manual.  World- and universe-style "variables"_variable.html
 are useful in this context.
 
 -plog file :pre
  
 Specify the base name for the partition log files, so partition N
 writes log information to file.N. If file is none, then no partition
 log files are created.  This overrides the filename specified in the
 -log command-line option.  This option is useful when working with
 large numbers of partitions, allowing the partition log files to be
 suppressed (-plog none) or placed in a sub-directory (-plog
 replica_files/log.lammps) If this option is not used the log file for
 partition N is log.lammps.N or whatever is specified by the -log
 command-line option.
 
 -pscreen file :pre 
 
 Specify the base name for the partition screen file, so partition N
 writes screen information to file.N. If file is none, then no
 partition screen files are created.  This overrides the filename
 specified in the -screen command-line option.  This option is useful
 when working with large numbers of partitions, allowing the partition
 screen files to be suppressed (-pscreen none) or placed in a
 sub-directory (-pscreen replica_files/screen).  If this option is not
 used the screen file for partition N is screen.N or whatever is
 specified by the -screen command-line option.
 
 -restart restartfile datafile keyword value ... :pre 
 
 Convert the restart file into a data file and immediately exit.  This
 is the same operation as if the following 2-line input script were
 run:
 
 read_restart restartfile
 write_data datafile keyword value ... :pre
 
 Note that the specified restartfile and datafile can have wild-card
 characters ("*",%") as described by the
 "read_restart"_read_restart.html and "write_data"_write_data.html
 commands.  But a filename such as file.* will need to be enclosed in
 quotes to avoid shell expansion of the "*" character.
 
 Also note that following datafile, the same optional keyword/value
 pairs can be listed as used by the "write_data"_write_data.html
 command.
 
 -reorder nth N
 -reorder custom filename :pre
 
 Reorder the processors in the MPI communicator used to instantiate
 LAMMPS, in one of several ways.  The original MPI communicator ranks
 all P processors from 0 to P-1.  The mapping of these ranks to
 physical processors is done by MPI before LAMMPS begins.  It may be
 useful in some cases to alter the rank order.  E.g. to insure that
 cores within each node are ranked in a desired order.  Or when using
 the "run_style verlet/split"_run_style.html command with 2 partitions
 to insure that a specific Kspace processor (in the 2nd partition) is
 matched up with a specific set of processors in the 1st partition.
 See the "Section_accelerate"_Section_accelerate.html doc pages for
 more details.
 
 If the keyword {nth} is used with a setting {N}, then it means every
 Nth processor will be moved to the end of the ranking.  This is useful
 when using the "run_style verlet/split"_run_style.html command with 2
 partitions via the -partition command-line switch.  The first set of
 processors will be in the first partition, the 2nd set in the 2nd
 partition.  The -reorder command-line switch can alter this so that
 the 1st N procs in the 1st partition and one proc in the 2nd partition
 will be ordered consecutively, e.g. as the cores on one physical node.
 This can boost performance.  For example, if you use "-reorder nth 4"
 and "-partition 9 3" and you are running on 12 processors, the
 processors will be reordered from
 
 0 1 2 3 4 5 6 7 8 9 10 11 :pre
 
 to
 
 0 1 2 4 5 6 8 9 10 3 7 11 :pre
 
 so that the processors in each partition will be
 
 0 1 2 4 5 6 8 9 10 
 3 7 11 :pre
 
 See the "processors" command for how to insure processors from each
 partition could then be grouped optimally for quad-core nodes.
 
 If the keyword is {custom}, then a file that specifies a permutation
 of the processor ranks is also specified.  The format of the reorder
 file is as follows.  Any number of initial blank or comment lines
 (starting with a "#" character) can be present.  These should be
 followed by P lines of the form:
 
 I J :pre
 
 where P is the number of processors LAMMPS was launched with.  Note
 that if running in multi-partition mode (see the -partition switch
 above) P is the total number of processors in all partitions.  The I
 and J values describe a permutation of the P processors.  Every I and
 J should be values from 0 to P-1 inclusive.  In the set of P I values,
 every proc ID should appear exactly once.  Ditto for the set of P J
 values.  A single I,J pairing means that the physical processor with
 rank I in the original MPI communicator will have rank J in the
 reordered communicator.
 
 Note that rank ordering can also be specified by many MPI
 implementations, either by environment variables that specify how to
 order physical processors, or by config files that specify what
 physical processors to assign to each MPI rank.  The -reorder switch
 simply gives you a portable way to do this without relying on MPI
 itself.  See the "processors out"_processors command for how to output
 info on the final assignment of physical processors to the LAMMPS
 simulation domain.
 
 -screen file :pre
 
 Specify a file for LAMMPS to write its screen information to.  In
 one-partition mode, if the switch is not used, LAMMPS writes to the
 screen.  If this switch is used, LAMMPS writes to the specified file
 instead and you will see no screen output.  In multi-partition mode,
 if the switch is not used, hi-level status information is written to
 the screen.  Each partition also writes to a screen.N file where N is
 the partition ID.  If the switch is specified in multi-partition mode,
 the hi-level screen dump is named "file" and each partition also
 writes screen information to a file.N.  For both one-partition and
 multi-partition mode, if the specified file is "none", then no screen
 output is performed. Option -pscreen will override the name of the
 partition screen files file.N.
 
--suffix style :pre
+-suffix style args :pre
 
 Use variants of various styles if they exist.  The specified style can
-be {opt}, {omp}, {gpu}, or {cuda}.  These refer to optional packages that
-LAMMPS can be built with, as described above in "Section
-2.3"_#start_3.  The "opt" style corrsponds to the OPT package, the
-"omp" style to the USER-OMP package, the "gpu" style to the GPU 
-package, and the "cuda" style to the USER-CUDA package.
+be {cuda}, {gpu}, {kk}, {omp}, or {opt}.  These refer to optional
+packages that LAMMPS can be built with, as described above in "Section
+2.3"_#start_3.  The "cuda" style corresponds to the USER-CUDA package,
+the "gpu" style to the GPU package, the "kk" style to the KOKKOS
+pacakge, the "opt" style to the OPT package, and the "omp" style to
+the USER-OMP package.
 
 As an example, all of the packages provide a "pair_style
-lj/cut"_pair_lj.html variant, with style names lj/cut/opt, lj/cut/omp,
-lj/cut/gpu, or lj/cut/cuda.  A variant styles can be specified
-explicitly in your input script, e.g. pair_style lj/cut/gpu.  If the
--suffix switch is used, you do not need to modify your input script.
-The specified suffix (opt,omp,gpu,cuda) is automatically appended
-whenever your input script command creates a new
-"atom"_atom_style.html, "pair"_pair_style.html, "fix"_fix.html,
+lj/cut"_pair_lj.html variant, with style names lj/cut/cuda,
+lj/cut/gpu, lj/cut/kk, lj/cut/omp, or lj/cut/opt.  A variant styles
+can be specified explicitly in your input script, e.g. pair_style
+lj/cut/gpu.  If the -suffix switch is used, you do not need to modify
+your input script.  The specified suffix (cuda,gpu,kk,omp,opt) is
+automatically appended whenever your input script command creates a
+new "atom"_atom_style.html, "pair"_pair_style.html, "fix"_fix.html,
 "compute"_compute.html, or "run"_run_style.html style.  If the variant
 version does not exist, the standard version is created.
 
 For the GPU package, using this command-line switch also invokes the
 default GPU settings, as if the command "package gpu force/neigh 0 0
 1" were used at the top of your input script.  These settings can be
 changed by using the "package gpu"_package.html command in your script
 if desired.
 
+For the KOKKOS package, using this command-line switch also invokes
+the default KOKKOS settings, as if the command "package kokkos neigh
+full comm/exchange host comm/forward host " were used at the top of
+your input script.  These settings can be changed by using the
+"package kokkos"_package.html command in your script if desired.
+
 For the OMP package, using this command-line switch also invokes the
 default OMP settings, as if the command "package omp *" were used at
 the top of your input script.  These settings can be changed by using
 the "package omp"_package.html command in your script if desired.
 
-The "suffix"_suffix.html command can also set a suffix and it can also
-turn off/on any suffix setting made via the command line.
+The "suffix"_suffix.html command can also be used set a suffix and it
+can also turn off or back on any suffix setting made via the command
+line.
 
 -var name value1 value2 ... :pre
 
 Specify a variable that will be defined for substitution purposes when
 the input script is read.  "Name" is the variable name which can be a
 single character (referenced as $x in the input script) or a full
 string (referenced as $\{abc\}).  An "index-style
 variable"_variable.html will be created and populated with the
 subsequent values, e.g. a set of filenames.  Using this command-line
 option is equivalent to putting the line "variable name index value1
 value2 ..."  at the beginning of the input script.  Defining an index
 variable as a command-line argument overrides any setting for the same
 index variable in the input script, since index variables cannot be
 re-defined.  See the "variable"_variable.html command for more info on
 defining index and other kinds of variables and "this
 section"_Section_commands.html#cmd_2 for more info on using variables
 in input scripts.
 
 NOTE: Currently, the command-line parser looks for arguments that
 start with "-" to indicate new switches.  Thus you cannot specify
 multiple variable values if any of they start with a "-", e.g. a
 negative numeric value.  It is OK if the first value1 starts with a
 "-", since it is automatically skipped.
 
 :line
 
 2.8 LAMMPS screen output :h4,link(start_8)
 
 As LAMMPS reads an input script, it prints information to both the
 screen and a log file about significant actions it takes to setup a
 simulation.  When the simulation is ready to begin, LAMMPS performs
 various initializations and prints the amount of memory (in MBytes per
 processor) that the simulation requires.  It also prints details of
 the initial thermodynamic state of the system.  During the run itself,
 thermodynamic information is printed periodically, every few
 timesteps.  When the run concludes, LAMMPS prints the final
 thermodynamic state and a total run time for the simulation.  It then
 appends statistics about the CPU time and storage requirements for the
 simulation.  An example set of statistics is shown here:
 
 Loop time of 49.002 on 2 procs for 2004 atoms :pre
 
 Pair   time (%) = 35.0495 (71.5267)
 Bond   time (%) = 0.092046 (0.187841)
 Kspce  time (%) = 6.42073 (13.103)
 Neigh  time (%) = 2.73485 (5.5811)
 Comm   time (%) = 1.50291 (3.06703)
 Outpt  time (%) = 0.013799 (0.0281601)
 Other  time (%) = 2.13669 (4.36041) :pre
 
 Nlocal:    1002 ave, 1015 max, 989 min
 Histogram: 1 0 0 0 0 0 0 0 0 1 
 Nghost:    8720 ave, 8724 max, 8716 min 
 Histogram: 1 0 0 0 0 0 0 0 0 1
 Neighs:    354141 ave, 361422 max, 346860 min 
 Histogram: 1 0 0 0 0 0 0 0 0 1 :pre
 
 Total # of neighbors = 708282
 Ave neighs/atom = 353.434
 Ave special neighs/atom = 2.34032
 Number of reneighborings = 42
 Dangerous reneighborings = 2 :pre
 
 The first section gives the breakdown of the CPU run time (in seconds)
 into major categories.  The second section lists the number of owned
 atoms (Nlocal), ghost atoms (Nghost), and pair-wise neighbors stored
 per processor.  The max and min values give the spread of these values
 across processors with a 10-bin histogram showing the distribution.
 The total number of histogram counts is equal to the number of
 processors.
 
 The last section gives aggregate statistics for pair-wise neighbors
 and special neighbors that LAMMPS keeps track of (see the
 "special_bonds"_special_bonds.html command).  The number of times
 neighbor lists were rebuilt during the run is given as well as the
 number of potentially "dangerous" rebuilds.  If atom movement
 triggered neighbor list rebuilding (see the
 "neigh_modify"_neigh_modify.html command), then dangerous
 reneighborings are those that were triggered on the first timestep
 atom movement was checked for.  If this count is non-zero you may wish
 to reduce the delay factor to insure no force interactions are missed
 by atoms moving beyond the neighbor skin distance before a rebuild
 takes place.
 
 If an energy minimization was performed via the
 "minimize"_minimize.html command, additional information is printed,
 e.g.
 
 Minimization stats:
   E initial, next-to-last, final = -0.895962 -2.94193 -2.94342
   Gradient 2-norm init/final= 1920.78 20.9992
   Gradient inf-norm init/final= 304.283 9.61216
   Iterations = 36
   Force evaluations = 177 :pre
 
 The first line lists the initial and final energy, as well as the
 energy on the next-to-last iteration.  The next 2 lines give a measure
 of the gradient of the energy (force on all atoms).  The 2-norm is the
 "length" of this force vector; the inf-norm is the largest component.
 The last 2 lines are statistics on how many iterations and
 force-evaluations the minimizer required.  Multiple force evaluations
 are typically done at each iteration to perform a 1d line minimization
 in the search direction.
 
 If a "kspace_style"_kspace_style.html long-range Coulombics solve was
 performed during the run (PPPM, Ewald), then additional information is
 printed, e.g.
 
 FFT time (% of Kspce) = 0.200313 (8.34477)
 FFT Gflps 3d 1d-only = 2.31074 9.19989 :pre
 
 The first line gives the time spent doing 3d FFTs (4 per timestep) and
 the fraction it represents of the total KSpace time (listed above).
 Each 3d FFT requires computation (3 sets of 1d FFTs) and communication
 (transposes).  The total flops performed is 5Nlog_2(N), where N is the
 number of points in the 3d grid.  The FFTs are timed with and without
 the communication and a Gflop rate is computed.  The 3d rate is with
 communication; the 1d rate is without (just the 1d FFTs).  Thus you
 can estimate what fraction of your FFT time was spent in
 communication, roughly 75% in the example above.
 
 :line
 
 2.9 Tips for users of previous LAMMPS versions :h4,link(start_9)
 
 The current C++ began with a complete rewrite of LAMMPS 2001, which
 was written in F90.  Features of earlier versions of LAMMPS are listed
 in "Section_history"_Section_history.html.  The F90 and F77 versions
 (2001 and 99) are also freely distributed as open-source codes; check
 the "LAMMPS WWW Site"_lws for distribution information if you prefer
 those versions.  The 99 and 2001 versions are no longer under active
 development; they do not have all the features of C++ LAMMPS.
 
 If you are a previous user of LAMMPS 2001, these are the most
 significant changes you will notice in C++ LAMMPS:
 
 (1) The names and arguments of many input script commands have
 changed.  All commands are now a single word (e.g. read_data instead
 of read data).
 
 (2) All the functionality of LAMMPS 2001 is included in C++ LAMMPS,
 but you may need to specify the relevant commands in different ways.
 
 (3) The format of the data file can be streamlined for some problems.
 See the "read_data"_read_data.html command for details.  The data file
 section "Nonbond Coeff" has been renamed to "Pair Coeff" in C++ LAMMPS.
 
 (4) Binary restart files written by LAMMPS 2001 cannot be read by C++
 LAMMPS with a "read_restart"_read_restart.html command.  This is
 because they were output by F90 which writes in a different binary
 format than C or C++ writes or reads.  Use the {restart2data} tool
 provided with LAMMPS 2001 to convert the 2001 restart file to a text
 data file.  Then edit the data file as necessary before using the C++
 LAMMPS "read_data"_read_data.html command to read it in.
 
 (5) There are numerous small numerical changes in C++ LAMMPS that mean
 you will not get identical answers when comparing to a 2001 run.
 However, your initial thermodynamic energy and MD trajectory should be
 close if you have setup the problem for both codes the same.
diff --git a/doc/atom_style.html b/doc/atom_style.html
index d7af8e203..0b606843a 100644
--- a/doc/atom_style.html
+++ b/doc/atom_style.html
@@ -1,237 +1,267 @@
 <HTML>
 <CENTER><A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> - <A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> 
 </CENTER>
 
 
 
 
 
 
 <HR>
 
 <H3>atom_style command 
 </H3>
 <P><B>Syntax:</B>
 </P>
 <PRE>atom_style style args 
 </PRE>
 <UL><LI>style = <I>angle</I> or <I>atomic</I> or <I>body</I> or <I>bond</I> or <I>charge</I> or <I>dipole</I> or         <I>electron</I> or <I>ellipsoid</I> or <I>full</I> or <I>line</I> or <I>meso</I> or 	<I>molecular</I> or <I>peri</I> or <I>sphere</I> or <I>tri</I> or <I>template</I> or <I>hybrid</I> 
 </UL>
 <PRE>  args = none for any style except <I>body</I> and <I>hybrid</I>
   <I>body</I> args = bstyle bstyle-args
     bstyle = style of body particles
     bstyle-args = additional arguments specific to the bstyle
                   see the <A HREF = "body.html">body</A> doc page for details
   <I>template</I> args = template-ID
     template-ID = ID of molecule template specified in a separate <A HREF = "molecule.html">molecule</A> command
   <I>hybrid</I> args = list of one or more sub-styles, each with their args 
 </PRE>
+<P>accelerated styles (with same args):
+</P>
+<UL><LI>style = <I>angle/cuda</I> or <I>atomic/cuda</I> or <I>atomic/kokkos</I> or <I>charge/cuda</I> or <I>full/cuda</I> 
+</UL>
 <P><B>Examples:</B>
 </P>
 <PRE>atom_style atomic
 atom_style bond
 atom_style full
+atom_style full/cuda
 atom_style body nparticle 2 10
 atom_style hybrid charge bond
 atom_style hybrid charge body nparticle 2 5
 atom_style template myMols 
 </PRE>
 <P><B>Description:</B>
 </P>
 <P>Define what style of atoms to use in a simulation.  This determines
 what attributes are associated with the atoms.  This command must be
 used before a simulation is setup via a <A HREF = "read_data.html">read_data</A>,
 <A HREF = "read_restart.html">read_restart</A>, or <A HREF = "create_box.html">create_box</A>
 command.
 </P>
 <P>Once a style is assigned, it cannot be changed, so use a style general
 enough to encompass all attributes.  E.g. with style <I>bond</I>, angular
 terms cannot be used or added later to the model.  It is OK to use a
 style more general than needed, though it may be slightly inefficient.
 </P>
 <P>The choice of style affects what quantities are stored by each atom,
 what quantities are communicated between processors to enable forces
 to be computed, and what quantities are listed in the data file read
 by the <A HREF = "read_data.html">read_data</A> command.
 </P>
 <P>These are the additional attributes of each style and the typical
 kinds of physical systems they are used to model.  All styles store
 coordinates, velocities, atom IDs and types.  See the
 <A HREF = "read_data.html">read_data</A>, <A HREF = "create_atoms.html">create_atoms</A>, and
 <A HREF = "set.html">set</A> commands for info on how to set these various
 quantities.
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR><TD ><I>angle</I> </TD><TD > bonds and angles </TD><TD > bead-spring polymers with stiffness </TD></TR>
 <TR><TD ><I>atomic</I> </TD><TD > only the default values </TD><TD > coarse-grain liquids, solids, metals </TD></TR>
 <TR><TD ><I>body</I> </TD><TD > mass, inertia moments, quaternion, angular momentum </TD><TD > arbitrary bodies </TD></TR>
 <TR><TD ><I>bond</I> </TD><TD > bonds </TD><TD > bead-spring polymers </TD></TR>
 <TR><TD ><I>charge</I> </TD><TD > charge </TD><TD > atomic system with charges </TD></TR>
 <TR><TD ><I>dipole</I> </TD><TD > charge and dipole moment </TD><TD > system with dipolar particles </TD></TR>
 <TR><TD ><I>electron</I> </TD><TD > charge and spin and eradius </TD><TD > electronic force field </TD></TR>
 <TR><TD ><I>ellipsoid</I> </TD><TD > shape, quaternion, angular momentum </TD><TD > aspherical particles </TD></TR>
 <TR><TD ><I>full</I> </TD><TD > molecular + charge </TD><TD > bio-molecules </TD></TR>
 <TR><TD ><I>line</I> </TD><TD > end points, angular velocity </TD><TD > rigid bodies </TD></TR>
 <TR><TD ><I>meso</I> </TD><TD > rho, e, cv </TD><TD > SPH particles </TD></TR>
 <TR><TD ><I>molecular</I> </TD><TD > bonds, angles, dihedrals, impropers </TD><TD > uncharged molecules </TD></TR>
 <TR><TD ><I>peri</I> </TD><TD > mass, volume </TD><TD > mesocopic Peridynamic models </TD></TR>
 <TR><TD ><I>sphere</I> </TD><TD > diameter, mass, angular velocity </TD><TD > granular models </TD></TR>
 <TR><TD ><I>template</I> </TD><TD > template index, template atom </TD><TD > small molecules with fixed topology </TD></TR>
 <TR><TD ><I>tri</I> </TD><TD > corner points, angular momentum </TD><TD > rigid bodies </TD></TR>
 <TR><TD ><I>wavepacket</I> </TD><TD > charge, spin, eradius, etag, cs_re, cs_im </TD><TD > AWPMD 
 </TD></TR></TABLE></DIV>
 
 <P>IMPORTANT NOTE: It is possible to add some attributes, such as a
 molecule ID, to atom styles that do not have them via the <A HREF = "fix_property_atom.html">fix
 property/atom</A> command.  This command also
 allows new custom attributes consisting of extra integer or
 floating-point values to be added to atoms.  See the <A HREF = "fix_property_atom.html">fix
 property/atom</A> doc page for examples of cases
 where this is useful and details on how to initialize, access, and
 output the custom values.
 </P>
 <P>All of the above styles define point particles, except the <I>sphere</I>,
 <I>ellipsoid</I>, <I>electron</I>, <I>peri</I>, <I>wavepacket</I>, <I>line</I>, <I>tri</I>, and
 <I>body</I> styles, which define finite-size particles.  See <A HREF = "Section_howto.html#howto_14">Section_howto
 14</A> for an overview of using finite-size
 particle models with LAMMPS.
 </P>
 <P>All of the point-particle styles assign mass to particles on a
 per-type basis, using the <A HREF = "mass.html">mass</A> command, The finite-size
 particle styles assign mass to individual particles on a per-particle
 basis.
 </P>
 <P>For the <I>sphere</I> style, the particles are spheres and each stores a
 per-particle diameter and mass.  If the diameter > 0.0, the particle
 is a finite-size sphere.  If the diameter = 0.0, it is a point
 particle.
 </P>
 <P>For the <I>ellipsoid</I> style, the particles are ellipsoids and each
 stores a flag which indicates whether it is a finite-size ellipsoid or
 a point particle.  If it is an ellipsoid, it also stores a shape
 vector with the 3 diamters of the ellipsoid and a quaternion 4-vector
 with its orientation.
 </P>
 <P>For the <I>electron</I> style, the particles representing electrons are 3d
 Gaussians with a specified position and bandwidth or uncertainty in
 position, which is represented by the eradius = electron size.
 </P>
 <P>For the <I>peri</I> style, the particles are spherical and each stores a
 per-particle mass and volume.
 </P>
 <P>The <I>meso</I> style is for smoothed particle hydrodynamics (SPH)
 particles which store a density (rho), energy (e), and heat capacity
 (cv).
 </P>
 <P>The <I>wavepacket</I> style is similar to <I>electron</I>, but the electrons may
 consist of several Gaussian wave packets, summed up with coefficients
 cs= (cs_re,cs_im).  Each of the wave packets is treated as a separate
 particle in LAMMPS, wave packets belonging to the same electron must
 have identical <I>etag</I> values.
 </P>
 <P>For the <I>line</I> style, the particles are idealized line segments and
 each stores a per-particle mass and length and orientation (i.e. the
 end points of the line segment).
 </P>
 <P>For the <I>tri</I> style, the particles are planar triangles and each
 stores a per-particle mass and size and orientation (i.e. the corner
 points of the triangle).
 </P>
 <P>The <I>template</I> style allows molecular topolgy (bonds,angles,etc) to be
 defined via a molecule template using the <A HREF = "molecule.txt">molecule</A>
 command.  The template stores one or more molecules with a single copy
 of the topology info (bonds,angles,etc) of each.  Individual atoms
 only store a template index and template atom to identify which
 molecule and which atom-within-the-molecule they represent.  Using the
 <I>template</I> style instead of the <I>bond</I>, <I>angle</I>, <I>molecular</I> styles
 can save memory for systems comprised of a large number of small
 molecules, all of a single type (or small number of types).  See the
 paper by Grime and Voth, in <A HREF = "#Grime">(Grime)</A>, for examples of how this
 can be advantageous for large-scale coarse-grained systems.
 </P>
 <P>IMPORTANT NOTE: When using the <I>template</I> style with a <A HREF = "molecule.html">molecule
 template</A> that contains multiple molecules, you should
 insure the atom types, bond types, angle_types, etc in all the
 molecules are consistent.  E.g. if one molecule represents H2O and
 another CO2, then you probably do not want each molecule file to
 define 2 atom types and a single bond type, because they will conflict
 with each other when a mixture system of H2O and CO2 molecules is
 defined, e.g. by the <A HREF = "read_data.html">read_data</A> command.  Rather the
 H2O molecule should define atom types 1 and 2, and bond type 1.  And
 the CO2 molecule should define atom types 3 and 4 (or atom types 3 and
 2 if a single oxygen type is desired), and bond type 2.
 </P>
 <P>For the <I>body</I> style, the particles are arbitrary bodies with internal
 attributes defined by the "style" of the bodies, which is specified by
 the <I>bstyle</I> argument.  Body particles can represent complex entities,
 such as surface meshes of discrete points, collections of
 sub-particles, deformable objects, etc.  
 </P>
 <P>The <A HREF = "body.html">body</A> doc page descibes the body styles LAMMPS
 currently supports, and provides more details as to the kind of body
 particles they represent.  For all styles, each body particle stores
 moments of inertia and a quaternion 4-vector, so that its orientation
 and position can be time integrated due to forces and torques.
 </P>
 <P>Note that there may be additional arguments required along with the
 <I>bstyle</I> specification, in the atom_style body command.  These
 arguments are described in the <A HREF = "body.html">body</A> doc page.
 </P>
 <HR>
 
 <P>Typically, simulations require only a single (non-hybrid) atom style.
 If some atoms in the simulation do not have all the properties defined
 by a particular style, use the simplest style that defines all the
 needed properties by any atom.  For example, if some atoms in a
 simulation are charged, but others are not, use the <I>charge</I> style.
 If some atoms have bonds, but others do not, use the <I>bond</I> style.
 </P>
 <P>The only scenario where the <I>hybrid</I> style is needed is if there is no
 single style which defines all needed properties of all atoms.  For
 example, if you want dipolar particles which will rotate due to
 torque, you would need to use "atom_style hybrid sphere dipole".  When
 a hybrid style is used, atoms store and communicate the union of all
 quantities implied by the individual styles.
 </P>
 <P>When using the <I>hybrid</I> style, you cannot combine the <I>template</I> style
 with another molecular style that stores bond,angle,etc info on a
 per-atom basis.
 </P>
 <P>LAMMPS can be extended with new atom styles as well as new body
 styles; see <A HREF = "Section_modify.html">this section</A>.
 </P>
+<HR>
+
+<P>Styles with a <I>cuda</I> or <I>kk</I> suffix are functionally the same as the
+corresponding style without the suffix.  They have been optimized to
+run faster, depending on your available hardware, as discussed in
+<A HREF = "Section_accelerate.html">Section_accelerate</A> of the manual.  The
+accelerated styles take the same arguments and should produce the same
+results, except for round-off and precision issues.
+</P>
+<P>Note that other acceleration packages in LAMMPS, specifically the GPU,
+USER-OMP, and OPT packages do not use of accelerated atom styles.
+</P>
+<P>These accelerated styles are part of the USER-CUDA and KOKKOS packages
+respectively.  They are only enabled if LAMMPS was built with those
+packages.  See the <A HREF = "Section_start.html#start_3">Making LAMMPS</A> section
+for more info.
+</P>
+<P>You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the <A HREF = "Section_start.html#start_7">-suffix command-line
+switch</A> when you invoke LAMMPS, or you can
+use the <A HREF = "suffix.html">suffix</A> command in your input script.
+</P>
+<P>See <A HREF = "Section_accelerate.html">Section_accelerate</A> of the manual for
+more instructions on how to use the accelerated styles effectively.
+</P>
 <P><B>Restrictions:</B>
 </P>
 <P>This command cannot be used after the simulation box is defined by a
 <A HREF = "read_data.html">read_data</A> or <A HREF = "create_box.html">create_box</A> command.
 </P>
 <P>The <I>angle</I>, <I>bond</I>, <I>full</I>, <I>molecular</I>, and <I>template</I> styles are
 part of the MOLECULAR package.  The <I>line</I> and <I>tri</I> styles are part
 of the ASPHERE pacakge.  The <I>body</I> style is part of the BODY package.
 The <I>dipole</I> style is part of the DIPOLE package.  The <I>peri</I> style is
 part of the PERI package for Peridynamics.  The <I>electron</I> style is
 part of the USER-EFF package for <A HREF = "pair_eff.html">electronic force
 fields</A>.  The <I>meso</I> style is part of the USER-SPH
 package for smoothed particle hydrodyanmics (SPH).  See <A HREF = "USER/sph/SPH_LAMMPS_userguide.pdf">this PDF
 guide</A> to using SPH in LAMMPS.  The
 <I>wavepacket</I> style is part of the USER-AWPMD package for the
 <A HREF = "pair_awpmd.html">antisymmetrized wave packet MD method</A>.  They are
 only enabled if LAMMPS was built with that package.  See the <A HREF = "Section_start.html#start_3">Making
 LAMMPS</A> section for more info.
 </P>
 <P><B>Related commands:</B>
 </P>
 <P><A HREF = "read_data.html">read_data</A>, <A HREF = "pair_style.html">pair_style</A>
 </P>
 <P><B>Default:</B>
 </P>
 <P>atom_style atomic
 </P>
 <HR>
 
 <A NAME = "Grime"></A>
 
 <P><B>(Grime)</B> Grime and Voth, to appear in J Chem Theory & Computation
 (2014).
 </P>
 </HTML>
diff --git a/doc/atom_style.txt b/doc/atom_style.txt
index 1819324fd..8690d3053 100644
--- a/doc/atom_style.txt
+++ b/doc/atom_style.txt
@@ -1,230 +1,260 @@
 "LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
 
 :link(lws,http://lammps.sandia.gov)
 :link(ld,Manual.html)
 :link(lc,Section_commands.html#comm)
 
 :line
 
 atom_style command :h3
 
 [Syntax:]
 
 atom_style style args :pre
 
 style = {angle} or {atomic} or {body} or {bond} or {charge} or {dipole} or \
         {electron} or {ellipsoid} or {full} or {line} or {meso} or \
 	{molecular} or {peri} or {sphere} or {tri} or {template} or {hybrid} :ul
   args = none for any style except {body} and {hybrid}
   {body} args = bstyle bstyle-args
     bstyle = style of body particles
     bstyle-args = additional arguments specific to the bstyle
                   see the "body"_body.html doc page for details
   {template} args = template-ID
     template-ID = ID of molecule template specified in a separate "molecule"_molecule.html command
   {hybrid} args = list of one or more sub-styles, each with their args :pre
 
+accelerated styles (with same args):
+
+style = {angle/cuda} or {atomic/cuda} or {atomic/kokkos} or {charge/cuda} or {full/cuda} :ul
+
 [Examples:]
 
 atom_style atomic
 atom_style bond
 atom_style full
+atom_style full/cuda
 atom_style body nparticle 2 10
 atom_style hybrid charge bond
 atom_style hybrid charge body nparticle 2 5
 atom_style template myMols :pre
 
 [Description:]
 
 Define what style of atoms to use in a simulation.  This determines
 what attributes are associated with the atoms.  This command must be
 used before a simulation is setup via a "read_data"_read_data.html,
 "read_restart"_read_restart.html, or "create_box"_create_box.html
 command.
 
 Once a style is assigned, it cannot be changed, so use a style general
 enough to encompass all attributes.  E.g. with style {bond}, angular
 terms cannot be used or added later to the model.  It is OK to use a
 style more general than needed, though it may be slightly inefficient.
 
 The choice of style affects what quantities are stored by each atom,
 what quantities are communicated between processors to enable forces
 to be computed, and what quantities are listed in the data file read
 by the "read_data"_read_data.html command.
 
 These are the additional attributes of each style and the typical
 kinds of physical systems they are used to model.  All styles store
 coordinates, velocities, atom IDs and types.  See the
 "read_data"_read_data.html, "create_atoms"_create_atoms.html, and
 "set"_set.html commands for info on how to set these various
 quantities.
 
 {angle} | bonds and angles | bead-spring polymers with stiffness |
 {atomic} | only the default values | coarse-grain liquids, solids, metals |
 {body} | mass, inertia moments, quaternion, angular momentum | arbitrary bodies |
 {bond} | bonds | bead-spring polymers |
 {charge} | charge | atomic system with charges |
 {dipole} | charge and dipole moment | system with dipolar particles |
 {electron} | charge and spin and eradius | electronic force field |
 {ellipsoid} | shape, quaternion, angular momentum | aspherical particles |
 {full} | molecular + charge | bio-molecules |
 {line} | end points, angular velocity | rigid bodies |
 {meso} | rho, e, cv | SPH particles |
 {molecular} | bonds, angles, dihedrals, impropers | uncharged molecules |
 {peri} | mass, volume | mesocopic Peridynamic models |
 {sphere} | diameter, mass, angular velocity | granular models |
 {template} | template index, template atom | small molecules with fixed topology |
 {tri} | corner points, angular momentum | rigid bodies |
 {wavepacket} | charge, spin, eradius, etag, cs_re, cs_im | AWPMD :tb(c=3,s=|)
 
 IMPORTANT NOTE: It is possible to add some attributes, such as a
 molecule ID, to atom styles that do not have them via the "fix
 property/atom"_fix_property_atom.html command.  This command also
 allows new custom attributes consisting of extra integer or
 floating-point values to be added to atoms.  See the "fix
 property/atom"_fix_property_atom.html doc page for examples of cases
 where this is useful and details on how to initialize, access, and
 output the custom values.
 
 All of the above styles define point particles, except the {sphere},
 {ellipsoid}, {electron}, {peri}, {wavepacket}, {line}, {tri}, and
 {body} styles, which define finite-size particles.  See "Section_howto
 14"_Section_howto.html#howto_14 for an overview of using finite-size
 particle models with LAMMPS.
 
 All of the point-particle styles assign mass to particles on a
 per-type basis, using the "mass"_mass.html command, The finite-size
 particle styles assign mass to individual particles on a per-particle
 basis.
 
 For the {sphere} style, the particles are spheres and each stores a
 per-particle diameter and mass.  If the diameter > 0.0, the particle
 is a finite-size sphere.  If the diameter = 0.0, it is a point
 particle.
 
 For the {ellipsoid} style, the particles are ellipsoids and each
 stores a flag which indicates whether it is a finite-size ellipsoid or
 a point particle.  If it is an ellipsoid, it also stores a shape
 vector with the 3 diamters of the ellipsoid and a quaternion 4-vector
 with its orientation.
 
 For the {electron} style, the particles representing electrons are 3d
 Gaussians with a specified position and bandwidth or uncertainty in
 position, which is represented by the eradius = electron size.
 
 For the {peri} style, the particles are spherical and each stores a
 per-particle mass and volume.
 
 The {meso} style is for smoothed particle hydrodynamics (SPH)
 particles which store a density (rho), energy (e), and heat capacity
 (cv).
 
 The {wavepacket} style is similar to {electron}, but the electrons may
 consist of several Gaussian wave packets, summed up with coefficients
 cs= (cs_re,cs_im).  Each of the wave packets is treated as a separate
 particle in LAMMPS, wave packets belonging to the same electron must
 have identical {etag} values.
 
 For the {line} style, the particles are idealized line segments and
 each stores a per-particle mass and length and orientation (i.e. the
 end points of the line segment).
 
 For the {tri} style, the particles are planar triangles and each
 stores a per-particle mass and size and orientation (i.e. the corner
 points of the triangle).
 
 The {template} style allows molecular topolgy (bonds,angles,etc) to be
 defined via a molecule template using the "molecule"_molecule.txt
 command.  The template stores one or more molecules with a single copy
 of the topology info (bonds,angles,etc) of each.  Individual atoms
 only store a template index and template atom to identify which
 molecule and which atom-within-the-molecule they represent.  Using the
 {template} style instead of the {bond}, {angle}, {molecular} styles
 can save memory for systems comprised of a large number of small
 molecules, all of a single type (or small number of types).  See the
 paper by Grime and Voth, in "(Grime)"_#Grime, for examples of how this
 can be advantageous for large-scale coarse-grained systems.
 
 IMPORTANT NOTE: When using the {template} style with a "molecule
 template"_molecule.html that contains multiple molecules, you should
 insure the atom types, bond types, angle_types, etc in all the
 molecules are consistent.  E.g. if one molecule represents H2O and
 another CO2, then you probably do not want each molecule file to
 define 2 atom types and a single bond type, because they will conflict
 with each other when a mixture system of H2O and CO2 molecules is
 defined, e.g. by the "read_data"_read_data.html command.  Rather the
 H2O molecule should define atom types 1 and 2, and bond type 1.  And
 the CO2 molecule should define atom types 3 and 4 (or atom types 3 and
 2 if a single oxygen type is desired), and bond type 2.
 
 For the {body} style, the particles are arbitrary bodies with internal
 attributes defined by the "style" of the bodies, which is specified by
 the {bstyle} argument.  Body particles can represent complex entities,
 such as surface meshes of discrete points, collections of
 sub-particles, deformable objects, etc.  
 
 The "body"_body.html doc page descibes the body styles LAMMPS
 currently supports, and provides more details as to the kind of body
 particles they represent.  For all styles, each body particle stores
 moments of inertia and a quaternion 4-vector, so that its orientation
 and position can be time integrated due to forces and torques.
 
 Note that there may be additional arguments required along with the
 {bstyle} specification, in the atom_style body command.  These
 arguments are described in the "body"_body.html doc page.
 
 :line
 
 Typically, simulations require only a single (non-hybrid) atom style.
 If some atoms in the simulation do not have all the properties defined
 by a particular style, use the simplest style that defines all the
 needed properties by any atom.  For example, if some atoms in a
 simulation are charged, but others are not, use the {charge} style.
 If some atoms have bonds, but others do not, use the {bond} style.
 
 The only scenario where the {hybrid} style is needed is if there is no
 single style which defines all needed properties of all atoms.  For
 example, if you want dipolar particles which will rotate due to
 torque, you would need to use "atom_style hybrid sphere dipole".  When
 a hybrid style is used, atoms store and communicate the union of all
 quantities implied by the individual styles.
 
 When using the {hybrid} style, you cannot combine the {template} style
 with another molecular style that stores bond,angle,etc info on a
 per-atom basis.
 
 LAMMPS can be extended with new atom styles as well as new body
 styles; see "this section"_Section_modify.html.
 
+:line
+
+Styles with a {cuda} or {kk} suffix are functionally the same as the
+corresponding style without the suffix.  They have been optimized to
+run faster, depending on your available hardware, as discussed in
+"Section_accelerate"_Section_accelerate.html of the manual.  The
+accelerated styles take the same arguments and should produce the same
+results, except for round-off and precision issues.
+
+Note that other acceleration packages in LAMMPS, specifically the GPU,
+USER-OMP, and OPT packages do not use of accelerated atom styles.
+
+These accelerated styles are part of the USER-CUDA and KOKKOS packages
+respectively.  They are only enabled if LAMMPS was built with those
+packages.  See the "Making LAMMPS"_Section_start.html#start_3 section
+for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_7 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section_accelerate"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
 [Restrictions:]
 
 This command cannot be used after the simulation box is defined by a
 "read_data"_read_data.html or "create_box"_create_box.html command.
 
 The {angle}, {bond}, {full}, {molecular}, and {template} styles are
 part of the MOLECULAR package.  The {line} and {tri} styles are part
 of the ASPHERE pacakge.  The {body} style is part of the BODY package.
 The {dipole} style is part of the DIPOLE package.  The {peri} style is
 part of the PERI package for Peridynamics.  The {electron} style is
 part of the USER-EFF package for "electronic force
 fields"_pair_eff.html.  The {meso} style is part of the USER-SPH
 package for smoothed particle hydrodyanmics (SPH).  See "this PDF
 guide"_USER/sph/SPH_LAMMPS_userguide.pdf to using SPH in LAMMPS.  The
 {wavepacket} style is part of the USER-AWPMD package for the
 "antisymmetrized wave packet MD method"_pair_awpmd.html.  They are
 only enabled if LAMMPS was built with that package.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.
 
 [Related commands:]
 
 "read_data"_read_data.html, "pair_style"_pair_style.html
 
 [Default:]
 
 atom_style atomic
 
 :line
 
 :link(Grime)
 [(Grime)] Grime and Voth, to appear in J Chem Theory & Computation
 (2014).
diff --git a/doc/fix_nve.html b/doc/fix_nve.html
index e70474fe0..e027559e8 100644
--- a/doc/fix_nve.html
+++ b/doc/fix_nve.html
@@ -1,78 +1,81 @@
 <HTML>
 <CENTER><A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> - <A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> 
 </CENTER>
 
 
 
 
 
 
 <HR>
 
 <H3>fix nve command 
 </H3>
 <H3>fix nve/cuda command 
 </H3>
+<H3>fix nve/kk command 
+</H3>
 <H3>fix nve/omp command 
 </H3>
 <P><B>Syntax:</B>
 </P>
 <PRE>fix ID group-ID nve 
 </PRE>
 <UL><LI>ID, group-ID are documented in <A HREF = "fix.html">fix</A> command
 <LI>nve = style name of this fix command 
 </UL>
 <P><B>Examples:</B>
 </P>
 <PRE>fix 1 all nve 
 </PRE>
 <P><B>Description:</B>
 </P>
 <P>Perform constant NVE integration to update position and velocity for
 atoms in the group each timestep.  V is volume; E is energy.  This
 creates a system trajectory consistent with the microcanonical
 ensemble.
 </P>
 <HR>
 
-<P>Styles with a <I>cuda</I>, <I>gpu</I>, <I>omp</I>, or <I>opt</I> suffix are functionally
-the same as the corresponding style without the suffix.  They have
-been optimized to run faster, depending on your available hardware, as
-discussed in <A HREF = "Section_accelerate.html">Section_accelerate</A> of the
-manual.  The accelerated styles take the same arguments and should
-produce the same results, except for round-off and precision issues.
+<P>Styles with a <I>cuda</I>, <I>gpu</I>, <I>kk</I>, <I>omp</I>, or <I>opt</I> suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in <A HREF = "Section_accelerate.html">Section_accelerate</A>
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
 </P>
-<P>These accelerated styles are part of the USER-CUDA, GPU, USER-OMP and OPT
-packages, respectively.  They are only enabled if LAMMPS was built with
-those packages.  See the <A HREF = "Section_start.html#start_3">Making LAMMPS</A>
-section for more info.
+<P>These accelerated styles are part of the USER-CUDA, GPU, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the <A HREF = "Section_start.html#start_3">Making
+LAMMPS</A> section for more info.
 </P>
 <P>You can specify the accelerated styles explicitly in your input script
 by including their suffix, or you can use the <A HREF = "Section_start.html#start_7">-suffix command-line
 switch</A> when you invoke LAMMPS, or you can
 use the <A HREF = "suffix.html">suffix</A> command in your input script.
 </P>
 <P>See <A HREF = "Section_accelerate.html">Section_accelerate</A> of the manual for
 more instructions on how to use the accelerated styles effectively.
 </P>
 <HR>
 
 <P><B>Restart, fix_modify, output, run start/stop, minimize info:</B>
 </P>
 <P>No information about this fix is written to <A HREF = "restart.html">binary restart
 files</A>.  None of the <A HREF = "fix_modify.html">fix_modify</A> options
 are relevant to this fix.  No global or per-atom quantities are stored
 by this fix for access by various <A HREF = "Section_howto.html#howto_15">output
 commands</A>.  No parameter of this fix can
 be used with the <I>start/stop</I> keywords of the <A HREF = "run.html">run</A> command.
 This fix is not invoked during <A HREF = "minimize.html">energy minimization</A>.
 </P>
 <P><B>Restrictions:</B> none
 </P>
 <P><B>Related commands:</B>
 </P>
 <P><A HREF = "fix_nh.html">fix nvt</A>, <A HREF = "fix_nh.html">fix npt</A>
 </P>
 <P><B>Default:</B> none
 </P>
 </HTML>
diff --git a/doc/fix_nve.txt b/doc/fix_nve.txt
index b43a78c62..46f842d37 100644
--- a/doc/fix_nve.txt
+++ b/doc/fix_nve.txt
@@ -1,71 +1,73 @@
 "LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
 
 :link(lws,http://lammps.sandia.gov)
 :link(ld,Manual.html)
 :link(lc,Section_commands.html#comm)
 
 :line
 
 fix nve command :h3
 fix nve/cuda command :h3
+fix nve/kk command :h3
 fix nve/omp command :h3
 
 [Syntax:]
 
 fix ID group-ID nve :pre
 
 ID, group-ID are documented in "fix"_fix.html command
 nve = style name of this fix command :ul
 
 [Examples:]
 
 fix 1 all nve :pre
 
 [Description:]
 
 Perform constant NVE integration to update position and velocity for
 atoms in the group each timestep.  V is volume; E is energy.  This
 creates a system trajectory consistent with the microcanonical
 ensemble.
 
 :line
 
-Styles with a {cuda}, {gpu}, {omp}, or {opt} suffix are functionally
-the same as the corresponding style without the suffix.  They have
-been optimized to run faster, depending on your available hardware, as
-discussed in "Section_accelerate"_Section_accelerate.html of the
-manual.  The accelerated styles take the same arguments and should
-produce the same results, except for round-off and precision issues.
-
-These accelerated styles are part of the USER-CUDA, GPU, USER-OMP and OPT
-packages, respectively.  They are only enabled if LAMMPS was built with
-those packages.  See the "Making LAMMPS"_Section_start.html#start_3
-section for more info.
+Styles with a {cuda}, {gpu}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section_accelerate"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the USER-CUDA, GPU, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
 
 You can specify the accelerated styles explicitly in your input script
 by including their suffix, or you can use the "-suffix command-line
 switch"_Section_start.html#start_7 when you invoke LAMMPS, or you can
 use the "suffix"_suffix.html command in your input script.
 
 See "Section_accelerate"_Section_accelerate.html of the manual for
 more instructions on how to use the accelerated styles effectively.
 
 :line
 
 [Restart, fix_modify, output, run start/stop, minimize info:]
 
 No information about this fix is written to "binary restart
 files"_restart.html.  None of the "fix_modify"_fix_modify.html options
 are relevant to this fix.  No global or per-atom quantities are stored
 by this fix for access by various "output
 commands"_Section_howto.html#howto_15.  No parameter of this fix can
 be used with the {start/stop} keywords of the "run"_run.html command.
 This fix is not invoked during "energy minimization"_minimize.html.
 
 [Restrictions:] none
 
 [Related commands:]
 
 "fix nvt"_fix_nh.html, "fix npt"_fix_nh.html
 
 [Default:] none
diff --git a/doc/fix_rigid.html b/doc/fix_rigid.html
index 88ff880c0..aecf8bcfe 100644
--- a/doc/fix_rigid.html
+++ b/doc/fix_rigid.html
@@ -1,802 +1,819 @@
 <HTML>
 <CENTER><A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> - <A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> 
 </CENTER>
 
 
 
 
 
 
 <HR>
 
 <H3>fix rigid command 
 </H3>
 <H3>fix rigid/nve command 
 </H3>
 <H3>fix rigid/nvt command 
 </H3>
 <H3>fix rigid/npt command 
 </H3>
 <H3>fix rigid/nph command 
 </H3>
 <H3>fix rigid/small command 
 </H3>
 <H3>fix rigid/nve/small command 
 </H3>
 <H3>fix rigid/nvt/small command 
 </H3>
 <H3>fix rigid/npt/small command 
 </H3>
 <H3>fix rigid/nph/small command 
 </H3>
 <P><B>Syntax:</B>
 </P>
 <PRE>fix ID group-ID style bodystyle args keyword values ... 
 </PRE>
 <UL><LI>ID, group-ID are documented in <A HREF = "fix.html">fix</A> command 
 
 <LI>style = <I>rigid</I> or <I>rigid/nve</I> or <I>rigid/nvt</I> or <I>rigid/npt</I> or <I>rigid/nph</I> or <I>rigid/small</I> or <I>rigid/nve/small</I> or <I>rigid/nvt/small</I> or <I>rigid/npt/small</I> or <I>rigid/nph/small</I> 
 
 <LI>bodystyle = <I>single</I> or <I>molecule</I> or <I>group</I> 
 
 <PRE>  <I>single</I> args = none
   <I>molecule</I> args = none
   <I>group</I> args = N groupID1 groupID2 ...
     N = # of groups
     groupID1, groupID2, ... = list of N group IDs 
 </PRE>
 <LI>zero or more keyword/value pairs may be appended 
 
 <LI>keyword = <I>langevin</I> or <I>temp</I> or <I>iso</I> or <I>aniso</I> or <I>x</I> or <I>y</I> or <I>z</I> or <I>couple</I> or <I>tparam</I> or <I>pchain</I> or <I>dilate</I> or <I>force</I> or <I>torque</I> or <I>infile</I> 
 
 <PRE>  <I>langevin</I> values = Tstart Tstop Tperiod seed
     Tstart,Tstop = desired temperature at start/stop of run (temperature units)
     Tdamp = temperature damping parameter (time units)
     seed = random number seed to use for white noise (positive integer)
   <I>temp</I> values = Tstart Tstop Tdamp
     Tstart,Tstop = desired temperature at start/stop of run (temperature units)
     Tdamp = temperature damping parameter (time units)
   <I>iso</I> or <I>aniso</I> values = Pstart Pstop Pdamp
     Pstart,Pstop = scalar external pressure at start/end of run (pressure units)
     Pdamp = pressure damping parameter (time units)
   <I>x</I> or <I>y</I> or <I>z</I> values = Pstart Pstop Pdamp
     Pstart,Pstop = external stress tensor component at start/end of run (pressure units)
     Pdamp = stress damping parameter (time units)
   <I>couple</I> = <I>none</I> or <I>xyz</I> or <I>xy</I> or <I>yz</I> or <I>xz</I>
   <I>tparam</I> values = Tchain Titer Torder
     Tchain = length of Nose/Hoover thermostat chain
     Titer = number of thermostat iterations performed
     Torder = 3 or 5 = Yoshida-Suzuki integration parameters
   <I>pchain</I> values = Pchain
     Pchain = length of the Nose/Hoover thermostat chain coupled with the barostat
   <I>dilate</I> value = dilate-group-ID
     dilate-group-ID = only dilate atoms in this group due to barostat volume changes
   <I>force</I> values = M xflag yflag zflag
     M = which rigid body from 1-Nbody (see asterisk form below)
     xflag,yflag,zflag = off/on if component of center-of-mass force is active
   <I>torque</I> values = M xflag yflag zflag
     M = which rigid body from 1-Nbody (see asterisk form below)
     xflag,yflag,zflag = off/on if component of center-of-mass torque is active
   <I>infile</I> filename
     filename = file with per-body values of mass, center-of-mass, moments of inertia 
   <I>mol</I> value = template-ID
     template-ID = ID of molecule template specified in a separate <A HREF = "molecule.html">molecule</A> command 
 </PRE>
 
 </UL>
 <P><B>Examples:</B>
 </P>
 <PRE>fix 1 clump rigid single
 fix 1 clump rigid/small molecule
 fix 1 clump rigid single force 1 off off on langevin 1.0 1.0 1.0 428984
 fix 1 polychains rigid/nvt molecule temp 1.0 1.0 5.0
 fix 1 polychains rigid molecule force 1*5 off off off force 6*10 off off on
 fix 1 polychains rigid/small molecule langevin 1.0 1.0 1.0 428984
 fix 2 fluid rigid group 3 clump1 clump2 clump3 torque * off off off 
 fix 1 rods rigid/npt molecule temp 300.0 300.0 100.0 iso 0.5 0.5 10.0
 fix 1 particles rigid/npt molecule temp 1.0 1.0 5.0 x 0.5 0.5 1.0 z 0.5 0.5 1.0 couple xz
 fix 1 water rigid/nph molecule iso 0.5 0.5 1.0
 fix 1 particles rigid/npt/small molecule temp 1.0 1.0 1.0 iso 0.5 0.5 1.0 
 </PRE>
 <P><B>Description:</B>
 </P>
 <P>Treat one or more sets of atoms as independent rigid bodies.  This
 means that each timestep the total force and torque on each rigid body
 is computed as the sum of the forces and torques on its constituent
 particles.  The coordinates, velocities, and orientations of the atoms
 in each body are then updated so that the body moves and rotates as a
 single entity.
 </P>
 <P>Examples of large rigid bodies are a colloidal particle, or portions
 of a biomolecule such as a protein.
 </P>
 <P>Example of small rigid bodies are patchy nanoparticles, such as those
 modeled in <A HREF = "#Zhang">this paper</A> by Sharon Glotzer's group, clumps of
 granular particles, lipid molecules consiting of one or more point
 dipoles connected to other spheroids or ellipsoids, irregular
 particles built from line segments (2d) or triangles (3d), and
 coarse-grain models of nano or colloidal particles consisting of a
 small number of constituent particles.  Note that the <A HREF = "fix_shake.html">fix
 shake</A> command can also be used to rigidify small
 molecules of 2, 3, or 4 atoms, e.g. water molecules.  That fix treats
 the constituent atoms as point masses.
 </P>
 <P>These fixes also update the positions and velocities of the atoms in
 each rigid body via time integration, in the NVE, NVT, NPT, or NPH
 ensemble, as described below.  
 </P>
 <P>There are two main variants of this fix, fix rigid and fix
 rigid/small.  The NVE/NVT/NPT/NHT versions belong to one of the two
 variants, as their style names indicate.  
 </P>
 <P>IMPORTANT NOTE: Not all of the bodystyle options and keyword/value
 options are available for both the <I>rigid</I> and <I>rigid/small</I> variants.
 See details below.
 </P>
 <P>The <I>rigid</I> variant is typically the best choice for a system with a
 small number of large rigid bodies, each of which can extend across
 the domain of many processors.  It operates by creating a single
 global list of rigid bodies, which all processors contribute to.
 MPI_Allreduce operations are performed each timestep to sum the
 contributions from each processor to the force and torque on all the
 bodies.  This operation will not scale well in parallel if large
 numbers of rigid bodies are simulated.
 </P>
 <P>The <I>rigid/small</I> variant is typically best for a system with a large
 number of small rigid bodies.  Each body is assigned to the atom
 closest to the geometrical center of the body.  The fix operates using
 local lists of rigid bodies owned by each processor and information is
 exchanged and summed via local communication between neighboring
 processors when ghost atom info is accumlated.
 </P>
 <P>IMPORTANT NOTE: To use <I>rigid/small</I> the ghost atom cutoff must be
 large enough to span the distance between the atom that owns the body
 and every other atom in the body.  This distance value is printed out
 when the rigid bodies are defined.  If the
 <A HREF = "pair_style.html">pair_style</A> cutoff plus neighbor skin does not span
 this distance, then you should use the <A HREF = "communicate.html">communicate
 cutoff</A> command with a setting epsilon larger than
 the distance.
 </P>
 <P>Which of the two variants is faster for a particular problem is hard
 to predict.  The best way to decide is to perform a short test run.
 Both variants should give identical numerical answers for short runs.
 Long runs should give statistically similar results, but round-off
 differences may accumulate to produce divergent trajectories.
 </P>
 <P>IMPORTANT NOTE: You should not update the atoms in rigid bodies via
 other time-integration fixes (e.g. <A HREF = "fix_nve.html">fix nve</A>, <A HREF = "fix_nvt.html">fix
 nvt</A>, <A HREF = "fix_npt.html">fix npt</A>), or you will be integrating
 their motion more than once each timestep.  When performing a hybrid
 simulation with some atoms in rigid bodies, and some not, a separate
 time integration fix like <A HREF = "fix_nve.html">fix nve</A> or <A HREF = "fix_nh.html">fix
 nvt</A> should be used for the non-rigid particles.
 </P>
 <P>IMPORTANT NOTE: These fixes are overkill if you simply want to hold a
 collection of atoms stationary or have them move with a constant
 velocity.  A simpler way to hold atoms stationary is to not include
 those atoms in your time integration fix.  E.g. use "fix 1 mobile nve"
 instead of "fix 1 all nve", where "mobile" is the group of atoms that
 you want to move.  You can move atoms with a constant velocity by
 assigning them an initial velocity (via the <A HREF = "velocity.html">velocity</A>
 command), setting the force on them to 0.0 (via the <A HREF = "fix_setforce.html">fix
 setforce</A> command), and integrating them as usual
 (e.g. via the <A HREF = "fix_nve.html">fix nve</A> command).
 </P>
 <P>IMPORTANT NOTE: The aggregate properties of each rigid body are
 calculated at the start of each simulation run.  These include its
 center of mass, moments of inertia, and net velocity and angular
 momentum.  This means that before or between runs, per-atom properties
 can be changed, e.g. via the <A HREF = "set.html">set</A> or
 <A HREF = "velocity.html">velocity</A> command, which will affect the bodies.  An
 exception is if the <I>infile</I> keyword is used, then all the body
 properties (except net velocity and angular momentum) are only
 calculated once so that values from the file are valid.
 </P>
 <HR>
 
 <P>Each rigid body must have two or more atoms.  An atom can belong to at
 most one rigid body.  Which atoms are in which bodies can be defined
 via several options.
 </P>
 <P>IMPORTANT NOTE: With fix rigid/small, which requires bodystyle
 <I>molecule</I>, you can define a system that has no rigid bodies
 initially.  This is useful when you are adding rigid bodies on-the-fly
 via commands such as <A HREF = "fix_deposit.html">fix deposit</A> or <A HREF = "fix_pour.html">fix
 pour</A>.
 </P>
 <P>For bodystyle <I>single</I> the entire fix group of atoms is treated as one
 rigid body.  This option is only allowed for fix rigid and its
 sub-styles.
 </P>
 <P>For bodystyle <I>molecule</I>, each set of atoms in the fix group with a
 different molecule ID is treated as a rigid body.  This option is
 allowed for fix rigid and fix rigid/small, and their sub-styles.  Note
 that atoms with a molecule ID = 0 will be treated as a single rigid
 body.  For a system with atomic solvent (typically this is atoms with
 molecule ID = 0) surrounding rigid bodies, this may not be what you
 want.  Thus you should be careful to use a fix group that only
 includes atoms you want to be part of rigid bodies.
 </P>
 <P>For bodystyle <I>group</I>, each of the listed groups is treated as a
 separate rigid body.  Only atoms that are also in the fix group are
 included in each rigid body.  This option is only allowed for fix
 rigid and its sub-styles.
 </P>
 <P>IMPORTANT NOTE: To compute the initial center-of-mass position and
 other properties of each rigid body, the image flags for each atom in
 the body are used to "unwrap" the atom coordinates.  Thus you must
 insure that these image flags are consistent so that the unwrapping
 creates a valid rigid body (one where the atoms are close together),
 particularly if the atoms in a single rigid body straddle a periodic
 boundary.  This means the input data file or restart file must define
 the image flags for each atom consistently or that you have used the
 <A HREF = "set.html">set</A> command to specify them correctly.  If a dimension is
 non-periodic then the image flag of each atom must be 0 in that
 dimension, else an error is generated.
 </P>
 <P>The <I>force</I> and <I>torque</I> keywords discussed next are only allowed for
 fix rigid and its sub-styles.
 </P>
 <P>By default, each rigid body is acted on by other atoms which induce an
 external force and torque on its center of mass, causing it to
 translate and rotate.  Components of the external center-of-mass force
 and torque can be turned off by the <I>force</I> and <I>torque</I> keywords.
 This may be useful if you wish a body to rotate but not translate, or
 vice versa, or if you wish it to rotate or translate continuously
 unaffected by interactions with other particles.  Note that if you
 expect a rigid body not to move or rotate by using these keywords, you
 must insure its initial center-of-mass translational or angular
 velocity is 0.0.  Otherwise the initial translational or angular
 momentum the body has will persist.
 </P>
 <P>An xflag, yflag, or zflag set to <I>off</I> means turn off the component of
 force of torque in that dimension.  A setting of <I>on</I> means turn on
 the component, which is the default.  Which rigid body(s) the settings
 apply to is determined by the first argument of the <I>force</I> and
 <I>torque</I> keywords.  It can be an integer M from 1 to Nbody, where
 Nbody is the number of rigid bodies defined.  A wild-card asterisk can
 be used in place of, or in conjunction with, the M argument to set the
 flags for multiple rigid bodies.  This takes the form "*" or "*n" or
 "n*" or "m*n".  If N = the number of rigid bodies, then an asterisk
 with no numeric values means all bodies from 1 to N.  A leading
 asterisk means all bodies from 1 to n (inclusive).  A trailing
 asterisk means all bodies from n to N (inclusive).  A middle asterisk
 means all types from m to n (inclusive).  Note that you can use the
 <I>force</I> or <I>torque</I> keywords as many times as you like.  If a
 particular rigid body has its component flags set multiple times, the
 settings from the final keyword are used.
 </P>
 <P>IMPORTANT NOTE: For computational efficiency, you may wish to turn off
 pairwise and bond interactions within each rigid body, as they no
 longer contribute to the motion.  The <A HREF = "neigh_modify.html">neigh_modify
 exclude</A> and <A HREF = "delete_bonds.html">delete_bonds</A>
 commands are used to do this.  If the rigid bodies have strongly
 overalapping atoms, you may need to turn off these interactions to
 avoid numerical problems due to large equal/opposite intra-body forces
 swamping the contribution of small inter-body forces.
 </P>
 <P>For computational efficiency, you should typically define one fix
 rigid or fix rigid/small command which includes all the desired rigid
 bodies.  LAMMPS will allow multiple rigid fixes to be defined, but it
 is more expensive.
 </P>
 <HR>
 
 <P>The constituent particles within a rigid body can be point particles
 (the default in LAMMPS) or finite-size particles, such as spheres or
 ellipsoids or line segments or triangles.  See the <A HREF = "atom_style.html">atom_style sphere
 and ellipsoid and line and tri</A> commands for more
 details on these kinds of particles.  Finite-size particles contribute
 differently to the moment of inertia of a rigid body than do point
 particles.  Finite-size particles can also experience torque (e.g. due
 to <A HREF = "pair_gran.html">frictional granular interactions</A>) and have an
 orientation.  These contributions are accounted for by these fixes.
 </P>
 <P>Forces between particles within a body do not contribute to the
 external force or torque on the body.  Thus for computational
 efficiency, you may wish to turn off pairwise and bond interactions
 between particles within each rigid body.  The <A HREF = "neigh_modify.html">neigh_modify
 exclude</A> and <A HREF = "delete_bonds.html">delete_bonds</A>
 commands are used to do this.  For finite-size particles this also
 means the particles can be highly overlapped when creating the rigid
 body.
 </P>
 <HR>
 
 <P>The <I>rigid</I> and <I>rigid/small</I> and <I>rigid/nve</I> styles perform constant
 NVE time integration.  The only difference is that the <I>rigid</I> and
 <I>rigid/small</I> styles use an integration technique based on Richardson
 iterations.  The <I>rigid/nve</I> style uses the methods described in the
 paper by <A HREF = "#Miller">Miller</A>, which are thought to provide better energy
 conservation than an iterative approach.
 </P>
 <P>The <I>rigid/nvt</I> and <I>rigid/nvt/small</I> styles performs constant NVT 
 integration using a Nose/Hoover thermostat with chains as described 
 originally in <A HREF = "#Hoover">(Hoover)</A> and <A HREF = "#Martyna">(Martyna)</A>, which 
 thermostats both the translational and rotational degrees of freedom 
 of the rigid bodies.  The rigid-body algorithm used by <I>rigid/nvt</I> 
 is described in the paper by <A HREF = "#Kamberaj">Kamberaj</A>.
 </P>
 <P>The <I>rigid/npt</I> and <I>rigid/nph</I> (and their /small counterparts) styles 
 perform constant NPT or NPH integration using a Nose/Hoover barostat 
 with chains.  For the NPT case, the same Nose/Hoover thermostat is also 
 used as with <I>rigid/nvt</I>.
 </P>
 <P>The barostat parameters are specified using one or more of the <I>iso</I>,
 <I>aniso</I>, <I>x</I>, <I>y</I>, <I>z</I> and <I>couple</I> keywords.  These keywords give you
 the ability to specify 3 diagonal components of the external stress
 tensor, and to couple these components together so that the dimensions
 they represent are varied together during a constant-pressure
 simulation.  The effects of these keywords are similar to those
 defined in <A HREF = "fix_nh.html">fix npt/nph</A>
 </P>
 <P>NOTE: Currently the <I>rigid/npt</I> and <I>rigid/nph</I> (and their /small 
 counterparts) styles do not support triclinic (non-orthongonal) boxes.
 </P>
 <P>The target pressures for each of the 6 components of the stress tensor
 can be specified independently via the <I>x</I>, <I>y</I>, <I>z</I> keywords, which
 correspond to the 3 simulation box dimensions.  For each component,
 the external pressure or tensor component at each timestep is a ramped
 value during the run from <I>Pstart</I> to <I>Pstop</I>. If a target pressure is
 specified for a component, then the corresponding box dimension will
 change during a simulation.  For example, if the <I>y</I> keyword is used,
 the y-box length will change.  A box dimension will not change if that
 component is not specified, although you have the option to change
 that dimension via the <A HREF = "fix_deform.html">fix deform</A> command.
 </P>
 <P>For all barostat keywords, the <I>Pdamp</I> parameter operates like the
 <I>Tdamp</I> parameter, determining the time scale on which pressure is
 relaxed.  For example, a value of 10.0 means to relax the pressure in
 a timespan of (roughly) 10 time units (e.g. tau or fmsec or psec - see
 the <A HREF = "units.html">units</A> command).
 </P>
 <P>Regardless of what atoms are in the fix group (the only atoms which
 are time integrated), a global pressure or stress tensor is computed
 for all atoms.  Similarly, when the size of the simulation box is
 changed, all atoms are re-scaled to new positions, unless the keyword
 <I>dilate</I> is specified with a <I>dilate-group-ID</I> for a group that
 represents a subset of the atoms.  This can be useful, for example, to
 leave the coordinates of atoms in a solid substrate unchanged and
 controlling the pressure of a surrounding fluid.  Another example is a
 system consisting of rigid bodies and point particles where the
 barostat is only coupled with the rigid bodies.  This option should be
 used with care, since it can be unphysical to dilate some atoms and
 not others, because it can introduce large, instantaneous
 displacements between a pair of atoms (one dilated, one not) that are
 far from the dilation origin.
 </P>
 <P>The <I>couple</I> keyword allows two or three of the diagonal components of
 the pressure tensor to be "coupled" together.  The value specified
 with the keyword determines which are coupled.  For example, <I>xz</I>
 means the <I>Pxx</I> and <I>Pzz</I> components of the stress tensor are coupled.
 <I>Xyz</I> means all 3 diagonal components are coupled.  Coupling means two
 things: the instantaneous stress will be computed as an average of the
 corresponding diagonal components, and the coupled box dimensions will
 be changed together in lockstep, meaning coupled dimensions will be
 dilated or contracted by the same percentage every timestep.  The
 <I>Pstart</I>, <I>Pstop</I>, <I>Pdamp</I> parameters for any coupled dimensions must
 be identical.  <I>Couple xyz</I> can be used for a 2d simulation; the <I>z</I>
 dimension is simply ignored.
 </P>
 <P>The <I>iso</I> and <I>aniso</I> keywords are simply shortcuts that are
 equivalent to specifying several other keywords together.
 </P>
 <P>The keyword <I>iso</I> means couple all 3 diagonal components together when
 pressure is computed (hydrostatic pressure), and dilate/contract the
 dimensions together.  Using "iso Pstart Pstop Pdamp" is the same as
 specifying these 4 keywords:
 </P>
 <PRE>x Pstart Pstop Pdamp
 y Pstart Pstop Pdamp
 z Pstart Pstop Pdamp
 couple xyz 
 </PRE>
 <P>The keyword <I>aniso</I> means <I>x</I>, <I>y</I>, and <I>z</I> dimensions are controlled
 independently using the <I>Pxx</I>, <I>Pyy</I>, and <I>Pzz</I> components of the
 stress tensor as the driving forces, and the specified scalar external
 pressure.  Using "aniso Pstart Pstop Pdamp" is the same as specifying
 these 4 keywords:
 </P>
 <PRE>x Pstart Pstop Pdamp
 y Pstart Pstop Pdamp
 z Pstart Pstop Pdamp
 couple none 
 </PRE>
 <HR>
 
 <P>The keyword/value option pairs are used in the following ways.
 </P>
 <P>The <I>langevin</I> and <I>temp</I> and <I>tparam</I> keywords perform thermostatting
 of the rigid bodies, altering both their translational and rotational
 degrees of freedom.  What is meant by "temperature" of a collection of
 rigid bodies and how it can be monitored via the fix output is
 discussed below.
 </P>
 <P>The <I>langevin</I> keyword applies a Langevin thermostat to the constant
 NVE time integration performed by either the <I>rigid</I> or <I>rigid/small</I>
 or <I>rigid/nve</I> styles.  It cannot be used with the <I>rigid/nvt</I> style.
 The desired temperature at each timestep is a ramped value during the
 run from <I>Tstart</I> to <I>Tstop</I>.  The <I>Tdamp</I> parameter is specified in
 time units and determines how rapidly the temperature is relaxed.  For
 example, a value of 100.0 means to relax the temperature in a timespan
 of (roughly) 100 time units (tau or fmsec or psec - see the
 <A HREF = "units.html">units</A> command).  The random # <I>seed</I> must be a positive
 integer.
 </P>
 <P>The way that Langevin thermostatting operates is explained on the <A HREF = "fix_langevin.html">fix
 langevin</A> doc page.  If you wish to simply viscously
 damp the rotational motion without thermostatting, you can set
 <I>Tstart</I> and <I>Tstop</I> to 0.0, which means only the viscous drag term in
 the Langevin thermostat will be applied.  See the discussion on the
 <A HREF = "doc/fix_viscous.html">fix viscous</A> doc page for details.
 </P>
 <P>IMPORTANT NOTE: When the <I>langevin</I> keyword is used with fix rigid
 versus fix rigid/small, different dynamics will result for parallel
 runs.  This is because of the way random numbers are used in the two
 cases.  The dynamics for the two cases should be statistically
 similar, but will not be identical, even for a single timestep.
 </P>
 <P>The <I>temp</I> and <I>tparam</I> keywords apply a Nose/Hoover thermostat to the
 NVT time integration performed by the <I>rigid/nvt</I> style.  They cannot
 be used with the <I>rigid</I> or <I>rigid/small</I> or <I>rigid/nve</I> styles.  The
 desired temperature at each timestep is a ramped value during the run
 from <I>Tstart</I> to <I>Tstop</I>.  The <I>Tdamp</I> parameter is specified in time
 units and determines how rapidly the temperature is relaxed.  For
 example, a value of 100.0 means to relax the temperature in a timespan
 of (roughly) 100 time units (tau or fmsec or psec - see the
 <A HREF = "units.html">units</A> command).
 </P>
 <P>Nose/Hoover chains are used in conjunction with this thermostat.  The
 <I>tparam</I> keyword can optionally be used to change the chain settings
 used.  <I>Tchain</I> is the number of thermostats in the Nose Hoover chain.
 This value, along with <I>Tdamp</I> can be varied to dampen undesirable
 oscillations in temperature that can occur in a simulation.  As a rule
 of thumb, increasing the chain length should lead to smaller
 oscillations. The keyword <I>pchain</I> specifies the number of
 thermostats in the chain thermostatting the barostat degrees of
 freedom. 
 </P>
 <P>IMPORTANT NOTE: There are alternate ways to thermostat a system of
 rigid bodies.  You can use <A HREF = "fix_langevin.html">fix langevin</A> to treat
 the individual particles in the rigid bodies as effectively immersed
 in an implicit solvent, e.g. a Brownian dynamics model.  For hybrid
 systems with both rigid bodies and solvent particles, you can
 thermostat only the solvent particles that surround one or more rigid
 bodies by appropriate choice of groups in the compute and fix commands
 for temperature and thermostatting.  The solvent interactions with the
 rigid bodies should then effectively thermostat the rigid body
 temperature as well without use of the Langevin or Nose/Hoover options
 associated with the fix rigid commands.
 </P>
 <HR>
 
 <P>The <I>mol</I> keyword can only be used with fix rigid/small.  It should be
 used when other commands, such as <A HREF = "fix_deposit.html">fix deposit</A> or
 <A HREF = "fix_pour.html">fix pour</A>, add rigid bodies on-the-fly during a
 simulation.  You specify a <I>template-ID</I> previously defined using the
 <A HREF = "molecule.html">molecule</A> command, which reads a file that defines the
 molecule.  You must use the same <I>template-ID</I> that the command adding
 rigid bodies uses.  The coordinates, atom types, atom diameters,
 center-of-mass, and moments of inertia can be specified in the
 molecule file.  See the <A HREF = "molecule.html">molecule</A> command for details.
 The only settings required to be in this file are the coordinates and
 types of atoms in the molecule.
 </P>
 <HR>
 
 <P>The <I>infile</I> keyword allows a file of rigid body attributes to be read
 in from a file, rather then having LAMMPS compute them.  There are 3
 such attributes: the total mass of the rigid body, its center-of-mass
 position, and its 6 moments of inertia.  For rigid bodies consisting
 of point particles or non-overlapping finite-size particles, LAMMPS
 can compute these values accurately.  However, for rigid bodies
 consisting of finite-size particles which overlap each other, LAMMPS
 will ignore the overlaps when computing these 3 attributes.  The
 amount of error this induces depends on the amount of overlap.  To
 avoid this issue, the values can be pre-computed (e.g. using Monte
 Carlo integration).
 </P>
 <P>The format of the file is as follows.  Note that the file does not
 have to list attributes for every rigid body integrated by fix rigid.
 Only bodies which the file specifies will have their computed
 attributes overridden.  The file can contain initial blank lines or
 comment lines starting with "#" which are ignored.  The first
 non-blank, non-comment line should list N = the number of lines to
 follow.  The N successive lines contain the following information:
 </P>
 <PRE>ID1 masstotal xcm ycm zcm ixx iyy izz ixy ixz iyz
 ID2 masstotal xcm ycm zcm ixx iyy izz ixy ixz iyz
 ...
 IDN masstotal xcm ycm zcm ixx iyy izz ixy ixz iyz 
 </PRE>
 <P>The rigid body IDs are all positive integers.  For the <I>single</I>
 bodystyle, only an ID of 1 can be used.  For the <I>group</I> bodystyle,
 IDs from 1 to Ng can be used where Ng is the number of specified
 groups.  For the <I>molecule</I> bodystyle, use the molecule ID for the
 atoms in a specific rigid body as the rigid body ID.
 </P>
 <P>The masstotal and center-of-mass coordinates (xcm,ycm,zcm) are
 self-explanatory.  The center-of-mass should be consistent with what
 is calculated for the position of the rigid body with all its atoms
 unwrapped by their respective image flags.  If this produces a
 center-of-mass that is outside the simulation box, LAMMPS wraps it
 back into the box.  The 6 moments of inertia (ixx,iyy,izz,ixy,ixz,iyz)
 should be the values consistent with the current orientation of the
 rigid body around its center of mass.  The values are with respect to
 the simulation box XYZ axes, not with respect to the prinicpal axes of
 the rigid body itself.  LAMMPS performs the latter calculation
 internally.
 </P>
 <P>IMPORTANT NOTE: If you use the <I>infile</I> keyword and write restart
 files during a simulation, then each time a restart file is written,
 the fix also write an auxiliary restart file with the name
 rfile.rigid, where "rfile" is the name of the restart file,
 e.g. tmp.restart.10000 and tmp.restart.10000.rigid.  This auxiliary
 file is in the same format described above and contains info on the
 current center-of-mass and 6 moments of inertia.  Thus it can be used
 in a new input script that restarts the run and re-specifies a rigid
 fix using an <I>infile</I> keyword and the appropriate filename.  Note that
 the auxiliary file will contain one line for every rigid body, even if
 the original file only listed a subset of the rigid bodies.
 </P>
 <P>IMPORTANT NOTE: If you are using fix rigid/small and defining a system
 that has no rigid bodies initially, because they will be added
 on-the-fly by commands such as <A HREF = "fix_deposit.html">fix deposit</A> or <A HREF = "fix_pour.html">fix
 pour</A>, you may still wish to use the <I>infile</I> keyword.
 This is so that restart files written during the simulation will
 output an auxiliary restart file as described above with information
 on the new rigid bodies.  In this case the initial <I>infile</I> file
 should use N = 0.
 </P>
 <HR>
 
 <P>If you use a <A HREF = "compute.html">temperature compute</A> with a group that
 includes particles in rigid bodies, the degrees-of-freedom removed by
 each rigid body are accounted for in the temperature (and pressure)
 computation, but only if the temperature group includes all the
 particles in a particular rigid body.
 </P>
 <P>A 3d rigid body has 6 degrees of freedom (3 translational, 3
 rotational), except for a collection of point particles lying on a
 straight line, which has only 5, e.g a dimer.  A 2d rigid body has 3
 degrees of freedom (2 translational, 1 rotational).
 </P>
 <P>IMPORTANT NOTE: You may wish to explicitly subtract additional
 degrees-of-freedom if you use the <I>force</I> and <I>torque</I> keywords to
 eliminate certain motions of one or more rigid bodies.  LAMMPS does
 not do this automatically.
 </P>
 <P>The rigid body contribution to the pressure of the system (virial) is
 also accounted for by this fix.
 </P>
 <P>IMPORTANT NOTE: The periodic image flags of atoms in rigid bodies are
 altered so that the rigid body can be reconstructed correctly when it
 straddles periodic boundaries.  The atom image flags are not
 incremented/decremented as they would be for non-rigid atoms as the
 rigid body crosses periodic boundaries.  Specifically, they are set so
 that the center-of-mass (COM) of the rigid body always remains inside
 the simulation box.
 </P>
 <P>This means that if you output per-atom image flags you cannot
 interpret them as you normally would.  I.e. the image flag values
 written to a <A HREF = "dump.html">dump file</A> will be different than they would
 be if the atoms were not in a rigid body.  Likewise the <A HREF = "compute_msd.html">compute
 msd</A> will not compute the expected mean-squared
 displacement for such atoms if the body moves across periodic
 boundaries.  It also means that if you have bonds between a pair of
 rigid bodies and the bond straddles a periodic boundary, you cannot
 use the <A HREF = "replicate.html">replicate</A> command to increase the system
 size.
 </P>
 <P>Here are details on how, you can post-process a dump file to calculate
 a diffusion coefficient for rigid bodies, using the altered per-atom
 image flags written to a dump file.  The image flags for atoms in the
 same rigid body can be used to unwrap the body and calculate its
 center-of-mass (COM).  As mentioned above, this COM will always be
 inside the simulation box.  Thus it will "jump" from one side of the
 box to the other when the COM crosses a periodic boundary.  If you
 keep track of the jumps, you can effectively "unwrap" the COM and use
 that value to track the displacement of each rigid body, and thus the
 mean-squared displacement (MSD) of an ensemble of bodies, and thus a
 diffusion coefficient.
 </P>
 <P>Note that fix rigid does define image flags for each rigid body, which
 are incremented when the center-of-mass of the rigid body crosses a
 periodic boundary in the usual way.  These image flags have the same
 meaning as atom images (see the "dump" command) and can be accessed
 and output as described below.
 </P>
 <HR>
 
 <P>If your simlulation is a hybrid model with a mixture of rigid bodies
 and non-rigid particles (e.g. solvent) there are several ways these
 rigid fixes can be used in tandem with <A HREF = "fix_nve.html">fix nve</A>, <A HREF = "fix_nh.html">fix
 nvt</A>, <A HREF = "fix_nh.html">fix npt</A>, and <A HREF = "fix_nh.html">fix nph</A>.
 </P>
 <P>If you wish to perform NVE dynamics (no thermostatting or
 barostatting), use fix rigid or fix rigid/nve to integrate the rigid
 bodies, and <A HREF = "fix_nve.html">fix nve</A> to integrate the non-rigid
 particles.
 </P>
 <P>If you wish to perform NVT dynamics (thermostatting, but no
 barostatting), you can use fix rigid/nvt for the rigid bodies, and any
 thermostatting fix for the non-rigid particles (<A HREF = "fix_nh.html">fix nvt</A>,
 <A HREF = "fix_langevin.html">fix langevin</A>, <A HREF = "fix_temp_berendsen.html">fix
 temp/berendsen</A>).  You can also use fix rigid
 or fix rigid/nve for the rigid bodies and thermostat them using <A HREF = "fix_langevin.html">fix
 langevin</A> on the group that contains all the
 particles in the rigid bodies.  The net force added by <A HREF = "fix_langevin.html">fix
 langevin</A> to each rigid body effectively thermostats
 its translational center-of-mass motion.  Not sure how well it does at
 thermostatting its rotational motion.
 </P>
 <P>If you with to perform NPT or NPH dynamics (barostatting), you cannot
 use both <A HREF = "fix_nh.html">fix npt</A> and fix rigid/npt (or the nph
 variants).  This is because there can only be one fix which monitors
 the global pressure and changes the simulation box dimensions.  So you
 have 3 choices:
 </P>
 <UL><LI>Use fix rigid/npt for the rigid bodies.  Use the <I>dilate</I> all option
 so that it will dilate the positions of the non-rigid particles as
 well.  Use <A HREF = "fix_nh.html">fix nvt</A> (or any other thermostat) for the
 non-rigid particles. 
 
 <LI>Use <A HREF = "fix_nh.html">fix npt</A> for the group of non-rigid particles.  Use
 the <I>dilate</I> all option so that it will dilate the center-of-mass
 positions of the rigid bodies as well.  Use fix rigid/nvt for the
 rigid bodies. 
 
 <LI>Use <A HREF = "fix_press_berendsen.html">fix press/berendsen</A> to compute the
 pressure and change the box dimensions.  Use fix rigid/nvt for the
 rigid bodies.  Use <A HREF = "fix_nh.thml">fix nvt</A> (or any other thermostat) for
 the non-rigid particles. 
 </UL>
 <P>In all case, the rigid bodies and non-rigid particles both contribute
 to the global pressure and the box is scaled the same by any of the
 barostatting fixes.
 </P>
 <P>You could even use the 2nd and 3rd options for a non-hybrid simulation
 consisting of only rigid bodies, assuming you give <A HREF = "fix_nh.html">fix
 npt</A> an empty group, though it's an odd thing to do.  The
 barostatting fixes (<A HREF = "fix_nh.html">fix npt</A> and <A HREF = "fix_press_berendsen.html">fix
 press/berensen</A>) will monitor the pressure
 and change the box dimensions, but not time integrate any particles.
 The integration of the rigid bodies will be performed by fix
 rigid/nvt.
 </P>
 <HR>
 
 <P>Styles with a <I>cuda</I>, <I>gpu</I>, <I>omp</I>, or <I>opt</I> suffix are functionally
 the same as the corresponding style without the suffix.  They have
 been optimized to run faster, depending on your available hardware, as
 discussed in <A HREF = "Section_accelerate.html">Section_accelerate</A> of the
 manual.  The accelerated styles take the same arguments and should
 produce the same results, except for round-off and precision issues.
 </P>
 <P>These accelerated styles are part of the USER-CUDA, GPU, USER-OMP and OPT
 packages, respectively.  They are only enabled if LAMMPS was built with
 those packages.  See the <A HREF = "Section_start.html#start_3">Making LAMMPS</A>
 section for more info.
 </P>
 <P>You can specify the accelerated styles explicitly in your input script
 by including their suffix, or you can use the <A HREF = "Section_start.html#start_7">-suffix command-line
 switch</A> when you invoke LAMMPS, or you can
 use the <A HREF = "suffix.html">suffix</A> command in your input script.
 </P>
 <P>See <A HREF = "Section_accelerate.html">Section_accelerate</A> of the manual for
 more instructions on how to use the accelerated styles effectively.
 </P>
 <HR>
 
 <P><B>Restart, fix_modify, output, run start/stop, minimize info:</B>
 </P>
 <P>No information about the <I>rigid</I> and <I>rigid/small</I> and <I>rigid/nve</I>
 fixes are written to <A HREF = "restart.html">binary restart files</A>.  For style
 <I>rigid/nvt</I> the state of the Nose/Hoover thermostat is written to
 <A HREF = "restart.html">binary restart files</A>.  See the
 <A HREF = "read_restart.html">read_restart</A> command for info on how to re-specify
 a fix in an input script that reads a restart file, so that the
 operation of the fix continues in an uninterrupted fashion.
 </P>
 <P>The <A HREF = "fix_modify.html">fix_modify</A> <I>energy</I> option is supported by the
 rigid/nvt fix to add the energy change induced by the thermostatting
 to the system's potential energy as part of <A HREF = "thermo_style.html">thermodynamic
 output</A>.
 </P>
 <P>The <A HREF = "fix_modify.html">fix_modify</A> <I>temp</I> and <I>press</I> options are 
 supported by the rigid/npt and rigid/nph fixes to change the computes used
 to calculate the instantaneous pressure tensor. Note that the rigid/nvt fix
 does not use any external compute to compute instantaneous temperature.
 </P>
 <P>The <I>rigid</I> and <I>rigid/small</I> and <I>rigid/nve</I> fixes compute a global
 scalar which can be accessed by various <A HREF = "Section_howto.html#howto_15">output
 commands</A>.  The scalar value calculated by
 these fixes is "intensive".  The scalar is the current temperature of
 the collection of rigid bodies.  This is averaged over all rigid
 bodies and their translational and rotational degrees of freedom.  The
 translational energy of a rigid body is 1/2 m v^2, where m = total
 mass of the body and v = the velocity of its center of mass.  The
 rotational energy of a rigid body is 1/2 I w^2, where I = the moment
 of inertia tensor of the body and w = its angular velocity.  Degrees
 of freedom constrained by the <I>force</I> and <I>torque</I> keywords are
 removed from this calculation, but only for the <I>rigid</I> and
 <I>rigid/nve</I> fixes.
 </P>
 <P>The <I>rigid/nvt</I>, <I>rigid/npt</I>, and <I>rigid/nph</I> fixes compute a global
 scalar which can be accessed by various <A HREF = "Section_howto.html#howto_15">output
 commands</A>.  The scalar value calculated by
 these fixes is "extensive".  The scalar is the cumulative energy
 change due to the thermostatting and barostatting the fix performs.
 </P>
 <P>All of the <I>rigid</I> fixes except <I>rigid/small</I> compute a global array
 of values which can be accessed by various <A HREF = "Section_howto.html#howto_15">output
 commands</A>.  The number of rows in the
 array is equal to the number of rigid bodies.  The number of columns
 is 15.  Thus for each rigid body, 15 values are stored: the xyz coords
 of the center of mass (COM), the xyz components of the COM velocity,
 the xyz components of the force acting on the COM, the xyz components
 of the torque acting on the COM, and the xyz image flags of the COM,
 which have the same meaning as image flags for atom positions (see the
 "dump" command).  The force and torque values in the array are not
 affected by the <I>force</I> and <I>torque</I> keywords in the fix rigid
 command; they reflect values before any changes are made by those
 keywords.
 </P>
 <P>The ordering of the rigid bodies (by row in the array) is as follows.
 For the <I>single</I> keyword there is just one rigid body.  For the
 <I>molecule</I> keyword, the bodies are ordered by ascending molecule ID.
 For the <I>group</I> keyword, the list of group IDs determines the ordering
 of bodies.
 </P>
 <P>The array values calculated by these fixes are "intensive", meaning
 they are independent of the number of atoms in the simulation.
 </P>
 <P>No parameter of these fixes can be used with the <I>start/stop</I> keywords
 of the <A HREF = "run.html">run</A> command.  These fixes are not invoked during
 <A HREF = "minimize.html">energy minimization</A>.
 </P>
 <HR>
 
 <P><B>Restrictions:</B>
 </P>
 <P>These fixes are all part of the RIGID package.  It is only enabled if
 LAMMPS was built with that package.  See the <A HREF = "Section_start.html#start_3">Making
 LAMMPS</A> section for more info.
 </P>
+<P>Assigning a temperature via the <A HREF = "velocity.html">velocity create</A>
+command to a system with <A HREF = "fix_rigid.html">rigid bodies</A> may not have
+the desired outcome for two reasons.  First, the velocity command can
+be invoked before the rigid-body fix is invoked or initialized and the
+number of adjusted degrees of freedom (DOFs) is known.  Thus it is not
+possible to compute the target temperature correctly.  Second, the
+assigned velocities may be partially canceled when constraints are
+first enforced, leading to a different temperature than desired.  A
+workaround for this is to perform a <A HREF = "run.html">run 0</A> command, which
+insures all DOFs are accounted for properly, and then rescale the
+temperature to the desired value before performing a simulation.  For
+example:
+</P>
+<PRE>velocity all create 300.0 12345
+run 0                             # temperature may not be 300K
+velocity all scale 300.0          # now it should be 
+</PRE>
 <P><B>Related commands:</B>
 </P>
 <P><A HREF = "delete_bonds.html">delete_bonds</A>, <A HREF = "neigh_modify.html">neigh_modify</A>
-exclude
+exclude, <A HREF = "fix_shake.html">fix shake</A>
 </P>
 <P><B>Default:</B>
 </P>
 <P>The option defaults are force * on on on and torque * on on on,
 meaning all rigid bodies are acted on by center-of-mass force and
 torque.  Also Tchain = Pchain = 10, Titer = 1, Torder = 3.
 </P>
 <HR>
 
 <A NAME = "Hoover"></A>
 
 <P><B>(Hoover)</B> Hoover, Phys Rev A, 31, 1695 (1985).
 </P>
 <A NAME = "Kamberaj"></A>
 
 <P><B>(Kamberaj)</B> Kamberaj, Low, Neal, J Chem Phys, 122, 224114 (2005).
 </P>
 <A NAME = "Martyna"></A>
 
 <P><B>(Martyna)</B> Martyna, Klein, Tuckerman, J Chem Phys, 97, 2635 (1992);
 Martyna, Tuckerman, Tobias, Klein, Mol Phys, 87, 1117.
 </P>
 <A NAME = "Miller"></A>
 
 <P><B>(Miller)</B> Miller, Eleftheriou, Pattnaik, Ndirango, and Newns, 
 J Chem Phys, 116, 8649 (2002).
 </P>
 <A NAME = "Zhang"></A>
 
 <P><B>(Zhang)</B> Zhang, Glotzer, Nanoletters, 4, 1407-1413 (2004).
 </P>
 </HTML>
diff --git a/doc/fix_rigid.txt b/doc/fix_rigid.txt
index 66a6c7bd6..3d0fc4afc 100644
--- a/doc/fix_rigid.txt
+++ b/doc/fix_rigid.txt
@@ -1,777 +1,794 @@
 "LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
 
 :link(lws,http://lammps.sandia.gov)
 :link(ld,Manual.html)
 :link(lc,Section_commands.html#comm)
 
 :line
 
 fix rigid command :h3
 fix rigid/nve command :h3
 fix rigid/nvt command :h3
 fix rigid/npt command :h3
 fix rigid/nph command :h3
 fix rigid/small command :h3
 fix rigid/nve/small command :h3
 fix rigid/nvt/small command :h3
 fix rigid/npt/small command :h3
 fix rigid/nph/small command :h3
 
 [Syntax:]
 
 fix ID group-ID style bodystyle args keyword values ... :pre
 
 ID, group-ID are documented in "fix"_fix.html command :ulb,l
 style = {rigid} or {rigid/nve} or {rigid/nvt} or {rigid/npt} or {rigid/nph} or {rigid/small} or {rigid/nve/small} or {rigid/nvt/small} or {rigid/npt/small} or {rigid/nph/small} :l
 bodystyle = {single} or {molecule} or {group} :l
   {single} args = none
   {molecule} args = none
   {group} args = N groupID1 groupID2 ...
     N = # of groups
     groupID1, groupID2, ... = list of N group IDs :pre
 
 zero or more keyword/value pairs may be appended :l
 keyword = {langevin} or {temp} or {iso} or {aniso} or {x} or {y} or {z} or {couple} or {tparam} or {pchain} or {dilate} or {force} or {torque} or {infile} :l
   {langevin} values = Tstart Tstop Tperiod seed
     Tstart,Tstop = desired temperature at start/stop of run (temperature units)
     Tdamp = temperature damping parameter (time units)
     seed = random number seed to use for white noise (positive integer)
   {temp} values = Tstart Tstop Tdamp
     Tstart,Tstop = desired temperature at start/stop of run (temperature units)
     Tdamp = temperature damping parameter (time units)
   {iso} or {aniso} values = Pstart Pstop Pdamp
     Pstart,Pstop = scalar external pressure at start/end of run (pressure units)
     Pdamp = pressure damping parameter (time units)
   {x} or {y} or {z} values = Pstart Pstop Pdamp
     Pstart,Pstop = external stress tensor component at start/end of run (pressure units)
     Pdamp = stress damping parameter (time units)
   {couple} = {none} or {xyz} or {xy} or {yz} or {xz}
   {tparam} values = Tchain Titer Torder
     Tchain = length of Nose/Hoover thermostat chain
     Titer = number of thermostat iterations performed
     Torder = 3 or 5 = Yoshida-Suzuki integration parameters
   {pchain} values = Pchain
     Pchain = length of the Nose/Hoover thermostat chain coupled with the barostat
   {dilate} value = dilate-group-ID
     dilate-group-ID = only dilate atoms in this group due to barostat volume changes
   {force} values = M xflag yflag zflag
     M = which rigid body from 1-Nbody (see asterisk form below)
     xflag,yflag,zflag = off/on if component of center-of-mass force is active
   {torque} values = M xflag yflag zflag
     M = which rigid body from 1-Nbody (see asterisk form below)
     xflag,yflag,zflag = off/on if component of center-of-mass torque is active
   {infile} filename
     filename = file with per-body values of mass, center-of-mass, moments of inertia 
   {mol} value = template-ID
     template-ID = ID of molecule template specified in a separate "molecule"_molecule.html command :pre
 :ule
 
 [Examples:]
 
 fix 1 clump rigid single
 fix 1 clump rigid/small molecule
 fix 1 clump rigid single force 1 off off on langevin 1.0 1.0 1.0 428984
 fix 1 polychains rigid/nvt molecule temp 1.0 1.0 5.0
 fix 1 polychains rigid molecule force 1*5 off off off force 6*10 off off on
 fix 1 polychains rigid/small molecule langevin 1.0 1.0 1.0 428984
 fix 2 fluid rigid group 3 clump1 clump2 clump3 torque * off off off 
 fix 1 rods rigid/npt molecule temp 300.0 300.0 100.0 iso 0.5 0.5 10.0
 fix 1 particles rigid/npt molecule temp 1.0 1.0 5.0 x 0.5 0.5 1.0 z 0.5 0.5 1.0 couple xz
 fix 1 water rigid/nph molecule iso 0.5 0.5 1.0
 fix 1 particles rigid/npt/small molecule temp 1.0 1.0 1.0 iso 0.5 0.5 1.0 :pre
 	
 [Description:]
 
 Treat one or more sets of atoms as independent rigid bodies.  This
 means that each timestep the total force and torque on each rigid body
 is computed as the sum of the forces and torques on its constituent
 particles.  The coordinates, velocities, and orientations of the atoms
 in each body are then updated so that the body moves and rotates as a
 single entity.
 
 Examples of large rigid bodies are a colloidal particle, or portions
 of a biomolecule such as a protein.
 
 Example of small rigid bodies are patchy nanoparticles, such as those
 modeled in "this paper"_#Zhang by Sharon Glotzer's group, clumps of
 granular particles, lipid molecules consiting of one or more point
 dipoles connected to other spheroids or ellipsoids, irregular
 particles built from line segments (2d) or triangles (3d), and
 coarse-grain models of nano or colloidal particles consisting of a
 small number of constituent particles.  Note that the "fix
 shake"_fix_shake.html command can also be used to rigidify small
 molecules of 2, 3, or 4 atoms, e.g. water molecules.  That fix treats
 the constituent atoms as point masses.
 
 These fixes also update the positions and velocities of the atoms in
 each rigid body via time integration, in the NVE, NVT, NPT, or NPH
 ensemble, as described below.  
 
 There are two main variants of this fix, fix rigid and fix
 rigid/small.  The NVE/NVT/NPT/NHT versions belong to one of the two
 variants, as their style names indicate.  
 
 IMPORTANT NOTE: Not all of the bodystyle options and keyword/value
 options are available for both the {rigid} and {rigid/small} variants.
 See details below.
 
 The {rigid} variant is typically the best choice for a system with a
 small number of large rigid bodies, each of which can extend across
 the domain of many processors.  It operates by creating a single
 global list of rigid bodies, which all processors contribute to.
 MPI_Allreduce operations are performed each timestep to sum the
 contributions from each processor to the force and torque on all the
 bodies.  This operation will not scale well in parallel if large
 numbers of rigid bodies are simulated.
 
 The {rigid/small} variant is typically best for a system with a large
 number of small rigid bodies.  Each body is assigned to the atom
 closest to the geometrical center of the body.  The fix operates using
 local lists of rigid bodies owned by each processor and information is
 exchanged and summed via local communication between neighboring
 processors when ghost atom info is accumlated.
 
 IMPORTANT NOTE: To use {rigid/small} the ghost atom cutoff must be
 large enough to span the distance between the atom that owns the body
 and every other atom in the body.  This distance value is printed out
 when the rigid bodies are defined.  If the
 "pair_style"_pair_style.html cutoff plus neighbor skin does not span
 this distance, then you should use the "communicate
 cutoff"_communicate.html command with a setting epsilon larger than
 the distance.
 
 Which of the two variants is faster for a particular problem is hard
 to predict.  The best way to decide is to perform a short test run.
 Both variants should give identical numerical answers for short runs.
 Long runs should give statistically similar results, but round-off
 differences may accumulate to produce divergent trajectories.
 
 IMPORTANT NOTE: You should not update the atoms in rigid bodies via
 other time-integration fixes (e.g. "fix nve"_fix_nve.html, "fix
 nvt"_fix_nvt.html, "fix npt"_fix_npt.html), or you will be integrating
 their motion more than once each timestep.  When performing a hybrid
 simulation with some atoms in rigid bodies, and some not, a separate
 time integration fix like "fix nve"_fix_nve.html or "fix
 nvt"_fix_nh.html should be used for the non-rigid particles.
 
 IMPORTANT NOTE: These fixes are overkill if you simply want to hold a
 collection of atoms stationary or have them move with a constant
 velocity.  A simpler way to hold atoms stationary is to not include
 those atoms in your time integration fix.  E.g. use "fix 1 mobile nve"
 instead of "fix 1 all nve", where "mobile" is the group of atoms that
 you want to move.  You can move atoms with a constant velocity by
 assigning them an initial velocity (via the "velocity"_velocity.html
 command), setting the force on them to 0.0 (via the "fix
 setforce"_fix_setforce.html command), and integrating them as usual
 (e.g. via the "fix nve"_fix_nve.html command).
 
 IMPORTANT NOTE: The aggregate properties of each rigid body are
 calculated at the start of each simulation run.  These include its
 center of mass, moments of inertia, and net velocity and angular
 momentum.  This means that before or between runs, per-atom properties
 can be changed, e.g. via the "set"_set.html or
 "velocity"_velocity.html command, which will affect the bodies.  An
 exception is if the {infile} keyword is used, then all the body
 properties (except net velocity and angular momentum) are only
 calculated once so that values from the file are valid.
 
 :line
 
 Each rigid body must have two or more atoms.  An atom can belong to at
 most one rigid body.  Which atoms are in which bodies can be defined
 via several options.
 
 IMPORTANT NOTE: With fix rigid/small, which requires bodystyle
 {molecule}, you can define a system that has no rigid bodies
 initially.  This is useful when you are adding rigid bodies on-the-fly
 via commands such as "fix deposit"_fix_deposit.html or "fix
 pour"_fix_pour.html.
 
 For bodystyle {single} the entire fix group of atoms is treated as one
 rigid body.  This option is only allowed for fix rigid and its
 sub-styles.
 
 For bodystyle {molecule}, each set of atoms in the fix group with a
 different molecule ID is treated as a rigid body.  This option is
 allowed for fix rigid and fix rigid/small, and their sub-styles.  Note
 that atoms with a molecule ID = 0 will be treated as a single rigid
 body.  For a system with atomic solvent (typically this is atoms with
 molecule ID = 0) surrounding rigid bodies, this may not be what you
 want.  Thus you should be careful to use a fix group that only
 includes atoms you want to be part of rigid bodies.
 
 For bodystyle {group}, each of the listed groups is treated as a
 separate rigid body.  Only atoms that are also in the fix group are
 included in each rigid body.  This option is only allowed for fix
 rigid and its sub-styles.
 
 IMPORTANT NOTE: To compute the initial center-of-mass position and
 other properties of each rigid body, the image flags for each atom in
 the body are used to "unwrap" the atom coordinates.  Thus you must
 insure that these image flags are consistent so that the unwrapping
 creates a valid rigid body (one where the atoms are close together),
 particularly if the atoms in a single rigid body straddle a periodic
 boundary.  This means the input data file or restart file must define
 the image flags for each atom consistently or that you have used the
 "set"_set.html command to specify them correctly.  If a dimension is
 non-periodic then the image flag of each atom must be 0 in that
 dimension, else an error is generated.
 
 The {force} and {torque} keywords discussed next are only allowed for
 fix rigid and its sub-styles.
 
 By default, each rigid body is acted on by other atoms which induce an
 external force and torque on its center of mass, causing it to
 translate and rotate.  Components of the external center-of-mass force
 and torque can be turned off by the {force} and {torque} keywords.
 This may be useful if you wish a body to rotate but not translate, or
 vice versa, or if you wish it to rotate or translate continuously
 unaffected by interactions with other particles.  Note that if you
 expect a rigid body not to move or rotate by using these keywords, you
 must insure its initial center-of-mass translational or angular
 velocity is 0.0.  Otherwise the initial translational or angular
 momentum the body has will persist.
 
 An xflag, yflag, or zflag set to {off} means turn off the component of
 force of torque in that dimension.  A setting of {on} means turn on
 the component, which is the default.  Which rigid body(s) the settings
 apply to is determined by the first argument of the {force} and
 {torque} keywords.  It can be an integer M from 1 to Nbody, where
 Nbody is the number of rigid bodies defined.  A wild-card asterisk can
 be used in place of, or in conjunction with, the M argument to set the
 flags for multiple rigid bodies.  This takes the form "*" or "*n" or
 "n*" or "m*n".  If N = the number of rigid bodies, then an asterisk
 with no numeric values means all bodies from 1 to N.  A leading
 asterisk means all bodies from 1 to n (inclusive).  A trailing
 asterisk means all bodies from n to N (inclusive).  A middle asterisk
 means all types from m to n (inclusive).  Note that you can use the
 {force} or {torque} keywords as many times as you like.  If a
 particular rigid body has its component flags set multiple times, the
 settings from the final keyword are used.
 
 IMPORTANT NOTE: For computational efficiency, you may wish to turn off
 pairwise and bond interactions within each rigid body, as they no
 longer contribute to the motion.  The "neigh_modify
 exclude"_neigh_modify.html and "delete_bonds"_delete_bonds.html
 commands are used to do this.  If the rigid bodies have strongly
 overalapping atoms, you may need to turn off these interactions to
 avoid numerical problems due to large equal/opposite intra-body forces
 swamping the contribution of small inter-body forces.
 
 For computational efficiency, you should typically define one fix
 rigid or fix rigid/small command which includes all the desired rigid
 bodies.  LAMMPS will allow multiple rigid fixes to be defined, but it
 is more expensive.
 
 :line
 
 The constituent particles within a rigid body can be point particles
 (the default in LAMMPS) or finite-size particles, such as spheres or
 ellipsoids or line segments or triangles.  See the "atom_style sphere
 and ellipsoid and line and tri"_atom_style.html commands for more
 details on these kinds of particles.  Finite-size particles contribute
 differently to the moment of inertia of a rigid body than do point
 particles.  Finite-size particles can also experience torque (e.g. due
 to "frictional granular interactions"_pair_gran.html) and have an
 orientation.  These contributions are accounted for by these fixes.
 
 Forces between particles within a body do not contribute to the
 external force or torque on the body.  Thus for computational
 efficiency, you may wish to turn off pairwise and bond interactions
 between particles within each rigid body.  The "neigh_modify
 exclude"_neigh_modify.html and "delete_bonds"_delete_bonds.html
 commands are used to do this.  For finite-size particles this also
 means the particles can be highly overlapped when creating the rigid
 body.
 
 :line
 
 The {rigid} and {rigid/small} and {rigid/nve} styles perform constant
 NVE time integration.  The only difference is that the {rigid} and
 {rigid/small} styles use an integration technique based on Richardson
 iterations.  The {rigid/nve} style uses the methods described in the
 paper by "Miller"_#Miller, which are thought to provide better energy
 conservation than an iterative approach.
 
 The {rigid/nvt} and {rigid/nvt/small} styles performs constant NVT 
 integration using a Nose/Hoover thermostat with chains as described 
 originally in "(Hoover)"_#Hoover and "(Martyna)"_#Martyna, which 
 thermostats both the translational and rotational degrees of freedom 
 of the rigid bodies.  The rigid-body algorithm used by {rigid/nvt} 
 is described in the paper by "Kamberaj"_#Kamberaj.
 
 The {rigid/npt} and {rigid/nph} (and their /small counterparts) styles 
 perform constant NPT or NPH integration using a Nose/Hoover barostat 
 with chains.  For the NPT case, the same Nose/Hoover thermostat is also 
 used as with {rigid/nvt}.
 
 The barostat parameters are specified using one or more of the {iso},
 {aniso}, {x}, {y}, {z} and {couple} keywords.  These keywords give you
 the ability to specify 3 diagonal components of the external stress
 tensor, and to couple these components together so that the dimensions
 they represent are varied together during a constant-pressure
 simulation.  The effects of these keywords are similar to those
 defined in "fix npt/nph"_fix_nh.html
 
 NOTE: Currently the {rigid/npt} and {rigid/nph} (and their /small 
 counterparts) styles do not support triclinic (non-orthongonal) boxes.
 
 The target pressures for each of the 6 components of the stress tensor
 can be specified independently via the {x}, {y}, {z} keywords, which
 correspond to the 3 simulation box dimensions.  For each component,
 the external pressure or tensor component at each timestep is a ramped
 value during the run from {Pstart} to {Pstop}. If a target pressure is
 specified for a component, then the corresponding box dimension will
 change during a simulation.  For example, if the {y} keyword is used,
 the y-box length will change.  A box dimension will not change if that
 component is not specified, although you have the option to change
 that dimension via the "fix deform"_fix_deform.html command.
 
 For all barostat keywords, the {Pdamp} parameter operates like the
 {Tdamp} parameter, determining the time scale on which pressure is
 relaxed.  For example, a value of 10.0 means to relax the pressure in
 a timespan of (roughly) 10 time units (e.g. tau or fmsec or psec - see
 the "units"_units.html command).
 
 Regardless of what atoms are in the fix group (the only atoms which
 are time integrated), a global pressure or stress tensor is computed
 for all atoms.  Similarly, when the size of the simulation box is
 changed, all atoms are re-scaled to new positions, unless the keyword
 {dilate} is specified with a {dilate-group-ID} for a group that
 represents a subset of the atoms.  This can be useful, for example, to
 leave the coordinates of atoms in a solid substrate unchanged and
 controlling the pressure of a surrounding fluid.  Another example is a
 system consisting of rigid bodies and point particles where the
 barostat is only coupled with the rigid bodies.  This option should be
 used with care, since it can be unphysical to dilate some atoms and
 not others, because it can introduce large, instantaneous
 displacements between a pair of atoms (one dilated, one not) that are
 far from the dilation origin.
 
 The {couple} keyword allows two or three of the diagonal components of
 the pressure tensor to be "coupled" together.  The value specified
 with the keyword determines which are coupled.  For example, {xz}
 means the {Pxx} and {Pzz} components of the stress tensor are coupled.
 {Xyz} means all 3 diagonal components are coupled.  Coupling means two
 things: the instantaneous stress will be computed as an average of the
 corresponding diagonal components, and the coupled box dimensions will
 be changed together in lockstep, meaning coupled dimensions will be
 dilated or contracted by the same percentage every timestep.  The
 {Pstart}, {Pstop}, {Pdamp} parameters for any coupled dimensions must
 be identical.  {Couple xyz} can be used for a 2d simulation; the {z}
 dimension is simply ignored.
 
 The {iso} and {aniso} keywords are simply shortcuts that are
 equivalent to specifying several other keywords together.
 
 The keyword {iso} means couple all 3 diagonal components together when
 pressure is computed (hydrostatic pressure), and dilate/contract the
 dimensions together.  Using "iso Pstart Pstop Pdamp" is the same as
 specifying these 4 keywords:
 
 x Pstart Pstop Pdamp
 y Pstart Pstop Pdamp
 z Pstart Pstop Pdamp
 couple xyz :pre
 
 The keyword {aniso} means {x}, {y}, and {z} dimensions are controlled
 independently using the {Pxx}, {Pyy}, and {Pzz} components of the
 stress tensor as the driving forces, and the specified scalar external
 pressure.  Using "aniso Pstart Pstop Pdamp" is the same as specifying
 these 4 keywords:
 
 x Pstart Pstop Pdamp
 y Pstart Pstop Pdamp
 z Pstart Pstop Pdamp
 couple none :pre
 
 :line
 
 The keyword/value option pairs are used in the following ways.
 
 The {langevin} and {temp} and {tparam} keywords perform thermostatting
 of the rigid bodies, altering both their translational and rotational
 degrees of freedom.  What is meant by "temperature" of a collection of
 rigid bodies and how it can be monitored via the fix output is
 discussed below.
 
 The {langevin} keyword applies a Langevin thermostat to the constant
 NVE time integration performed by either the {rigid} or {rigid/small}
 or {rigid/nve} styles.  It cannot be used with the {rigid/nvt} style.
 The desired temperature at each timestep is a ramped value during the
 run from {Tstart} to {Tstop}.  The {Tdamp} parameter is specified in
 time units and determines how rapidly the temperature is relaxed.  For
 example, a value of 100.0 means to relax the temperature in a timespan
 of (roughly) 100 time units (tau or fmsec or psec - see the
 "units"_units.html command).  The random # {seed} must be a positive
 integer.
 
 The way that Langevin thermostatting operates is explained on the "fix
 langevin"_fix_langevin.html doc page.  If you wish to simply viscously
 damp the rotational motion without thermostatting, you can set
 {Tstart} and {Tstop} to 0.0, which means only the viscous drag term in
 the Langevin thermostat will be applied.  See the discussion on the
 "fix viscous"_doc/fix_viscous.html doc page for details.
 
 IMPORTANT NOTE: When the {langevin} keyword is used with fix rigid
 versus fix rigid/small, different dynamics will result for parallel
 runs.  This is because of the way random numbers are used in the two
 cases.  The dynamics for the two cases should be statistically
 similar, but will not be identical, even for a single timestep.
 
 The {temp} and {tparam} keywords apply a Nose/Hoover thermostat to the
 NVT time integration performed by the {rigid/nvt} style.  They cannot
 be used with the {rigid} or {rigid/small} or {rigid/nve} styles.  The
 desired temperature at each timestep is a ramped value during the run
 from {Tstart} to {Tstop}.  The {Tdamp} parameter is specified in time
 units and determines how rapidly the temperature is relaxed.  For
 example, a value of 100.0 means to relax the temperature in a timespan
 of (roughly) 100 time units (tau or fmsec or psec - see the
 "units"_units.html command).
 
 Nose/Hoover chains are used in conjunction with this thermostat.  The
 {tparam} keyword can optionally be used to change the chain settings
 used.  {Tchain} is the number of thermostats in the Nose Hoover chain.
 This value, along with {Tdamp} can be varied to dampen undesirable
 oscillations in temperature that can occur in a simulation.  As a rule
 of thumb, increasing the chain length should lead to smaller
 oscillations. The keyword {pchain} specifies the number of
 thermostats in the chain thermostatting the barostat degrees of
 freedom. 
 
 IMPORTANT NOTE: There are alternate ways to thermostat a system of
 rigid bodies.  You can use "fix langevin"_fix_langevin.html to treat
 the individual particles in the rigid bodies as effectively immersed
 in an implicit solvent, e.g. a Brownian dynamics model.  For hybrid
 systems with both rigid bodies and solvent particles, you can
 thermostat only the solvent particles that surround one or more rigid
 bodies by appropriate choice of groups in the compute and fix commands
 for temperature and thermostatting.  The solvent interactions with the
 rigid bodies should then effectively thermostat the rigid body
 temperature as well without use of the Langevin or Nose/Hoover options
 associated with the fix rigid commands.
 
 :line
 
 The {mol} keyword can only be used with fix rigid/small.  It should be
 used when other commands, such as "fix deposit"_fix_deposit.html or
 "fix pour"_fix_pour.html, add rigid bodies on-the-fly during a
 simulation.  You specify a {template-ID} previously defined using the
 "molecule"_molecule.html command, which reads a file that defines the
 molecule.  You must use the same {template-ID} that the command adding
 rigid bodies uses.  The coordinates, atom types, atom diameters,
 center-of-mass, and moments of inertia can be specified in the
 molecule file.  See the "molecule"_molecule.html command for details.
 The only settings required to be in this file are the coordinates and
 types of atoms in the molecule.
 
 :line
 
 The {infile} keyword allows a file of rigid body attributes to be read
 in from a file, rather then having LAMMPS compute them.  There are 3
 such attributes: the total mass of the rigid body, its center-of-mass
 position, and its 6 moments of inertia.  For rigid bodies consisting
 of point particles or non-overlapping finite-size particles, LAMMPS
 can compute these values accurately.  However, for rigid bodies
 consisting of finite-size particles which overlap each other, LAMMPS
 will ignore the overlaps when computing these 3 attributes.  The
 amount of error this induces depends on the amount of overlap.  To
 avoid this issue, the values can be pre-computed (e.g. using Monte
 Carlo integration).
 
 The format of the file is as follows.  Note that the file does not
 have to list attributes for every rigid body integrated by fix rigid.
 Only bodies which the file specifies will have their computed
 attributes overridden.  The file can contain initial blank lines or
 comment lines starting with "#" which are ignored.  The first
 non-blank, non-comment line should list N = the number of lines to
 follow.  The N successive lines contain the following information:
 
 ID1 masstotal xcm ycm zcm ixx iyy izz ixy ixz iyz
 ID2 masstotal xcm ycm zcm ixx iyy izz ixy ixz iyz
 ...
 IDN masstotal xcm ycm zcm ixx iyy izz ixy ixz iyz :pre
 
 The rigid body IDs are all positive integers.  For the {single}
 bodystyle, only an ID of 1 can be used.  For the {group} bodystyle,
 IDs from 1 to Ng can be used where Ng is the number of specified
 groups.  For the {molecule} bodystyle, use the molecule ID for the
 atoms in a specific rigid body as the rigid body ID.
 
 The masstotal and center-of-mass coordinates (xcm,ycm,zcm) are
 self-explanatory.  The center-of-mass should be consistent with what
 is calculated for the position of the rigid body with all its atoms
 unwrapped by their respective image flags.  If this produces a
 center-of-mass that is outside the simulation box, LAMMPS wraps it
 back into the box.  The 6 moments of inertia (ixx,iyy,izz,ixy,ixz,iyz)
 should be the values consistent with the current orientation of the
 rigid body around its center of mass.  The values are with respect to
 the simulation box XYZ axes, not with respect to the prinicpal axes of
 the rigid body itself.  LAMMPS performs the latter calculation
 internally.
 
 IMPORTANT NOTE: If you use the {infile} keyword and write restart
 files during a simulation, then each time a restart file is written,
 the fix also write an auxiliary restart file with the name
 rfile.rigid, where "rfile" is the name of the restart file,
 e.g. tmp.restart.10000 and tmp.restart.10000.rigid.  This auxiliary
 file is in the same format described above and contains info on the
 current center-of-mass and 6 moments of inertia.  Thus it can be used
 in a new input script that restarts the run and re-specifies a rigid
 fix using an {infile} keyword and the appropriate filename.  Note that
 the auxiliary file will contain one line for every rigid body, even if
 the original file only listed a subset of the rigid bodies.
 
 IMPORTANT NOTE: If you are using fix rigid/small and defining a system
 that has no rigid bodies initially, because they will be added
 on-the-fly by commands such as "fix deposit"_fix_deposit.html or "fix
 pour"_fix_pour.html, you may still wish to use the {infile} keyword.
 This is so that restart files written during the simulation will
 output an auxiliary restart file as described above with information
 on the new rigid bodies.  In this case the initial {infile} file
 should use N = 0.
 
 :line
 
 If you use a "temperature compute"_compute.html with a group that
 includes particles in rigid bodies, the degrees-of-freedom removed by
 each rigid body are accounted for in the temperature (and pressure)
 computation, but only if the temperature group includes all the
 particles in a particular rigid body.
 
 A 3d rigid body has 6 degrees of freedom (3 translational, 3
 rotational), except for a collection of point particles lying on a
 straight line, which has only 5, e.g a dimer.  A 2d rigid body has 3
 degrees of freedom (2 translational, 1 rotational).
 
 IMPORTANT NOTE: You may wish to explicitly subtract additional
 degrees-of-freedom if you use the {force} and {torque} keywords to
 eliminate certain motions of one or more rigid bodies.  LAMMPS does
 not do this automatically.
 
 The rigid body contribution to the pressure of the system (virial) is
 also accounted for by this fix.
 
 IMPORTANT NOTE: The periodic image flags of atoms in rigid bodies are
 altered so that the rigid body can be reconstructed correctly when it
 straddles periodic boundaries.  The atom image flags are not
 incremented/decremented as they would be for non-rigid atoms as the
 rigid body crosses periodic boundaries.  Specifically, they are set so
 that the center-of-mass (COM) of the rigid body always remains inside
 the simulation box.
 
 This means that if you output per-atom image flags you cannot
 interpret them as you normally would.  I.e. the image flag values
 written to a "dump file"_dump.html will be different than they would
 be if the atoms were not in a rigid body.  Likewise the "compute
 msd"_compute_msd.html will not compute the expected mean-squared
 displacement for such atoms if the body moves across periodic
 boundaries.  It also means that if you have bonds between a pair of
 rigid bodies and the bond straddles a periodic boundary, you cannot
 use the "replicate"_replicate.html command to increase the system
 size.
 
 Here are details on how, you can post-process a dump file to calculate
 a diffusion coefficient for rigid bodies, using the altered per-atom
 image flags written to a dump file.  The image flags for atoms in the
 same rigid body can be used to unwrap the body and calculate its
 center-of-mass (COM).  As mentioned above, this COM will always be
 inside the simulation box.  Thus it will "jump" from one side of the
 box to the other when the COM crosses a periodic boundary.  If you
 keep track of the jumps, you can effectively "unwrap" the COM and use
 that value to track the displacement of each rigid body, and thus the
 mean-squared displacement (MSD) of an ensemble of bodies, and thus a
 diffusion coefficient.
 
 Note that fix rigid does define image flags for each rigid body, which
 are incremented when the center-of-mass of the rigid body crosses a
 periodic boundary in the usual way.  These image flags have the same
 meaning as atom images (see the "dump" command) and can be accessed
 and output as described below.
 
 :line
 
 If your simlulation is a hybrid model with a mixture of rigid bodies
 and non-rigid particles (e.g. solvent) there are several ways these
 rigid fixes can be used in tandem with "fix nve"_fix_nve.html, "fix
 nvt"_fix_nh.html, "fix npt"_fix_nh.html, and "fix nph"_fix_nh.html.
 
 If you wish to perform NVE dynamics (no thermostatting or
 barostatting), use fix rigid or fix rigid/nve to integrate the rigid
 bodies, and "fix nve"_fix_nve.html to integrate the non-rigid
 particles.
 
 If you wish to perform NVT dynamics (thermostatting, but no
 barostatting), you can use fix rigid/nvt for the rigid bodies, and any
 thermostatting fix for the non-rigid particles ("fix nvt"_fix_nh.html,
 "fix langevin"_fix_langevin.html, "fix
 temp/berendsen"_fix_temp_berendsen.html).  You can also use fix rigid
 or fix rigid/nve for the rigid bodies and thermostat them using "fix
 langevin"_fix_langevin.html on the group that contains all the
 particles in the rigid bodies.  The net force added by "fix
 langevin"_fix_langevin.html to each rigid body effectively thermostats
 its translational center-of-mass motion.  Not sure how well it does at
 thermostatting its rotational motion.
 
 If you with to perform NPT or NPH dynamics (barostatting), you cannot
 use both "fix npt"_fix_nh.html and fix rigid/npt (or the nph
 variants).  This is because there can only be one fix which monitors
 the global pressure and changes the simulation box dimensions.  So you
 have 3 choices:
 
 Use fix rigid/npt for the rigid bodies.  Use the {dilate} all option
 so that it will dilate the positions of the non-rigid particles as
 well.  Use "fix nvt"_fix_nh.html (or any other thermostat) for the
 non-rigid particles. :ulb,l
 
 Use "fix npt"_fix_nh.html for the group of non-rigid particles.  Use
 the {dilate} all option so that it will dilate the center-of-mass
 positions of the rigid bodies as well.  Use fix rigid/nvt for the
 rigid bodies. :l
 
 Use "fix press/berendsen"_fix_press_berendsen.html to compute the
 pressure and change the box dimensions.  Use fix rigid/nvt for the
 rigid bodies.  Use "fix nvt"_fix_nh.thml (or any other thermostat) for
 the non-rigid particles. :l,ule
 
 In all case, the rigid bodies and non-rigid particles both contribute
 to the global pressure and the box is scaled the same by any of the
 barostatting fixes.
 
 You could even use the 2nd and 3rd options for a non-hybrid simulation
 consisting of only rigid bodies, assuming you give "fix
 npt"_fix_nh.html an empty group, though it's an odd thing to do.  The
 barostatting fixes ("fix npt"_fix_nh.html and "fix
 press/berensen"_fix_press_berendsen.html) will monitor the pressure
 and change the box dimensions, but not time integrate any particles.
 The integration of the rigid bodies will be performed by fix
 rigid/nvt.
 
 :line
 
 Styles with a {cuda}, {gpu}, {omp}, or {opt} suffix are functionally
 the same as the corresponding style without the suffix.  They have
 been optimized to run faster, depending on your available hardware, as
 discussed in "Section_accelerate"_Section_accelerate.html of the
 manual.  The accelerated styles take the same arguments and should
 produce the same results, except for round-off and precision issues.
 
 These accelerated styles are part of the USER-CUDA, GPU, USER-OMP and OPT
 packages, respectively.  They are only enabled if LAMMPS was built with
 those packages.  See the "Making LAMMPS"_Section_start.html#start_3
 section for more info.
 
 You can specify the accelerated styles explicitly in your input script
 by including their suffix, or you can use the "-suffix command-line
 switch"_Section_start.html#start_7 when you invoke LAMMPS, or you can
 use the "suffix"_suffix.html command in your input script.
 
 See "Section_accelerate"_Section_accelerate.html of the manual for
 more instructions on how to use the accelerated styles effectively.
 
 :line
 
 [Restart, fix_modify, output, run start/stop, minimize info:]
 
 No information about the {rigid} and {rigid/small} and {rigid/nve}
 fixes are written to "binary restart files"_restart.html.  For style
 {rigid/nvt} the state of the Nose/Hoover thermostat is written to
 "binary restart files"_restart.html.  See the
 "read_restart"_read_restart.html command for info on how to re-specify
 a fix in an input script that reads a restart file, so that the
 operation of the fix continues in an uninterrupted fashion.
 
 The "fix_modify"_fix_modify.html {energy} option is supported by the
 rigid/nvt fix to add the energy change induced by the thermostatting
 to the system's potential energy as part of "thermodynamic
 output"_thermo_style.html.
 
 The "fix_modify"_fix_modify.html {temp} and {press} options are 
 supported by the rigid/npt and rigid/nph fixes to change the computes used
 to calculate the instantaneous pressure tensor. Note that the rigid/nvt fix
 does not use any external compute to compute instantaneous temperature.
 
 The {rigid} and {rigid/small} and {rigid/nve} fixes compute a global
 scalar which can be accessed by various "output
 commands"_Section_howto.html#howto_15.  The scalar value calculated by
 these fixes is "intensive".  The scalar is the current temperature of
 the collection of rigid bodies.  This is averaged over all rigid
 bodies and their translational and rotational degrees of freedom.  The
 translational energy of a rigid body is 1/2 m v^2, where m = total
 mass of the body and v = the velocity of its center of mass.  The
 rotational energy of a rigid body is 1/2 I w^2, where I = the moment
 of inertia tensor of the body and w = its angular velocity.  Degrees
 of freedom constrained by the {force} and {torque} keywords are
 removed from this calculation, but only for the {rigid} and
 {rigid/nve} fixes.
 
 The {rigid/nvt}, {rigid/npt}, and {rigid/nph} fixes compute a global
 scalar which can be accessed by various "output
 commands"_Section_howto.html#howto_15.  The scalar value calculated by
 these fixes is "extensive".  The scalar is the cumulative energy
 change due to the thermostatting and barostatting the fix performs.
 
 All of the {rigid} fixes except {rigid/small} compute a global array
 of values which can be accessed by various "output
 commands"_Section_howto.html#howto_15.  The number of rows in the
 array is equal to the number of rigid bodies.  The number of columns
 is 15.  Thus for each rigid body, 15 values are stored: the xyz coords
 of the center of mass (COM), the xyz components of the COM velocity,
 the xyz components of the force acting on the COM, the xyz components
 of the torque acting on the COM, and the xyz image flags of the COM,
 which have the same meaning as image flags for atom positions (see the
 "dump" command).  The force and torque values in the array are not
 affected by the {force} and {torque} keywords in the fix rigid
 command; they reflect values before any changes are made by those
 keywords.
 
 The ordering of the rigid bodies (by row in the array) is as follows.
 For the {single} keyword there is just one rigid body.  For the
 {molecule} keyword, the bodies are ordered by ascending molecule ID.
 For the {group} keyword, the list of group IDs determines the ordering
 of bodies.
 
 The array values calculated by these fixes are "intensive", meaning
 they are independent of the number of atoms in the simulation.
 
 No parameter of these fixes can be used with the {start/stop} keywords
 of the "run"_run.html command.  These fixes are not invoked during
 "energy minimization"_minimize.html.
 
 :line
 
 [Restrictions:]
 
 These fixes are all part of the RIGID package.  It is only enabled if
 LAMMPS was built with that package.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.
 
+Assigning a temperature via the "velocity create"_velocity.html
+command to a system with "rigid bodies"_fix_rigid.html may not have
+the desired outcome for two reasons.  First, the velocity command can
+be invoked before the rigid-body fix is invoked or initialized and the
+number of adjusted degrees of freedom (DOFs) is known.  Thus it is not
+possible to compute the target temperature correctly.  Second, the
+assigned velocities may be partially canceled when constraints are
+first enforced, leading to a different temperature than desired.  A
+workaround for this is to perform a "run 0"_run.html command, which
+insures all DOFs are accounted for properly, and then rescale the
+temperature to the desired value before performing a simulation.  For
+example:
+
+velocity all create 300.0 12345
+run 0                             # temperature may not be 300K
+velocity all scale 300.0          # now it should be :pre
+
 [Related commands:]
 
 "delete_bonds"_delete_bonds.html, "neigh_modify"_neigh_modify.html
-exclude
+exclude, "fix shake"_fix_shake.html
 
 [Default:]
 
 The option defaults are force * on on on and torque * on on on,
 meaning all rigid bodies are acted on by center-of-mass force and
 torque.  Also Tchain = Pchain = 10, Titer = 1, Torder = 3.
 
 :line
 
 :link(Hoover)
 [(Hoover)] Hoover, Phys Rev A, 31, 1695 (1985).
 
 :link(Kamberaj)
 [(Kamberaj)] Kamberaj, Low, Neal, J Chem Phys, 122, 224114 (2005).
 
 :link(Martyna)
 [(Martyna)] Martyna, Klein, Tuckerman, J Chem Phys, 97, 2635 (1992);
 Martyna, Tuckerman, Tobias, Klein, Mol Phys, 87, 1117.
 
 :link(Miller)
 [(Miller)] Miller, Eleftheriou, Pattnaik, Ndirango, and Newns, 
 J Chem Phys, 116, 8649 (2002).
 
 :link(Zhang)
 [(Zhang)] Zhang, Glotzer, Nanoletters, 4, 1407-1413 (2004).
diff --git a/doc/package.html b/doc/package.html
index d707037da..939fee6ff 100644
--- a/doc/package.html
+++ b/doc/package.html
@@ -1,291 +1,363 @@
 <HTML>
 <CENTER><A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> - <A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> 
 </CENTER>
 
 
 
 
 
 
 <HR>
 
 <H3>package command 
 </H3>
 <P><B>Syntax:</B>
 </P>
 <PRE>package style args 
 </PRE>
-<UL><LI>style = <I>gpu</I> or <I>cuda</I> or <I>omp</I> 
+<UL><LI>style = <I>cuda</I> or <I>gpu</I> or <I>kokkos</I> or <I>omp</I> 
 
 <LI>args = arguments specific to the style 
 
-<PRE>  <I>gpu</I> args = mode first last split keyword value ...
+<PRE>  <I>cuda</I> args = keyword value ...
+    one or more keyword/value pairs may be appended
+    keywords = <I>gpu/node</I> or <I>gpu/node/special</I> or <I>timing</I> or <I>test</I> or <I>override/bpa</I>
+      <I>gpu/node</I> value = N
+        N = number of GPUs to be used per node
+      <I>gpu/node/special</I> values = N gpu1 .. gpuN
+        N = number of GPUs to be used per node
+        gpu1 .. gpuN = N IDs of the GPUs to use
+      <I>timing</I> values = none
+      <I>test</I> values = id
+        id = atom-ID of a test particle
+      <I>override/bpa</I> values = flag
+        flag = 0 for TpA algorithm, 1 for BpA algorithm 
+  <I>gpu</I> args = mode first last split keyword value ...
     mode = force or force/neigh
     first = ID of first GPU to be used on each node
     last = ID of last GPU to be used on each node
     split = fraction of particles assigned to the GPU
     zero or more keyword/value pairs may be appended
     keywords = <I>threads_per_atom</I> or <I>cellsize</I> or <I>device</I>
       <I>threads_per_atom</I> value = Nthreads
         Nthreads = # of GPU threads used per atom
       <I>cellsize</I> value = dist
         dist = length (distance units) in each dimension for neighbor bins
       <I>device</I> value = device_type
         device_type = <I>kepler</I> or <I>fermi</I> or <I>cypress</I> or <I>generic</I>
-  <I>cuda</I> args = keyword value ...
+  <I>kokkos</I> args = keyword value ...
     one or more keyword/value pairs may be appended
-    keywords = <I>gpu/node</I> or <I>gpu/node/special</I> or <I>timing</I> or <I>test</I> or <I>override/bpa</I>
-      <I>gpu/node</I> value = N
-        N = number of GPUs to be used per node
-      <I>gpu/node/special</I> values = N gpu1 .. gpuN
-        N = number of GPUs to be used per node
-        gpu1 .. gpuN = N IDs of the GPUs to use
-      <I>timing</I> values = none
-      <I>test</I> values = id
-        id = atom-ID of a test particle
-      <I>override/bpa</I> values = flag
-        flag = 0 for TpA algorithm, 1 for BpA algorithm 
+    keywords = <I>neigh</I> or <I>comm/exchange</I> or <I>comm/forward</I>
+      <I>neigh</I> value = <I>full</I> or <I>half/thread</I> or <I>half</I> or <I>n2</I> or <I>full/cluster</I>
+      <I>comm/exchange</I> value = <I>no</I> or <I>host</I> or <I>device</I>
+      <I>comm/forward</I> value = <I>no</I> or <I>host</I> or <I>device</I>
   <I>omp</I> args = Nthreads mode
     Nthreads = # of OpenMP threads to associate with each MPI process
     mode = force or force/neigh (optional) 
 </PRE>
 
 </UL>
 <P><B>Examples:</B>
 </P>
 <PRE>package gpu force 0 0 1.0
 package gpu force 0 0 0.75
 package gpu force/neigh 0 0 1.0
 package gpu force/neigh 0 1 -1.0
 package cuda gpu/node/special 2 0 2
 package cuda test 3948
+package kokkos neigh half/thread comm/forward device
 package omp * force/neigh
 package omp 4 force 
 </PRE>
 <P><B>Description:</B>
 </P>
 <P>This command invokes package-specific settings.  Currently the
-following packages use it: GPU, USER-CUDA, and USER-OMP.
+following packages use it: USER-CUDA, GPU, KOKKOS, and USER-OMP.
 </P>
 <P>To use the accelerated GPU and USER-OMP styles, the use of the package
 command is required.  However, as described in the "Defaults" section
 below, if you use the "-sf gpu" or "-sf omp" <A HREF = "Section_start.html#start_7">command-line
 options</A> to enable use of these styles,
 then default package settings are enabled.  In that case you only need
 to use the package command if you want to change the defaults.
 </P>
-<P>To use the accelerate USER-CUDA styles, the package command is not
-required as defaults are assigned internally.  You only need to use
-the package command if you want to change the defaults.
+<P>To use the accelerated USER-CUDA and KOKKOS styles, the package
+command is not required as defaults are assigned internally.  You only
+need to use the package command if you want to change the defaults.
 </P>
 <P>See <A HREF = "Section_accelerate.html">Section_accelerate</A> of the manual for
 more details about using these various packages for accelerating
 LAMMPS calculations.
 </P>
 <HR>
 
+<P>The <I>cuda</I> style invokes options associated with the use of the
+USER-CUDA package.  
+</P>
+<P>The <I>gpu/node</I> keyword specifies the number <I>N</I> of GPUs to be used on
+each node.  An MPI process with rank <I>K</I> will use the GPU (K mod N).
+This implies that processes should be assigned with successive ranks
+on each node, which is the default with most (or even all) MPI
+implementations. The default value for <I>N</I> is 2.
+</P>
+<P>The <I>gpu/node/special</I> keyword also specifies the number (N) of GPUs
+to be used on each node, but allows more control over their
+specification.  An MPI process with rank <I>K</I> will use the GPU <I>gpuI</I>
+with l = (K mod N) + 1. This implies that processes should be assigned
+with successive ranks on each node, which is the default with most (or
+even all) MPI implementations.  For example if you have three GPUs on
+a machine, one of which is used for the X-Server (the GPU with the ID
+1) while the others (with IDs 0 and 2) are used for computations you
+would specify:
+</P>
+<PRE>package cuda gpu/node/special 2 0 2 
+</PRE>
+<P>A main purpose of the <I>gpu/node/special</I> optoin is to allow two (or
+more) simulations to be run on one workstation.  In that case one
+would set the first simulation to use GPU 0 and the second to use GPU
+1. This is not necessary though, if the GPUs are in what is called
+<I>compute exclusive</I> mode.  Using that setting, every process will get
+its own GPU automatically.  This <I>compute exclusive</I> mode can be set
+as root using the <I>nvidia-smi</I> tool which is part of the CUDA
+installation.
+</P>
+<P>Note that if the <I>gpu/node/special</I> keyword is not used, the USER-CUDA
+package sorts existing GPUs on each node according to their number of
+multiprocessors.  This way, compute GPUs will be priorized over
+X-Server GPUs.
+</P>
+<P>Use of the <I>timing</I> keyword will output detailed timing information
+for various subroutines.
+</P>
+<P>The <I>test</I> keyword will output info for the the specified atom at
+several points during each time step.  This is mainly usefull for
+debugging purposes.  Note that the simulation will be severly slowed
+down if this option is used.
+</P>
+<P>The <I>override/bpa</I> keyword can be used to specify which mode is used
+for pair-force evaluation.  TpA = one thread per atom; BpA = one block
+per atom.  If this keyword is not used, a short test at the begin of
+each run will determine which method is more effective (the result of
+this test is part of the LAMMPS output).  Therefore it is usually not
+necessary to use this keyword.
+</P>
+<HR>
+
 <P>The <I>gpu</I> style invokes options associated with the use of the GPU
 package. 
 </P>
 <P>The <I>mode</I> setting specifies where neighbor list calculations will be
 performed.  If <I>mode</I> is force, neighbor list calculation is performed
 on the CPU. If <I>mode</I> is force/neigh, neighbor list calculation is
 performed on the GPU. GPU neighbor list calculation currently cannot
 be used with a triclinic box. GPU neighbor list calculation currently
 cannot be used with <A HREF = "pair_hybrid.html">hybrid</A> pair styles.  GPU
 neighbor lists are not compatible with styles that are not
 GPU-enabled.  When a non-GPU enabled style requires a neighbor list,
 it will also be built using CPU routines. In these cases, it will
 typically be more efficient to only use CPU neighbor list builds.
 </P>
 <P>The <I>first</I> and <I>last</I> settings specify the GPUs that will be used for
 simulation.  On each node, the GPU IDs in the inclusive range from
 <I>first</I> to <I>last</I> will be used.
 </P>
 <P>The <I>split</I> setting can be used for load balancing force calculation
 work between CPU and GPU cores in GPU-enabled pair styles. If 0 <
 <I>split</I> < 1.0, a fixed fraction of particles is offloaded to the GPU
 while force calculation for the other particles occurs simulataneously
 on the CPU. If <I>split</I><0, the optimal fraction (based on CPU and GPU
 timings) is calculated every 25 timesteps. If <I>split</I> = 1.0, all force
 calculations for GPU accelerated pair styles are performed on the
 GPU. In this case, <A HREF = "pair_hybrid.html">hybrid</A>, <A HREF = "bond_style.html">bond</A>,
 <A HREF = "angle_style.html">angle</A>, <A HREF = "dihedral_style.html">dihedral</A>,
 <A HREF = "improper_style.html">improper</A>, and <A HREF = "kspace_style.html">long-range</A>
 calculations can be performed on the CPU while the GPU is performing
 force calculations for the GPU-enabled pair style.  If all CPU force
 computations complete before the GPU, LAMMPS will block until the GPU
 has finished before continuing the timestep.
 </P>
 <P>As an example, if you have two GPUs per node and 8 CPU cores per node,
 and would like to run on 4 nodes (32 cores) with dynamic balancing of
 force calculation across CPU and GPU cores, you could specify
 </P>
 <PRE>package gpu force/neigh 0 1 -1 
 </PRE>
 <P>In this case, all CPU cores and GPU devices on the nodes would be
 utilized.  Each GPU device would be shared by 4 CPU cores. The CPU
 cores would perform force calculations for some fraction of the
 particles at the same time the GPUs performed force calculation for
 the other particles.
 </P>
 <P>The <I>threads_per_atom</I> keyword allows control of the number of GPU
 threads used per-atom to perform the short range force calculation.
 By default, the value will be chosen based on the pair style, however,
 the value can be set with this keyword to fine-tune performance.  For
 large cutoffs or with a small number of particles per GPU, increasing
 the value can improve performance. The number of threads per atom must
 be a power of 2 and currently cannot be greater than 32.
 </P>
 <P>The <I>cellsize</I> keyword can be used to control the size of the cells used
 for binning atoms in neighbor list calculations. Setting this value is 
 normally not needed; the optimal value is close to the default 
 (equal to the cutoff distance for the short range interactions 
 plus the neighbor skin). GPUs can perform efficiently with much larger cutoffs 
 than CPUs and this can be used to reduce the time required for long-range 
 calculations or in some cases to eliminate them with models such as 
 <A HREF = "pair_coul.html">coul/wolf</A> or <A HREF = "pair_coul.html">coul/dsf</A>. For very large cutoffs,
 it can be more efficient to use smaller values for cellsize in parallel
 simulations. For example, with a cutoff of 20*sigma and a neighbor skin of
 sigma, a cellsize of 5.25*sigma can be efficient for parallel simulations.
 </P>
 <P>The <I>device</I> keyword can be used to tune parameters to optimize for a specific
 accelerator when using OpenCL. For CUDA, the <I>device</I> keyword is ignored.
 Currently, the device type is limited to NVIDIA Kepler, NVIDIA Fermi, 
 AMD Cypress, or a generic device. More devices will be added soon. The default
 device type can be specified when building LAMMPS with the GPU library.
 </P>
 <HR>
 
-<P>The <I>cuda</I> style invokes options associated with the use of the
-USER-CUDA package.  
-</P>
-<P>The <I>gpu/node</I> keyword specifies the number <I>N</I> of GPUs to be used on
-each node.  An MPI process with rank <I>K</I> will use the GPU (K mod N).
-This implies that processes should be assigned with successive ranks
-on each node, which is the default with most (or even all) MPI
-implementations. The default value for <I>N</I> is 2.
-</P>
-<P>The <I>gpu/node/special</I> keyword also specifies the number (N) of GPUs
-to be used on each node, but allows more control over their
-specification.  An MPI process with rank <I>K</I> will use the GPU <I>gpuI</I>
-with l = (K mod N) + 1. This implies that processes should be assigned
-with successive ranks on each node, which is the default with most (or
-even all) MPI implementations.  For example if you have three GPUs on
-a machine, one of which is used for the X-Server (the GPU with the ID
-1) while the others (with IDs 0 and 2) are used for computations you
-would specify:
-</P>
-<PRE>package cuda gpu/node/special 2 0 2 
-</PRE>
-<P>A main purpose of the <I>gpu/node/special</I> optoin is to allow two (or
-more) simulations to be run on one workstation.  In that case one
-would set the first simulation to use GPU 0 and the second to use GPU
-1. This is not necessary though, if the GPUs are in what is called
-<I>compute exclusive</I> mode.  Using that setting, every process will get
-its own GPU automatically.  This <I>compute exclusive</I> mode can be set
-as root using the <I>nvidia-smi</I> tool which is part of the CUDA
-installation.
-</P>
-<P>Note that if the <I>gpu/node/special</I> keyword is not used, the USER-CUDA
-package sorts existing GPUs on each node according to their number of
-multiprocessors.  This way, compute GPUs will be priorized over
-X-Server GPUs.
-</P>
-<P>Use of the <I>timing</I> keyword will output detailed timing information
-for various subroutines.
-</P>
-<P>The <I>test</I> keyword will output info for the the specified atom at
-several points during each time step.  This is mainly usefull for
-debugging purposes.  Note that the simulation will be severly slowed
-down if this option is used.
-</P>
-<P>The <I>override/bpa</I> keyword can be used to specify which mode is used
-for pair-force evaluation.  TpA = one thread per atom; BpA = one block
-per atom.  If this keyword is not used, a short test at the begin of
-each run will determine which method is more effective (the result of
-this test is part of the LAMMPS output).  Therefore it is usually not
-necessary to use this keyword.
+<P>The <I>kokkos</I> style invokes options associated with the use of the
+KOKKOS package.
+</P>
+<P>The <I>neigh</I> keyword determines what kinds of neighbor lists are built.
+A value of <I>half</I> uses half-neighbor lists, the same as used by most
+pair styles in LAMMPS.  A value of <I>half/thread</I> uses a threadsafe
+variant of the half-neighbor list.  It should be used instead of
+<I>half</I> when running with threads on a CPU.  A value of <I>full</I> uses a
+full-neighborlist, i.e. f_ij and f_ji are both calculated.  This
+performs twice as much computation as the <I>half</I> option, however that
+can be a win because it is threadsafe and doesn't require atomic
+operations.  A value of <I>full/cluster</I> is an experimental neighbor
+style, where particles interact with all particles within a small
+cluster, if at least one of the clusters particles is within the
+neighbor cutoff range.  This potentially allows for better
+vectorization on architectures such as the Intel Phi.  If also reduces
+the size of the neighbor list by roughly a factor of the cluster size,
+thus reducing the total memory footprint considerably.
+</P>
+<P>The <I>comm/exchange</I> and <I>comm/forward</I> keywords determine whether the
+host or device performs the packing and unpacking of data when
+communicating information between processors.  "Exchange"
+communication happens only on timesteps that neighbor lists are
+rebuilt.  The data is only for atoms that migrate to new processors.
+"Forward" communication happens every timestep.  The data is for atom
+coordinates and any other atom properties that needs to be updated for
+ghost atoms owned by each processor.
+</P>
+<P>The value options for these keywords are <I>no</I> or <I>host</I> or <I>device</I>.
+A value of <I>no</I> means to use the standard non-KOKKOS method of
+packing/unpacking data for the communication.  A value of <I>host</I> means
+to use the host, typically a multi-core CPU, and perform the
+packing/unpacking in parallel with threads.  A value of <I>device</I> means
+to use the device, typically a GPU, to perform the packing/unpacking
+operation.
+</P>
+<P>The optimal choice for these keywords depends on the input script and
+the hardware used.  The <I>no</I> value is useful for verifying that Kokkos
+code is working correctly.  It may also be the fastest choice when
+using Kokkos styles in MPI-only mode (i.e. with a thread count of 1).
+When running on CPUs or Xeon Phi, the <I>host</I> and <I>device</I> values work
+identically.  When using GPUs, the <I>device</I> value will typically be
+optimal if all of your styles used in your input script are supported
+by the KOKKOS package.  In this case data can stay on the GPU for many
+timesteps without being moved between the host and GPU, if you use the
+<I>device</I> value.  This requires that your MPI is able to access GPU
+memory directly.  Currently that is true for OpenMPI 1.8 (or later
+versions), Mvapich2 1.9 (or later), and CrayMPI.  If your script uses
+styles (e.g. fixes) which are not yet supported by the KOKKOS package,
+then data has to be move between the host and device anyway, so it is
+typically faster to let the host handle communication, by using the
+<I>host</I> value.  Using <I>host</I> instead of <I>no</I> will enable use of
+multiple threads to pack/unpack communicated data.
 </P>
 <HR>
 
 <P>The <I>omp</I> style invokes options associated with the use of the
 USER-OMP package.
 </P>
 <P>The first argument allows to explicitly set the number of OpenMP
 threads to be allocated for each MPI process.  For example, if your
 system has nodes with dual quad-core processors, it has a total of 8
 cores per node.  You could run MPI on 2 cores on each node (e.g. using
 options for the mpirun command), and set the <I>Nthreads</I> setting to 4.
 This would effectively use all 8 cores on each node.  Since each MPI
 process would spawn 4 threads (one of which runs as part of the MPI
 process itself).
 </P>
 <P>For performance reasons, you should not set <I>Nthreads</I> to more threads
 than there are physical cores (per MPI task), but LAMMPS cannot check
 for this.
 </P>
 <P>An <I>Nthreads</I> value of '*' instructs LAMMPS to use whatever is the
 default for the given OpenMP environment. This is usually determined
 via the <I>OMP_NUM_THREADS</I> environment variable or the compiler
 runtime.  Please note that in most cases the default for OpenMP
 capable compilers is to use one thread for each available CPU core
 when <I>OMP_NUM_THREADS</I> is not set, which can lead to extremely bad
 performance.
 </P>
 <P>Which combination of threads and MPI tasks gives the best performance
 is difficult to predict and can depend on many components of your input.
 Not all features of LAMMPS support OpenMP and the parallel efficiency
 can be very different, too.
 </P>
 <P>The <I>mode</I> setting specifies where neighbor list calculations will be
 multi-threaded as well.  If <I>mode</I> is force, neighbor list calculation
 is performed in serial. If <I>mode</I> is force/neigh, a multi-threaded
 neighbor list build is used. Using the force/neigh setting is almost
 always faster and should produce idential neighbor lists at the
 expense of using some more memory (neighbor list pages are always
 allocated for all threads at the same time and each thread works on
 its own pages).
 </P>
 <HR>
 
 <P><B>Restrictions:</B>
 </P>
 <P>This command cannot be used after the simulation box is defined by a
 <A HREF = "read_data.html">read_data</A> or <A HREF = "create_box.html">create_box</A> command.
 </P>
 <P>The cuda style of this command can only be invoked if LAMMPS was built
 with the USER-CUDA package.  See the <A HREF = "Section_start.html#start_3">Making
 LAMMPS</A> section for more info.
 </P>
 <P>The gpu style of this command can only be invoked if LAMMPS was built
 with the GPU package.  See the <A HREF = "Section_start.html#start_3">Making
 LAMMPS</A> section for more info.
 </P>
+<P>The kk style of this command can only be invoked if LAMMPS was built
+with the KOKKOS package.  See the <A HREF = "Section_start.html#start_3">Making
+LAMMPS</A> section for more info.
+</P>
 <P>The omp style of this command can only be invoked if LAMMPS was built
 with the USER-OMP package.  See the <A HREF = "Section_start.html#start_3">Making
 LAMMPS</A> section for more info.
 </P>
 <P><B>Related commands:</B>
 </P>
 <P><A HREF = "suffix.html">suffix</A>
 </P>
 <P><B>Default:</B>
 </P>
+<P>The default settings for the USER-CUDA package are "package cuda gpu
+2".  This is the case whether the "-sf cuda" <A HREF = "Section_start.html#start_7">command-line
+switch</A> is used or not.
+</P>
 <P>If the "-sf gpu" <A HREF = "Section_start.html#start_7">command-line switch</A> is
 used then it is as if the command "package gpu force/neigh 0 0 1" were
 invoked, to specify default settings for the GPU package.  If the
 command-line switch is not used, then no defaults are set, and you
 must specify the appropriate package command in your input script.
 </P>
-<P>The default settings for the USER CUDA package are "package cuda gpu
-2".  This is the case whether the "-sf cuda" <A HREF = "Section_start.html#start_7">command-line
-switch</A> is used or not.
+<P>The default settings for the KOKKOS package are "package kk neigh full 
+comm/exchange host comm/forward host".  This is the case whether the
+"-sf kk" <A HREF = "Section_start.html#start_7">command-line switch</A> is used or
+not.
 </P>
 <P>If the "-sf omp" <A HREF = "Section_start.html#start_7">command-line switch</A> is
 used then it is as if the command "package omp *" were invoked, to
 specify default settings for the USER-OMP package.  If the
 command-line switch is not used, then no defaults are set, and you
 must specify the appropriate package command in your input script.
 </P>
 </HTML>
diff --git a/doc/package.txt b/doc/package.txt
index 54f534313..49b383da6 100644
--- a/doc/package.txt
+++ b/doc/package.txt
@@ -1,287 +1,357 @@
 "LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
 
 :link(lws,http://lammps.sandia.gov)
 :link(ld,Manual.html)
 :link(lc,Section_commands.html#comm)
 
 :line
 
 package command :h3
 
 [Syntax:]
 
 package style args :pre
 
-style = {gpu} or {cuda} or {omp} :ulb,l
+style = {cuda} or {gpu} or {kokkos} or {omp} :ulb,l
 args = arguments specific to the style :l
+  {cuda} args = keyword value ...
+    one or more keyword/value pairs may be appended
+    keywords = {gpu/node} or {gpu/node/special} or {timing} or {test} or {override/bpa}
+      {gpu/node} value = N
+        N = number of GPUs to be used per node
+      {gpu/node/special} values = N gpu1 .. gpuN
+        N = number of GPUs to be used per node
+        gpu1 .. gpuN = N IDs of the GPUs to use
+      {timing} values = none
+      {test} values = id
+        id = atom-ID of a test particle
+      {override/bpa} values = flag
+        flag = 0 for TpA algorithm, 1 for BpA algorithm 
   {gpu} args = mode first last split keyword value ...
     mode = force or force/neigh
     first = ID of first GPU to be used on each node
     last = ID of last GPU to be used on each node
     split = fraction of particles assigned to the GPU
     zero or more keyword/value pairs may be appended
     keywords = {threads_per_atom} or {cellsize} or {device}
       {threads_per_atom} value = Nthreads
         Nthreads = # of GPU threads used per atom
       {cellsize} value = dist
         dist = length (distance units) in each dimension for neighbor bins
       {device} value = device_type
         device_type = {kepler} or {fermi} or {cypress} or {phi} or {intel} or {generic}
-  {cuda} args = keyword value ...
+  {kokkos} args = keyword value ...
     one or more keyword/value pairs may be appended
-    keywords = {gpu/node} or {gpu/node/special} or {timing} or {test} or {override/bpa}
-      {gpu/node} value = N
-        N = number of GPUs to be used per node
-      {gpu/node/special} values = N gpu1 .. gpuN
-        N = number of GPUs to be used per node
-        gpu1 .. gpuN = N IDs of the GPUs to use
-      {timing} values = none
-      {test} values = id
-        id = atom-ID of a test particle
-      {override/bpa} values = flag
-        flag = 0 for TpA algorithm, 1 for BpA algorithm 
+    keywords = {neigh} or {comm/exchange} or {comm/forward}
+      {neigh} value = {full} or {half/thread} or {half} or {n2} or {full/cluster}
+      {comm/exchange} value = {no} or {host} or {device}
+      {comm/forward} value = {no} or {host} or {device}
   {omp} args = Nthreads mode
     Nthreads = # of OpenMP threads to associate with each MPI process
     mode = force or force/neigh (optional) :pre
 :ule
 
 [Examples:]
 
 package gpu force 0 0 1.0
 package gpu force 0 0 0.75
 package gpu force/neigh 0 0 1.0
 package gpu force/neigh 0 1 -1.0
 package cuda gpu/node/special 2 0 2
 package cuda test 3948
+package kokkos neigh half/thread comm/forward device
 package omp * force/neigh
 package omp 4 force :pre
 
 [Description:]
 
 This command invokes package-specific settings.  Currently the
-following packages use it: GPU, USER-CUDA, and USER-OMP.
+following packages use it: USER-CUDA, GPU, KOKKOS, and USER-OMP.
 
 To use the accelerated GPU and USER-OMP styles, the use of the package
 command is required.  However, as described in the "Defaults" section
 below, if you use the "-sf gpu" or "-sf omp" "command-line
 options"_Section_start.html#start_7 to enable use of these styles,
 then default package settings are enabled.  In that case you only need
 to use the package command if you want to change the defaults.
 
-To use the accelerate USER-CUDA styles, the package command is not
-required as defaults are assigned internally.  You only need to use
-the package command if you want to change the defaults.
+To use the accelerated USER-CUDA and KOKKOS styles, the package
+command is not required as defaults are assigned internally.  You only
+need to use the package command if you want to change the defaults.
 
 See "Section_accelerate"_Section_accelerate.html of the manual for
 more details about using these various packages for accelerating
 LAMMPS calculations.
 
 :line
 
+The {cuda} style invokes options associated with the use of the
+USER-CUDA package.  
+
+The {gpu/node} keyword specifies the number {N} of GPUs to be used on
+each node.  An MPI process with rank {K} will use the GPU (K mod N).
+This implies that processes should be assigned with successive ranks
+on each node, which is the default with most (or even all) MPI
+implementations. The default value for {N} is 2.
+
+The {gpu/node/special} keyword also specifies the number (N) of GPUs
+to be used on each node, but allows more control over their
+specification.  An MPI process with rank {K} will use the GPU {gpuI}
+with l = (K mod N) + 1. This implies that processes should be assigned
+with successive ranks on each node, which is the default with most (or
+even all) MPI implementations.  For example if you have three GPUs on
+a machine, one of which is used for the X-Server (the GPU with the ID
+1) while the others (with IDs 0 and 2) are used for computations you
+would specify:
+
+package cuda gpu/node/special 2 0 2 :pre
+
+A main purpose of the {gpu/node/special} optoin is to allow two (or
+more) simulations to be run on one workstation.  In that case one
+would set the first simulation to use GPU 0 and the second to use GPU
+1. This is not necessary though, if the GPUs are in what is called
+{compute exclusive} mode.  Using that setting, every process will get
+its own GPU automatically.  This {compute exclusive} mode can be set
+as root using the {nvidia-smi} tool which is part of the CUDA
+installation.
+
+Note that if the {gpu/node/special} keyword is not used, the USER-CUDA
+package sorts existing GPUs on each node according to their number of
+multiprocessors.  This way, compute GPUs will be priorized over
+X-Server GPUs.
+ 
+Use of the {timing} keyword will output detailed timing information
+for various subroutines.
+
+The {test} keyword will output info for the the specified atom at
+several points during each time step.  This is mainly usefull for
+debugging purposes.  Note that the simulation will be severly slowed
+down if this option is used.
+
+The {override/bpa} keyword can be used to specify which mode is used
+for pair-force evaluation.  TpA = one thread per atom; BpA = one block
+per atom.  If this keyword is not used, a short test at the begin of
+each run will determine which method is more effective (the result of
+this test is part of the LAMMPS output).  Therefore it is usually not
+necessary to use this keyword.
+
+:line
+
 The {gpu} style invokes options associated with the use of the GPU
 package. 
 
 The {mode} setting specifies where neighbor list calculations will be
 performed.  If {mode} is force, neighbor list calculation is performed
 on the CPU. If {mode} is force/neigh, neighbor list calculation is
 performed on the GPU. GPU neighbor list calculation currently cannot
 be used with a triclinic box. GPU neighbor list calculation currently
 cannot be used with "hybrid"_pair_hybrid.html pair styles.  GPU
 neighbor lists are not compatible with styles that are not
 GPU-enabled.  When a non-GPU enabled style requires a neighbor list,
 it will also be built using CPU routines. In these cases, it will
 typically be more efficient to only use CPU neighbor list builds.
 
 The {first} and {last} settings specify the GPUs that will be used for
 simulation.  On each node, the GPU IDs in the inclusive range from
 {first} to {last} will be used.
 
 The {split} setting can be used for load balancing force calculation
 work between CPU and GPU cores in GPU-enabled pair styles. If 0 <
 {split} < 1.0, a fixed fraction of particles is offloaded to the GPU
 while force calculation for the other particles occurs simulataneously
 on the CPU. If {split}<0, the optimal fraction (based on CPU and GPU
 timings) is calculated every 25 timesteps. If {split} = 1.0, all force
 calculations for GPU accelerated pair styles are performed on the
 GPU. In this case, "hybrid"_pair_hybrid.html, "bond"_bond_style.html,
 "angle"_angle_style.html, "dihedral"_dihedral_style.html,
 "improper"_improper_style.html, and "long-range"_kspace_style.html
 calculations can be performed on the CPU while the GPU is performing
 force calculations for the GPU-enabled pair style.  If all CPU force
 computations complete before the GPU, LAMMPS will block until the GPU
 has finished before continuing the timestep.
 
 As an example, if you have two GPUs per node and 8 CPU cores per node,
 and would like to run on 4 nodes (32 cores) with dynamic balancing of
 force calculation across CPU and GPU cores, you could specify
 
 package gpu force/neigh 0 1 -1 :pre
 
 In this case, all CPU cores and GPU devices on the nodes would be
 utilized.  Each GPU device would be shared by 4 CPU cores. The CPU
 cores would perform force calculations for some fraction of the
 particles at the same time the GPUs performed force calculation for
 the other particles.
 
 The {threads_per_atom} keyword allows control of the number of GPU
 threads used per-atom to perform the short range force calculation.
 By default, the value will be chosen based on the pair style, however,
 the value can be set with this keyword to fine-tune performance.  For
 large cutoffs or with a small number of particles per GPU, increasing
 the value can improve performance. The number of threads per atom must
 be a power of 2 and currently cannot be greater than 32.
 
 The {cellsize} keyword can be used to control the size of the cells used
 for binning atoms in neighbor list calculations. Setting this value is 
 normally not needed; the optimal value is close to the default 
 (equal to the cutoff distance for the short range interactions 
 plus the neighbor skin). GPUs can perform efficiently with much larger cutoffs 
 than CPUs and this can be used to reduce the time required for long-range 
 calculations or in some cases to eliminate them with models such as 
 "coul/wolf"_pair_coul.html or "coul/dsf"_pair_coul.html. For very large cutoffs,
 it can be more efficient to use smaller values for cellsize in parallel
 simulations. For example, with a cutoff of 20*sigma and a neighbor skin of
 sigma, a cellsize of 5.25*sigma can be efficient for parallel simulations.
 
 The {device} keyword can be used to tune parameters to optimize for a specific
 accelerator when using OpenCL. For CUDA, the {device} keyword is ignored.
 Currently, the device type is limited to NVIDIA Kepler, NVIDIA Fermi, 
 AMD Cypress, Intel CPU, Intel Phi, or a generic device. More devices will be 
 added soon. The default device type can be specified when building LAMMPS with
 the GPU library.
 
 :line
 
-The {cuda} style invokes options associated with the use of the
-USER-CUDA package.  
-
-The {gpu/node} keyword specifies the number {N} of GPUs to be used on
-each node.  An MPI process with rank {K} will use the GPU (K mod N).
-This implies that processes should be assigned with successive ranks
-on each node, which is the default with most (or even all) MPI
-implementations. The default value for {N} is 2.
-
-The {gpu/node/special} keyword also specifies the number (N) of GPUs
-to be used on each node, but allows more control over their
-specification.  An MPI process with rank {K} will use the GPU {gpuI}
-with l = (K mod N) + 1. This implies that processes should be assigned
-with successive ranks on each node, which is the default with most (or
-even all) MPI implementations.  For example if you have three GPUs on
-a machine, one of which is used for the X-Server (the GPU with the ID
-1) while the others (with IDs 0 and 2) are used for computations you
-would specify:
-
-package cuda gpu/node/special 2 0 2 :pre
-
-A main purpose of the {gpu/node/special} optoin is to allow two (or
-more) simulations to be run on one workstation.  In that case one
-would set the first simulation to use GPU 0 and the second to use GPU
-1. This is not necessary though, if the GPUs are in what is called
-{compute exclusive} mode.  Using that setting, every process will get
-its own GPU automatically.  This {compute exclusive} mode can be set
-as root using the {nvidia-smi} tool which is part of the CUDA
-installation.
-
-Note that if the {gpu/node/special} keyword is not used, the USER-CUDA
-package sorts existing GPUs on each node according to their number of
-multiprocessors.  This way, compute GPUs will be priorized over
-X-Server GPUs.
- 
-Use of the {timing} keyword will output detailed timing information
-for various subroutines.
-
-The {test} keyword will output info for the the specified atom at
-several points during each time step.  This is mainly usefull for
-debugging purposes.  Note that the simulation will be severly slowed
-down if this option is used.
-
-The {override/bpa} keyword can be used to specify which mode is used
-for pair-force evaluation.  TpA = one thread per atom; BpA = one block
-per atom.  If this keyword is not used, a short test at the begin of
-each run will determine which method is more effective (the result of
-this test is part of the LAMMPS output).  Therefore it is usually not
-necessary to use this keyword.
+The {kokkos} style invokes options associated with the use of the
+KOKKOS package.
+
+The {neigh} keyword determines what kinds of neighbor lists are built.
+A value of {half} uses half-neighbor lists, the same as used by most
+pair styles in LAMMPS.  A value of {half/thread} uses a threadsafe
+variant of the half-neighbor list.  It should be used instead of
+{half} when running with threads on a CPU.  A value of {full} uses a
+full-neighborlist, i.e. f_ij and f_ji are both calculated.  This
+performs twice as much computation as the {half} option, however that
+can be a win because it is threadsafe and doesn't require atomic
+operations.  A value of {full/cluster} is an experimental neighbor
+style, where particles interact with all particles within a small
+cluster, if at least one of the clusters particles is within the
+neighbor cutoff range.  This potentially allows for better
+vectorization on architectures such as the Intel Phi.  If also reduces
+the size of the neighbor list by roughly a factor of the cluster size,
+thus reducing the total memory footprint considerably.
+
+The {comm/exchange} and {comm/forward} keywords determine whether the
+host or device performs the packing and unpacking of data when
+communicating information between processors.  "Exchange"
+communication happens only on timesteps that neighbor lists are
+rebuilt.  The data is only for atoms that migrate to new processors.
+"Forward" communication happens every timestep.  The data is for atom
+coordinates and any other atom properties that needs to be updated for
+ghost atoms owned by each processor.
+
+The value options for these keywords are {no} or {host} or {device}.
+A value of {no} means to use the standard non-KOKKOS method of
+packing/unpacking data for the communication.  A value of {host} means
+to use the host, typically a multi-core CPU, and perform the
+packing/unpacking in parallel with threads.  A value of {device} means
+to use the device, typically a GPU, to perform the packing/unpacking
+operation.
+
+The optimal choice for these keywords depends on the input script and
+the hardware used.  The {no} value is useful for verifying that Kokkos
+code is working correctly.  It may also be the fastest choice when
+using Kokkos styles in MPI-only mode (i.e. with a thread count of 1).
+When running on CPUs or Xeon Phi, the {host} and {device} values work
+identically.  When using GPUs, the {device} value will typically be
+optimal if all of your styles used in your input script are supported
+by the KOKKOS package.  In this case data can stay on the GPU for many
+timesteps without being moved between the host and GPU, if you use the
+{device} value.  This requires that your MPI is able to access GPU
+memory directly.  Currently that is true for OpenMPI 1.8 (or later
+versions), Mvapich2 1.9 (or later), and CrayMPI.  If your script uses
+styles (e.g. fixes) which are not yet supported by the KOKKOS package,
+then data has to be move between the host and device anyway, so it is
+typically faster to let the host handle communication, by using the
+{host} value.  Using {host} instead of {no} will enable use of
+multiple threads to pack/unpack communicated data.
 
 :line
 
 The {omp} style invokes options associated with the use of the
 USER-OMP package.
 
 The first argument allows to explicitly set the number of OpenMP
 threads to be allocated for each MPI process.  For example, if your
 system has nodes with dual quad-core processors, it has a total of 8
 cores per node.  You could run MPI on 2 cores on each node (e.g. using
 options for the mpirun command), and set the {Nthreads} setting to 4.
 This would effectively use all 8 cores on each node.  Since each MPI
 process would spawn 4 threads (one of which runs as part of the MPI
 process itself).
 
 For performance reasons, you should not set {Nthreads} to more threads
 than there are physical cores (per MPI task), but LAMMPS cannot check
 for this.
 
 An {Nthreads} value of '*' instructs LAMMPS to use whatever is the
 default for the given OpenMP environment. This is usually determined
 via the {OMP_NUM_THREADS} environment variable or the compiler
 runtime.  Please note that in most cases the default for OpenMP
 capable compilers is to use one thread for each available CPU core
 when {OMP_NUM_THREADS} is not set, which can lead to extremely bad
 performance.
 
 Which combination of threads and MPI tasks gives the best performance
 is difficult to predict and can depend on many components of your input.
 Not all features of LAMMPS support OpenMP and the parallel efficiency
 can be very different, too.
 
 The {mode} setting specifies where neighbor list calculations will be
 multi-threaded as well.  If {mode} is force, neighbor list calculation
 is performed in serial. If {mode} is force/neigh, a multi-threaded
 neighbor list build is used. Using the force/neigh setting is almost
 always faster and should produce idential neighbor lists at the
 expense of using some more memory (neighbor list pages are always
 allocated for all threads at the same time and each thread works on
 its own pages).
 
 :line
 
 [Restrictions:]
 
 This command cannot be used after the simulation box is defined by a
 "read_data"_read_data.html or "create_box"_create_box.html command.
 
 The cuda style of this command can only be invoked if LAMMPS was built
 with the USER-CUDA package.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.
 
 The gpu style of this command can only be invoked if LAMMPS was built
 with the GPU package.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.
-When using the "r-RESPA run style"_run_style.html, GPU accelerated
-styles can only be used on the outermost RESPA level.
+
+The kk style of this command can only be invoked if LAMMPS was built
+with the KOKKOS package.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
 
 The omp style of this command can only be invoked if LAMMPS was built
 with the USER-OMP package.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.
 
 [Related commands:]
 
 "suffix"_suffix.html
 
 [Default:]
 
+The default settings for the USER-CUDA package are "package cuda gpu
+2".  This is the case whether the "-sf cuda" "command-line
+switch"_Section_start.html#start_7 is used or not.
+
 If the "-sf gpu" "command-line switch"_Section_start.html#start_7 is
 used then it is as if the command "package gpu force/neigh 0 0 1" were
 invoked, to specify default settings for the GPU package.  If the
 command-line switch is not used, then no defaults are set, and you
 must specify the appropriate package command in your input script.
 
-The default settings for the USER CUDA package are "package cuda gpu
-2".  This is the case whether the "-sf cuda" "command-line
-switch"_Section_start.html#start_7 is used or not.
+The default settings for the KOKKOS package are "package kk neigh full 
+comm/exchange host comm/forward host".  This is the case whether the
+"-sf kk" "command-line switch"_Section_start.html#start_7 is used or
+not.
 
 If the "-sf omp" "command-line switch"_Section_start.html#start_7 is
 used then it is as if the command "package omp *" were invoked, to
 specify default settings for the USER-OMP package.  If the
 command-line switch is not used, then no defaults are set, and you
 must specify the appropriate package command in your input script.
 
diff --git a/doc/pair_lj.html b/doc/pair_lj.html
index 767022c00..70eb931a1 100644
--- a/doc/pair_lj.html
+++ b/doc/pair_lj.html
@@ -1,348 +1,351 @@
 <HTML>
 <CENTER><A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> - <A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> 
 </CENTER>
 
 
 
 
 
 
 <HR>
 
 <H3>pair_style lj/cut command 
 </H3>
 <H3>pair_style lj/cut/cuda command 
 </H3>
 <H3>pair_style lj/cut/experimental/cuda command 
 </H3>
 <H3>pair_style lj/cut/gpu command 
 </H3>
+<H3>pair_style lj/cut/kk command 
+</H3>
 <H3>pair_style lj/cut/opt command 
 </H3>
 <H3>pair_style lj/cut/omp command 
 </H3>
 <H3>pair_style lj/cut/coul/cut command 
 </H3>
 <H3>pair_style lj/cut/coul/cut/cuda command 
 </H3>
 <H3>pair_style lj/cut/coul/cut/gpu command 
 </H3>
 <H3>pair_style lj/cut/coul/cut/omp command 
 </H3>
 <H3>pair_style lj/cut/coul/debye command 
 </H3>
 <H3>pair_style lj/cut/coul/debye/cuda command 
 </H3>
 <H3>pair_style lj/cut/coul/debye/gpu command 
 </H3>
 <H3>pair_style lj/cut/coul/debye/omp command 
 </H3>
 <H3>pair_style lj/cut/coul/dsf command 
 </H3>
 <H3>pair_style lj/cut/coul/dsf/gpu command 
 </H3>
 <H3>pair_style lj/cut/coul/dsf/omp command 
 </H3>
 <H3>pair_style lj/cut/coul/long command 
 </H3>
 <H3>pair_style lj/cut/coul/long/cuda command 
 </H3>
 <H3>pair_style lj/cut/coul/long/gpu command 
 </H3>
 <H3>pair_style lj/cut/coul/long/opt command 
 </H3>
 <H3>pair_style lj/cut/coul/long/omp command 
 </H3>
 <H3>pair_style lj/cut/coul/msm command 
 </H3>
 <H3>pair_style lj/cut/coul/msm/gpu command 
 </H3>
 <H3>pair_style lj/cut/coul/msm/omp command 
 </H3>
 <H3>pair_style lj/cut/tip4p/cut command 
 </H3>
 <H3>pair_style lj/cut/tip4p/cut/omp command 
 </H3>
 <H3>pair_style lj/cut/tip4p/long command 
 </H3>
 <H3>pair_style lj/cut/tip4p/long/omp command 
 </H3>
 <H3>pair_style lj/cut/tip4p/long/opt command 
 </H3>
 <P><B>Syntax:</B>
 </P>
 <PRE>pair_style style args 
 </PRE>
 <UL><LI>style = <I>lj/cut</I> or <I>lj/cut/coul/cut</I> or <I>lj/cut/coul/debye</I> or <I>lj/cut/coul/dsf</I> or <I>lj/cut/coul/long</I> or <I>lj/cut/coul/msm</I> or <I>lj/cut/tip4p/long</I>
 <LI>args = list of arguments for a particular style 
 </UL>
 <PRE>  <I>lj/cut</I> args = cutoff
     cutoff = global cutoff for Lennard Jones interactions (distance units)
   <I>lj/cut/coul/cut</I> args = cutoff (cutoff2)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
   <I>lj/cut/coul/debye</I> args = kappa cutoff (cutoff2)
     kappa = inverse of the Debye length (inverse distance units)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
   <I>lj/cut/coul/dsf</I> args = alpha cutoff (cutoff2)
     alpha = damping parameter (inverse distance units)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (distance units)
   <I>lj/cut/coul/long</I> args = cutoff (cutoff2)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
   <I>lj/cut/coul/msm</I> args = cutoff (cutoff2)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
   <I>lj/cut/tip4p/cut</I> args = otype htype btype atype qdist cutoff (cutoff2)
     otype,htype = atom types for TIP4P O and H
     btype,atype = bond and angle types for TIP4P waters
     qdist = distance from O atom to massless charge (distance units)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
   <I>lj/cut/tip4p/long</I> args = otype htype btype atype qdist cutoff (cutoff2)
     otype,htype = atom types for TIP4P O and H
     btype,atype = bond and angle types for TIP4P waters
     qdist = distance from O atom to massless charge (distance units)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units) 
 </PRE>
 <P><B>Examples:</B>
 </P>
 <PRE>pair_style lj/cut 2.5
 pair_coeff * * 1 1
 pair_coeff 1 1 1 1.1 2.8 
 </PRE>
 <PRE>pair_style lj/cut/coul/cut 10.0
 pair_style lj/cut/coul/cut 10.0 8.0
 pair_coeff * * 100.0 3.0
 pair_coeff 1 1 100.0 3.5 9.0
 pair_coeff 1 1 100.0 3.5 9.0 9.0 
 </PRE>
 <PRE>pair_style lj/cut/coul/debye 1.5 3.0
 pair_style lj/cut/coul/debye 1.5 2.5 5.0
 pair_coeff * * 1.0 1.0
 pair_coeff 1 1 1.0 1.5 2.5
 pair_coeff 1 1 1.0 1.5 2.5 5.0 
 </PRE>
 <PRE>pair_style lj/cut/coul/dsf 0.05 2.5 10.0
 pair_coeff * * 1.0 1.0
 pair_coeff 1 1 1.0 1.0 2.5 
 </PRE>
 <PRE>pair_style lj/cut/coul/long 10.0
 pair_style lj/cut/coul/long 10.0 8.0
 pair_coeff * * 100.0 3.0
 pair_coeff 1 1 100.0 3.5 9.0 
 </PRE>
 <PRE>pair_style lj/cut/coul/msm 10.0
 pair_style lj/cut/coul/msm 10.0 8.0
 pair_coeff * * 100.0 3.0
 pair_coeff 1 1 100.0 3.5 9.0 
 </PRE>
 <PRE>pair_style lj/cut/tip4p/cut 1 2 7 8 0.15 12.0
 pair_style lj/cut/tip4p/cut 1 2 7 8 0.15 12.0 10.0
 pair_coeff * * 100.0 3.0
 pair_coeff 1 1 100.0 3.5 9.0 
 </PRE>
 <PRE>pair_style lj/cut/tip4p/long 1 2 7 8 0.15 12.0
 pair_style lj/cut/tip4p/long 1 2 7 8 0.15 12.0 10.0
 pair_coeff * * 100.0 3.0
 pair_coeff 1 1 100.0 3.5 9.0 
 </PRE>
 <P><B>Description:</B>
 </P>
 <P>The <I>lj/cut</I> styles compute the standard 12/6 Lennard-Jones potential,
 given by
 </P>
 <CENTER><IMG SRC = "Eqs/pair_lj.jpg">
 </CENTER>
 <P>Rc is the cutoff.
 </P>
 <P>Style <I>lj/cut/coul/cut</I> adds a Coulombic pairwise interaction given by
 </P>
 <CENTER><IMG SRC = "Eqs/pair_coulomb.jpg">
 </CENTER>
 <P>where C is an energy-conversion constant, Qi and Qj are the charges on
 the 2 atoms, and epsilon is the dielectric constant which can be set
 by the <A HREF = "dielectric.html">dielectric</A> command.  If one cutoff is
 specified in the pair_style command, it is used for both the LJ and
 Coulombic terms.  If two cutoffs are specified, they are used as
 cutoffs for the LJ and Coulombic terms respectively.
 </P>
 <P>Style <I>lj/cut/coul/debye</I> adds an additional exp() damping factor
 to the Coulombic term, given by
 </P>
 <CENTER><IMG SRC = "Eqs/pair_debye.jpg">
 </CENTER>
 <P>where kappa is the inverse of the Debye length.  This potential is
 another way to mimic the screening effect of a polar solvent.
 </P>
 <P>Style <I>lj/cut/coul/dsf</I> computes the Coulombic term via the damped 
 shifted force model described in <A HREF = "#Fennell">Fennell</A>, given by:
 </P>
 <CENTER><IMG SRC = "Eqs/pair_coul_dsf.jpg">
 </CENTER>
 <P>where <I>alpha</I> is the damping parameter and erfc() is the complementary
 error-function. This potential is essentially a short-range,
 spherically-truncated, charge-neutralized, shifted, pairwise <I>1/r</I>
 summation.  The potential is based on Wolf summation, proposed as an
 alternative to Ewald summation for condensed phase systems where
 charge screening causes electrostatic interactions to become
 effectively short-ranged. In order for the electrostatic sum to be
 absolutely convergent, charge neutralization within the cutoff radius
 is enforced by shifting the potential through placement of image
 charges on the cutoff sphere. Convergence can often be improved by
 setting <I>alpha</I> to a small non-zero value.
 </P>
 <P>Styles <I>lj/cut/coul/long</I> and <I>lj/cut/coul/msm</I> compute the same
 Coulombic interactions as style <I>lj/cut/coul/cut</I> except that an
 additional damping factor is applied to the Coulombic term so it can
 be used in conjunction with the <A HREF = "kspace_style.html">kspace_style</A>
 command and its <I>ewald</I> or <I>pppm</I> option.  The Coulombic cutoff
 specified for this style means that pairwise interactions within this
 distance are computed directly; interactions outside that distance are
 computed in reciprocal space.
 </P>
 <P>Styles <I>lj/cut/tip4p/cut</I> and <I>lj/cut/tip4p/long</I> implement the TIP4P
 water model of <A HREF = "#Jorgensen">(Jorgensen)</A>, which introduces a massless
 site located a short distance away from the oxygen atom along the
 bisector of the HOH angle.  The atomic types of the oxygen and
 hydrogen atoms, the bond and angle types for OH and HOH interactions,
 and the distance to the massless charge site are specified as
 pair_style arguments.  Style <I>lj/cut/tip4p/cut</I> uses a cutoff for
 Coulomb interactions; style <I>lj/cut/tip4p/long</I> is for use with a
 long-range Coulombic solver (Ewald or PPPM).
 </P>
 <P>IMPORTANT NOTE: For each TIP4P water molecule in your system, the atom
 IDs for the O and 2 H atoms must be consecutive, with the O atom
 first.  This is to enable LAMMPS to "find" the 2 H atoms associated
 with each O atom.  For example, if the atom ID of an O atom in a TIP4P
 water molecule is 500, then its 2 H atoms must have IDs 501 and 502.
 </P>
 <P>See the <A HREF = "Section_howto.html#howto_8">howto section</A> for more
 information on how to use the TIP4P pair styles and lists of
 parameters to set.  Note that the neighobr list cutoff for Coulomb
 interactions is effectively extended by a distance 2*qdist when using
 the TIP4P pair style, to account for the offset distance of the
 fictitious charges on O atoms in water molecules.  Thus it is
 typically best in an efficiency sense to use a LJ cutoff >= Coulomb
 cutoff + 2*qdist, to shrink the size of the neighbor list.  This leads
 to slightly larger cost for the long-range calculation, so you can
 test the trade-off for your model.
 </P>
 <P>For all of the <I>lj/cut</I> pair styles, the following coefficients must
 be defined for each pair of atoms types via the
 <A HREF = "pair_coeff.html">pair_coeff</A> command as in the examples above, or in
 the data file or restart files read by the <A HREF = "read_data.html">read_data</A>
 or <A HREF = "read_restart.html">read_restart</A> commands, or by mixing as
 described below:
 </P>
 <UL><LI>epsilon (energy units)
 <LI>sigma (distance units)
 <LI>cutoff1 (distance units)
 <LI>cutoff2 (distance units) 
 </UL>
 <P>Note that sigma is defined in the LJ formula as the zero-crossing
 distance for the potential, not as the energy minimum at 2^(1/6)
 sigma.
 </P>
 <P>The latter 2 coefficients are optional.  If not specified, the global
 LJ and Coulombic cutoffs specified in the pair_style command are used.
 If only one cutoff is specified, it is used as the cutoff for both LJ
 and Coulombic interactions for this type pair.  If both coefficients
 are specified, they are used as the LJ and Coulombic cutoffs for this
 type pair.  You cannot specify 2 cutoffs for style <I>lj/cut</I>, since it
 has no Coulombic terms.
 </P>
 <P>For <I>lj/cut/coul/long</I> and <I>lj/cut/coul/msm</I> and <I>lj/cut/tip4p/cut</I>
 and <I>lj/cut/tip4p/long</I> only the LJ cutoff can be specified since a
 Coulombic cutoff cannot be specified for an individual I,J type pair.
 All type pairs use the same global Coulombic cutoff specified in the
 pair_style command.
 </P>
 <HR>
 
-<P>Styles with a <I>cuda</I>, <I>gpu</I>, <I>omp</I>, or <I>opt</I> suffix are functionally
-the same as the corresponding style without the suffix.  They have
-been optimized to run faster, depending on your available hardware, as
-discussed in <A HREF = "Section_accelerate.html">Section_accelerate</A> of the
-manual.  The accelerated styles take the same arguments and should
-produce the same results, except for round-off and precision issues.
-</P>
-<P>These accelerated styles are part of the USER-CUDA, GPU, USER-OMP and OPT
-packages, respectively.  They are only enabled if LAMMPS was built with
-those packages.  See the <A HREF = "Section_start.html#start_3">Making LAMMPS</A>
-section for more info.
+<P>Styles with a <I>cuda</I>, <I>gpu</I>, <I>kk</I>, <I>omp</I>, or <I>opt</I> suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in <A HREF = "Section_accelerate.html">Section_accelerate</A>
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+</P>
+<P>These accelerated styles are part of the USER-CUDA, GPU, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the <A HREF = "Section_start.html#start_3">Making
+LAMMPS</A> section for more info.
 </P>
 <P>You can specify the accelerated styles explicitly in your input script
 by including their suffix, or you can use the <A HREF = "Section_start.html#start_7">-suffix command-line
 switch</A> when you invoke LAMMPS, or you can
 use the <A HREF = "suffix.html">suffix</A> command in your input script.
 </P>
 <P>See <A HREF = "Section_accelerate.html">Section_accelerate</A> of the manual for
 more instructions on how to use the accelerated styles effectively.
 </P>
 <HR>
 
 <P><B>Mixing, shift, table, tail correction, restart, rRESPA info</B>:
 </P>
 <P>For atom type pairs I,J and I != J, the epsilon and sigma coefficients
 and cutoff distance for all of the lj/cut pair styles can be mixed.
 The default mix value is <I>geometric</I>.  See the "pair_modify" command
 for details.
 </P>
 <P>All of the <I>lj/cut</I> pair styles support the
 <A HREF = "pair_modify.html">pair_modify</A> shift option for the energy of the
 Lennard-Jones portion of the pair interaction.
 </P>
 <P>The <I>lj/cut/coul/long</I> and <I>lj/cut/tip4p/long</I> pair styles support the
 <A HREF = "pair_modify.html">pair_modify</A> table option since they can tabulate
 the short-range portion of the long-range Coulombic interaction.
 </P>
 <P>All of the <I>lj/cut</I> pair styles support the
 <A HREF = "pair_modify.html">pair_modify</A> tail option for adding a long-range
 tail correction to the energy and pressure for the Lennard-Jones
 portion of the pair interaction.
 </P>
 <P>All of the <I>lj/cut</I> pair styles write their information to <A HREF = "restart.html">binary
 restart files</A>, so pair_style and pair_coeff commands do
 not need to be specified in an input script that reads a restart file.
 </P>
 <P>The <I>lj/cut</I> and <I>lj/cut/coul/long</I> pair styles support the use of the
 <I>inner</I>, <I>middle</I>, and <I>outer</I> keywords of the <A HREF = "run_style.html">run_style
 respa</A> command, meaning the pairwise forces can be
 partitioned by distance at different levels of the rRESPA hierarchy.
 The other styles only support the <I>pair</I> keyword of run_style respa.
 See the <A HREF = "run_style.html">run_style</A> command for details.
 </P>
 <HR>
 
 <P><B>Restrictions:</B>
 </P>
 <P>The <I>lj/cut/coul/long</I> and <I>lj/cut/tip4p/long</I> styles are part of the
 KSPACE package. The <I>lj/cut/tip4p/cut</I> style is part of the MOLECULE
 package. These styles are only enabled if LAMMPS was built with those
 packages.  See the <A HREF = "Section_start.html#start_3">Making LAMMPS</A> section
 for more info.  Note that the KSPACE and MOLECULE packages are
 installed by default.
 </P>
 <P><B>Related commands:</B>
 </P>
 <P><A HREF = "pair_coeff.html">pair_coeff</A>
 </P>
 <P><B>Default:</B> none
 </P>
 <HR>
 
 <A NAME = "Jorgensen"></A>
 
 <P><B>(Jorgensen)</B> Jorgensen, Chandrasekhar, Madura, Impey, Klein, J Chem
 Phys, 79, 926 (1983).
 </P>
 <A NAME = "Fennell"></A>
 
 <P><B>(Fennell)</B> C. J. Fennell, J. D. Gezelter, J Chem Phys, 124, 
 234104 (2006).
 </P>
 </HTML>
diff --git a/doc/pair_lj.txt b/doc/pair_lj.txt
index a5613c121..fed4af04f 100644
--- a/doc/pair_lj.txt
+++ b/doc/pair_lj.txt
@@ -1,311 +1,313 @@
 "LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
 
 :link(lws,http://lammps.sandia.gov)
 :link(ld,Manual.html)
 :link(lc,Section_commands.html#comm)
 
 :line
 
 pair_style lj/cut command :h3
 pair_style lj/cut/cuda command :h3
 pair_style lj/cut/experimental/cuda command :h3
 pair_style lj/cut/gpu command :h3
+pair_style lj/cut/kk command :h3
 pair_style lj/cut/opt command :h3
 pair_style lj/cut/omp command :h3
 pair_style lj/cut/coul/cut command :h3
 pair_style lj/cut/coul/cut/cuda command :h3
 pair_style lj/cut/coul/cut/gpu command :h3
 pair_style lj/cut/coul/cut/omp command :h3
 pair_style lj/cut/coul/debye command :h3
 pair_style lj/cut/coul/debye/cuda command :h3
 pair_style lj/cut/coul/debye/gpu command :h3
 pair_style lj/cut/coul/debye/omp command :h3
 pair_style lj/cut/coul/dsf command :h3
 pair_style lj/cut/coul/dsf/gpu command :h3
 pair_style lj/cut/coul/dsf/omp command :h3
 pair_style lj/cut/coul/long command :h3
 pair_style lj/cut/coul/long/cuda command :h3
 pair_style lj/cut/coul/long/gpu command :h3
 pair_style lj/cut/coul/long/opt command :h3
 pair_style lj/cut/coul/long/omp command :h3
 pair_style lj/cut/coul/msm command :h3
 pair_style lj/cut/coul/msm/gpu command :h3
 pair_style lj/cut/coul/msm/omp command :h3
 pair_style lj/cut/tip4p/cut command :h3
 pair_style lj/cut/tip4p/cut/omp command :h3
 pair_style lj/cut/tip4p/long command :h3
 pair_style lj/cut/tip4p/long/omp command :h3
 pair_style lj/cut/tip4p/long/opt command :h3
 
 [Syntax:]
 
 pair_style style args :pre
 
 style = {lj/cut} or {lj/cut/coul/cut} or {lj/cut/coul/debye} or {lj/cut/coul/dsf} or {lj/cut/coul/long} or {lj/cut/coul/msm} or {lj/cut/tip4p/long}
 args = list of arguments for a particular style :ul
   {lj/cut} args = cutoff
     cutoff = global cutoff for Lennard Jones interactions (distance units)
   {lj/cut/coul/cut} args = cutoff (cutoff2)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
   {lj/cut/coul/debye} args = kappa cutoff (cutoff2)
     kappa = inverse of the Debye length (inverse distance units)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
   {lj/cut/coul/dsf} args = alpha cutoff (cutoff2)
     alpha = damping parameter (inverse distance units)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (distance units)
   {lj/cut/coul/long} args = cutoff (cutoff2)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
   {lj/cut/coul/msm} args = cutoff (cutoff2)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
   {lj/cut/tip4p/cut} args = otype htype btype atype qdist cutoff (cutoff2)
     otype,htype = atom types for TIP4P O and H
     btype,atype = bond and angle types for TIP4P waters
     qdist = distance from O atom to massless charge (distance units)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
   {lj/cut/tip4p/long} args = otype htype btype atype qdist cutoff (cutoff2)
     otype,htype = atom types for TIP4P O and H
     btype,atype = bond and angle types for TIP4P waters
     qdist = distance from O atom to massless charge (distance units)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units) :pre
 
 [Examples:]
 
 pair_style lj/cut 2.5
 pair_coeff * * 1 1
 pair_coeff 1 1 1 1.1 2.8 :pre
 
 pair_style lj/cut/coul/cut 10.0
 pair_style lj/cut/coul/cut 10.0 8.0
 pair_coeff * * 100.0 3.0
 pair_coeff 1 1 100.0 3.5 9.0
 pair_coeff 1 1 100.0 3.5 9.0 9.0 :pre
 
 pair_style lj/cut/coul/debye 1.5 3.0
 pair_style lj/cut/coul/debye 1.5 2.5 5.0
 pair_coeff * * 1.0 1.0
 pair_coeff 1 1 1.0 1.5 2.5
 pair_coeff 1 1 1.0 1.5 2.5 5.0 :pre
 
 pair_style lj/cut/coul/dsf 0.05 2.5 10.0
 pair_coeff * * 1.0 1.0
 pair_coeff 1 1 1.0 1.0 2.5 :pre
 
 pair_style lj/cut/coul/long 10.0
 pair_style lj/cut/coul/long 10.0 8.0
 pair_coeff * * 100.0 3.0
 pair_coeff 1 1 100.0 3.5 9.0 :pre
 
 pair_style lj/cut/coul/msm 10.0
 pair_style lj/cut/coul/msm 10.0 8.0
 pair_coeff * * 100.0 3.0
 pair_coeff 1 1 100.0 3.5 9.0 :pre
 
 pair_style lj/cut/tip4p/cut 1 2 7 8 0.15 12.0
 pair_style lj/cut/tip4p/cut 1 2 7 8 0.15 12.0 10.0
 pair_coeff * * 100.0 3.0
 pair_coeff 1 1 100.0 3.5 9.0 :pre
 
 pair_style lj/cut/tip4p/long 1 2 7 8 0.15 12.0
 pair_style lj/cut/tip4p/long 1 2 7 8 0.15 12.0 10.0
 pair_coeff * * 100.0 3.0
 pair_coeff 1 1 100.0 3.5 9.0 :pre
 
 [Description:]
 
 The {lj/cut} styles compute the standard 12/6 Lennard-Jones potential,
 given by
 
 :c,image(Eqs/pair_lj.jpg)
 
 Rc is the cutoff.
 
 Style {lj/cut/coul/cut} adds a Coulombic pairwise interaction given by
 
 :c,image(Eqs/pair_coulomb.jpg)
 
 where C is an energy-conversion constant, Qi and Qj are the charges on
 the 2 atoms, and epsilon is the dielectric constant which can be set
 by the "dielectric"_dielectric.html command.  If one cutoff is
 specified in the pair_style command, it is used for both the LJ and
 Coulombic terms.  If two cutoffs are specified, they are used as
 cutoffs for the LJ and Coulombic terms respectively.
 
 Style {lj/cut/coul/debye} adds an additional exp() damping factor
 to the Coulombic term, given by
 
 :c,image(Eqs/pair_debye.jpg)
 
 where kappa is the inverse of the Debye length.  This potential is
 another way to mimic the screening effect of a polar solvent.
 
 Style {lj/cut/coul/dsf} computes the Coulombic term via the damped 
 shifted force model described in "Fennell"_#Fennell, given by:
 
 :c,image(Eqs/pair_coul_dsf.jpg)
 
 where {alpha} is the damping parameter and erfc() is the complementary
 error-function. This potential is essentially a short-range,
 spherically-truncated, charge-neutralized, shifted, pairwise {1/r}
 summation.  The potential is based on Wolf summation, proposed as an
 alternative to Ewald summation for condensed phase systems where
 charge screening causes electrostatic interactions to become
 effectively short-ranged. In order for the electrostatic sum to be
 absolutely convergent, charge neutralization within the cutoff radius
 is enforced by shifting the potential through placement of image
 charges on the cutoff sphere. Convergence can often be improved by
 setting {alpha} to a small non-zero value.
 
 Styles {lj/cut/coul/long} and {lj/cut/coul/msm} compute the same
 Coulombic interactions as style {lj/cut/coul/cut} except that an
 additional damping factor is applied to the Coulombic term so it can
 be used in conjunction with the "kspace_style"_kspace_style.html
 command and its {ewald} or {pppm} option.  The Coulombic cutoff
 specified for this style means that pairwise interactions within this
 distance are computed directly; interactions outside that distance are
 computed in reciprocal space.
 
 Styles {lj/cut/tip4p/cut} and {lj/cut/tip4p/long} implement the TIP4P
 water model of "(Jorgensen)"_#Jorgensen, which introduces a massless
 site located a short distance away from the oxygen atom along the
 bisector of the HOH angle.  The atomic types of the oxygen and
 hydrogen atoms, the bond and angle types for OH and HOH interactions,
 and the distance to the massless charge site are specified as
 pair_style arguments.  Style {lj/cut/tip4p/cut} uses a cutoff for
 Coulomb interactions; style {lj/cut/tip4p/long} is for use with a
 long-range Coulombic solver (Ewald or PPPM).
 
 IMPORTANT NOTE: For each TIP4P water molecule in your system, the atom
 IDs for the O and 2 H atoms must be consecutive, with the O atom
 first.  This is to enable LAMMPS to "find" the 2 H atoms associated
 with each O atom.  For example, if the atom ID of an O atom in a TIP4P
 water molecule is 500, then its 2 H atoms must have IDs 501 and 502.
 
 See the "howto section"_Section_howto.html#howto_8 for more
 information on how to use the TIP4P pair styles and lists of
 parameters to set.  Note that the neighobr list cutoff for Coulomb
 interactions is effectively extended by a distance 2*qdist when using
 the TIP4P pair style, to account for the offset distance of the
 fictitious charges on O atoms in water molecules.  Thus it is
 typically best in an efficiency sense to use a LJ cutoff >= Coulomb
 cutoff + 2*qdist, to shrink the size of the neighbor list.  This leads
 to slightly larger cost for the long-range calculation, so you can
 test the trade-off for your model.
 
 For all of the {lj/cut} pair styles, the following coefficients must
 be defined for each pair of atoms types via the
 "pair_coeff"_pair_coeff.html command as in the examples above, or in
 the data file or restart files read by the "read_data"_read_data.html
 or "read_restart"_read_restart.html commands, or by mixing as
 described below:
 
 epsilon (energy units)
 sigma (distance units)
 cutoff1 (distance units)
 cutoff2 (distance units) :ul
 
 Note that sigma is defined in the LJ formula as the zero-crossing
 distance for the potential, not as the energy minimum at 2^(1/6)
 sigma.
 
 The latter 2 coefficients are optional.  If not specified, the global
 LJ and Coulombic cutoffs specified in the pair_style command are used.
 If only one cutoff is specified, it is used as the cutoff for both LJ
 and Coulombic interactions for this type pair.  If both coefficients
 are specified, they are used as the LJ and Coulombic cutoffs for this
 type pair.  You cannot specify 2 cutoffs for style {lj/cut}, since it
 has no Coulombic terms.
 
 For {lj/cut/coul/long} and {lj/cut/coul/msm} and {lj/cut/tip4p/cut}
 and {lj/cut/tip4p/long} only the LJ cutoff can be specified since a
 Coulombic cutoff cannot be specified for an individual I,J type pair.
 All type pairs use the same global Coulombic cutoff specified in the
 pair_style command.
 
 :line
 
-Styles with a {cuda}, {gpu}, {omp}, or {opt} suffix are functionally
-the same as the corresponding style without the suffix.  They have
-been optimized to run faster, depending on your available hardware, as
-discussed in "Section_accelerate"_Section_accelerate.html of the
-manual.  The accelerated styles take the same arguments and should
-produce the same results, except for round-off and precision issues.
-
-These accelerated styles are part of the USER-CUDA, GPU, USER-OMP and OPT
-packages, respectively.  They are only enabled if LAMMPS was built with
-those packages.  See the "Making LAMMPS"_Section_start.html#start_3
-section for more info.
+Styles with a {cuda}, {gpu}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section_accelerate"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the USER-CUDA, GPU, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
 
 You can specify the accelerated styles explicitly in your input script
 by including their suffix, or you can use the "-suffix command-line
 switch"_Section_start.html#start_7 when you invoke LAMMPS, or you can
 use the "suffix"_suffix.html command in your input script.
 
 See "Section_accelerate"_Section_accelerate.html of the manual for
 more instructions on how to use the accelerated styles effectively.
 
 :line
 
 [Mixing, shift, table, tail correction, restart, rRESPA info]:
 
 For atom type pairs I,J and I != J, the epsilon and sigma coefficients
 and cutoff distance for all of the lj/cut pair styles can be mixed.
 The default mix value is {geometric}.  See the "pair_modify" command
 for details.
 
 All of the {lj/cut} pair styles support the
 "pair_modify"_pair_modify.html shift option for the energy of the
 Lennard-Jones portion of the pair interaction.
 
 The {lj/cut/coul/long} and {lj/cut/tip4p/long} pair styles support the
 "pair_modify"_pair_modify.html table option since they can tabulate
 the short-range portion of the long-range Coulombic interaction.
 
 All of the {lj/cut} pair styles support the
 "pair_modify"_pair_modify.html tail option for adding a long-range
 tail correction to the energy and pressure for the Lennard-Jones
 portion of the pair interaction.
 
 All of the {lj/cut} pair styles write their information to "binary
 restart files"_restart.html, so pair_style and pair_coeff commands do
 not need to be specified in an input script that reads a restart file.
 
 The {lj/cut} and {lj/cut/coul/long} pair styles support the use of the
 {inner}, {middle}, and {outer} keywords of the "run_style
 respa"_run_style.html command, meaning the pairwise forces can be
 partitioned by distance at different levels of the rRESPA hierarchy.
 The other styles only support the {pair} keyword of run_style respa.
 See the "run_style"_run_style.html command for details.
 
 :line
 
 [Restrictions:]
 
 The {lj/cut/coul/long} and {lj/cut/tip4p/long} styles are part of the
 KSPACE package. The {lj/cut/tip4p/cut} style is part of the MOLECULE
 package. These styles are only enabled if LAMMPS was built with those
 packages.  See the "Making LAMMPS"_Section_start.html#start_3 section
 for more info.  Note that the KSPACE and MOLECULE packages are
 installed by default.
 
 [Related commands:]
 
 "pair_coeff"_pair_coeff.html
 
 [Default:] none
 
 :line
 
 :link(Jorgensen)
 [(Jorgensen)] Jorgensen, Chandrasekhar, Madura, Impey, Klein, J Chem
 Phys, 79, 926 (1983).
 
 :link(Fennell)
 [(Fennell)] C. J. Fennell, J. D. Gezelter, J Chem Phys, 124, 
 234104 (2006).
diff --git a/doc/pair_table.html b/doc/pair_table.html
index c86d12afc..21d9fc9d1 100644
--- a/doc/pair_table.html
+++ b/doc/pair_table.html
@@ -1,260 +1,263 @@
 <HTML>
 <CENTER><A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> - <A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> 
 </CENTER>
 
 
 
 
 
 
 <HR>
 
 <H3>pair_style table command 
 </H3>
 <H3>pair_style table/gpu command 
 </H3>
+<H3>pair_style table/kk command 
+</H3>
 <H3>pair_style table/omp command 
 </H3>
 <P><B>Syntax:</B>
 </P>
 <PRE>pair_style table style N keyword ... 
 </PRE>
 <UL><LI>style = <I>lookup</I> or <I>linear</I> or <I>spline</I> or <I>bitmap</I> = method of interpolation
 <LI>N = use N values in <I>lookup</I>, <I>linear</I>, <I>spline</I> tables
 <LI>N = use 2^N values in <I>bitmap</I> tables
 <LI>zero or more keywords may be appended
 <LI>keyword = <I>ewald</I> or <I>pppm</I> or <I>msm</I> or <I>dispersion</I> or <I>tip4p</I> 
 </UL>
 <P><B>Examples:</B>
 </P>
 <PRE>pair_style table linear 1000
 pair_style table linear 1000 pppm
 pair_style table bitmap 12
 pair_coeff * 3 morse.table ENTRY1
 pair_coeff * 3 morse.table ENTRY1 7.0 
 </PRE>
 <P><B>Description:</B>
 </P>
 <P>Style <I>table</I> creates interpolation tables of length <I>N</I> from pair
 potential and force values listed in a file(s) as a function of
 distance.  The files are read by the <A HREF = "pair_coeff.html">pair_coeff</A>
 command.
 </P>
 <P>The interpolation tables are created by fitting cubic splines to the
 file values and interpolating energy and force values at each of <I>N</I>
 distances.  During a simulation, these tables are used to interpolate
 energy and force values as needed.  The interpolation is done in one
 of 4 styles: <I>lookup</I>, <I>linear</I>, <I>spline</I>, or <I>bitmap</I>.
 </P>
 <P>For the <I>lookup</I> style, the distance between 2 atoms is used to find
 the nearest table entry, which is the energy or force.
 </P>
 <P>For the <I>linear</I> style, the pair distance is used to find 2
 surrounding table values from which an energy or force is computed by
 linear interpolation.
 </P>
 <P>For the <I>spline</I> style, a cubic spline coefficients are computed and
 stored at each of the <I>N</I> values in the table.  The pair distance is
 used to find the appropriate set of coefficients which are used to
 evaluate a cubic polynomial which computes the energy or force.
 </P>
 <P>For the <I>bitmap</I> style, the N means to create interpolation tables
 that are 2^N in length.  <The pair distance is used to index into the
 table via a fast bit-mapping technique <A HREF = "#Wolff">(Wolff)</A> and a linear
 interpolation is performed between adjacent table values.
 </P>
 <P>The following coefficients must be defined for each pair of atoms
 types via the <A HREF = "pair_coeff.html">pair_coeff</A> command as in the examples
 above.
 </P>
 <UL><LI>filename
 <LI>keyword
 <LI>cutoff (distance units) 
 </UL>
 <P>The filename specifies a file containing tabulated energy and force
 values.  The keyword specifies a section of the file.  The cutoff is
 an optional coefficient.  If not specified, the outer cutoff in the
 table itself (see below) will be used to build an interpolation table
 that extend to the largest tabulated distance.  If specified, only
 file values up to the cutoff are used to create the interpolation
 table.  The format of this file is described below.
 </P>
 <P>If your tabulated potential(s) are designed to be used as the
 short-range part of one of the long-range solvers specified by the
 <A HREF = "kspace_style.html">kspace_style</A> command, then you must use one or
 more of the optional keywords listed above for the pair_style command.
 These are <I>ewald</I> or <I>pppm</I> or <I>msm</I> or <I>dispersion</I> or <I>tip4p</I>.  This
 is so LAMMPS can insure the short-range potential and long-range
 solver are compatible with each other, as it does for other
 short-range pair styles, such as <A HREF = "pair_lj.html">pair_style
 lj/cut/coul/long</A>.  Note that it is up to you to insure
 the tabulated values for each pair of atom types has the correct
 functional form to be compatible with the matching long-range solver.
 </P>
 <HR>
 
 <P>Here are some guidelines for using the pair_style table command to
 best effect:
 </P>
 <UL><LI>Vary the number of table points; you may need to use more than you think
 to get good resolution. 
 
 <LI>Always use the <A HREF = "pair_write.html">pair_write</A> command to produce a plot
 of what the final interpolated potential looks like.  This can show up
 interpolation "features" you may not like. 
 
 <LI>Start with the linear style; it's the style least likely to have problems. 
 
 <LI>Use <I>N</I> in the pair_style command equal to the "N" in the tabulation
 file, and use the "RSQ" or "BITMAP" parameter, so additional interpolation
 is not needed.  See discussion below. 
 
 <LI>Make sure that your tabulated forces and tabulated energies are consistent 
 (dE/dr = -F) along the entire range of r values. 
 
 <LI>Use as large an inner cutoff as possible.  This avoids fitting splines
 to very steep parts of the potential. 
 </UL>
 <HR>
 
 <P>The format of a tabulated file is as follows (without the
 parenthesized comments):
 </P>
 <PRE># Morse potential for Fe   (one or more comment or blank lines) 
 </PRE>
 <PRE>MORSE_FE                   (keyword is first text on line)
 N 500 R 1.0 10.0           (N, R, RSQ, BITMAP, FPRIME parameters)
                            (blank)
 1 1.0 25.5 102.34          (index, r, energy, force)
 2 1.02 23.4 98.5
 ...
 500 10.0 0.001 0.003 
 </PRE>
 <P>A section begins with a non-blank line whose 1st character is not a
 "#"; blank lines or lines starting with "#" can be used as comments
 between sections.  The first line begins with a keyword which
 identifies the section.  The line can contain additional text, but the
 initial text must match the argument specified in the pair_coeff
 command.  The next line lists (in any order) one or more parameters
 for the table.  Each parameter is a keyword followed by one or more
 numeric values.
 </P>
 <P>The parameter "N" is required and its value is the number of table
 entries that follow.  Note that this may be different than the <I>N</I>
 specified in the <A HREF = "pair_style.html">pair_style table</A> command.  Let
 Ntable = <I>N</I> in the pair_style command, and Nfile = "N" in the
 tabulated file.  What LAMMPS does is a preliminary interpolation by
 creating splines using the Nfile tabulated values as nodal points.  It
 uses these to interpolate as needed to generate energy and force
 values at Ntable different points.  The resulting tables of length
 Ntable are then used as described above, when computing energy and
 force for individual pair distances.  This means that if you want the
 interpolation tables of length Ntable to match exactly what is in the
 tabulated file (with effectively no preliminary interpolation), you
 should set Ntable = Nfile, and use the "RSQ" or "BITMAP" parameter.
 The internal table abscissa is RSQ (separation distance squared).
 </P>
 <P>All other parameters are optional.  If "R" or "RSQ" or "BITMAP" does
 not appear, then the distances in each line of the table are used
 as-is to perform spline interpolation.  In this case, the table values
 can be spaced in <I>r</I> uniformly or however you wish to position table
 values in regions of large gradients.
 </P>
 <P>If used, the parameters "R" or "RSQ" are followed by 2 values <I>rlo</I>
 and <I>rhi</I>.  If specified, the distance associated with each energy and
 force value is computed from these 2 values (at high accuracy), rather
 than using the (low-accuracy) value listed in each line of the table.
 The distance values in the table file are ignored in this case.
 For "R", distances uniformly spaced between <I>rlo</I> and <I>rhi</I> are
 computed; for "RSQ", squared distances uniformly spaced between
 <I>rlo*rlo</I> and <I>rhi*rhi</I> are computed.
 </P>
 <P>If used, the parameter "BITMAP" is also followed by 2 values <I>rlo</I> and
 <I>rhi</I>.  These values, along with the "N" value determine the ordering
 of the N lines that follow and what distance is associated with each.
 This ordering is complex, so it is not documented here, since this
 file is typically produced by the <A HREF = "pair_write.html">pair_write</A> command
 with its <I>bitmap</I> option.  When the table is in BITMAP format, the "N"
 parameter in the file must be equal to 2^M where M is the value
 specified in the pair_style command.  Also, a cutoff parameter cannot
 be used as an optional 3rd argument in the pair_coeff command; the
 entire table extent as specified in the file must be used.
 </P>
 <P>If used, the parameter "FPRIME" is followed by 2 values <I>fplo</I> and
 <I>fphi</I> which are the derivative of the force at the innermost and
 outermost distances listed in the table.  These values are needed by
 the spline construction routines.  If not specified by the "FPRIME"
 parameter, they are estimated (less accurately) by the first 2 and
 last 2 force values in the table.  This parameter is not used by
 BITMAP tables.
 </P>
 <P>Following a blank line, the next N lines list the tabulated values.
 On each line, the 1st value is the index from 1 to N, the 2nd value is
 r (in distance units), the 3rd value is the energy (in energy units),
 and the 4th is the force (in force units).  The r values must increase
 from one line to the next (unless the BITMAP parameter is specified).
 </P>
 <P>Note that one file can contain many sections, each with a tabulated
 potential.  LAMMPS reads the file section by section until it finds
 one that matches the specified keyword.
 </P>
 <HR>
 
-<P>Styles with a <I>cuda</I>, <I>gpu</I>, <I>omp</I>, or <I>opt</I> suffix are functionally
-the same as the corresponding style without the suffix.  They have
-been optimized to run faster, depending on your available hardware, as
-discussed in <A HREF = "Section_accelerate.html">Section_accelerate</A> of the
-manual.  The accelerated styles take the same arguments and should
-produce the same results, except for round-off and precision issues.
-</P>
-<P>These accelerated styles are part of the USER-CUDA, GPU, USER-OMP and OPT
-packages, respectively.  They are only enabled if LAMMPS was built with
-those packages.  See the <A HREF = "Section_start.html#start_3">Making LAMMPS</A>
-section for more info.
+<P>Styles with a <I>cuda</I>, <I>gpu</I>, <I>kk</I>, <I>omp</I>, or <I>opt</I> suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in <A HREF = "Section_accelerate.html">Section_accelerate</A>
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+</P>
+<P>These accelerated styles are part of the USER-CUDA, GPU, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the <A HREF = "Section_start.html#start_3">Making
+LAMMPS</A> section for more info.
 </P>
 <P>You can specify the accelerated styles explicitly in your input script
 by including their suffix, or you can use the <A HREF = "Section_start.html#start_7">-suffix command-line
 switch</A> when you invoke LAMMPS, or you can
 use the <A HREF = "suffix.html">suffix</A> command in your input script.
 </P>
 <P>See <A HREF = "Section_accelerate.html">Section_accelerate</A> of the manual for
 more instructions on how to use the accelerated styles effectively.
 </P>
 <HR>
 
 <P><B>Mixing, shift, table, tail correction, restart, rRESPA info</B>:
 </P>
 <P>This pair style does not support mixing.  Thus, coefficients for all
 I,J pairs must be specified explicitly.
 </P>
 <P>The <A HREF = "pair_modify.html">pair_modify</A> shift, table, and tail options are
 not relevant for this pair style.
 </P>
 <P>This pair style writes the settings for the "pair_style table" command
 to <A HREF = "restart.html">binary restart files</A>, so a pair_style command does
 not need to specified in an input script that reads a restart file.
 However, the coefficient information is not stored in the restart
 file, since it is tabulated in the potential files.  Thus, pair_coeff
 commands do need to be specified in the restart input script.
 </P>
 <P>This pair style can only be used via the <I>pair</I> keyword of the
 <A HREF = "run_style.html">run_style respa</A> command.  It does not support the
 <I>inner</I>, <I>middle</I>, <I>outer</I> keywords.
 </P>
 <HR>
 
 <P><B>Restrictions:</B> none
 </P>
 <P><B>Related commands:</B>
 </P>
 <P><A HREF = "pair_coeff.html">pair_coeff</A>
 </P>
 <P><B>Default:</B> none
 </P>
 <HR>
 
 <A NAME = "Wolff"></A>
 
 <P><B>(Wolff)</B> Wolff and Rudd, Comp Phys Comm, 120, 200-32 (1999).
 </P>
 </HTML>
diff --git a/doc/pair_table.txt b/doc/pair_table.txt
index 9f221190a..4b83d2a8f 100644
--- a/doc/pair_table.txt
+++ b/doc/pair_table.txt
@@ -1,252 +1,254 @@
 "LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
 
 :link(lws,http://lammps.sandia.gov)
 :link(ld,Manual.html)
 :link(lc,Section_commands.html#comm)
 
 :line
 
 pair_style table command :h3
 pair_style table/gpu command :h3
+pair_style table/kk command :h3
 pair_style table/omp command :h3
 
 [Syntax:]
 
 pair_style table style N keyword ... :pre
 
 style = {lookup} or {linear} or {spline} or {bitmap} = method of interpolation
 N = use N values in {lookup}, {linear}, {spline} tables
 N = use 2^N values in {bitmap} tables
 zero or more keywords may be appended
 keyword = {ewald} or {pppm} or {msm} or {dispersion} or {tip4p} :ul
 
 [Examples:]
 
 pair_style table linear 1000
 pair_style table linear 1000 pppm
 pair_style table bitmap 12
 pair_coeff * 3 morse.table ENTRY1
 pair_coeff * 3 morse.table ENTRY1 7.0 :pre
 
 [Description:]
 
 Style {table} creates interpolation tables of length {N} from pair
 potential and force values listed in a file(s) as a function of
 distance.  The files are read by the "pair_coeff"_pair_coeff.html
 command.
 
 The interpolation tables are created by fitting cubic splines to the
 file values and interpolating energy and force values at each of {N}
 distances.  During a simulation, these tables are used to interpolate
 energy and force values as needed.  The interpolation is done in one
 of 4 styles: {lookup}, {linear}, {spline}, or {bitmap}.
 
 For the {lookup} style, the distance between 2 atoms is used to find
 the nearest table entry, which is the energy or force.
 
 For the {linear} style, the pair distance is used to find 2
 surrounding table values from which an energy or force is computed by
 linear interpolation.
 
 For the {spline} style, a cubic spline coefficients are computed and
 stored at each of the {N} values in the table.  The pair distance is
 used to find the appropriate set of coefficients which are used to
 evaluate a cubic polynomial which computes the energy or force.
 
 For the {bitmap} style, the N means to create interpolation tables
 that are 2^N in length.  <The pair distance is used to index into the
 table via a fast bit-mapping technique "(Wolff)"_#Wolff and a linear
 interpolation is performed between adjacent table values.
 
 The following coefficients must be defined for each pair of atoms
 types via the "pair_coeff"_pair_coeff.html command as in the examples
 above.
 
 filename
 keyword
 cutoff (distance units) :ul
 
 The filename specifies a file containing tabulated energy and force
 values.  The keyword specifies a section of the file.  The cutoff is
 an optional coefficient.  If not specified, the outer cutoff in the
 table itself (see below) will be used to build an interpolation table
 that extend to the largest tabulated distance.  If specified, only
 file values up to the cutoff are used to create the interpolation
 table.  The format of this file is described below.
 
 If your tabulated potential(s) are designed to be used as the
 short-range part of one of the long-range solvers specified by the
 "kspace_style"_kspace_style.html command, then you must use one or
 more of the optional keywords listed above for the pair_style command.
 These are {ewald} or {pppm} or {msm} or {dispersion} or {tip4p}.  This
 is so LAMMPS can insure the short-range potential and long-range
 solver are compatible with each other, as it does for other
 short-range pair styles, such as "pair_style
 lj/cut/coul/long"_pair_lj.html.  Note that it is up to you to insure
 the tabulated values for each pair of atom types has the correct
 functional form to be compatible with the matching long-range solver.
 
 :line
 
 Here are some guidelines for using the pair_style table command to
 best effect:
 
 Vary the number of table points; you may need to use more than you think
 to get good resolution. :ulb,l
 
 Always use the "pair_write"_pair_write.html command to produce a plot
 of what the final interpolated potential looks like.  This can show up
 interpolation "features" you may not like. :l
 
 Start with the linear style; it's the style least likely to have problems. :l
 
 Use {N} in the pair_style command equal to the "N" in the tabulation
 file, and use the "RSQ" or "BITMAP" parameter, so additional interpolation
 is not needed.  See discussion below. :l
 
 Make sure that your tabulated forces and tabulated energies are consistent 
 (dE/dr = -F) along the entire range of r values. :l
 
 Use as large an inner cutoff as possible.  This avoids fitting splines
 to very steep parts of the potential. :l,ule
 
 :line
 
 The format of a tabulated file is as follows (without the
 parenthesized comments):
 
 # Morse potential for Fe   (one or more comment or blank lines) :pre
 
 MORSE_FE                   (keyword is first text on line)
 N 500 R 1.0 10.0           (N, R, RSQ, BITMAP, FPRIME parameters)
                            (blank)
 1 1.0 25.5 102.34          (index, r, energy, force)
 2 1.02 23.4 98.5
 ...
 500 10.0 0.001 0.003 :pre
 
 A section begins with a non-blank line whose 1st character is not a
 "#"; blank lines or lines starting with "#" can be used as comments
 between sections.  The first line begins with a keyword which
 identifies the section.  The line can contain additional text, but the
 initial text must match the argument specified in the pair_coeff
 command.  The next line lists (in any order) one or more parameters
 for the table.  Each parameter is a keyword followed by one or more
 numeric values.
 
 The parameter "N" is required and its value is the number of table
 entries that follow.  Note that this may be different than the {N}
 specified in the "pair_style table"_pair_style.html command.  Let
 Ntable = {N} in the pair_style command, and Nfile = "N" in the
 tabulated file.  What LAMMPS does is a preliminary interpolation by
 creating splines using the Nfile tabulated values as nodal points.  It
 uses these to interpolate as needed to generate energy and force
 values at Ntable different points.  The resulting tables of length
 Ntable are then used as described above, when computing energy and
 force for individual pair distances.  This means that if you want the
 interpolation tables of length Ntable to match exactly what is in the
 tabulated file (with effectively no preliminary interpolation), you
 should set Ntable = Nfile, and use the "RSQ" or "BITMAP" parameter.
 The internal table abscissa is RSQ (separation distance squared).
 
 All other parameters are optional.  If "R" or "RSQ" or "BITMAP" does
 not appear, then the distances in each line of the table are used
 as-is to perform spline interpolation.  In this case, the table values
 can be spaced in {r} uniformly or however you wish to position table
 values in regions of large gradients.
 
 If used, the parameters "R" or "RSQ" are followed by 2 values {rlo}
 and {rhi}.  If specified, the distance associated with each energy and
 force value is computed from these 2 values (at high accuracy), rather
 than using the (low-accuracy) value listed in each line of the table.
 The distance values in the table file are ignored in this case.
 For "R", distances uniformly spaced between {rlo} and {rhi} are
 computed; for "RSQ", squared distances uniformly spaced between
 {rlo*rlo} and {rhi*rhi} are computed.
 
 If used, the parameter "BITMAP" is also followed by 2 values {rlo} and
 {rhi}.  These values, along with the "N" value determine the ordering
 of the N lines that follow and what distance is associated with each.
 This ordering is complex, so it is not documented here, since this
 file is typically produced by the "pair_write"_pair_write.html command
 with its {bitmap} option.  When the table is in BITMAP format, the "N"
 parameter in the file must be equal to 2^M where M is the value
 specified in the pair_style command.  Also, a cutoff parameter cannot
 be used as an optional 3rd argument in the pair_coeff command; the
 entire table extent as specified in the file must be used.
 
 If used, the parameter "FPRIME" is followed by 2 values {fplo} and
 {fphi} which are the derivative of the force at the innermost and
 outermost distances listed in the table.  These values are needed by
 the spline construction routines.  If not specified by the "FPRIME"
 parameter, they are estimated (less accurately) by the first 2 and
 last 2 force values in the table.  This parameter is not used by
 BITMAP tables.
 
 Following a blank line, the next N lines list the tabulated values.
 On each line, the 1st value is the index from 1 to N, the 2nd value is
 r (in distance units), the 3rd value is the energy (in energy units),
 and the 4th is the force (in force units).  The r values must increase
 from one line to the next (unless the BITMAP parameter is specified).
 
 Note that one file can contain many sections, each with a tabulated
 potential.  LAMMPS reads the file section by section until it finds
 one that matches the specified keyword.
 
 :line
 
-Styles with a {cuda}, {gpu}, {omp}, or {opt} suffix are functionally
-the same as the corresponding style without the suffix.  They have
-been optimized to run faster, depending on your available hardware, as
-discussed in "Section_accelerate"_Section_accelerate.html of the
-manual.  The accelerated styles take the same arguments and should
-produce the same results, except for round-off and precision issues.
-
-These accelerated styles are part of the USER-CUDA, GPU, USER-OMP and OPT
-packages, respectively.  They are only enabled if LAMMPS was built with
-those packages.  See the "Making LAMMPS"_Section_start.html#start_3
-section for more info.
+Styles with a {cuda}, {gpu}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section_accelerate"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the USER-CUDA, GPU, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
 
 You can specify the accelerated styles explicitly in your input script
 by including their suffix, or you can use the "-suffix command-line
 switch"_Section_start.html#start_7 when you invoke LAMMPS, or you can
 use the "suffix"_suffix.html command in your input script.
 
 See "Section_accelerate"_Section_accelerate.html of the manual for
 more instructions on how to use the accelerated styles effectively.
 
 :line
 
 [Mixing, shift, table, tail correction, restart, rRESPA info]:
 
 This pair style does not support mixing.  Thus, coefficients for all
 I,J pairs must be specified explicitly.
 
 The "pair_modify"_pair_modify.html shift, table, and tail options are
 not relevant for this pair style.
 
 This pair style writes the settings for the "pair_style table" command
 to "binary restart files"_restart.html, so a pair_style command does
 not need to specified in an input script that reads a restart file.
 However, the coefficient information is not stored in the restart
 file, since it is tabulated in the potential files.  Thus, pair_coeff
 commands do need to be specified in the restart input script.
 
 This pair style can only be used via the {pair} keyword of the
 "run_style respa"_run_style.html command.  It does not support the
 {inner}, {middle}, {outer} keywords.
 
 :line
 
 [Restrictions:] none
 
 [Related commands:]
 
 "pair_coeff"_pair_coeff.html
 
 [Default:] none
 
 :line
 
 :link(Wolff)
 [(Wolff)] Wolff and Rudd, Comp Phys Comm, 120, 200-32 (1999).
diff --git a/doc/suffix.html b/doc/suffix.html
index c599782f1..0543a8a32 100644
--- a/doc/suffix.html
+++ b/doc/suffix.html
@@ -1,87 +1,93 @@
 <HTML>
 <CENTER><A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> - <A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> 
 </CENTER>
 
 
 
 
 
 
 <HR>
 
 <H3>suffix command 
 </H3>
 <P><B>Syntax:</B>
 </P>
 <PRE>suffix style 
 </PRE>
-<UL><LI>style = <I>off</I> or <I>on</I> or <I>opt</I> or <I>omp</I> or <I>gpu</I> or <I>cuda</I> 
+<UL><LI>style = <I>off</I> or <I>on</I> or <I>cuda</I> or <I>gpu</I> or <I>kk</I> or <I>omp</I> or <I>opt</I> 
 </UL>
 <P><B>Examples:</B>
 </P>
 <PRE>suffix off
 suffix on
-suffix gpu 
+suffix gpu
+suffix kk 
 </PRE>
 <P><B>Description:</B>
 </P>
 <P>This command allows you to use variants of various styles if they
 exist.  In that respect it operates the same as the <A HREF = "Section_start.html#start_7">-suffix
 command-line switch</A>.  It also has options
-to turn off/on any suffix setting made via the command line.
+to turn off or back on any suffix setting made via the command line.
 </P>
-<P>The specified style can be <I>opt</I>, <I>omp</I>, <I>gpu</I>, or <I>cuda</I>.  These refer to
-optional packages that LAMMPS can be built with, as described in <A HREF = "Section_start.html#start_3">this
-section of the manual</A>.  The "opt" style
-corrsponds to the OPT package, the "omp" style to the USER-OMP package, 
-the "gpu" style to the GPU package, and the "cuda" style to the
-USER-CUDA package.
+<P>The specified style can be <I>cuda</I>, <I>gpu</I>, <I>kk</I>, <I>omp</I>, or <I>opt</I>.
+These refer to optional packages that LAMMPS can be built with, as
+described in <A HREF = "Section_start.html#start_3">this section of the manual</A>.
+The "cuda" style corresponds to the USER-CUDA package, the "gpu" style
+to the GPU package, the "kk" style to the KOKKOS package, the "omp"
+style to the USER-OMP package, and the "opt" style to the OPT package,
 </P>
 <P>These are the variants these packages provide:
 </P>
-<UL><LI>OPT = a handful of pair styles, cache-optimized for faster CPU
-performance 
+<UL><LI>USER-CUDA = a collection of atom, pair, fix, compute, and intergrate
+styles, optimized to run on one or more NVIDIA GPUs 
+
+<LI>GPU = a handful of pair styles and the PPPM kspace_style, optimized to
+run on one or more GPUs or multicore CPU/GPU nodes 
+
+<LI>KOKKOS = a collection of atom, pair, and fix styles optimized to run
+using the Kokkos library on various kinds of hardware, including GPUs
+via Cuda and many-core chips via OpenMP or threading. 
 
 <LI>USER-OMP = a collection of pair, bond, angle, dihedral, improper,
 kspace, compute, and fix styles with support for OpenMP
 multi-threading 
 
-<LI>GPU = a handful of pair styles and the PPPM kspace_style, optimized to
-run on one or more GPUs or multicore CPU/GPU nodes 
-
-<LI>USER-CUDA = a collection of atom, pair, fix, compute, and intergrate
-styles, optimized to run on one or more NVIDIA GPUs 
+<LI>OPT = a handful of pair styles, cache-optimized for faster CPU
+performance 
 </UL>
 <P>As an example, all of the packages provide a <A HREF = "pair_lj.html">pair_style
 lj/cut</A> variant, with style names lj/cut/opt, lj/cut/omp,
-lj/cut/gpu, or lj/cut/cuda.  A variant styles can be specified
-explicitly in your input script, e.g. pair_style lj/cut/gpu.  If the
-suffix command is used with the appropriate style, you do not need to
-modify your input script.  The specified suffix (opt,omp,gpu,cuda) is
-automatically appended whenever your input script command creates a
-new <A HREF = "atom_style.html">atom</A>, <A HREF = "pair_style.html">pair</A>,
-<A HREF = "bond_style.html">bond</A>, <A HREF = "angle_style.html">angle</A>,
-<A HREF = "dihedral_style.html">dihedral</A>, <A HREF = "improper_style.html">improper</A>,
-<A HREF = "kspace_style.html">kspace</A>, <A HREF = "fix.html">fix</A>, <A HREF = "compute.html">compute</A>, or
-<A HREF = "run_style.html">run</A> style.  If the variant version does not exist,
-the standard version is created.
+lj/cut/gpu, lj/cut/cuda, or lj/cut/kk.  A variant styles can be
+specified explicitly in your input script, e.g. pair_style lj/cut/gpu.
+If the suffix command is used with the appropriate style, you do not
+need to modify your input script.  The specified suffix
+(opt,omp,gpu,cuda,kk) is automatically appended whenever your input
+script command creates a new <A HREF = "atom_style.html">atom</A>,
+<A HREF = "pair_style.html">pair</A>, <A HREF = "bond_style.html">bond</A>,
+<A HREF = "angle_style.html">angle</A>, <A HREF = "dihedral_style.html">dihedral</A>,
+<A HREF = "improper_style.html">improper</A>, <A HREF = "kspace_style.html">kspace</A>,
+<A HREF = "fix.html">fix</A>, <A HREF = "compute.html">compute</A>, or <A HREF = "run_style.html">run</A> style.
+If the variant version does not exist, the standard version is
+created.
 </P>
 <P>If the specified style is <I>off</I>, then any previously specified suffix
 is temporarily disabled, whether it was specified by a command-line
 switch or a previous suffix command.  If the specified style is <I>on</I>,
 a disabled suffix is turned back on.  The use of these 2 commands lets
 your input script use a standard LAMMPS style (i.e. a non-accelerated
 variant), which can be useful for testing or benchmarking purposes.
 Of course this is also possible by not using any suffix commands, and
 explictly appending or not appending the suffix to the relevant
 commands in your input script.
 </P>
 <P><B>Restrictions:</B> none
 </P>
 <P><B>Related commands:</B>
 </P>
 <P><A HREF = "Section_start.html#start_7">Command-line switch -suffix</A>
 </P>
 <P><B>Default:</B> none
 </P>
 </HTML>
diff --git a/doc/suffix.txt b/doc/suffix.txt
index be2c1c26f..42675d252 100644
--- a/doc/suffix.txt
+++ b/doc/suffix.txt
@@ -1,82 +1,88 @@
 "LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
 
 :link(lws,http://lammps.sandia.gov)
 :link(ld,Manual.html)
 :link(lc,Section_commands.html#comm)
 
 :line
 
 suffix command :h3
 
 [Syntax:]
 
 suffix style :pre
 
-style = {off} or {on} or {opt} or {omp} or {gpu} or {cuda} :ul
+style = {off} or {on} or {cuda} or {gpu} or {kk} or {omp} or {opt} :ul
 
 [Examples:]
 
 suffix off
 suffix on
-suffix gpu :pre
+suffix gpu
+suffix kk :pre
 
 [Description:]
 
 This command allows you to use variants of various styles if they
 exist.  In that respect it operates the same as the "-suffix
 command-line switch"_Section_start.html#start_7.  It also has options
-to turn off/on any suffix setting made via the command line.
+to turn off or back on any suffix setting made via the command line.
 
-The specified style can be {opt}, {omp}, {gpu}, or {cuda}.  These refer to
-optional packages that LAMMPS can be built with, as described in "this
-section of the manual"_Section_start.html#start_3.  The "opt" style
-corrsponds to the OPT package, the "omp" style to the USER-OMP package, 
-the "gpu" style to the GPU package, and the "cuda" style to the
-USER-CUDA package.
+The specified style can be {cuda}, {gpu}, {kk}, {omp}, or {opt}.
+These refer to optional packages that LAMMPS can be built with, as
+described in "this section of the manual"_Section_start.html#start_3.
+The "cuda" style corresponds to the USER-CUDA package, the "gpu" style
+to the GPU package, the "kk" style to the KOKKOS package, the "omp"
+style to the USER-OMP package, and the "opt" style to the OPT package,
 
 These are the variants these packages provide:
 
-OPT = a handful of pair styles, cache-optimized for faster CPU
-performance :ulb,l
+USER-CUDA = a collection of atom, pair, fix, compute, and intergrate
+styles, optimized to run on one or more NVIDIA GPUs :ulb,l
+
+GPU = a handful of pair styles and the PPPM kspace_style, optimized to
+run on one or more GPUs or multicore CPU/GPU nodes :l
+
+KOKKOS = a collection of atom, pair, and fix styles optimized to run
+using the Kokkos library on various kinds of hardware, including GPUs
+via Cuda and many-core chips via OpenMP or threading. :l
 
 USER-OMP = a collection of pair, bond, angle, dihedral, improper,
 kspace, compute, and fix styles with support for OpenMP
 multi-threading :l
 
-GPU = a handful of pair styles and the PPPM kspace_style, optimized to
-run on one or more GPUs or multicore CPU/GPU nodes :l
-
-USER-CUDA = a collection of atom, pair, fix, compute, and intergrate
-styles, optimized to run on one or more NVIDIA GPUs :l,ule
+OPT = a handful of pair styles, cache-optimized for faster CPU
+performance :ule,l
 
 As an example, all of the packages provide a "pair_style
 lj/cut"_pair_lj.html variant, with style names lj/cut/opt, lj/cut/omp,
-lj/cut/gpu, or lj/cut/cuda.  A variant styles can be specified
-explicitly in your input script, e.g. pair_style lj/cut/gpu.  If the
-suffix command is used with the appropriate style, you do not need to
-modify your input script.  The specified suffix (opt,omp,gpu,cuda) is
-automatically appended whenever your input script command creates a
-new "atom"_atom_style.html, "pair"_pair_style.html,
-"bond"_bond_style.html, "angle"_angle_style.html,
-"dihedral"_dihedral_style.html, "improper"_improper_style.html,
-"kspace"_kspace_style.html, "fix"_fix.html, "compute"_compute.html, or
-"run"_run_style.html style.  If the variant version does not exist,
-the standard version is created.
+lj/cut/gpu, lj/cut/cuda, or lj/cut/kk.  A variant styles can be
+specified explicitly in your input script, e.g. pair_style lj/cut/gpu.
+If the suffix command is used with the appropriate style, you do not
+need to modify your input script.  The specified suffix
+(opt,omp,gpu,cuda,kk) is automatically appended whenever your input
+script command creates a new "atom"_atom_style.html,
+"pair"_pair_style.html, "bond"_bond_style.html,
+"angle"_angle_style.html, "dihedral"_dihedral_style.html,
+"improper"_improper_style.html, "kspace"_kspace_style.html,
+"fix"_fix.html, "compute"_compute.html, or "run"_run_style.html style.
+If the variant version does not exist, the standard version is
+created.
 
 If the specified style is {off}, then any previously specified suffix
 is temporarily disabled, whether it was specified by a command-line
 switch or a previous suffix command.  If the specified style is {on},
 a disabled suffix is turned back on.  The use of these 2 commands lets
 your input script use a standard LAMMPS style (i.e. a non-accelerated
 variant), which can be useful for testing or benchmarking purposes.
 Of course this is also possible by not using any suffix commands, and
 explictly appending or not appending the suffix to the relevant
 commands in your input script.
 
 [Restrictions:] none
 
 [Related commands:]
 
 "Command-line switch -suffix"_Section_start.html#start_7
 
 [Default:] none
diff --git a/doc/velocity.html b/doc/velocity.html
index ac1a17289..53513fbe4 100644
--- a/doc/velocity.html
+++ b/doc/velocity.html
@@ -1,222 +1,239 @@
 <HTML>
 <CENTER><A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> - <A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> 
 </CENTER>
 
 
 
 
 
 
 <HR>
 
 <H3>velocity command 
 </H3>
 <P><B>Syntax:</B>
 </P>
 <PRE>velocity group-ID style args keyword value ... 
 </PRE>
 <UL><LI>group-ID = ID of group of atoms whose velocity will be changed 
 
 <LI>style = <I>create</I> or <I>set</I> or <I>scale</I> or <I>ramp</I> or <I>zero</I> 
 
 <PRE>  <I>create</I> args = temp seed
     temp = temperature value (temperature units)
     seed = random # seed (positive integer)
   <I>set</I> args = vx vy vz
     vx,vy,vz = velocity value or NULL (velocity units)
     any of vx,vy,vz van be a variable (see below)
   <I>scale</I> arg = temp
     temp = temperature value (temperature units)
   <I>ramp</I> args = vdim vlo vhi dim clo chi
     vdim = <I>vx</I> or <I>vy</I> or <I>vz</I>
     vlo,vhi = lower and upper velocity value (velocity units)
     dim = <I>x</I> or <I>y</I> or <I>z</I>
     clo,chi = lower and upper coordinate bound (distance units)
   <I>zero</I> arg = <I>linear</I> or <I>angular</I>
     <I>linear</I> = zero the linear momentum
     <I>angular</I> = zero the angular momentum 
 </PRE>
 <LI>zero or more keyword/value pairs may be appended 
 
 <LI>keyword = <I>dist</I> or <I>sum</I> or <I>mom</I> or <I>rot</I> or <I>temp</I> or <I>loop</I> or <I>units</I> 
 
 <PRE>  <I>dist</I> value = <I>uniform</I> or <I>gaussian</I>
   <I>sum</I> value = <I>no</I> or <I>yes</I>
   <I>mom</I> value = <I>no</I> or <I>yes</I> 
   <I>rot</I> value = <I>no</I> or <I>yes</I> 
   <I>temp</I> value = temperature ID
   <I>loop</I> value = <I>all</I> or <I>local</I> or <I>geom</I>
   <I>rigid</I> value = fix-ID
     fix-ID = ID of rigid body fix
   <I>units</I> value = <I>box</I> or <I>lattice</I> 
 </PRE>
 
 </UL>
 <P><B>Examples:</B>
 </P>
 <PRE>velocity all create 300.0 4928459 rot yes dist gaussian
 velocity border set NULL 4.0 v_vz sum yes units box
 velocity flow scale 300.0
 velocity flow ramp vx 0.0 5.0 y 5 25 temp mytemp
 velocity all zero linear 
 </PRE>
 <P><B>Description:</B>
 </P>
 <P>Set or change the velocities of a group of atoms in one of several
 styles.  For each style, there are required arguments and optional
 keyword/value parameters.  Not all options are used by each style.
 Each option has a default as listed below.
 </P>
 <P>The <I>create</I> style generates an ensemble of velocities using a random
 number generator with the specified seed as the specified temperature.
 </P>
 <P>The <I>set</I> style sets the velocities of all atoms in the group to the
 specified values.  If any component is specified as NULL, then it is
 not set.  Any of the vx,vy,vz velocity components can be specified as
 an equal-style or atom-style <A HREF = "variable.html">variable</A>.  If the value
 is a variable, it should be specified as v_name, where name is the
 variable name.  In this case, the variable will be evaluated, and its
 value used to determine the velocity component.  Note that if a
 variable is used, the velocity it calculates must be in box units, not
 lattice units; see the discussion of the <I>units</I> keyword below.
 </P>
 <P>Equal-style variables can specify formulas with various mathematical
 functions, and include <A HREF = "thermo_style.html">thermo_style</A> command
 keywords for the simulation box parameters or other parameters.
 </P>
 <P>Atom-style variables can specify the same formulas as equal-style
 variables but can also include per-atom values, such as atom
 coordinates.  Thus it is easy to specify a spatially-dependent
 velocity field.
 </P>
 <P>The <I>scale</I> style computes the current temperature of the group of
 atoms and then rescales the velocities to the specified temperature.
 </P>
 <P>The <I>ramp</I> style is similar to that used by the <A HREF = "compute_temp_ramp.html">compute
 temp/ramp</A> command.  Velocities ramped
 uniformly from vlo to vhi are applied to dimension vx, or vy, or vz.
 The value assigned to a particular atom depends on its relative
 coordinate value (in dim) from clo to chi.  For the example above, an
 atom with y-coordinate of 10 (1/4 of the way from 5 to 25), would be
 assigned a x-velocity of 1.25 (1/4 of the way from 0.0 to 5.0).  Atoms
 outside the coordinate bounds (less than 5 or greater than 25 in this
 case), are assigned velocities equal to vlo or vhi (0.0 or 5.0 in this
 case).
 </P>
 <P>The <I>zero</I> style adjusts the velocities of the group of atoms so that
 the aggregate linear or angular momentum is zero.  No other changes
 are made to the velocities of the atoms.  If the <I>rigid</I> option is
 specified (see below), then the zeroing is performed on individual
 rigid bodies, as defined by the <A HREF = "fix_rigid.html">fix rigid or fix
 rigid/small</A> commands.  In other words, zero linear
 will set the linear momentum of each rigid body to zero, and zero
 angular will set the angular momentum of each rigid body to zero.
 This is done by adjusting the velocities of the atoms in each rigid
 body.
 </P>
 <P>All temperatures specified in the velocity command are in temperature
 units; see the <A HREF = "units.html">units</A> command.  The units of velocities and
 coordinates depend on whether the <I>units</I> keyword is set to <I>box</I> or
 <I>lattice</I>, as discussed below.
 </P>
 <P>For all styles, no atoms are assigned z-component velocities if the
 simulation is 2d; see the <A HREF = "dimension.html">dimension</A> command.
 </P>
 <HR>
 
 <P>The keyword/value option pairs are used in the following ways by the
 various styles.
 </P>
 <P>The <I>dist</I> option is used by <I>create</I>.  The ensemble of generated
 velocities can be a <I>uniform</I> distribution from some minimum to
 maximum value, scaled to produce the requested temperature.  Or it can
 be a <I>gaussian</I> distribution with a mean of 0.0 and a sigma scaled to
 produce the requested temperature.
 </P>
 <P>The <I>sum</I> option is used by all styles, except <I>zero</I>.  The new
 velocities will be added to the existing ones if sum = yes, or will
 replace them if sum = no.
 </P>
 <P>The <I>mom</I> and <I>rot</I> options are used by <I>create</I>.  If mom = yes, the
 linear momentum of the newly created ensemble of velocities is zeroed;
 if rot = yes, the angular momentum is zeroed.
 </P>
 <P>The <I>temp</I> option is used by <I>create</I> and <I>scale</I> to specify a
 <A HREF = "compute.html">compute</A> that calculates temperature in a desired way.
 If this option is not specified, <I>create</I> and <I>scale</I> calculate
 temperature using a compute that is defined as follows:
 </P>
 <PRE>compute velocity_temp group-ID temp 
 </PRE>
 <P>where group-ID is the same ID used in the velocity command. i.e. the
 group of atoms whose velocity is being altered.  This compute is
 deleted when the velocity command is finished.  See the <A HREF = "compute_temp.html">compute
 temp</A> command for details.  If the computed
 temperature should have degrees-of-freedom removed due to fix
 constraints (e.g. SHAKE or rigid-body constraints), then the
 appropriate fix command must be specified before the velocity command
 is issued.
 </P>
 <HR>
 
 <P>The <I>loop</I> option is used by <I>create</I> in the following ways.
 </P>
 <P>If loop = all, then each processor loops over all atoms in the
 simulation to create velocities, but only stores velocities for atoms
 it owns.  This can be a slow loop for a large simulation.  If atoms
 were read from a data file, the velocity assigned to a particular atom
 will be the same, independent of how many processors are being used.
 This will not be the case if atoms were created using the
 <A HREF = "create_atoms.html">create_atoms</A> command, since atom IDs will likely
 be assigned to atoms differently.
 </P>
 <P>If loop = local, then each processor loops over only its atoms to
 produce velocities.  The random number seed is adjusted to give a
 different set of velocities on each processor.  This is a fast loop,
 but the velocity assigned to a particular atom will depend on which
 processor owns it.  Thus the results will always be different when a
 simulation is run on a different number of processors.
 </P>
 <P>If loop = geom, then each processor loops over only its atoms.  For
 each atom a unique random number seed is created, based on the atom's
 xyz coordinates.  A velocity is generated using that seed.  This is a
 fast loop and the velocity assigned to a particular atom will be the
 same, independent of how many processors are used.  However, the set
 of generated velocities may be more correlated than if the <I>all</I> or
 <I>local</I> options are used.
 </P>
 <P>Note that the <I>loop geom</I> option will not necessarily assign identical
 velocities for two simulations run on different machines.  This is
 because the computations based on xyz coordinates are sensitive to
 tiny differences in the double-precision value for a coordinate as
 stored on a particular machine.
 </P>
 <HR>
 
 <P>The <I>rigid</I> option only has meaning when used with the <I>zero</I> style.
 It allows specification of a fix-ID for one of the <A HREF = "fix_rigid.html">rigid-body
 fix</A> variants which defines a set of rigid bodies.  The
 zeroing of linear or angular momentum is then performed for each rigid
 body defined by the fix, as described above.
 </P>
 <P>The <I>units</I> option is used by <I>set</I> and <I>ramp</I>.  If units = box, 
 the velocities and coordinates specified in the velocity command are
 in the standard units described by the <A HREF = "units.html">units</A> command
 (e.g. Angstroms/fmsec for real units).  If units = lattice, velocities
 are in units of lattice spacings per time (e.g. spacings/fmsec) and
 coordinates are in lattice spacings.  The <A HREF = "lattice.html">lattice</A>
 command must have been previously used to define the lattice spacing.
 </P>
-<P><B>Restrictions:</B> none
-</P>
+<P><B>Restrictions:</B>
+</P>
+<P>Assigning a temperature via the <I>create</I> option to a system with
+<A HREF = "fix_rigid.html">rigid bodies</A> or <A HREF = "fix_shake.html">SHAKE constraints</A>
+may not have the desired outcome for two reasons.  First, the velocity
+command can be invoked before all of the relevant fixes are created
+and initialized and the number of adjusted degrees of freedom (DOFs)
+is known.  Thus it is not possible to compute the target temperature
+correctly.  Second, the assigned velocities may be partially canceled
+when constraints are first enforced, leading to a different
+temperature than desired.  A workaround for this is to perform a <A HREF = "run.html">run
+0</A> command, which insures all DOFs are accounted for
+properly, and then rescale the temperature to the desired value before
+performing a simulation.  For example:
+</P>
+<PRE>velocity all create 300.0 12345
+run 0                             # temperature may not be 300K
+velocity all scale 300.0          # now it should be 
+</PRE>
 <P><B>Related commands:</B>
 </P>
-<P><A HREF = "fix_shake.html">fix shake</A>, <A HREF = "lattice.html">lattice</A>
+<P><A HREF = "fix_rigid.html">fix rigid</A>, <A HREF = "fix_shake.html">fix shake</A>, <A HREF = "lattice.html">lattice</A>
 </P>
 <P><B>Default:</B>
 </P>
 <P>The option defaults are dist = uniform, sum = no, mom = yes, rot = no,
 temp = full style on group-ID, loop = all, and units = lattice.  The
 rigid option is not defined by default.
 </P>
 </HTML>
diff --git a/doc/velocity.txt b/doc/velocity.txt
index 2a606cf55..19bfca633 100644
--- a/doc/velocity.txt
+++ b/doc/velocity.txt
@@ -1,212 +1,229 @@
 "LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
 
 :link(lws,http://lammps.sandia.gov)
 :link(ld,Manual.html)
 :link(lc,Section_commands.html#comm)
 
 :line
 
 velocity command :h3
 
 [Syntax:]
 
 velocity group-ID style args keyword value ... :pre
 
 group-ID = ID of group of atoms whose velocity will be changed :ulb,l
 style = {create} or {set} or {scale} or {ramp} or {zero} :l
   {create} args = temp seed
     temp = temperature value (temperature units)
     seed = random # seed (positive integer)
   {set} args = vx vy vz
     vx,vy,vz = velocity value or NULL (velocity units)
     any of vx,vy,vz van be a variable (see below)
   {scale} arg = temp
     temp = temperature value (temperature units)
   {ramp} args = vdim vlo vhi dim clo chi
     vdim = {vx} or {vy} or {vz}
     vlo,vhi = lower and upper velocity value (velocity units)
     dim = {x} or {y} or {z}
     clo,chi = lower and upper coordinate bound (distance units)
   {zero} arg = {linear} or {angular}
     {linear} = zero the linear momentum
     {angular} = zero the angular momentum :pre
 
 zero or more keyword/value pairs may be appended :l
 keyword = {dist} or {sum} or {mom} or {rot} or {temp} or {loop} or {units} :l
   {dist} value = {uniform} or {gaussian}
   {sum} value = {no} or {yes}
   {mom} value = {no} or {yes} 
   {rot} value = {no} or {yes} 
   {temp} value = temperature ID
   {loop} value = {all} or {local} or {geom}
   {rigid} value = fix-ID
     fix-ID = ID of rigid body fix
   {units} value = {box} or {lattice} :pre
 :ule
 
 [Examples:]
 
 velocity all create 300.0 4928459 rot yes dist gaussian
 velocity border set NULL 4.0 v_vz sum yes units box
 velocity flow scale 300.0
 velocity flow ramp vx 0.0 5.0 y 5 25 temp mytemp
 velocity all zero linear :pre
 
 [Description:]
 
 Set or change the velocities of a group of atoms in one of several
 styles.  For each style, there are required arguments and optional
 keyword/value parameters.  Not all options are used by each style.
 Each option has a default as listed below.
 
 The {create} style generates an ensemble of velocities using a random
 number generator with the specified seed as the specified temperature.
 
 The {set} style sets the velocities of all atoms in the group to the
 specified values.  If any component is specified as NULL, then it is
 not set.  Any of the vx,vy,vz velocity components can be specified as
 an equal-style or atom-style "variable"_variable.html.  If the value
 is a variable, it should be specified as v_name, where name is the
 variable name.  In this case, the variable will be evaluated, and its
 value used to determine the velocity component.  Note that if a
 variable is used, the velocity it calculates must be in box units, not
 lattice units; see the discussion of the {units} keyword below.
 
 Equal-style variables can specify formulas with various mathematical
 functions, and include "thermo_style"_thermo_style.html command
 keywords for the simulation box parameters or other parameters.
 
 Atom-style variables can specify the same formulas as equal-style
 variables but can also include per-atom values, such as atom
 coordinates.  Thus it is easy to specify a spatially-dependent
 velocity field.
 
 The {scale} style computes the current temperature of the group of
 atoms and then rescales the velocities to the specified temperature.
 
 The {ramp} style is similar to that used by the "compute
 temp/ramp"_compute_temp_ramp.html command.  Velocities ramped
 uniformly from vlo to vhi are applied to dimension vx, or vy, or vz.
 The value assigned to a particular atom depends on its relative
 coordinate value (in dim) from clo to chi.  For the example above, an
 atom with y-coordinate of 10 (1/4 of the way from 5 to 25), would be
 assigned a x-velocity of 1.25 (1/4 of the way from 0.0 to 5.0).  Atoms
 outside the coordinate bounds (less than 5 or greater than 25 in this
 case), are assigned velocities equal to vlo or vhi (0.0 or 5.0 in this
 case).
 
 The {zero} style adjusts the velocities of the group of atoms so that
 the aggregate linear or angular momentum is zero.  No other changes
 are made to the velocities of the atoms.  If the {rigid} option is
 specified (see below), then the zeroing is performed on individual
 rigid bodies, as defined by the "fix rigid or fix
 rigid/small"_fix_rigid.html commands.  In other words, zero linear
 will set the linear momentum of each rigid body to zero, and zero
 angular will set the angular momentum of each rigid body to zero.
 This is done by adjusting the velocities of the atoms in each rigid
 body.
 
 All temperatures specified in the velocity command are in temperature
 units; see the "units"_units.html command.  The units of velocities and
 coordinates depend on whether the {units} keyword is set to {box} or
 {lattice}, as discussed below.
 
 For all styles, no atoms are assigned z-component velocities if the
 simulation is 2d; see the "dimension"_dimension.html command.
 
 :line
 
 The keyword/value option pairs are used in the following ways by the
 various styles.
 
 The {dist} option is used by {create}.  The ensemble of generated
 velocities can be a {uniform} distribution from some minimum to
 maximum value, scaled to produce the requested temperature.  Or it can
 be a {gaussian} distribution with a mean of 0.0 and a sigma scaled to
 produce the requested temperature.
 
 The {sum} option is used by all styles, except {zero}.  The new
 velocities will be added to the existing ones if sum = yes, or will
 replace them if sum = no.
 
 The {mom} and {rot} options are used by {create}.  If mom = yes, the
 linear momentum of the newly created ensemble of velocities is zeroed;
 if rot = yes, the angular momentum is zeroed.
 
 The {temp} option is used by {create} and {scale} to specify a
 "compute"_compute.html that calculates temperature in a desired way.
 If this option is not specified, {create} and {scale} calculate
 temperature using a compute that is defined as follows:
 
 compute velocity_temp group-ID temp :pre
 
 where group-ID is the same ID used in the velocity command. i.e. the
 group of atoms whose velocity is being altered.  This compute is
 deleted when the velocity command is finished.  See the "compute
 temp"_compute_temp.html command for details.  If the computed
 temperature should have degrees-of-freedom removed due to fix
 constraints (e.g. SHAKE or rigid-body constraints), then the
 appropriate fix command must be specified before the velocity command
 is issued.
 
 :line
 
 The {loop} option is used by {create} in the following ways.
 
 If loop = all, then each processor loops over all atoms in the
 simulation to create velocities, but only stores velocities for atoms
 it owns.  This can be a slow loop for a large simulation.  If atoms
 were read from a data file, the velocity assigned to a particular atom
 will be the same, independent of how many processors are being used.
 This will not be the case if atoms were created using the
 "create_atoms"_create_atoms.html command, since atom IDs will likely
 be assigned to atoms differently.
 
 If loop = local, then each processor loops over only its atoms to
 produce velocities.  The random number seed is adjusted to give a
 different set of velocities on each processor.  This is a fast loop,
 but the velocity assigned to a particular atom will depend on which
 processor owns it.  Thus the results will always be different when a
 simulation is run on a different number of processors.
 
 If loop = geom, then each processor loops over only its atoms.  For
 each atom a unique random number seed is created, based on the atom's
 xyz coordinates.  A velocity is generated using that seed.  This is a
 fast loop and the velocity assigned to a particular atom will be the
 same, independent of how many processors are used.  However, the set
 of generated velocities may be more correlated than if the {all} or
 {local} options are used.
 
 Note that the {loop geom} option will not necessarily assign identical
 velocities for two simulations run on different machines.  This is
 because the computations based on xyz coordinates are sensitive to
 tiny differences in the double-precision value for a coordinate as
 stored on a particular machine.
 
 :line
 
 The {rigid} option only has meaning when used with the {zero} style.
 It allows specification of a fix-ID for one of the "rigid-body
 fix"_fix_rigid.html variants which defines a set of rigid bodies.  The
 zeroing of linear or angular momentum is then performed for each rigid
 body defined by the fix, as described above.
 
 The {units} option is used by {set} and {ramp}.  If units = box, 
 the velocities and coordinates specified in the velocity command are
 in the standard units described by the "units"_units.html command
 (e.g. Angstroms/fmsec for real units).  If units = lattice, velocities
 are in units of lattice spacings per time (e.g. spacings/fmsec) and
 coordinates are in lattice spacings.  The "lattice"_lattice.html
 command must have been previously used to define the lattice spacing.
 
-[Restrictions:] none
+[Restrictions:]
+
+Assigning a temperature via the {create} option to a system with
+"rigid bodies"_fix_rigid.html or "SHAKE constraints"_fix_shake.html
+may not have the desired outcome for two reasons.  First, the velocity
+command can be invoked before all of the relevant fixes are created
+and initialized and the number of adjusted degrees of freedom (DOFs)
+is known.  Thus it is not possible to compute the target temperature
+correctly.  Second, the assigned velocities may be partially canceled
+when constraints are first enforced, leading to a different
+temperature than desired.  A workaround for this is to perform a "run
+0"_run.html command, which insures all DOFs are accounted for
+properly, and then rescale the temperature to the desired value before
+performing a simulation.  For example:
+
+velocity all create 300.0 12345
+run 0                             # temperature may not be 300K
+velocity all scale 300.0          # now it should be :pre
 
 [Related commands:]
 
-"fix shake"_fix_shake.html, "lattice"_lattice.html
+"fix rigid"_fix_rigid.html, "fix shake"_fix_shake.html, "lattice"_lattice.html
 
 [Default:]
 
 The option defaults are dist = uniform, sum = no, mom = yes, rot = no,
 temp = full style on group-ID, loop = all, and units = lattice.  The
 rigid option is not defined by default.
diff --git a/examples/README b/examples/README
index 33a28a98b..fb039c5b6 100644
--- a/examples/README
+++ b/examples/README
@@ -1,142 +1,143 @@
 LAMMPS example problems
 
 There are 3 flavors of sub-directories in this file, each with sample
 problems you can run with LAMMPS.
 
 lower-case directories = simple test problems for LAMMPS and its packages
 upper-case directories = more complex problems
 USER directory with its own sub-directories = tests for USER packages
 
 Each is discussed below.
 
 ------------------------------------------
 
 Lower-case directories
 
 Each of these sub-directories contains a sample problem you can run
 with LAMMPS.  Most are 2d models so that they run quickly, requiring a
 few seconds to a few minutes to run on a desktop machine.  Each
 problem has an input script (in.*) and produces a log file (log.*) and
 (optionally) a dump file (dump.*) or image files (image.*) or movie
 (movie.mpg) when it runs.  Some use a data file (data.*) of initial
 coordinates as additional input.  Some require that you install one or
 more optional LAMMPS packages.
 
 A few sample log file outputs on different machines and different
 numbers of processors are included in the directories to compare your
 answers to.  E.g. a log file like log.crack.date.foo.P means it ran on
 P processors of machine "foo" with the dated version of LAMMPS.  Note
 that these problems should get statistically similar answers when run
 on different machines or different numbers of processors, but not
 identical answers to those in the log of dump files included here.
 See the Errors section of the LAMMPS documentation for more
 discussion.
 
 Most of the example input scripts have commented-out lines that
 produce dump snapshots of the running simulation in any of 3 formats.
 
 If you uncomment the dump command in the input script, a text dump
 file will be produced, which can be animated by various visualization
 programs (see http://lammps.sandia.gov/viz.html) such as VMD or
 AtomEye.  It can also be animated using the xmovie tool described in
 the Additional Tools section of the LAMMPS documentation.
 
 If you uncomment the dump image command in the input script, and
 assuming you have built LAMMPS with a JPG library, JPG snapshot images
 will be produced when the simulation runs.  They can be quickly
 post-processed into a movie using commands described on the dump image
 doc page.
 
 If you uncomment the dump movie command in the input script, and
 assuming you have built LAMMPS with the FFMPEG library, an MPG movie
 will be produced when the simulation runs.  The movie file can be
 played using various viewers, such as mplayer or QuickTime.
 
 Animations of many of these examples can be viewed on the Movies
 section of the LAMMPS WWW Site.
 
 These are the sample problems and their output in the various
 sub-directories:
 
 body:     body particles, 2d system
 colloid:  big colloid particles in a small particle solvent, 2d system
 comb:	  models using the COMB potential
 crack:	  crack propagation in a 2d solid
 deposit:  deposition of atoms and molecules onto a 3d substrate
 dipole:   point dipolar particles, 2d system
 dreiding: methanol via Dreiding FF
 eim:      NaCl using the EIM potential
 ellipse:  ellipsoidal particles in spherical solvent, 2d system
 flow:	  Couette and Poiseuille flow in a 2d channel
 friction: frictional contact of spherical asperities between 2d surfaces
 gpu:      use of the GPU package for GPU acceleration
 hugoniostat: Hugoniostat shock dynamics
 indent:	  spherical indenter into a 2d solid
 kim:      use of potentials in Knowledge Base for Interatomic Models (KIM)
+kokkos:   use of the KOKKOS package for multi-threading and GPU acceleration
 meam:	  MEAM test for SiC and shear (same as shear examples)
 melt:	  rapid melt of 3d LJ system
 micelle:  self-assembly of small lipid-like molecules into 2d bilayers
 min:	  energy minimization of 2d LJ melt
 msst:	  MSST shock dynamics
 nb3b:     use of NB3B pair style
 neb:	  nudged elastic band (NEB) calculation for barrier finding
 nemd:	  non-equilibrium MD of 2d sheared system
 obstacle: flow around two voids in a 2d channel
 peptide:  dynamics of a small solvated peptide chain (5-mer)
 peri:	  Peridynamic model of cylinder impacted by indenter
 pour:     pouring of granular particles into a 3d box, then chute flow
 prd:      parallel replica dynamics of vacancy diffusion in bulk Si
 reax:     RDX and TATB models using the ReaxFF
 rigid:    rigid bodies modeled as independent or coupled
 shear:    sideways shear applied to 2d solid, with and without a void
 srd:      stochastic rotation dynamics (SRD) particles as solvent
 tad:      temperature-accelerated dynamics of vacancy diffusion in bulk Si
 voronoi:  test of Voronoi tesselation in compute voronoi/atom
 
 Here is how you might run and visualize one of the sample problems:
 
 cd indent
 cp ../../src/lmp_linux .           # copy LAMMPS executable to this dir
 lmp_linux < in.indent              # run the problem
 
 Running the simulation produces the files {dump.indent} and
 {log.lammps}.  You can visualize the dump file as follows:
 
 ../../tools/xmovie/xmovie -scale dump.indent
 
 If you uncomment the dump image line(s) in the input script a series
 of JPG images will be produced by the run.  These can be viewed
 individually or turned into a movie or animated by tools like
 ImageMagick or QuickTime or various Windows-based tools.  See the dump
 image doc page for more details.  E.g. this Imagemagick command would
 create a GIF file suitable for viewing in a browser.
 
 % convert -loop 1 *.jpg foo.gif
 
 ------------------------------------------
 
 Upper-case directories
 
 The ASPHERE directory has examples of how to model aspherical
 particles with or without solvent, in 3 styles LAMMPS provides.
 Namely point ellipsoids, rigid bodies, and generalized aspherical
 bodies built from line/triangle surface facets in 2d/3d.  See the
 ASPHERE/README file to get started.
 
 The COUPLE directory has examples of how to use LAMMPS as a library,
 either by itself or in tandem with another code or library.  See the
 COUPLE/README file to get started.
 
 The ELASTIC directory has an example script for computing elastic
 constants, using a zero temperature Si example.  See the
 ELASTIC/in.elastic file for more info.
 
 The KAPPA directory has an example scripts for computing the thermal
 conductivity (kappa) of a LJ liquid using 4 different methods.  See
 the KAPPA/README file for more info.
 
 The USER directory contains subdirectories of user-provided example
 scripts for ser packages.  See the README files in those directories
 for more info.  See the doc/Section_start.html file for more info
 about installing and building user packages.
 
diff --git a/examples/gpu/README b/examples/gpu/README
new file mode 100644
index 000000000..8fb8db00a
--- /dev/null
+++ b/examples/gpu/README
@@ -0,0 +1,35 @@
+These are input scripts designed for use with the GPU package.
+
+To run them, you must first build LAMMPS with the GPU package
+installed, following the steps explained in Section 2.3 of
+doc/Section_start.html and lib/gpu/README.  An overview of building
+and running LAMMPS with the GPU package is given in Section 5.6 of
+doc/Section_accelerate.html.  Note that you can choose the precision
+at which computations are performed on the GPU in the build process.
+
+Note that lines such as this in each of the input scripts:
+
+package 	gpu force/neigh 0 1 1
+
+are set for running on a compute node with 2 GPUs.  If you
+have a single GPU, you should comment out the line, since
+the default is 1 GPU per compute node.
+
+The scripts can be run in the usual manner:
+
+lmp_g++ < in.gpu.melt.2.5
+lmp_g++ < in.gpu.melt.5.0
+lmp_g++ < in.gpu.phosphate
+lmp_g++ < in.gpu.rhodo
+
+mpirun -np 4 lmp_g++ < in.gpu.melt.2.5
+mpirun -np 4 lmp_g++ < in.gpu.melt.5.0
+mpirun -np 4 lmp_g++ < in.gpu.phosphate
+mpirun -np 4 lmp_g++ < in.gpu.rhodo
+
+The first set of commmands will run a single MPI task using a single
+GPU (even if you have 2 GPUs).
+
+The second set of commands will run 4 MPI tasks, with 2 MPI tasks per
+GPU (if you have 2 GPUs), or 4 MPI tasks per GPU (if you have a single
+GPU).
diff --git a/examples/kokkos/README b/examples/kokkos/README
new file mode 100644
index 000000000..fe0ea4de7
--- /dev/null
+++ b/examples/kokkos/README
@@ -0,0 +1,42 @@
+The in.kokkos input script is a copy of the bench/in.lj script,
+but can be run with the KOKKOS package,
+
+To run it, you must first build LAMMPS with the KOKKOS package
+installed, following the steps explained in Section 2.3.4 of
+doc/Section_start.html.  An overview of building and running LAMMPS
+with the KOKKOS package, for different compute-node hardware on your
+machine, is given in Section 5.8 of doc/Section_accelerate.html.
+
+The example log files included in this directory are for a desktop box
+with dual hex-core CPUs and 2 GPUs.
+
+Two executables were built in the following manner:
+
+make yes-kokkos
+make g++ OMP=yes -> lmp_cpu
+make cuda CUDA=yes -> lmp_cuda
+
+Then the following runs were made.  The "->" means that the run
+produced log.lammps which was then copied to the named log file.
+
+* MPI-only runs
+
+lmp_cpu -k off < in.kokkos -> log.kokkos.date.mpionly.1
+mpirun -np 4 lmp_cpu -k off < in.kokkos -> log.kokkos.date.mpionly.4
+
+* OpenMP threaded runs on CPUs only
+
+lmp_cpu -k on t 1 -sf kk < in.kokkos.half -> log.kokkos.date.cpu.1
+lmp_cpu -k on t 4 -sf kk < in.kokkos -> log.kokkos.date.cpu.4
+
+Note that in.kokkos.half was use for one of the runs, which uses the
+package command to force the use of half neighbor lists which are
+faster when running on just 1 thread.
+
+* GPU runs on 1 or 2 GPUs
+
+lmp_cuda -k on t 6 -sf kk < in.kokkos -> log.kokkos.date.gpu.1
+mpirun -np 2 lmp_cuda -k on t 6 -sf kk < in.kokkos -> log.kokkos.date.gpu.2
+
+Note that this is a very small problem (32K atoms) to run
+on 1 or 2 GPUs.
diff --git a/examples/kokkos/in.kokkos b/examples/kokkos/in.kokkos
new file mode 100644
index 000000000..01e12ef8a
--- /dev/null
+++ b/examples/kokkos/in.kokkos
@@ -0,0 +1,30 @@
+# 3d Lennard-Jones melt
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable	xx equal 20*$x
+variable	yy equal 20*$y
+variable	zz equal 20*$z
+
+units		lj
+atom_style	atomic
+
+lattice		fcc 0.8442
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+create_box	1 box
+create_atoms	1 box
+mass		1 1.0
+
+velocity	all create 1.44 87287 loop geom
+
+pair_style	lj/cut 2.5
+pair_coeff	1 1 1.0 1.0 2.5
+
+neighbor	0.3 bin
+neigh_modify	delay 0 every 20 check no
+
+fix		1 all nve
+
+run		100
diff --git a/examples/kokkos/in.kokkos.half b/examples/kokkos/in.kokkos.half
new file mode 100644
index 000000000..9847d18ef
--- /dev/null
+++ b/examples/kokkos/in.kokkos.half
@@ -0,0 +1,32 @@
+# 3d Lennard-Jones melt
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable	xx equal 20*$x
+variable	yy equal 20*$y
+variable	zz equal 20*$z
+
+package         kokkos neigh half
+
+units		lj
+atom_style	atomic
+
+lattice		fcc 0.8442
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+create_box	1 box
+create_atoms	1 box
+mass		1 1.0
+
+velocity	all create 1.44 87287 loop geom
+
+pair_style	lj/cut 2.5
+pair_coeff	1 1 1.0 1.0 2.5
+
+neighbor	0.3 bin
+neigh_modify	delay 0 every 20 check no
+
+fix		1 all nve
+
+run		100
diff --git a/examples/kokkos/log.kokkos.1Feb14.cpu.1 b/examples/kokkos/log.kokkos.1Feb14.cpu.1
new file mode 100644
index 000000000..76c5f5747
--- /dev/null
+++ b/examples/kokkos/log.kokkos.1Feb14.cpu.1
@@ -0,0 +1,68 @@
+LAMMPS (27 May 2014)
+KOKKOS mode is enabled (../lammps.cpp:468)
+  using 1 OpenMP thread(s) per MPI task
+# 3d Lennard-Jones melt
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable	xx equal 20*$x
+variable	xx equal 20*1
+variable	yy equal 20*$y
+variable	yy equal 20*1
+variable	zz equal 20*$z
+variable	zz equal 20*1
+
+package         kokkos neigh half
+
+units		lj
+atom_style	atomic
+
+lattice		fcc 0.8442
+Lattice spacing in x,y,z = 1.6796 1.6796 1.6796
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+region		box block 0 20 0 ${yy} 0 ${zz}
+region		box block 0 20 0 20 0 ${zz}
+region		box block 0 20 0 20 0 20
+create_box	1 box
+Created orthogonal box = (0 0 0) to (33.5919 33.5919 33.5919)
+  1 by 1 by 1 MPI processor grid
+create_atoms	1 box
+Created 32000 atoms
+mass		1 1.0
+
+velocity	all create 1.44 87287 loop geom
+
+pair_style	lj/cut 2.5
+pair_coeff	1 1 1.0 1.0 2.5
+
+neighbor	0.3 bin
+neigh_modify	delay 0 every 20 check no
+
+fix		1 all nve
+
+run		100
+Memory usage per processor = 7.79551 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0         1.44   -6.7733681            0   -4.6134356   -5.0197073 
+     100    0.7574531   -5.7585055            0   -4.6223613   0.20726105 
+Loop time of 2.29105 on 1 procs (1 MPI x 1 OpenMP) for 100 steps with 32000 atoms
+
+Pair  time (%) = 1.82425 (79.6249)
+Neigh time (%) = 0.338632 (14.7806)
+Comm  time (%) = 0.0366232 (1.59853)
+Outpt time (%) = 0.000144005 (0.00628553)
+Other time (%) = 0.0914049 (3.98965)
+
+Nlocal:    32000 ave 32000 max 32000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    19657 ave 19657 max 19657 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    1.20283e+06 ave 1.20283e+06 max 1.20283e+06 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 1202833
+Ave neighs/atom = 37.5885
+Neighbor list builds = 5
+Dangerous builds = 0
diff --git a/examples/kokkos/log.kokkos.1Feb14.cpu.4 b/examples/kokkos/log.kokkos.1Feb14.cpu.4
new file mode 100644
index 000000000..2b6001025
--- /dev/null
+++ b/examples/kokkos/log.kokkos.1Feb14.cpu.4
@@ -0,0 +1,68 @@
+LAMMPS (27 May 2014)
+KOKKOS mode is enabled (../lammps.cpp:468)
+  using 4 OpenMP thread(s) per MPI task
+# 3d Lennard-Jones melt
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable	xx equal 20*$x
+variable	xx equal 20*1
+variable	yy equal 20*$y
+variable	yy equal 20*1
+variable	zz equal 20*$z
+variable	zz equal 20*1
+
+units		lj
+atom_style	atomic
+
+lattice		fcc 0.8442
+Lattice spacing in x,y,z = 1.6796 1.6796 1.6796
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+region		box block 0 20 0 ${yy} 0 ${zz}
+region		box block 0 20 0 20 0 ${zz}
+region		box block 0 20 0 20 0 20
+create_box	1 box
+Created orthogonal box = (0 0 0) to (33.5919 33.5919 33.5919)
+  1 by 1 by 1 MPI processor grid
+create_atoms	1 box
+Created 32000 atoms
+mass		1 1.0
+
+velocity	all create 1.44 87287 loop geom
+
+pair_style	lj/cut 2.5
+pair_coeff	1 1 1.0 1.0 2.5
+
+neighbor	0.3 bin
+neigh_modify	delay 0 every 20 check no
+
+fix		1 all nve
+
+run		100
+Memory usage per processor = 13.2888 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0         1.44   -6.7733681            0   -4.6134356   -5.0197073 
+     100    0.7574531   -5.7585055            0   -4.6223613   0.20726105 
+Loop time of 0.983697 on 4 procs (1 MPI x 4 OpenMP) for 100 steps with 32000 atoms
+
+Pair  time (%) = 0.767155 (77.9869)
+Neigh time (%) = 0.14734 (14.9782)
+Comm  time (%) = 0.041466 (4.21532)
+Outpt time (%) = 0.000172138 (0.0174991)
+Other time (%) = 0.0275636 (2.80204)
+
+Nlocal:    32000 ave 32000 max 32000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    19657 ave 19657 max 19657 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    0 ave 0 max 0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:  2.40567e+06 ave 2.40567e+06 max 2.40567e+06 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 2405666
+Ave neighs/atom = 75.1771
+Neighbor list builds = 5
+Dangerous builds = 0
diff --git a/examples/kokkos/log.kokkos.1Feb14.gpu.1 b/examples/kokkos/log.kokkos.1Feb14.gpu.1
new file mode 100644
index 000000000..8dd9caca4
--- /dev/null
+++ b/examples/kokkos/log.kokkos.1Feb14.gpu.1
@@ -0,0 +1,68 @@
+LAMMPS (27 May 2014)
+KOKKOS mode is enabled (../lammps.cpp:468)
+  using 6 OpenMP thread(s) per MPI task
+# 3d Lennard-Jones melt
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable	xx equal 20*$x
+variable	xx equal 20*1
+variable	yy equal 20*$y
+variable	yy equal 20*1
+variable	zz equal 20*$z
+variable	zz equal 20*1
+
+units		lj
+atom_style	atomic
+
+lattice		fcc 0.8442
+Lattice spacing in x,y,z = 1.6796 1.6796 1.6796
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+region		box block 0 20 0 ${yy} 0 ${zz}
+region		box block 0 20 0 20 0 ${zz}
+region		box block 0 20 0 20 0 20
+create_box	1 box
+Created orthogonal box = (0 0 0) to (33.5919 33.5919 33.5919)
+  1 by 1 by 1 MPI processor grid
+create_atoms	1 box
+Created 32000 atoms
+mass		1 1.0
+
+velocity	all create 1.44 87287 loop geom
+
+pair_style	lj/cut 2.5
+pair_coeff	1 1 1.0 1.0 2.5
+
+neighbor	0.3 bin
+neigh_modify	delay 0 every 20 check no
+
+fix		1 all nve
+
+run		100
+Memory usage per processor = 16.9509 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0         1.44   -6.7733681            0   -4.6134356   -5.0197073 
+     100    0.7574531   -5.7585055            0   -4.6223613   0.20726105 
+Loop time of 0.57192 on 6 procs (1 MPI x 6 OpenMP) for 100 steps with 32000 atoms
+
+Pair  time (%) = 0.205416 (35.917)
+Neigh time (%) = 0.112468 (19.665)
+Comm  time (%) = 0.174223 (30.4629)
+Outpt time (%) = 0.000159025 (0.0278055)
+Other time (%) = 0.0796535 (13.9274)
+
+Nlocal:    32000 ave 32000 max 32000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    19657 ave 19657 max 19657 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    0 ave 0 max 0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs:  2.40567e+06 ave 2.40567e+06 max 2.40567e+06 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 2405666
+Ave neighs/atom = 75.1771
+Neighbor list builds = 5
+Dangerous builds = 0
diff --git a/examples/kokkos/log.kokkos.1Feb14.gpu.2 b/examples/kokkos/log.kokkos.1Feb14.gpu.2
new file mode 100644
index 000000000..938485a35
--- /dev/null
+++ b/examples/kokkos/log.kokkos.1Feb14.gpu.2
@@ -0,0 +1,68 @@
+LAMMPS (27 May 2014)
+KOKKOS mode is enabled (../lammps.cpp:468)
+  using 6 OpenMP thread(s) per MPI task
+# 3d Lennard-Jones melt
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable	xx equal 20*$x
+variable	xx equal 20*1
+variable	yy equal 20*$y
+variable	yy equal 20*1
+variable	zz equal 20*$z
+variable	zz equal 20*1
+
+units		lj
+atom_style	atomic
+
+lattice		fcc 0.8442
+Lattice spacing in x,y,z = 1.6796 1.6796 1.6796
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+region		box block 0 20 0 ${yy} 0 ${zz}
+region		box block 0 20 0 20 0 ${zz}
+region		box block 0 20 0 20 0 20
+create_box	1 box
+Created orthogonal box = (0 0 0) to (33.5919 33.5919 33.5919)
+  1 by 1 by 2 MPI processor grid
+create_atoms	1 box
+Created 32000 atoms
+mass		1 1.0
+
+velocity	all create 1.44 87287 loop geom
+
+pair_style	lj/cut 2.5
+pair_coeff	1 1 1.0 1.0 2.5
+
+neighbor	0.3 bin
+neigh_modify	delay 0 every 20 check no
+
+fix		1 all nve
+
+run		100
+Memory usage per processor = 8.95027 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0         1.44   -6.7733681            0   -4.6134356   -5.0197073 
+     100    0.7574531   -5.7585055            0   -4.6223613   0.20726105 
+Loop time of 0.689608 on 12 procs (2 MPI x 6 OpenMP) for 100 steps with 32000 atoms
+
+Pair  time (%) = 0.210953 (30.5903)
+Neigh time (%) = 0.122991 (17.8349)
+Comm  time (%) = 0.25264 (36.6353)
+Outpt time (%) = 0.000259042 (0.0375636)
+Other time (%) = 0.102765 (14.9019)
+
+Nlocal:    16000 ave 16001 max 15999 min
+Histogram: 1 0 0 0 0 0 0 0 0 1
+Nghost:    13632.5 ave 13635 max 13630 min
+Histogram: 1 0 0 0 0 0 0 0 0 1
+Neighs:    0 ave 0 max 0 min
+Histogram: 2 0 0 0 0 0 0 0 0 0
+FullNghs:  1.20283e+06 ave 1.20347e+06 max 1.2022e+06 min
+Histogram: 1 0 0 0 0 0 0 0 0 1
+
+Total # of neighbors = 2405666
+Ave neighs/atom = 75.1771
+Neighbor list builds = 5
+Dangerous builds = 0
diff --git a/examples/kokkos/log.kokkos.1Feb14.mpionly.1 b/examples/kokkos/log.kokkos.1Feb14.mpionly.1
new file mode 100644
index 000000000..d7763feb7
--- /dev/null
+++ b/examples/kokkos/log.kokkos.1Feb14.mpionly.1
@@ -0,0 +1,65 @@
+LAMMPS (27 May 2014)
+  using 1 OpenMP thread(s) per MPI task
+# 3d Lennard-Jones melt
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable	xx equal 20*$x
+variable	xx equal 20*1
+variable	yy equal 20*$y
+variable	yy equal 20*1
+variable	zz equal 20*$z
+variable	zz equal 20*1
+
+units		lj
+atom_style	atomic
+
+lattice		fcc 0.8442
+Lattice spacing in x,y,z = 1.6796 1.6796 1.6796
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+region		box block 0 20 0 ${yy} 0 ${zz}
+region		box block 0 20 0 20 0 ${zz}
+region		box block 0 20 0 20 0 20
+create_box	1 box
+Created orthogonal box = (0 0 0) to (33.5919 33.5919 33.5919)
+  1 by 1 by 1 MPI processor grid
+create_atoms	1 box
+Created 32000 atoms
+mass		1 1.0
+
+velocity	all create 1.44 87287 loop geom
+
+pair_style	lj/cut 2.5
+pair_coeff	1 1 1.0 1.0 2.5
+
+neighbor	0.3 bin
+neigh_modify	delay 0 every 20 check no
+
+fix		1 all nve
+
+run		100
+Memory usage per processor = 8.21387 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0         1.44   -6.7733681            0   -4.6134356   -5.0197073 
+     100    0.7574531   -5.7585055            0   -4.6223613   0.20726105 
+Loop time of 2.57975 on 1 procs (1 MPI x 1 OpenMP) for 100 steps with 32000 atoms
+
+Pair  time (%) = 2.20959 (85.6512)
+Neigh time (%) = 0.269136 (10.4326)
+Comm  time (%) = 0.0252256 (0.977833)
+Outpt time (%) = 0.000126123 (0.00488898)
+Other time (%) = 0.0756752 (2.93343)
+
+Nlocal:    32000 ave 32000 max 32000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    19657 ave 19657 max 19657 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    1.20283e+06 ave 1.20283e+06 max 1.20283e+06 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 1202833
+Ave neighs/atom = 37.5885
+Neighbor list builds = 5
+Dangerous builds = 0
diff --git a/examples/kokkos/log.kokkos.1Feb14.mpionly.4 b/examples/kokkos/log.kokkos.1Feb14.mpionly.4
new file mode 100644
index 000000000..1838aafd0
--- /dev/null
+++ b/examples/kokkos/log.kokkos.1Feb14.mpionly.4
@@ -0,0 +1,65 @@
+LAMMPS (27 May 2014)
+  using 1 OpenMP thread(s) per MPI task
+# 3d Lennard-Jones melt
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable	xx equal 20*$x
+variable	xx equal 20*1
+variable	yy equal 20*$y
+variable	yy equal 20*1
+variable	zz equal 20*$z
+variable	zz equal 20*1
+
+units		lj
+atom_style	atomic
+
+lattice		fcc 0.8442
+Lattice spacing in x,y,z = 1.6796 1.6796 1.6796
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+region		box block 0 20 0 ${yy} 0 ${zz}
+region		box block 0 20 0 20 0 ${zz}
+region		box block 0 20 0 20 0 20
+create_box	1 box
+Created orthogonal box = (0 0 0) to (33.5919 33.5919 33.5919)
+  1 by 2 by 2 MPI processor grid
+create_atoms	1 box
+Created 32000 atoms
+mass		1 1.0
+
+velocity	all create 1.44 87287 loop geom
+
+pair_style	lj/cut 2.5
+pair_coeff	1 1 1.0 1.0 2.5
+
+neighbor	0.3 bin
+neigh_modify	delay 0 every 20 check no
+
+fix		1 all nve
+
+run		100
+Memory usage per processor = 4.09506 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0         1.44   -6.7733681            0   -4.6134356   -5.0197073 
+     100    0.7574531   -5.7585055            0   -4.6223613   0.20726105 
+Loop time of 0.709072 on 4 procs (4 MPI x 1 OpenMP) for 100 steps with 32000 atoms
+
+Pair  time (%) = 0.574495 (81.0206)
+Neigh time (%) = 0.0709588 (10.0073)
+Comm  time (%) = 0.0474771 (6.69567)
+Outpt time (%) = 6.62804e-05 (0.00934748)
+Other time (%) = 0.0160753 (2.26708)
+
+Nlocal:    8000 ave 8037 max 7964 min
+Histogram: 2 0 0 0 0 0 0 0 1 1
+Nghost:    9007.5 ave 9050 max 8968 min
+Histogram: 1 1 0 0 0 0 0 1 0 1
+Neighs:    300708 ave 305113 max 297203 min
+Histogram: 1 0 0 1 1 0 0 0 0 1
+
+Total # of neighbors = 1202833
+Ave neighs/atom = 37.5885
+Neighbor list builds = 5
+Dangerous builds = 0
diff --git a/lib/README b/lib/README
index 00a69b0b6..d4dac7f26 100644
--- a/lib/README
+++ b/lib/README
@@ -1,31 +1,33 @@
 This directory contains libraries that can be linked with when
 building LAMMPS, if particular packages are included in a LAMMPS
 build.  The library itself must be built first, so that a lib*.a file
 exists for LAMMPS to link against.
 
 Each library directory contains a README with additional info about
 how to build the library.  This may require you to edit one of the
 provided Makefiles to make it suitable for your machine.
 
 The libraries included in the LAMMPS distribution are the following:
 
 atc           atomistic-to-continuum methods, USER-ATC package
                 from Reese Jones, Jeremy Templeton, Jon Zimmerman (Sandia)
 awpmd         antisymmetrized wave packet molecular dynamics, AWPMD package
                 from Ilya Valuev (JIHT RAS)
 colvars	      collective variable module (Metadynamics, ABF and more)
                 from Giacomo Fiorin and Jerome Henin (ICMS, Temple U)
 cuda	      NVIDIA GPU routines, USER-CUDA package
                 from Christian Trott (U Tech Ilmenau)
 gpu	      general GPU routines, GPU package
 	        from Mike Brown (ORNL)
+kokkos        Kokkos package for GPU and many-core acceleration
+                from Kokkos development team (Sandia)
 linalg        set of BLAS and LAPACK routines needed by USER-ATC package
 	        from Axel Kohlmeyer (Temple U)
 poems	      POEMS rigid-body integration package, POEMS package
                 from Rudranarayan Mukherjee (RPI)
 meam	      modified embedded atom method (MEAM) potential, MEAM package
                 from Greg Wagner (Sandia)
 qmmm	      quantum mechanics/molecular mechanics coupling interface
                 from Axel Kohlmeyer (Temple U)
 reax          ReaxFF potential, REAX package
 	        from Adri van Duin (Penn State) and Aidan Thompson (Sandia)
diff --git a/lib/kokkos/Makefile.lammps b/lib/kokkos/Makefile.lammps
new file mode 100644
index 000000000..f9fa37cdf
--- /dev/null
+++ b/lib/kokkos/Makefile.lammps
@@ -0,0 +1,104 @@
+# Settings that the LAMMPS build will import when this package library is used
+ 
+OMP = yes
+CUDA = no
+HWLOC = no
+AVX = no
+MIC = no
+LIBRT = no
+DEBUG = no
+
+CUDA_PATH = /usr/local/cuda
+
+KOKKOS_PATH = ../../lib/kokkos
+kokkos_SYSINC = -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I../ 
+SRC_KOKKOS = $(wildcard $(KOKKOS_PATH)/core/src/impl/*.cpp)
+
+ifeq ($(CUDA), yes)
+kokkos_SYSINC += -x cu -DDEVICE=2 -DKOKKOS_HAVE_CUDA
+SRC_KOKKOS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
+SRC_KOKKOS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cu)
+USRLIB += -L$(CUDA_PATH)/lib64 -lcudart -lcuda 
+ifeq ($(UVM), yes)
+kokkos_SYSINC += -DKOKKOS_USE_UVM
+endif
+else
+kokkos_SYSINC += -DDEVICE=1
+endif
+
+ifeq ($(CUSPARSE), yes)
+kokkos_SYSINC += -DKOKKOS_USE_CUSPARSE
+USRLIB += -lcusparse
+endif
+
+ifeq ($(CUBLAS), yes)
+kokkos_SYSINC += -DKOKKOS_USE_CUBLAS
+USRLIB += -lcublas
+endif
+
+ifeq ($(AVX), yes)
+ifeq ($(CUDA), yes) 
+kokkos_SYSINC += -Xcompiler -mavx
+else
+kokkos_SYSINC += -mavx
+endif
+LINKFLAGS += -mavx
+endif
+
+ifeq ($(MIC), yes)
+kokkos_SYSINC += -mmic
+LINKFLAGS += -mmic
+endif
+
+ifeq ($(OMP),yes)
+kokkos_SYSINC += -DKOKKOS_HAVE_OPENMP 
+SRC_KOKKOS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
+ifeq ($(CUDA), yes) 
+kokkos_SYSINC += -Xcompiler -fopenmp
+else
+kokkos_SYSINC += -fopenmp
+endif
+LINKFLAGS += -fopenmp
+else
+kokkos_SYSINC += -DKOKKOS_HAVE_PTHREAD
+USRLIB += -lpthread
+SRC_KOKKOS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
+endif
+
+ifeq ($(HWLOC),yes)
+kokkos_SYSINC += -DKOKKOS_HAVE_HWLOC -I$(HWLOCPATH)/include
+USRLIB += -L$(HWLOCPATH)/lib -lhwloc
+endif
+
+ifeq ($(RED_PREC), yes)
+kokkos_SYSINC += --use_fast_math
+endif
+
+ifeq ($(DEBUG), yes)
+kokkos_SYSINC += -g -G -DKOKKOS_EXPRESSION_CHECK -DENABLE_TRACEBACK
+LINKFLAGS += -g
+endif
+
+ifeq ($(LIBRT),yes)
+kokkos_SYSINC += -DKOKKOS_USE_LIBRT -DPREC_TIMER
+USRLIB += -lrt
+endif
+
+ifeq ($(CUDALDG), yes)
+kokkos_SYSINC += -DKOKKOS_USE_LDG_INTRINSIC
+endif
+
+OBJ_KOKKOS_TMP = $(SRC_KOKKOS:.cpp=.o)
+OBJ_KOKKOS = $(OBJ_KOKKOS_TMP:.cu=.o)
+OBJ_KOKKOS_LINK = $(notdir $(OBJ_KOKKOS))
+
+override OBJ += kokkos_depend.o
+
+libkokkoscore.a: $(OBJ_KOKKOS)
+	ar cr libkokkoscore.a $(OBJ_KOKKOS_LINK)	
+
+kokkos_depend.o: libkokkoscore.a
+	touch kokkos_depend.cpp
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c kokkos_depend.cpp
+
+kokkos_SYSLIB = -L./ $(LINKFLAGS) $(USRLIB)
diff --git a/lib/kokkos/README b/lib/kokkos/README
new file mode 100644
index 000000000..59f5685ba
--- /dev/null
+++ b/lib/kokkos/README
@@ -0,0 +1,44 @@
+Kokkos library
+
+Carter Edwards, Christian Trott, Daniel Sunderland
+Sandia National Labs
+
+29 May 2014
+http://trilinos.sandia.gov/packages/kokkos/
+
+-------------------------
+
+This directory has source files from the Kokkos library that LAMMPS
+uses when building with its KOKKOS package.  The package contains
+versions of pair, fix, and atom styles written with Kokkos data
+structures and calls to the Kokkos library that should run efficiently
+on various kinds of accelerated nodes, including GPU and many-core
+chips.
+
+Kokkos is a C++ library that provides two key abstractions for an
+application like LAMMPS.  First, it allows a single implementation of
+an application kernel (e.g. a pair style) to run efficiently on
+different kinds of hardware (GPU, Intel Phi, many-core chip).
+
+Second, it provides data abstractions to adjust (at compile time) the
+memory layout of basic data structures like 2d and 3d arrays and allow
+the transparent utilization of special hardware load and store units.
+Such data structures are used in LAMMPS to store atom coordinates or
+forces or neighbor lists.  The layout is chosen to optimize
+performance on different platforms.  Again this operation is hidden
+from the developer, and does not affect how the single implementation
+of the kernel is coded.
+
+To build LAMMPS with Kokkos, you should not need to make any changes
+to files in this directory.  You can overrided defaults that are set
+in Makefile.lammps when building LAMMPS, by defining variables as part
+of the make command.  Details of the build process with Kokkos are
+explained in Section 2.3 of doc/Section_start.html. and in Section 5.9
+of doc/Section_accelerate.html.
+
+The one exception is that when using Kokkos with NVIDIA GPUs, the
+CUDA_PATH setting in Makefile.lammps needs to point to the
+installation of the Cuda software on your machine.  The normal default
+location is /usr/local/cuda.  If this is not correct, you need to edit
+Makefile.lammps.
+
diff --git a/lib/kokkos/TPL/cub/block/block_discontinuity.cuh b/lib/kokkos/TPL/cub/block/block_discontinuity.cuh
new file mode 100644
index 000000000..76af003e5
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/block_discontinuity.cuh
@@ -0,0 +1,587 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * A set of "head flags" (or "tail flags") is often used to indicate corresponding items
+ * that differ from their predecessors (or successors).  For example, head flags are convenient
+ * for demarcating disjoint data segments as part of a segmented scan or reduction.
+ *
+ * \tparam T                    The data type to be flagged.
+ * \tparam BLOCK_THREADS        The thread block size in threads.
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockDiscontinuity}
+ * \par
+ * The code snippet below illustrates the head flagging of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockDiscontinuity for 128 threads on type int
+ *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+ *
+ *     // Allocate shared memory for BlockDiscontinuity
+ *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Collectively compute head flags for discontinuities in the segment
+ *     int head_flags[4];
+ *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
+ * The corresponding output \p head_flags in those threads will be
+ * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+ *
+ * \par Performance Considerations
+ * - Zero bank conflicts for most types.
+ *
+ */
+template <
+    typename    T,
+    int         BLOCK_THREADS>
+class BlockDiscontinuity
+{
+private:
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    typedef T _TempStorage[BLOCK_THREADS];
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /// Specialization for when FlagOp has third index param
+    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
+    struct ApplyOp
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx)
+        {
+            return flag_op(a, b, idx);
+        }
+    };
+
+    /// Specialization for when FlagOp does not have a third index param
+    template <typename FlagOp>
+    struct ApplyOp<FlagOp, false>
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx)
+        {
+            return flag_op(a, b);
+        }
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockDiscontinuity}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockDiscontinuity()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockDiscontinuity(
+        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
+     */
+    __device__ __forceinline__ BlockDiscontinuity(
+        int linear_tid)             ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(linear_tid)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
+     */
+    __device__ __forceinline__ BlockDiscontinuity(
+        TempStorage &temp_storage,  ///< [in] Reference to memory allocation having layout type TempStorage
+        int linear_tid)             ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head flag operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
+     *
+     * The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     * <tt>input<sub><em>i</em></sub></tt> when
+     * <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     * returns \p true (where <em>previous-item</em> is either the preceding item
+     * in the same thread or the last item in the previous thread).
+     * Furthermore, <tt>head_flags<sub><em>i</em></sub></tt> is always set for
+     * <tt>input><sub>0</sub></tt> in <em>thread</em><sub>0</sub>.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates the head-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute head flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
+     * The corresponding output \p head_flags in those threads will be
+     * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share last item
+        temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        __syncthreads();
+
+        // Set flag for first item
+        head_flags[0] = (linear_tid == 0) ?
+            1 :                                 // First thread
+            ApplyOp<FlagOp>::Flag(
+                flag_op,
+                temp_storage[linear_tid - 1],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        #pragma unroll
+        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            head_flags[ITEM] = ApplyOp<FlagOp>::Flag(
+                flag_op,
+                input[ITEM - 1],
+                input[ITEM],
+                (linear_tid * ITEMS_PER_THREAD) + ITEM);
+        }
+    }
+
+
+    /**
+     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     * <tt>input<sub><em>i</em></sub></tt> when
+     * <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     * returns \p true (where <em>previous-item</em> is either the preceding item
+     * in the same thread or the last item in the previous thread).
+     * For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     * against \p tile_predecessor_item.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates the head-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Collectively compute head flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagHeads(
+     *         head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>,
+     * and that \p tile_predecessor_item is \p 0.  The corresponding output \p head_flags in those threads will be
+     * <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)                   ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        // Share last item
+        temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        __syncthreads();
+
+        // Set flag for first item
+        int predecessor = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::Flag(
+            flag_op,
+            predecessor,
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for remaining items
+        #pragma unroll
+        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            head_flags[ITEM] = ApplyOp<FlagOp>::Flag(
+                flag_op,
+                input[ITEM - 1],
+                input[ITEM],
+                (linear_tid * ITEMS_PER_THREAD) + ITEM);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Tail flag operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged.
+     *
+     * The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     * <tt>input<sub><em>i</em></sub></tt> when
+     * <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     * returns \p true (where <em>next-item</em> is either the next item
+     * in the same thread or the first item in the next thread).
+     * Furthermore, <tt>tail_flags<sub>ITEMS_PER_THREAD-1</sub></tt> is always
+     * set for <em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub>.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates the tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute tail flags for discontinuities in the segment
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>.
+     * The corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first item
+        temp_storage[linear_tid] = input[0];
+
+        __syncthreads();
+
+        // Set flag for last item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::Flag(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1));
+
+        // Set flags for remaining items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++)
+        {
+            tail_flags[ITEM] = ApplyOp<FlagOp>::Flag(
+                flag_op,
+                input[ITEM],
+                input[ITEM + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEM);
+        }
+    }
+
+
+    /**
+     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     * <tt>input<sub><em>i</em></sub></tt> when
+     * <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     * returns \p true (where <em>next-item</em> is either the next item
+     * in the same thread or the first item in the next thread).
+     * For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     * <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     * against \p tile_predecessor_item.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates the tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute tail flags for discontinuities in the segment
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         tail_flags, thread_data, cub::Inequality(), tile_successor_item);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that \p tile_successor_item is \p 125.  The corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_successor_item)                   ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+    {
+        // Share first item
+        temp_storage[linear_tid] = input[0];
+
+        __syncthreads();
+
+        // Set flag for last item
+        int successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::Flag(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1));
+
+        // Set flags for remaining items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++)
+        {
+            tail_flags[ITEM] = ApplyOp<FlagOp>::Flag(
+                flag_op,
+                input[ITEM],
+                input[ITEM + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEM);
+        }
+    }
+
+    //@}  end member group
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/block/block_exchange.cuh b/lib/kokkos/TPL/cub/block/block_exchange.cuh
new file mode 100644
index 000000000..b7b95343b
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/block_exchange.cuh
@@ -0,0 +1,918 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../util_arch.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * It is commonplace for blocks of threads to rearrange data items between
+ * threads.  For example, the global memory subsystem prefers access patterns
+ * where data items are "striped" across threads (where consecutive threads access consecutive items),
+ * yet most block-wide operations prefer a "blocked" partitioning of items across threads
+ * (where consecutive items belong to a single thread).
+ *
+ * \par
+ * BlockExchange supports the following types of data exchanges:
+ * - Transposing between [<em>blocked</em>](index.html#sec5sec4) and [<em>striped</em>](index.html#sec5sec4) arrangements
+ * - Transposing between [<em>blocked</em>](index.html#sec5sec4) and [<em>warp-striped</em>](index.html#sec5sec4) arrangements
+ * - Scattering ranked items to a [<em>blocked arrangement</em>](index.html#sec5sec4)
+ * - Scattering ranked items to a [<em>striped arrangement</em>](index.html#sec5sec4)
+ *
+ * \tparam T                    The data type to be exchanged.
+ * \tparam BLOCK_THREADS        The thread block size in threads.
+ * \tparam ITEMS_PER_THREAD     The number of items partitioned onto each thread.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the expense of decreased parallelism.  (Default: false)
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockExchange}
+ * \par
+ * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+ * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockExchange for 128 threads owning 4 integer items each
+ *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+ *
+ *     // Allocate shared memory for BlockExchange
+ *     __shared__ typename BlockExchange::TempStorage temp_storage;
+ *
+ *     // Load a tile of data striped across threads
+ *     int thread_data[4];
+ *     cub::LoadStriped<LOAD_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+ *
+ *     // Collectively exchange data into a blocked arrangement across threads
+ *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of striped input \p thread_data across the block of threads is
+ * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt>.
+ * The corresponding output \p thread_data in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ *
+ * \par Performance Considerations
+ * - Proper device-specific padding ensures zero bank conflicts for most types.
+ *
+ */
+template <
+    typename        T,
+    int             BLOCK_THREADS,
+    int             ITEMS_PER_THREAD,
+    bool            WARP_TIME_SLICING = false>
+class BlockExchange
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    enum
+    {
+        LOG_WARP_THREADS            = PtxArchProps::LOG_WARP_THREADS,
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + PtxArchProps::WARP_THREADS - 1) / PtxArchProps::WARP_THREADS,
+
+        LOG_SMEM_BANKS              = PtxArchProps::LOG_SMEM_BANKS,
+        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
+
+        TILE_ITEMS                  = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        TIME_SLICES                 = (WARP_TIME_SLICING) ? WARPS : 1,
+
+        TIME_SLICED_THREADS         = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
+        TIME_SLICED_ITEMS           = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
+        WARP_TIME_SLICED_THREADS    = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+        WARP_TIME_SLICED_ITEMS      = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
+        // Insert padding if the number of items per thread is a power of two
+        INSERT_PADDING              = ((ITEMS_PER_THREAD & (ITEMS_PER_THREAD - 1)) == 0),
+        PADDING_ITEMS               = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type
+    typedef T _TempStorage[TIME_SLICED_ITEMS + PADDING_ITEMS];
+
+public:
+
+    /// \smemstorage{BlockExchange}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+    int warp_lane;
+    int warp_id;
+    int warp_offset;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for no timeslicing.
+     */
+    __device__ __forceinline__ void BlockedToStriped(
+        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> time_slicing)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage[item_offset] = items[ITEM];
+        }
+
+        __syncthreads();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            items[ITEM] = temp_storage[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    __device__ __forceinline__ void BlockedToStriped(
+        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  time_slicing)
+    {
+        T temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            __syncthreads();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (warp_lane * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage[item_offset] = items[ITEM];
+                }
+            }
+
+            __syncthreads();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Read a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_items[ITEM] = temp_storage[item_offset];
+                    }
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for no timeslicing
+     */
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>warp-striped</em> arrangements.
+        Int2Type<false> time_slicing)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + ITEM + (warp_lane * ITEMS_PER_THREAD);
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage[item_offset] = items[ITEM];
+        }
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            items[ITEM] = temp_storage[item_offset];
+        }
+    }
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for warp-timeslicing
+     */
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>warp-striped</em> arrangements.
+        Int2Type<true>  time_slicing)
+    {
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
+        {
+            __syncthreads();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = ITEM + (warp_lane * ITEMS_PER_THREAD);
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage[item_offset] = items[ITEM];
+                }
+
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    items[ITEM] = temp_storage[item_offset];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     */
+    __device__ __forceinline__ void StripedToBlocked(
+        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        Int2Type<false> time_slicing)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage[item_offset] = items[ITEM];
+        }
+
+        __syncthreads();
+
+        // No timeslicing
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            items[ITEM] = temp_storage[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    __device__ __forceinline__ void StripedToBlocked(
+        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        Int2Type<true>  time_slicing)
+    {
+        // Warp time-slicing
+        T temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            __syncthreads();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Write a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_storage[item_offset] = items[ITEM];
+                    }
+                }
+            }
+
+            __syncthreads();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (warp_lane * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_items[ITEM] = temp_storage[item_offset];
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing
+     */
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>warp-striped</em> and <em>blocked</em> arrangements.
+        Int2Type<false> time_slicing)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage[item_offset] = items[ITEM];
+        }
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + ITEM + (warp_lane * ITEMS_PER_THREAD);
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            items[ITEM] = temp_storage[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing
+     */
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>warp-striped</em> and <em>blocked</em> arrangements.
+        Int2Type<true>  time_slicing)
+    {
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
+        {
+            __syncthreads();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage[item_offset] = items[ITEM];
+                }
+
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = ITEM + (warp_lane * ITEMS_PER_THREAD);
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    items[ITEM] = temp_storage[item_offset];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     */
+    __device__ __forceinline__ void ScatterToBlocked(
+        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
+        int             ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<false> time_slicing)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            temp_storage[item_offset] = items[ITEM];
+        }
+
+        __syncthreads();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            items[ITEM] = temp_storage[item_offset];
+        }
+    }
+
+    /**
+     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    __device__ __forceinline__ void ScatterToBlocked(
+        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
+        int             ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<true>  time_slicing)
+    {
+        T temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            __syncthreads();
+
+            const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ranks[ITEM] - SLICE_OFFSET;
+                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+                {
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_storage[item_offset] = items[ITEM];
+                }
+            }
+
+            __syncthreads();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (warp_lane * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_items[ITEM] = temp_storage[item_offset];
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for no timeslicing.
+     */
+    __device__ __forceinline__ void ScatterToStriped(
+        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
+        int             ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<false> time_slicing)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            temp_storage[item_offset] = items[ITEM];
+        }
+
+        __syncthreads();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            items[ITEM] = temp_storage[item_offset];
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    __device__ __forceinline__ void ScatterToStriped(
+        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
+        int             ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<true> time_slicing)
+    {
+        T temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            __syncthreads();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ranks[ITEM] - SLICE_OFFSET;
+                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+                {
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_storage[item_offset] = items[ITEM];
+                }
+            }
+
+            __syncthreads();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Read a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_items[ITEM] = temp_storage[item_offset];
+                    }
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+public:
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockExchange()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(threadIdx.x),
+        warp_lane(linear_tid & (WARP_THREADS - 1)),
+        warp_id(linear_tid >> LOG_WARP_THREADS),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockExchange(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(threadIdx.x),
+        warp_lane(linear_tid & (WARP_THREADS - 1)),
+        warp_id(linear_tid >> LOG_WARP_THREADS),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
+     */
+    __device__ __forceinline__ BlockExchange(
+        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(linear_tid),
+        warp_lane(linear_tid & (WARP_THREADS - 1)),
+        warp_id(linear_tid >> LOG_WARP_THREADS),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
+     */
+    __device__ __forceinline__ BlockExchange(
+        TempStorage &temp_storage,              ///< [in] Reference to memory allocation having layout type TempStorage
+        int         linear_tid)                 ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid),
+        warp_lane(linear_tid & (WARP_THREADS - 1)),
+        warp_id(linear_tid >> LOG_WARP_THREADS),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Structured exchanges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Load a tile of ordered data into a striped arrangement across block threads
+     *     int thread_data[4];
+     *     cub::LoadStriped<LOAD_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     *     // Collectively exchange data into a blocked arrangement across threads
+     *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of striped input \p thread_data across the block of threads is
+     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from global memory.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void StripedToBlocked(
+        T                items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        StripedToBlocked(items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+    /**
+     * \brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively exchange data into a striped arrangement across threads
+     *     BlockExchange(temp_storage).BlockedToStriped(thread_data);
+     *
+     *     // Store data striped across block threads into an ordered tile
+     *     cub::StoreStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> in
+     * preparation for storing to global memory.
+     *
+     */
+    __device__ __forceinline__ void BlockedToStriped(
+        T               items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+    {
+        BlockedToStriped(items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+    /**
+     * \brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Load a tile of ordered data into a warp-striped arrangement across warp threads
+     *     int thread_data[4];
+     *     cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
+     *
+     *     // Collectively exchange data into a blocked arrangement across threads
+     *     BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of warp-striped input \p thread_data across the block of threads is
+     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
+     * after loading from global memory.  (The first 128 items are striped across
+     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        T                items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>warp-striped</em> and <em>blocked</em> arrangements.
+    {
+        WarpStripedToBlocked(items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+    /**
+     * \brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively exchange data into a warp-striped arrangement across threads
+     *     BlockExchange(temp_storage).BlockedToWarpStriped(thread_data);
+     *
+     *     // Store data striped across warp threads into an ordered tile
+     *     cub::StoreStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
+     * in preparation for storing to global memory. (The first 128 items are striped across
+     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+     *
+     */
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        T                items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>warp-striped</em> arrangements.
+    {
+        BlockedToWarpStriped(items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Scatter exchanges
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.
+     *
+     * \smemreuse
+     */
+    __device__ __forceinline__ void ScatterToBlocked(
+        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
+        int             ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToBlocked(items, ranks, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \smemreuse
+     */
+    __device__ __forceinline__ void ScatterToStriped(
+        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
+        int             ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStriped(items, ranks, Int2Type<WARP_TIME_SLICING>());
+    }
+
+    //@}  end member group
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/block/block_histogram.cuh b/lib/kokkos/TPL/cub/block/block_histogram.cuh
new file mode 100644
index 000000000..dd346e395
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/block_histogram.cuh
@@ -0,0 +1,414 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_histogram_sort.cuh"
+#include "specializations/block_histogram_atomic.cuh"
+#include "../util_arch.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms.
+ */
+enum BlockHistogramAlgorithm
+{
+
+    /**
+     * \par Overview
+     * Sorting followed by differentiation.  Execution is comprised of two phases:
+     * -# Sort the data using efficient radix sort
+     * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
+     *
+     * \par Performance Considerations
+     * Delivers consistent throughput regardless of sample bin distribution.
+     */
+    BLOCK_HISTO_SORT,
+
+
+    /**
+     * \par Overview
+     * Use atomic addition to update byte counts directly
+     *
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
+     */
+    BLOCK_HISTO_ATOMIC,
+};
+
+
+
+/******************************************************************************
+ * Block histogram
+ ******************************************************************************/
+
+
+/**
+ * \brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
+ * counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
+ *
+ * \par
+ * Optionally, BlockHistogram can be specialized to use different algorithms:
+ *   -# <b>cub::BLOCK_HISTO_SORT</b>.  Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm)
+ *   -# <b>cub::BLOCK_HISTO_ATOMIC</b>.  Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm)
+ *
+ * \tparam T                    The sample type being histogrammed (must be castable to an integer bin identifier)
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam ITEMS_PER_THREAD     The number of items per thread
+ * \tparam BINS                 The number bins within the histogram
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockHistogram}
+ * \par
+ * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+ * are partitioned across 128 threads where each thread owns 4 samples.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
+ *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+ *
+ *     // Allocate shared memory for BlockHistogram
+ *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+ *
+ *     // Allocate shared memory for block-wide histogram bin counts
+ *     __shared__ unsigned int smem_histogram[256];
+ *
+ *     // Obtain input samples per thread
+ *     unsigned char data[4];
+ *     ...
+ *
+ *     // Compute the block-wide histogram
+ *     BlockHistogram(temp_storage).Histogram(data, smem_histogram);
+ *
+ * \endcode
+ *
+ * \par Performance and Usage Considerations
+ * - The histogram output can be constructed in shared or global memory
+ * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     BINS,
+    BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT>
+class BlockHistogram
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /**
+     * Ensure the template parameterization meets the requirements of the
+     * targeted device architecture.  BLOCK_HISTO_ATOMIC can only be used
+     * on version SM120 or later.  Otherwise BLOCK_HISTO_SORT is used
+     * regardless.
+     */
+    static const BlockHistogramAlgorithm SAFE_ALGORITHM =
+        ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (CUB_PTX_ARCH < 120)) ?
+            BLOCK_HISTO_SORT :
+            ALGORITHM;
+
+    /// Internal specialization.
+    typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT),
+        BlockHistogramSort<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS>,
+        BlockHistogramAtomic<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS> >::Type InternalBlockHistogram;
+
+    /// Shared memory storage layout type for BlockHistogram
+    typedef typename InternalBlockHistogram::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /// \smemstorage{BlockHistogram}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockHistogram()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockHistogram(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
+     */
+    __device__ __forceinline__ BlockHistogram(
+        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(linear_tid)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
+     */
+    __device__ __forceinline__ BlockHistogram(
+        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
+        int linear_tid)                        ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Histogram operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Initialize the shared histogram counters to zero.
+     *
+     * The code snippet below illustrates a the initialization and update of a
+     * histogram of 512 integer samples that are partitioned across 128 threads
+     * where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Initialize the block-wide histogram
+     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+     *
+     *     // Update the block-wide histogram
+     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam HistoCounter         <b>[inferred]</b> Histogram counter type
+     */
+    template <typename HistoCounter>
+    __device__ __forceinline__ void InitHistogram(HistoCounter histogram[BINS])
+    {
+        // Initialize histogram bin counts to zeros
+        int histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            histogram[histo_offset + linear_tid] = 0;
+        }
+        // Finish up with guarded initialization if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            histogram[histo_offset + linear_tid] = 0;
+        }
+    }
+
+
+    /**
+     * \brief Constructs a block-wide histogram in shared/global memory.  Each thread contributes an array of input elements.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+     * are partitioned across 128 threads where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Compute the block-wide histogram
+     *     BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam HistoCounter         <b>[inferred]</b> Histogram counter type
+     */
+    template <
+        typename            HistoCounter>
+    __device__ __forceinline__ void Histogram(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        HistoCounter        histogram[BINS])                ///< [out] Reference to shared/global memory histogram
+    {
+        // Initialize histogram bin counts to zeros
+        InitHistogram(histogram);
+
+        // Composite the histogram
+        InternalBlockHistogram(temp_storage, linear_tid).Composite(items, histogram);
+    }
+
+
+
+    /**
+     * \brief Updates an existing block-wide histogram in shared/global memory.  Each thread composites an array of input elements.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a the initialization and update of a
+     * histogram of 512 integer samples that are partitioned across 128 threads
+     * where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Initialize the block-wide histogram
+     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+     *
+     *     // Update the block-wide histogram
+     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam HistoCounter         <b>[inferred]</b> Histogram counter type
+     */
+    template <
+        typename            HistoCounter>
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        HistoCounter        histogram[BINS])                 ///< [out] Reference to shared/global memory histogram
+    {
+        InternalBlockHistogram(temp_storage, linear_tid).Composite(items, histogram);
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/block/block_load.cuh b/lib/kokkos/TPL/cub/block/block_load.cuh
new file mode 100644
index 000000000..e645bcdce
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/block_load.cuh
@@ -0,0 +1,1122 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Operations for reading linear tiles of data into the CUDA thread block.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../util_namespace.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_vector.cuh"
+#include "../thread/thread_load.cuh"
+#include "block_exchange.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup IoModule
+ * @{
+ */
+
+
+/******************************************************************//**
+ * \name Blocked I/O
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block using the specified cache modifier.
+ *
+ * \blocked
+ *
+ * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).
+ */
+template <
+    PtxLoadModifier MODIFIER,
+    typename        T,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorRA>
+__device__ __forceinline__ void LoadBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    // Load directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = ThreadLoad<MODIFIER>(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM);
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block using the specified cache modifier, guarded by range.
+ *
+ * \blocked
+ *
+ * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).
+ */
+template <
+    PtxLoadModifier MODIFIER,
+    typename        T,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorRA>
+__device__ __forceinline__ void LoadBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+    int bounds = valid_items - (linear_tid * ITEMS_PER_THREAD);
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (ITEM < bounds)
+        {
+            items[ITEM] = ThreadLoad<MODIFIER>(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM);
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block using the specified cache modifier, guarded by range, with a fall-back assignment of out-of-bound elements..
+ *
+ * \blocked
+ *
+ * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).
+ */
+template <
+    PtxLoadModifier MODIFIER,
+    typename        T,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorRA>
+__device__ __forceinline__ void LoadBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    T               oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    int bounds = valid_items - (linear_tid * ITEMS_PER_THREAD);
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = (ITEM < bounds) ?
+            ThreadLoad<MODIFIER>(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM) :
+            oob_default;
+    }
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Striped I/O
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block using the specified cache modifier.
+ *
+ * \striped
+ *
+ * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).
+ */
+template <
+    PtxLoadModifier MODIFIER,
+    int             BLOCK_THREADS,
+    typename        T,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorRA>
+__device__ __forceinline__ void LoadStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = ThreadLoad<MODIFIER>(block_itr + (ITEM * BLOCK_THREADS) + linear_tid);
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block using the specified cache modifier, guarded by range
+ *
+ * \striped
+ *
+ * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).
+ */
+template <
+    PtxLoadModifier MODIFIER,
+    int             BLOCK_THREADS,
+    typename        T,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorRA>
+__device__ __forceinline__ void LoadStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+    int bounds = valid_items - linear_tid;
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (ITEM * BLOCK_THREADS < bounds)
+        {
+            items[ITEM] = ThreadLoad<MODIFIER>(block_itr + linear_tid + (ITEM * BLOCK_THREADS));
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block using the specified cache modifier, guarded by range, with a fall-back assignment of out-of-bound elements.
+ *
+ * \striped
+ *
+ * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).
+ */
+template <
+    PtxLoadModifier MODIFIER,
+    int             BLOCK_THREADS,
+    typename        T,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorRA>
+__device__ __forceinline__ void LoadStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    T               oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    int bounds = valid_items - linear_tid;
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = (ITEM * BLOCK_THREADS < bounds) ?
+             ThreadLoad<MODIFIER>(block_itr + linear_tid + (ITEM * BLOCK_THREADS)) :
+             oob_default;
+    }
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Warp-striped I/O
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block using the specified cache modifier.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).
+ */
+template <
+    PtxLoadModifier MODIFIER,
+    typename        T,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorRA>
+__device__ __forceinline__ void LoadWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    int tid         = linear_tid & (PtxArchProps::WARP_THREADS - 1);
+    int wid         = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
+    int warp_offset = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
+
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = ThreadLoad<MODIFIER>(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS));
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block using the specified cache modifier, guarded by range
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).
+ */
+template <
+    PtxLoadModifier MODIFIER,
+    typename        T,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorRA>
+__device__ __forceinline__ void LoadWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)               ///< [in] Number of valid items to load
+{
+    int tid                 = linear_tid & (PtxArchProps::WARP_THREADS - 1);
+    int wid                 = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
+    int warp_offset         = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
+    int bounds              = valid_items - warp_offset - tid;
+
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if ((ITEM * PtxArchProps::WARP_THREADS) < bounds)
+        {
+            items[ITEM] = ThreadLoad<MODIFIER>(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS));
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block using the specified cache modifier, guarded by range, with a fall-back assignment of out-of-bound elements.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).
+ */
+template <
+    PtxLoadModifier MODIFIER,
+    typename        T,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorRA>
+__device__ __forceinline__ void LoadWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,               ///< [in] Number of valid items to load
+    T               oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    int tid         = linear_tid & (PtxArchProps::WARP_THREADS - 1);
+    int wid         = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
+    int warp_offset = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
+    int bounds      = valid_items - warp_offset - tid;
+
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = ((ITEM * PtxArchProps::WARP_THREADS) < bounds) ?
+            ThreadLoad<MODIFIER>(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS)) :
+            oob_default;
+    }
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Blocked, vectorized I/O
+ *********************************************************************/
+//@{
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block using the specified cache modifier.
+ *
+ * \blocked
+ *
+ * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned
+ *
+ * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+ *   - \p ITEMS_PER_THREAD is odd
+ *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ */
+template <
+    PtxLoadModifier MODIFIER,
+    typename        T,
+    int             ITEMS_PER_THREAD>
+__device__ __forceinline__ void LoadBlockedVectorized(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T               *block_ptr,                 ///< [in] Input pointer for loading from
+    T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    enum
+    {
+        // Maximum CUDA vector size is 4 elements
+        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
+
+        // Vector size must be a power of two and an even divisor of the items per thread
+        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
+            MAX_VEC_SIZE :
+            1,
+
+        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
+    };
+
+    // Vector type
+    typedef typename VectorHelper<T, VEC_SIZE>::Type Vector;
+
+    // Alias local data (use raw_items array here which should get optimized away to prevent conservative PTXAS lmem spilling)
+    T raw_items[ITEMS_PER_THREAD];
+
+    // Direct-load using vector types
+    LoadBlocked<MODIFIER>(
+        linear_tid,
+        reinterpret_cast<Vector *>(block_ptr),
+        reinterpret_cast<Vector (&)[VECTORS_PER_THREAD]>(raw_items));
+
+    // Copy
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = raw_items[ITEM];
+    }
+}
+
+
+//@}  end member group
+
+/** @} */       // end group IoModule
+
+
+
+//-----------------------------------------------------------------------------
+// Generic BlockLoad abstraction
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
+ */
+enum BlockLoadAlgorithm
+{
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec4) of data is read
+     * directly from memory.  The thread block reads items in a parallel "raking" fashion: thread<sub><em>i</em></sub>
+     * reads the <em>i</em><sup>th</sup> segment of consecutive elements.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) decreases as the
+     *   access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_LOAD_DIRECT,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec4) of data is read directly
+     * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
+     * The thread block reads items in a parallel "raking" fashion: thread<sub><em>i</em></sub> uses vector loads to
+     * read the <em>i</em><sup>th</sup> segment of consecutive elements.
+     *
+     * For example, <tt>ld.global.v4.s32</tt> instructions will be generated when \p T = \p int and \p ITEMS_PER_THREAD > 4.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high until the the
+     *   access stride between threads (i.e., the number items per thread) exceeds the
+     *   maximum vector load width (typically 4 items or 64B, whichever is lower).
+     * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+     *   - \p ITEMS_PER_THREAD is odd
+     *   - The \p InputIteratorRA is not a simple pointer type
+     *   - The block input offset is not quadword-aligned
+     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     */
+    BLOCK_LOAD_VECTORIZE,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>striped arrangement</em>](index.html#sec5sec4) of data is read
+     * directly from memory and then is locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec4). The thread block
+     * reads items in a parallel "strip-mining" fashion:
+     * thread<sub><em>i</em></sub> reads items having stride \p BLOCK_THREADS
+     * between them. cub::BlockExchange is then used to locally reorder the items
+     * into a [<em>blocked arrangement</em>](index.html#sec5sec4).
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+     */
+    BLOCK_LOAD_TRANSPOSE,
+
+
+    /**
+     * \par Overview
+     *
+     * A [<em>warp-striped arrangement</em>](index.html#sec5sec4) of data is read
+     * directly from memory and then is locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec4). Each warp reads its own
+     * contiguous segment in a parallel "strip-mining" fashion: lane<sub><em>i</em></sub>
+     * reads items having stride \p WARP_THREADS between them. cub::BlockExchange
+     * is then used to locally reorder the items into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec4).
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+     */
+    BLOCK_LOAD_WARP_TRANSPOSE,
+};
+
+
+/**
+ * \brief The BlockLoad class provides [<em>collective</em>](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [<em>blocked arrangement</em>](index.html#sec5sec4) across a CUDA thread block.  ![](block_load_logo.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * The BlockLoad class provides a single data movement abstraction that can be specialized
+ * to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
+ * performance policies for different architectures, data types, granularity sizes, etc.
+ *
+ * \par
+ * Optionally, BlockLoad can be specialized by different data movement strategies:
+ *   -# <b>cub::BLOCK_LOAD_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec4)
+ *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec4)
+ *      of data is read directly from memory using CUDA's built-in vectorized loads as a
+ *      coalescing optimization.    [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_TRANSPOSE</b>.  A [<em>striped arrangement</em>](index.html#sec5sec4)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec4).  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec4)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec4).  [More...](\ref cub::BlockLoadAlgorithm)
+ *
+ * \tparam InputIteratorRA      The input iterator type (may be a simple pointer type).
+ * \tparam BLOCK_THREADS        The thread block size in threads.
+ * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockLoadAlgorithm tuning policy.  default: cub::BLOCK_LOAD_DIRECT.
+ * \tparam MODIFIER             <b>[optional]</b> cub::PtxLoadModifier cache modifier.  default: cub::LOAD_DEFAULT.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> For transposition-based cub::BlockLoadAlgorithm parameterizations that utilize shared memory: When \p true, only use enough shared memory for a single warp's worth of data, time-slicing the block-wide exchange over multiple synchronized rounds (default: false)
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockLoad}
+ * \par
+ * The code snippet below illustrates the loading of a linear
+ * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+ * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+ * meaning memory references are efficiently coalesced using a warp-striped access
+ * pattern (after which items are locally reordered among threads).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockLoad for 128 threads owning 4 integer items each
+ *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+ *
+ *     // Allocate shared memory for BlockLoad
+ *     __shared__ typename BlockLoad::TempStorage temp_storage;
+ *
+ *     // Load a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     BlockLoad(temp_storage).Load(d_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ * The set of \p thread_data across the block of threads in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+ *
+ */
+template <
+    typename            InputIteratorRA,
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  ALGORITHM = BLOCK_LOAD_DIRECT,
+    PtxLoadModifier     MODIFIER = LOAD_DEFAULT,
+    bool                WARP_TIME_SLICING = false>
+class BlockLoad
+{
+private:
+
+    /******************************************************************************
+     * Constants and typed definitions
+     ******************************************************************************/
+
+    // Data type of input iterator
+    typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+
+    /******************************************************************************
+     * Algorithmic variants
+     ******************************************************************************/
+
+    /// Load helper
+    template <BlockLoadAlgorithm _POLICY, int DUMMY = 0>
+    struct LoadInternal;
+
+
+    /**
+     * BLOCK_LOAD_DIRECT specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        __device__ __forceinline__ void Load(
+            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            LoadBlocked<MODIFIER>(linear_tid, block_itr, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        __device__ __forceinline__ void Load(
+            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        __device__ __forceinline__ void Load(
+            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            T               oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items, oob_default);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_VECTORIZE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        __device__ __forceinline__ void Load(
+            T               *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
+            T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            LoadBlockedVectorized<MODIFIER>(linear_tid, block_ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
+        template <
+            typename T,
+            typename _InputIteratorRA>
+        __device__ __forceinline__ void Load(
+            _InputIteratorRA    block_itr,                  ///< [in] The thread block's base input iterator for loading from
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+        {
+            LoadBlocked<MODIFIER>(linear_tid, block_itr, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range (skips vectorization)
+        __device__ __forceinline__ void Load(
+            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)
+        __device__ __forceinline__ void Load(
+            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            T               oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items, oob_default);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_TRANSPOSE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
+    {
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_THREADS, ITEMS_PER_THREAD, WARP_TIME_SLICING> BlockExchange;
+
+        /// Shared memory storage layout type
+        typedef typename BlockExchange::TempStorage _TempStorage;
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        __device__ __forceinline__ void Load(
+            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadStriped<MODIFIER, BLOCK_THREADS>(linear_tid, block_itr, items);
+            BlockExchange(temp_storage, linear_tid).StripedToBlocked(items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        __device__ __forceinline__ void Load(
+            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadStriped<MODIFIER, BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+            BlockExchange(temp_storage, linear_tid).StripedToBlocked(items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        __device__ __forceinline__ void Load(
+            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            T               oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadStriped<MODIFIER, BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
+            BlockExchange(temp_storage, linear_tid).StripedToBlocked(items);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = PtxArchProps::WARP_THREADS
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_THREADS, ITEMS_PER_THREAD, WARP_TIME_SLICING> BlockExchange;
+
+        /// Shared memory storage layout type
+        typedef typename BlockExchange::TempStorage _TempStorage;
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        __device__ __forceinline__ void Load(
+            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadWarpStriped<MODIFIER>(linear_tid, block_itr, items);
+            BlockExchange(temp_storage, linear_tid).WarpStripedToBlocked(items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        __device__ __forceinline__ void Load(
+            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadWarpStriped<MODIFIER>(linear_tid, block_itr, items, valid_items);
+            BlockExchange(temp_storage, linear_tid).WarpStripedToBlocked(items);
+        }
+
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        __device__ __forceinline__ void Load(
+            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            T               oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadWarpStriped<MODIFIER>(linear_tid, block_itr, items, valid_items, oob_default);
+            BlockExchange(temp_storage, linear_tid).WarpStripedToBlocked(items);
+        }
+    };
+
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Internal load implementation to use
+    typedef LoadInternal<ALGORITHM> InternalLoad;
+
+
+    /// Shared memory storage layout type
+    typedef typename InternalLoad::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Thread reference to shared storage
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+public:
+
+    /// \smemstorage{BlockLoad}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockLoad()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockLoad(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
+     */
+    __device__ __forceinline__ BlockLoad(
+        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(linear_tid)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
+     */
+    __device__ __forceinline__ BlockLoad(
+        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
+        int linear_tid)                        ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Load a linear segment of items from memory.
+     *
+     * \blocked
+     *
+     * The code snippet below illustrates the loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockLoad for 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void Load(
+        InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
+    }
+
+
+    /**
+     * \brief Load a linear segment of items from memory, guarded by range.
+     *
+     * \blocked
+     *
+     * The code snippet below illustrates the guarded loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockLoad for 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt> and \p valid_items is \p 5.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt>, with only the first two threads
+     * being unmasked to load portions of valid data (and other items remaining unassigned).
+     *
+     */
+    __device__ __forceinline__ void Load(
+        InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+        int             valid_items)                ///< [in] Number of valid items to load
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
+    }
+
+
+    /**
+     * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+     *
+     * \blocked
+     *
+     * The code snippet below illustrates the guarded loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockLoad for 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt>,
+     * \p valid_items is \p 5, and the out-of-bounds default is \p -1.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt>, with only the first two threads
+     * being unmasked to load portions of valid data (and other items are assigned \p -1)
+     *
+     */
+    __device__ __forceinline__ void Load(
+        InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+        int             valid_items,                ///< [in] Number of valid items to load
+        T               oob_default)                ///< [in] Default value to assign out-of-bound items
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
+    }
+
+
+    //@}  end member group
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/block/block_radix_rank.cuh b/lib/kokkos/TPL/cub/block/block_radix_rank.cuh
new file mode 100644
index 000000000..149a62c65
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/block_radix_rank.cuh
@@ -0,0 +1,479 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock
+ */
+
+#pragma once
+
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_scan.cuh"
+#include "../block/block_scan.cuh"
+#include "../util_namespace.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock.
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * Blah...
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam RADIX_BITS           <b>[optional]</b> The number of radix bits per digit place (default: 5 bits)
+ * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
+ * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+ * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+ *
+ * \par Usage Considerations
+ * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
+ * - Assumes a [<em>blocked arrangement</em>](index.html#sec5sec4) of elements across threads
+ * - \smemreuse{BlockRadixRank::TempStorage}
+ *
+ * \par Performance Considerations
+ *
+ * \par Algorithm
+ * These parallel radix ranking variants have <em>O</em>(<em>n</em>) work complexity and are implemented in XXX phases:
+ * -# blah
+ * -# blah
+ *
+ * \par Examples
+ * \par
+ * - <b>Example 1:</b> Simple radix rank of 32-bit integer keys
+ *      \code
+ *      #include <cub/cub.cuh>
+ *
+ *      template <int BLOCK_THREADS>
+ *      __global__ void ExampleKernel(...)
+ *      {
+ *
+ *      \endcode
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte>
+class BlockRadixRank
+{
+private:
+
+    /******************************************************************************
+     * Type definitions and constants
+     ******************************************************************************/
+
+    // Integer type for digit counters (to be packed into words of type PackedCounters)
+    typedef unsigned short DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte),
+        unsigned long long,
+        unsigned int>::Type PackedCounter;
+
+    enum
+    {
+        RADIX_DIGITS                 = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS             = PtxArchProps::LOG_WARP_THREADS,
+        WARP_THREADS                 = 1 << LOG_WARP_THREADS,
+        WARPS                        = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        BYTES_PER_COUNTER            = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER        = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO                = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO            = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES            = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0),                // Always at least one lane
+        COUNTER_LANES                = 1 << LOG_COUNTER_LANES,
+
+        // The number of packed counters per thread (plus one for padding)
+        RAKING_SEGMENT               = COUNTER_LANES + 1,
+
+        LOG_SMEM_BANKS               = PtxArchProps::LOG_SMEM_BANKS,
+        SMEM_BANKS                   = 1 << LOG_SMEM_BANKS,
+    };
+
+
+    /// BlockScan type
+    typedef BlockScan<PackedCounter, BLOCK_THREADS, INNER_SCAN_ALGORITHM> BlockScan;
+
+
+    /// Shared memory storage layout type for BlockRadixRank
+    struct _TempStorage
+    {
+        // Storage for scanning local ranks
+        typename BlockScan::TempStorage block_scan;
+
+        union
+        {
+            DigitCounter            digit_counters[COUNTER_LANES + 1][BLOCK_THREADS][PACKING_RATIO];
+            PackedCounter           raking_grid[BLOCK_THREADS][RAKING_SEGMENT];
+        };
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+    /// Copy of raking segment, promoted to registers
+    PackedCounter cached_segment[RAKING_SEGMENT];
+
+
+    /******************************************************************************
+     * Templated iteration
+     ******************************************************************************/
+
+    // General template iteration
+    template <int COUNT, int MAX>
+    struct Iterate
+    {
+        /**
+         * Decode keys.  Decodes the radix digit from the current digit place
+         * and increments the thread's corresponding counter in shared
+         * memory for that digit.
+         *
+         * Saves both (1) the prior value of that counter (the key's
+         * thread-local exclusive prefix sum for that digit), and (2) the shared
+         * memory offset of the counter (for later use).
+         */
+        template <typename UnsignedBits, int KEYS_PER_THREAD>
+        static __device__ __forceinline__ void DecodeKeys(
+            BlockRadixRank  &cta,                                   // BlockRadixRank instance
+            UnsignedBits    (&keys)[KEYS_PER_THREAD],               // Key to decode
+            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],    // Prefix counter value (out parameter)
+            DigitCounter*   (&digit_counters)[KEYS_PER_THREAD],     // Counter smem offset (out parameter)
+            int             current_bit)                            // The least-significant bit position of the current digit to extract
+        {
+            // Add in sub-counter offset
+            UnsignedBits sub_counter = BFE(keys[COUNT], current_bit + LOG_COUNTER_LANES, LOG_PACKING_RATIO);
+
+            // Add in row offset
+            UnsignedBits row_offset = BFE(keys[COUNT], current_bit, LOG_COUNTER_LANES);
+
+            // Pointer to smem digit counter
+            digit_counters[COUNT] = &cta.temp_storage.digit_counters[row_offset][cta.linear_tid][sub_counter];
+
+            // Load thread-exclusive prefix
+            thread_prefixes[COUNT] = *digit_counters[COUNT];
+
+            // Store inclusive prefix
+            *digit_counters[COUNT] = thread_prefixes[COUNT] + 1;
+
+            // Iterate next key
+            Iterate<COUNT + 1, MAX>::DecodeKeys(cta, keys, thread_prefixes, digit_counters, current_bit);
+        }
+
+
+        // Termination
+        template <int KEYS_PER_THREAD>
+        static __device__ __forceinline__ void UpdateRanks(
+            int             (&ranks)[KEYS_PER_THREAD],              // Local ranks (out parameter)
+            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],    // Prefix counter value
+            DigitCounter*   (&digit_counters)[KEYS_PER_THREAD])     // Counter smem offset
+        {
+            // Add in threadblock exclusive prefix
+            ranks[COUNT] = thread_prefixes[COUNT] + *digit_counters[COUNT];
+
+            // Iterate next key
+            Iterate<COUNT + 1, MAX>::UpdateRanks(ranks, thread_prefixes, digit_counters);
+        }
+    };
+
+
+    // Termination
+    template <int MAX>
+    struct Iterate<MAX, MAX>
+    {
+        // DecodeKeys
+        template <typename UnsignedBits, int KEYS_PER_THREAD>
+        static __device__ __forceinline__ void DecodeKeys(
+            BlockRadixRank  &cta,
+            UnsignedBits    (&keys)[KEYS_PER_THREAD],
+            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],
+            DigitCounter*   (&digit_counters)[KEYS_PER_THREAD],
+            int             current_bit) {}
+
+
+        // UpdateRanks
+        template <int KEYS_PER_THREAD>
+        static __device__ __forceinline__ void UpdateRanks(
+            int             (&ranks)[KEYS_PER_THREAD],
+            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],
+            DigitCounter    *(&digit_counters)[KEYS_PER_THREAD]) {}
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal storage allocator
+     */
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /**
+     * Performs upsweep raking reduction, returning the aggregate
+     */
+    __device__ __forceinline__ PackedCounter Upsweep()
+    {
+        PackedCounter *smem_raking_ptr = temp_storage.raking_grid[linear_tid];
+        PackedCounter *raking_ptr;
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data into registers
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                cached_segment[i] = smem_raking_ptr[i];
+            }
+            raking_ptr = cached_segment;
+        }
+        else
+        {
+            raking_ptr = smem_raking_ptr;
+        }
+
+        return ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        PackedCounter raking_partial)
+    {
+        PackedCounter *smem_raking_ptr = temp_storage.raking_grid[linear_tid];
+
+        PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
+            cached_segment :
+            smem_raking_ptr;
+
+        // Exclusive raking downsweep scan
+        ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data back to smem
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                smem_raking_ptr[i] = cached_segment[i];
+            }
+        }
+    }
+
+
+    /**
+     * Reset shared memory digit counters
+     */
+    __device__ __forceinline__ void ResetCounters()
+    {
+        // Reset shared memory digit counters
+        #pragma unroll
+        for (int LANE = 0; LANE < COUNTER_LANES + 1; LANE++)
+        {
+            *((PackedCounter*) temp_storage.digit_counters[LANE][linear_tid]) = 0;
+        }
+    }
+
+
+    /**
+     * Scan shared memory digit counters.
+     */
+    __device__ __forceinline__ void ScanCounters()
+    {
+        // Upsweep scan
+        PackedCounter raking_partial = Upsweep();
+
+        // Compute inclusive sum
+        PackedCounter inclusive_partial;
+        PackedCounter packed_aggregate;
+        BlockScan(temp_storage.block_scan, linear_tid).InclusiveSum(raking_partial, inclusive_partial, packed_aggregate);
+
+        // Propagate totals in packed fields
+        #pragma unroll
+        for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
+        {
+            inclusive_partial += packed_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
+        }
+
+        // Downsweep scan with exclusive partial
+        PackedCounter exclusive_partial = inclusive_partial - raking_partial;
+        ExclusiveDownsweep(exclusive_partial);
+    }
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockRadixRank()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockRadixRank(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
+     */
+    __device__ __forceinline__ BlockRadixRank(
+        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(linear_tid)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
+     */
+    __device__ __forceinline__ BlockRadixRank(
+        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
+        int linear_tid)                        ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Raking
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Rank keys.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
+        int             current_bit)                        ///< [in] The least-significant bit position of the current digit to extract
+    {
+        DigitCounter    thread_prefixes[KEYS_PER_THREAD];   // For each key, the count of previous keys in this tile having the same digit
+        DigitCounter*   digit_counters[KEYS_PER_THREAD];    // For each key, the byte-offset of its corresponding digit counter in smem
+
+        // Reset shared memory digit counters
+        ResetCounters();
+
+        // Decode keys and update digit counters
+        Iterate<0, KEYS_PER_THREAD>::DecodeKeys(*this, keys, thread_prefixes, digit_counters, current_bit);
+
+        __syncthreads();
+
+        // Scan shared memory counters
+        ScanCounters();
+
+        __syncthreads();
+
+        // Extract the local ranks of each key
+        Iterate<0, KEYS_PER_THREAD>::UpdateRanks(ranks, thread_prefixes, digit_counters);
+    }
+
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             &inclusive_digit_prefix)            ///< [out] The incluisve prefix sum for the digit threadIdx.x
+    {
+        // Rank keys
+        RankKeys(keys, ranks, current_bit);
+
+        // Get the inclusive and exclusive digit totals corresponding to the calling thread.
+        if ((BLOCK_THREADS == RADIX_DIGITS) || (linear_tid < RADIX_DIGITS))
+        {
+            // Obtain ex/inclusive digit counts.  (Unfortunately these all reside in the
+            // first counter column, resulting in unavoidable bank conflicts.)
+            int counter_lane = (linear_tid & (COUNTER_LANES - 1));
+            int sub_counter = linear_tid >> (LOG_COUNTER_LANES);
+            inclusive_digit_prefix = temp_storage.digit_counters[counter_lane + 1][0][sub_counter];
+        }
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/lib/kokkos/TPL/cub/block/block_radix_sort.cuh b/lib/kokkos/TPL/cub/block/block_radix_sort.cuh
new file mode 100644
index 000000000..873d40126
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/block_radix_sort.cuh
@@ -0,0 +1,608 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block.
+ */
+
+
+#pragma once
+
+#include "../util_namespace.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "block_exchange.cuh"
+#include "block_radix_rank.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method.  ![](sorting_logo.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending order.  It relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \par
+ * BlockRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
+ * <tt>unsigned char</tt>, \p int, \p double, etc.  Within each key, the implementation treats fixed-length
+ * bit-sequences of \p RADIX_BITS as radix digit places.  Although the direct radix sorting
+ * method can only be applied to unsigned integral types, BlockRadixSort
+ * is able to sort signed and floating-point types via simple bit-wise transformations
+ * that ensure lexicographic key ordering.
+ *
+ * \tparam Key                  Key type
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam ITEMS_PER_THREAD     The number of items per thread
+ * \tparam Value                <b>[optional]</b> Value type (default: cub::NullType)
+ * \tparam RADIX_BITS           <b>[optional]</b> The number of radix bits per digit place (default: 4 bits)
+ * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).
+ * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+ * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockRadixSort}
+ * \par
+ * The code snippet below illustrates a sort of 512 integer keys that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockRadixSort for 128 threads owning 4 integer items each
+ *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+ *
+ *     // Allocate shared memory for BlockRadixSort
+ *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_keys[4];
+ *     ...
+ *
+ *     // Collectively sort the keys
+ *     BlockRadixSort(temp_storage).Sort(thread_keys);
+ *
+ *     ...
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_keys across the block of threads is
+ * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+ * corresponding output \p thread_keys in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ *
+ */
+template <
+    typename                Key,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    typename                Value                   = NullType,
+    int                     RADIX_BITS              = 4,
+    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte>
+class BlockRadixSort
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    // Key traits and unsigned bits type
+    typedef NumericTraits<Key>                  KeyTraits;
+    typedef typename KeyTraits::UnsignedBits    UnsignedBits;
+
+    /// BlockRadixRank utility type
+    typedef BlockRadixRank<BLOCK_THREADS, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG> BlockRadixRank;
+
+    /// BlockExchange utility type for keys
+    typedef BlockExchange<Key, BLOCK_THREADS, ITEMS_PER_THREAD> BlockExchangeKeys;
+
+    /// BlockExchange utility type for values
+    typedef BlockExchange<Value, BLOCK_THREADS, ITEMS_PER_THREAD> BlockExchangeValues;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        union
+        {
+            typename BlockRadixRank::TempStorage          ranking_storage;
+            typename BlockExchangeKeys::TempStorage        exchange_keys;
+            typename BlockExchangeValues::TempStorage      exchange_values;
+        };
+    };
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockRadixSort()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockRadixSort(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
+     */
+    __device__ __forceinline__ BlockRadixSort(
+        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(linear_tid)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
+     */
+    __device__ __forceinline__ BlockRadixSort(
+        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
+        int linear_tid)                        ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Sorting (blocked arrangements)
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Performs a block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec4) of keys.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).Sort(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
+     * The corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     */
+    __device__ __forceinline__ void Sort(
+        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
+            begin_bit += RADIX_BITS;
+
+            __syncthreads();
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
+
+            // Quit if done
+            if (begin_bit >= end_bit) break;
+
+            __syncthreads();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+
+    /**
+     * \brief Performs a block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec4) of keys and values.
+     *
+     * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     * more than one tile of values, simply perform a key-value sort of the keys paired
+     * with a temporary value array that enumerates the key indices.  The reordered indices
+     * can then be used as a gather-vector for exchanging other associated tile data through
+     * shared memory.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive pairs.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void Sort(
+        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        Value   (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
+            begin_bit += RADIX_BITS;
+
+            __syncthreads();
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
+
+            __syncthreads();
+
+            // Exchange values through shared memory in blocked arrangement
+            BlockExchangeValues(temp_storage.exchange_values, linear_tid).ScatterToBlocked(values, ranks);
+
+            // Quit if done
+            if (begin_bit >= end_bit) break;
+
+            __syncthreads();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Sorting (blocked arrangement -> striped arrangement)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Performs a radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec4) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec4).
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortBlockedToStriped(
+        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
+            begin_bit += RADIX_BITS;
+
+            __syncthreads();
+
+            // Check if this is the last pass
+            if (begin_bit >= end_bit)
+            {
+                // Last pass exchanges keys through shared memory in striped arrangement
+                BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToStriped(keys, ranks);
+
+                // Quit
+                break;
+            }
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
+
+            __syncthreads();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+
+    /**
+     * \brief Performs a radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec4) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec4).
+     *
+     * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     * more than one tile of values, simply perform a key-value sort of the keys paired
+     * with a temporary value array that enumerates the key indices.  The reordered indices
+     * can then be used as a gather-vector for exchanging other associated tile data through
+     * shared memory.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortBlockedToStriped(
+        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        Value   (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
+            begin_bit += RADIX_BITS;
+
+            __syncthreads();
+
+            // Check if this is the last pass
+            if (begin_bit >= end_bit)
+            {
+                // Last pass exchanges keys through shared memory in striped arrangement
+                BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToStriped(keys, ranks);
+
+                __syncthreads();
+
+                // Last pass exchanges through shared memory in striped arrangement
+                BlockExchangeValues(temp_storage.exchange_values, linear_tid).ScatterToStriped(values, ranks);
+
+                // Quit
+                break;
+            }
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
+
+            __syncthreads();
+
+            // Exchange values through shared memory in blocked arrangement
+            BlockExchangeValues(temp_storage.exchange_values, linear_tid).ScatterToBlocked(values, ranks);
+
+            __syncthreads();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+
+    //@}  end member group
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/block/block_raking_layout.cuh b/lib/kokkos/TPL/cub/block/block_raking_layout.cuh
new file mode 100644
index 000000000..878a786cd
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/block_raking_layout.cuh
@@ -0,0 +1,145 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
+ */
+
+
+#pragma once
+
+#include "../util_macro.cuh"
+#include "../util_arch.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for raking across thread block data.    ![](raking.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * This type facilitates a shared memory usage pattern where a block of CUDA
+ * threads places elements into shared memory and then reduces the active
+ * parallelism to one "raking" warp of threads for serially aggregating consecutive
+ * sequences of shared items.  Padding is inserted to eliminate bank conflicts
+ * (for most data types).
+ *
+ * \tparam T                    The data type to be exchanged.
+ * \tparam BLOCK_THREADS        The thread block size in threads.
+ * \tparam BLOCK_STRIPS         When strip-mining, the number of threadblock-strips per tile
+ */
+template <
+    typename    T,
+    int         BLOCK_THREADS,
+    int         BLOCK_STRIPS = 1>
+struct BlockRakingLayout
+{
+    //---------------------------------------------------------------------
+    // Constants and typedefs
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// The total number of elements that need to be cooperatively reduced
+        SHARED_ELEMENTS =
+            BLOCK_THREADS * BLOCK_STRIPS,
+
+        /// Maximum number of warp-synchronous raking threads
+        MAX_RAKING_THREADS =
+            CUB_MIN(BLOCK_THREADS, PtxArchProps::WARP_THREADS),
+
+        /// Number of raking elements per warp-synchronous raking thread (rounded up)
+        SEGMENT_LENGTH =
+            (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
+
+        /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
+        RAKING_THREADS =
+            (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
+
+        /// Pad each segment length with one element if it evenly divides the number of banks
+        SEGMENT_PADDING =
+            (PtxArchProps::SMEM_BANKS % SEGMENT_LENGTH == 0) ? 1 : 0,
+
+        /// Total number of elements in the raking grid
+        GRID_ELEMENTS =
+            RAKING_THREADS * (SEGMENT_LENGTH + SEGMENT_PADDING),
+
+        /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the warp size)
+        UNGUARDED =
+            (SHARED_ELEMENTS % RAKING_THREADS == 0),
+    };
+
+
+    /**
+     * \brief Shared memory storage type
+     */
+    typedef T TempStorage[BlockRakingLayout::GRID_ELEMENTS];
+
+
+    /**
+     * \brief Returns the location for the calling thread to place data into the grid
+     */
+    static __device__ __forceinline__ T* PlacementPtr(
+        TempStorage &temp_storage,
+        int linear_tid,
+        int block_strip = 0)
+    {
+        // Offset for partial
+        unsigned int offset = (block_strip * BLOCK_THREADS) + linear_tid;
+
+        // Add in one padding element for every segment
+        if (SEGMENT_PADDING > 0)
+        {
+            offset += offset / SEGMENT_LENGTH;
+        }
+
+        // Incorporating a block of padding partials every shared memory segment
+        return temp_storage + offset;
+    }
+
+
+    /**
+     * \brief Returns the location for the calling thread to begin sequential raking
+     */
+    static __device__ __forceinline__ T* RakingPtr(
+        TempStorage &temp_storage,
+        int linear_tid)
+    {
+        return temp_storage + (linear_tid * (SEGMENT_LENGTH + SEGMENT_PADDING));
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/block/block_reduce.cuh b/lib/kokkos/TPL/cub/block/block_reduce.cuh
new file mode 100644
index 000000000..ffdff7377
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/block_reduce.cuh
@@ -0,0 +1,563 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_reduce_raking.cuh"
+#include "specializations/block_reduce_warp_reductions.cuh"
+#include "../util_type.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * BlockReduceAlgorithm enumerates alternative algorithms for parallel
+ * reduction across a CUDA threadblock.
+ */
+enum BlockReduceAlgorithm
+{
+
+    /**
+     * \par Overview
+     * An efficient "raking" reduction algorithm.  Execution is comprised of
+     * three phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within a
+     *    single warp rake across segments of shared partial reductions.
+     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
+     *
+     * \par
+     * \image html block_reduce.png
+     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer longer turnaround latencies when the
+     *   GPU is under-occupied, it can often provide higher overall throughput
+     *   across the GPU when suitably occupied.
+     */
+    BLOCK_REDUCE_RAKING,
+
+
+    /**
+     * \par Overview
+     * A quick "tiled warp-reductions" reduction algorithm.  Execution is
+     * comprised of four phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
+     *    reduction within each warp.
+     * -# A propagation phase where the warp reduction outputs in each warp are
+     *    updated with the aggregate from each preceding warp.
+     *
+     * \par
+     * \image html block_scan_warpscans.png
+     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer lower overall throughput across the
+     *   GPU because due to a heavy reliance on inefficient warp-reductions, it
+     *   can often provide lower turnaround latencies when the GPU is
+     *   under-occupied.
+     */
+    BLOCK_REDUCE_WARP_REDUCTIONS,
+};
+
+
+/******************************************************************************
+ * Block reduce
+ ******************************************************************************/
+
+/**
+ * \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a list of input elements.
+ *
+ * \par
+ * Optionally, BlockReduce can be specialized by algorithm to accommodate different latency/throughput workload profiles:
+ *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.  An efficient "raking" reduction algorithm. [More...](\ref cub::BlockReduceAlgorithm)
+ *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.  A quick "tiled warp-reductions" reduction algorithm. [More...](\ref cub::BlockReduceAlgorithm)
+ *
+ * \tparam T                Data type being reduced
+ * \tparam BLOCK_THREADS    The thread block size in threads
+ * \tparam ALGORITHM        <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_RAKING)
+ *
+ * \par Performance Considerations
+ * - Very efficient (only one synchronization barrier).
+ * - Zero bank conflicts for most types.
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *   - Summation (<b><em>vs.</em></b> generic reduction)
+ *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
+ *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
+ * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockReduce}
+ * \par
+ * The code snippet below illustrates a sum reduction of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockReduce for 128 threads on type int
+ *     typedef cub::BlockReduce<int, 128> BlockReduce;
+ *
+ *     // Allocate shared memory for BlockReduce
+ *     __shared__ typename BlockReduce::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Compute the block-wide sum for thread0
+ *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_THREADS,
+    BlockReduceAlgorithm    ALGORITHM = BLOCK_REDUCE_RAKING>
+class BlockReduce
+{
+private:
+
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    /// Internal specialization.
+    typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
+        BlockReduceWarpReductions<T, BLOCK_THREADS>,
+        BlockReduceRaking<T, BLOCK_THREADS> >::Type InternalBlockReduce;
+
+    /// Shared memory storage layout type for BlockReduce
+    typedef typename InternalBlockReduce::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockReduce}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockReduce()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockReduce(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
+     */
+    __device__ __forceinline__ BlockReduce(
+        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(linear_tid)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
+     */
+    __device__ __forceinline__ BlockReduce(
+        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
+        int linear_tid)                        ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Generic reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes one input element.
+     *
+     * The return value is undefined in threads other than thread<sub>0</sub>.
+     *
+     * Supports non-commutative reduction operators.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a max reduction of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               input,                      ///< [in] Calling thread's input
+        ReductionOp     reduction_op)               ///< [in] Binary reduction operator
+    {
+        return InternalBlockReduce(temp_storage, linear_tid).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * The return value is undefined in threads other than thread<sub>0</sub>.
+     *
+     * Supports non-commutative reduction operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a max reduction of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               (&inputs)[ITEMS_PER_THREAD],    ///< [in] Calling thread's input segment
+        ReductionOp     reduction_op)                   ///< [in] Binary reduction operator
+    {
+        // Reduce partials
+        T partial = ThreadReduce(inputs, reduction_op);
+        return Reduce(partial, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  The first \p num_valid threads each contribute one input element.
+     *
+     * The return value is undefined in threads other than thread<sub>0</sub>.
+     *
+     * Supports non-commutative reduction operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a max reduction of a partially-full tile of integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int num_valid, ...)
+     * {
+     *     // Specialize BlockReduce for 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     if (threadIdx.x < num_valid) thread_data = ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
+     *
+     * \endcode
+     *
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,                  ///< [in] Calling thread's input
+        ReductionOp         reduction_op,           ///< [in] Binary reduction operator
+        int                 num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    {
+        // Determine if we scan skip bounds checking
+        if (num_valid >= BLOCK_THREADS)
+        {
+            return InternalBlockReduce(temp_storage, linear_tid).template Reduce<true>(input, num_valid, reduction_op);
+        }
+        else
+        {
+            return InternalBlockReduce(temp_storage, linear_tid).template Reduce<false>(input, num_valid, reduction_op);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Summation reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes one input element.
+     *
+     * The return value is undefined in threads other than thread<sub>0</sub>.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a sum reduction of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T   input)                      ///< [in] Calling thread's input
+    {
+        return InternalBlockReduce(temp_storage, linear_tid).template Sum<true>(input, BLOCK_THREADS);
+    }
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes an array of consecutive input elements.
+     *
+     * The return value is undefined in threads other than thread<sub>0</sub>.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a sum reduction of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ T Sum(
+        T   (&inputs)[ITEMS_PER_THREAD])    ///< [in] Calling thread's input segment
+    {
+        // Reduce partials
+        T partial = ThreadReduce(inputs, cub::Sum());
+        return Sum(partial);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  The first \p num_valid threads each contribute one input element.
+     *
+     * The return value is undefined in threads other than thread<sub>0</sub>.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int num_valid, ...)
+     * {
+     *     // Specialize BlockReduce for 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item (up to num_items)
+     *     int thread_data;
+     *     if (threadIdx.x < num_valid)
+     *         thread_data = ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
+     *
+     * \endcode
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T   input,                  ///< [in] Calling thread's input
+        int num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    {
+        // Determine if we scan skip bounds checking
+        if (num_valid >= BLOCK_THREADS)
+        {
+            return InternalBlockReduce(temp_storage, linear_tid).template Sum<true>(input, num_valid);
+        }
+        else
+        {
+            return InternalBlockReduce(temp_storage, linear_tid).template Sum<false>(input, num_valid);
+        }
+    }
+
+
+    //@}  end member group
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/block/block_scan.cuh b/lib/kokkos/TPL/cub/block/block_scan.cuh
new file mode 100644
index 000000000..1c1a2dac8
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/block_scan.cuh
@@ -0,0 +1,2233 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_scan_raking.cuh"
+#include "specializations/block_scan_warp_scans.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block.
+ */
+enum BlockScanAlgorithm
+{
+
+    /**
+     * \par Overview
+     * An efficient "raking reduce-then-scan" prefix scan algorithm.  Execution is comprised of five phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within a single warp rake across segments of shared partial reductions.
+     * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
+     * -# Downsweep sequential exclusive scan in shared memory.  Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output.
+     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
+     *
+     * \par
+     * \image html block_scan_raking.png
+     * <div class="centercaption">\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer longer turnaround latencies when the
+     *   GPU is under-occupied, it can often provide higher overall throughput
+     *   across the GPU when suitably occupied.
+     */
+    BLOCK_SCAN_RAKING,
+
+
+    /**
+     * \par Overview
+     * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at
+     * the expense of higher register pressure.  Raking threads preserve their
+     * "upsweep" segment of values in registers while performing warp-synchronous
+     * scan, allowing the "downsweep" not to re-read them from shared memory.
+     */
+    BLOCK_SCAN_RAKING_MEMOIZE,
+
+
+    /**
+     * \par Overview
+     * A quick "tiled warpscans" prefix scan algorithm.  Execution is comprised of four phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
+     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
+     * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp.
+     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
+     *
+     * \par
+     * \image html block_scan_warpscans.png
+     * <div class="centercaption">\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer lower overall throughput across the
+     *   GPU because due to a heavy reliance on inefficient warpscans, it can
+     *   often provide lower turnaround latencies when the GPU is under-occupied.
+     */
+    BLOCK_SCAN_WARP_SCANS,
+};
+
+
+/******************************************************************************
+ * Block scan
+ ******************************************************************************/
+
+/**
+ * \brief The BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ * produces an output list where each element is computed to be the reduction
+ * of the elements occurring earlier in the input list.  <em>Prefix sum</em>
+ * connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ * the <em>i</em><sup>th</sup> output reduction.
+ *
+ * \par
+ * Optionally, BlockScan can be specialized by algorithm to accommodate different latency/throughput workload profiles:
+ *   -# <b>cub::BLOCK_SCAN_RAKING</b>.  An efficient "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
+ *   -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>.  A quick "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
+ *
+ * \tparam T                Data type being scanned
+ * \tparam BLOCK_THREADS    The thread block size in threads
+ * \tparam ALGORITHM        <b>[optional]</b> cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING)
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockScan}
+ * \par
+ * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockScan for 128 threads on type int
+ *     typedef cub::BlockScan<int, 128> BlockScan;
+ *
+ *     // Allocate shared memory for BlockScan
+ *     __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Collectively compute the block-wide exclusive prefix sum
+ *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.
+ * The corresponding output \p thread_data in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+ *
+ * \par Performance Considerations
+ * - Uses special instructions when applicable (e.g., warp \p SHFL)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Uses only one or two block-wide synchronization barriers (depending on
+ *   algorithm selection)
+ * - Zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *   - Prefix sum variants (<b><em>vs.</em></b> generic scan)
+ *   - Exclusive variants (<b><em>vs.</em></b> inclusive)
+ *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
+ * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
+ *
+ */
+template <
+    typename            T,
+    int                 BLOCK_THREADS,
+    BlockScanAlgorithm  ALGORITHM = BLOCK_SCAN_RAKING>
+class BlockScan
+{
+private:
+
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    /**
+     * Ensure the template parameterization meets the requirements of the
+     * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
+     * cannot be used with threadblock sizes not a multiple of the
+     * architectural warp size.
+     */
+    static const BlockScanAlgorithm SAFE_ALGORITHM =
+        ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % PtxArchProps::WARP_THREADS != 0)) ?
+            BLOCK_SCAN_RAKING :
+            ALGORITHM;
+
+    /// Internal specialization.
+    typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS),
+        BlockScanWarpScans<T, BLOCK_THREADS>,
+        BlockScanRaking<T, BLOCK_THREADS, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)> >::Type InternalBlockScan;
+
+
+    /// Shared memory storage layout type for BlockScan
+    typedef typename InternalBlockScan::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockScan()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockScan(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
+     */
+    __device__ __forceinline__ BlockScan(
+        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(linear_tid)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
+     */
+    __device__ __forceinline__ BlockScan(
+        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
+        int linear_tid)                        ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sum operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
+    {
+        T block_aggregate;
+        InternalBlockScan(temp_storage, linear_tid).ExclusiveSum(input, output, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
+     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage, linear_tid).ExclusiveSum(input, output, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     * The functor will be invoked by the first warp of threads in the block, however only the return value from
+     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide exclusive prefix sum
+     *         int block_aggregate;
+     *         BlockScan(temp_storage).ExclusiveSum(
+     *             thread_data, thread_data, block_aggregate, prefix_op);
+     *         __syncthreads();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 1, ..., 127</tt>.
+     * The output for the second segment will be <tt>128, 129, ..., 255</tt>.  Furthermore,
+     * the value \p 128 will be stored in \p block_aggregate for all threads after each scan.
+     *
+     * \tparam BlockPrefixOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <typename BlockPrefixOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+    {
+        InternalBlockScan(temp_storage, linear_tid).ExclusiveSum(input, output, block_aggregate, block_prefix_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sum operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD])  ///< [out] Calling thread's output items (may be aliased to \p input)
+    {
+        // Reduce consecutive thread items in registers
+        Sum scan_op;
+        T thread_partial = ThreadReduce(input, scan_op);
+
+        // Exclusive threadblock-scan
+        ExclusiveSum(thread_partial, thread_partial);
+
+        // Exclusive scan in registers with prefix
+        ThreadScanExclusive(input, output, scan_op, thread_partial);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                 (&input)[ITEMS_PER_THREAD],       ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],      ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 &block_aggregate)                 ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        Sum scan_op;
+        T thread_partial = ThreadReduce(input, scan_op);
+
+        // Exclusive threadblock-scan
+        ExclusiveSum(thread_partial, thread_partial, block_aggregate);
+
+        // Exclusive scan in registers with prefix
+        ThreadScanExclusive(input, output, scan_op, thread_partial);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     * The functor will be invoked by the first warp of threads in the block, however only the return value from
+     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4)
+     * across 128 threads where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_TRANSPOSE> BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         __syncthreads();
+     *
+     *         // Collectively compute the block-wide exclusive prefix sum
+     *         int block_aggregate;
+     *         BlockScan(temp_storage.scan).ExclusiveSum(
+     *             thread_data, thread_data, block_aggregate, prefix_op);
+     *         __syncthreads();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         __syncthreads();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 1, 2, 3, ..., 510, 511</tt>.
+     * The output for the second segment will be <tt>512, 513, 514, 515, ..., 1022, 1023</tt>.  Furthermore,
+     * the value \p 512 will be stored in \p block_aggregate for all threads after each scan.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam BlockPrefixOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename BlockPrefixOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 &block_aggregate,             ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp     &block_prefix_op)             ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+    {
+        // Reduce consecutive thread items in registers
+        Sum scan_op;
+        T thread_partial = ThreadReduce(input, scan_op);
+
+        // Exclusive threadblock-scan
+        ExclusiveSum(thread_partial, thread_partial, block_aggregate, block_prefix_op);
+
+        // Exclusive scan in registers with prefix
+        ThreadScanExclusive(input, output, scan_op, thread_partial);
+    }
+
+
+
+    //@}  end member group        // Inclusive prefix sums
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               identity,                       ///< [in] Identity value
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, identity, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &identity,          ///< [in] Identity value
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, identity, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     * The functor will be invoked by the first warp of threads in the block, however only the return value from
+     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixOp prefix_op(INT_MIN);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide exclusive prefix max scan
+     *         int block_aggregate;
+     *         BlockScan(temp_storage).ExclusiveScan(
+     *             thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate, prefix_op);
+     *         __syncthreads();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     * The output for the second segment will be <tt>126, 128, 128, 130, ..., 252, 254</tt>.  Furthermore,
+     * \p block_aggregate will be assigned \p 126 in all threads after the first scan, assigned \p 254 after the second
+     * scan, etc.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename BlockPrefixOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               identity,                       ///< [in] Identity value
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+    {
+        InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, identity, scan_op, block_aggregate, block_prefix_op);
+    }
+
+
+    //@}  end member group        // Inclusive prefix sums
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T           &identity,                    ///< [in] Identity value
+        ScanOp            scan_op)                      ///< [in] Binary scan operator
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = ThreadReduce(input, scan_op);
+
+        // Exclusive threadblock-scan
+        ExclusiveScan(thread_partial, thread_partial, identity, scan_op);
+
+        // Exclusive scan in registers with prefix
+        ThreadScanExclusive(input, output, scan_op, thread_partial);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
+     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T           &identity,                    ///< [in] Identity value
+        ScanOp            scan_op,                      ///< [in] Binary scan operator
+        T                 &block_aggregate)             ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = ThreadReduce(input, scan_op);
+
+        // Exclusive threadblock-scan
+        ExclusiveScan(thread_partial, thread_partial, identity, scan_op, block_aggregate);
+
+        // Exclusive scan in registers with prefix
+        ThreadScanExclusive(input, output, scan_op, thread_partial);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     * The functor will be invoked by the first warp of threads in the block, however only the return value from
+     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_TRANSPOSE> BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         __syncthreads();
+     *
+     *         // Collectively compute the block-wide exclusive prefix max scan
+     *         int block_aggregate;
+     *         BlockScan(temp_storage.scan).ExclusiveScan(
+     *             thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate, prefix_op);
+     *         __syncthreads();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         __syncthreads();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510</tt>.
+     * The output for the second segment will be <tt>510, 512, 512, 514, 514, 516, ..., 1020, 1022</tt>.  Furthermore,
+     * \p block_aggregate will be assigned \p 510 in all threads after the first scan, assigned \p 1022 after the second
+     * scan, etc.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp,
+        typename        BlockPrefixOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        T               identity,                       ///< [in] Identity value
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = ThreadReduce(input, scan_op);
+
+        // Exclusive threadblock-scan
+        ExclusiveScan(thread_partial, thread_partial, identity, scan_op, block_aggregate, block_prefix_op);
+
+        // Exclusive scan in registers with prefix
+        ThreadScanExclusive(input, output, scan_op, thread_partial);
+    }
+
+
+    //@}  end member group
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (identityless, single datum per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     * The functor will be invoked by the first warp of threads in the block, however only the return value from
+     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename BlockPrefixOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+    {
+        InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, scan_op, block_aggregate, block_prefix_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (identityless, multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp            scan_op)                      ///< [in] Binary scan operator
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = ThreadReduce(input, scan_op);
+
+        // Exclusive threadblock-scan
+        ExclusiveScan(thread_partial, thread_partial, scan_op);
+
+        // Exclusive scan in registers with prefix
+        ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = ThreadReduce(input, scan_op);
+
+        // Exclusive threadblock-scan
+        ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
+
+        // Exclusive scan in registers with prefix
+        ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     * The functor will be invoked by the first warp of threads in the block, however only the return value from
+     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp,
+        typename        BlockPrefixOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op,                      ///< [in] Binary scan operator
+        T               &block_aggregate,             ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)             ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = ThreadReduce(input, scan_op);
+
+        // Exclusive threadblock-scan
+        ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate, block_prefix_op);
+
+        // Exclusive scan in registers with prefix
+        ThreadScanExclusive(input, output, scan_op, thread_partial);
+    }
+
+
+    //@}  end member group
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /******************************************************************//**
+     * \name Inclusive prefix sum operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
+     *
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
+    {
+        T block_aggregate;
+        InternalBlockScan(temp_storage, linear_tid).InclusiveSum(input, output, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
+     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+     *
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage, linear_tid).InclusiveSum(input, output, block_aggregate);
+    }
+
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     * The functor will be invoked by the first warp of threads in the block, however only the return value from
+     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide inclusive prefix sum
+     *         int block_aggregate;
+     *         BlockScan(temp_storage).InclusiveSum(
+     *             thread_data, thread_data, block_aggregate, prefix_op);
+     *         __syncthreads();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>1, 2, ..., 128</tt>.
+     * The output for the second segment will be <tt>129, 130, ..., 256</tt>.  Furthermore,
+     * the value \p 128 will be stored in \p block_aggregate for all threads after each scan.
+     *
+     * \tparam BlockPrefixOp          <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <typename BlockPrefixOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+    {
+        InternalBlockScan(temp_storage, linear_tid).InclusiveSum(input, output, block_aggregate, block_prefix_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix sum operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void InclusiveSum(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD])    ///< [out] Calling thread's output items (may be aliased to \p input)
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0]);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_partial = ThreadReduce(input, scan_op);
+
+            // Exclusive threadblock-scan
+            ExclusiveSum(thread_partial, thread_partial);
+
+            // Inclusive scan in registers with prefix
+            ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be
+     * <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
+     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void InclusiveSum(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0], block_aggregate);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_partial = ThreadReduce(input, scan_op);
+
+            // Exclusive threadblock-scan
+            ExclusiveSum(thread_partial, thread_partial, block_aggregate);
+
+            // Inclusive scan in registers with prefix
+            ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     * The functor will be invoked by the first warp of threads in the block, however only the return value from
+     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4)
+     * across 128 threads where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_TRANSPOSE> BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         __syncthreads();
+     *
+     *         // Collectively compute the block-wide inclusive prefix sum
+     *         int block_aggregate;
+     *         BlockScan(temp_storage.scan).IncluisveSum(
+     *             thread_data, thread_data, block_aggregate, prefix_op);
+     *         __syncthreads();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         __syncthreads();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>1, 2, 3, 4, ..., 511, 512</tt>.
+     * The output for the second segment will be <tt>513, 514, 515, 516, ..., 1023, 1024</tt>.  Furthermore,
+     * the value \p 512 will be stored in \p block_aggregate for all threads after each scan.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam BlockPrefixOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename BlockPrefixOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        T               &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0], block_aggregate, block_prefix_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_partial = ThreadReduce(input, scan_op);
+
+            // Exclusive threadblock-scan
+            ExclusiveSum(thread_partial, thread_partial, block_aggregate, block_prefix_op);
+
+            // Inclusive scan in registers with prefix
+            ThreadScanInclusive(input, output, scan_op, thread_partial);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scan operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, output, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage, linear_tid).InclusiveScan(input, output, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     * The functor will be invoked by the first warp of threads in the block, however only the return value from
+     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixOp prefix_op(INT_MIN);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide inclusive prefix max scan
+     *         int block_aggregate;
+     *         BlockScan(temp_storage).InclusiveScan(
+     *             thread_data, thread_data, cub::Max(), block_aggregate, prefix_op);
+     *         __syncthreads();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     * The output for the second segment will be <tt>128, 128, 130, 130, ..., 254, 254</tt>.  Furthermore,
+     * \p block_aggregate will be assigned \p 126 in all threads after the first scan, assigned \p 254 after the second
+     * scan, etc.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename BlockPrefixOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+    {
+        InternalBlockScan(temp_storage, linear_tid).InclusiveScan(input, output, scan_op, block_aggregate, block_prefix_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scan operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_partial = ThreadReduce(input, scan_op);
+
+            // Exclusive threadblock-scan
+            ExclusiveScan(thread_partial, thread_partial, scan_op);
+
+            // Inclusive scan in registers with prefix
+            ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
+     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename         ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op, block_aggregate);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_partial = ThreadReduce(input, scan_op);
+
+            // Exclusive threadblock-scan
+            ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
+
+            // Inclusive scan in registers with prefix
+            ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     * The functor will be invoked by the first warp of threads in the block, however only the return value from
+     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \blocked
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_TRANSPOSE> BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         __syncthreads();
+     *
+     *         // Collectively compute the block-wide inclusive prefix max scan
+     *         int block_aggregate;
+     *         BlockScan(temp_storage.scan).InclusiveScan(
+     *             thread_data, thread_data, cub::Max(), block_aggregate, prefix_op);
+     *         __syncthreads();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         __syncthreads();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, 4, 4, ..., 510, 510</tt>.
+     * The output for the second segment will be <tt>512, 512, 514, 514, 516, 516, ..., 1022, 1022</tt>.  Furthermore,
+     * \p block_aggregate will be assigned \p 510 in all threads after the first scan, assigned \p 1022 after the second
+     * scan, etc.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp,
+        typename        BlockPrefixOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op, block_aggregate, block_prefix_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_partial = ThreadReduce(input, scan_op);
+
+            // Exclusive threadblock-scan
+            ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate, block_prefix_op);
+
+            // Inclusive scan in registers with prefix
+            ThreadScanInclusive(input, output, scan_op, thread_partial);
+        }
+    }
+
+    //@}  end member group
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/block/block_store.cuh b/lib/kokkos/TPL/cub/block/block_store.cuh
new file mode 100644
index 000000000..fb990de1c
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/block_store.cuh
@@ -0,0 +1,926 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Operations for writing linear segments of data from the CUDA thread block
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../util_namespace.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_vector.cuh"
+#include "../thread/thread_store.cuh"
+#include "block_exchange.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup IoModule
+ * @{
+ */
+
+
+/******************************************************************//**
+ * \name Blocked I/O
+ *********************************************************************/
+//@{
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items using the specified cache modifier.
+ *
+ * \blocked
+ *
+ * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
+ */
+template <
+    PtxStoreModifier    MODIFIER,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorRA>
+__device__ __forceinline__ void StoreBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        ThreadStore<MODIFIER>(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM, items[ITEM]);
+    }
+}
+
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items using the specified cache modifier, guarded by range
+ *
+ * \blocked
+ *
+ * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
+ */
+template <
+    PtxStoreModifier    MODIFIER,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorRA>
+__device__ __forceinline__ void StoreBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
+        {
+            ThreadStore<MODIFIER>(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM, items[ITEM]);
+        }
+    }
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Striped I/O
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier.
+ *
+ * \striped
+ *
+ * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
+ */
+template <
+    PtxStoreModifier    MODIFIER,
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorRA>
+__device__ __forceinline__ void StoreStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        ThreadStore<MODIFIER>(block_itr + (ITEM * BLOCK_THREADS) + linear_tid, items[ITEM]);
+    }
+}
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier, guarded by range
+ *
+ * \striped
+ *
+ * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
+ */
+template <
+    PtxStoreModifier    MODIFIER,
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorRA>
+__device__ __forceinline__ void StoreStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
+        {
+            ThreadStore<MODIFIER>(block_itr + (ITEM * BLOCK_THREADS) + linear_tid, items[ITEM]);
+        }
+    }
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Warp-striped I/O
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
+ */
+template <
+    PtxStoreModifier    MODIFIER,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorRA>
+__device__ __forceinline__ void StoreWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    int tid         = linear_tid & (PtxArchProps::WARP_THREADS - 1);
+    int wid         = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
+    int warp_offset = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        ThreadStore<MODIFIER>(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS), items[ITEM]);
+    }
+}
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier, guarded by range
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
+ */
+template <
+    PtxStoreModifier    MODIFIER,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorRA>
+__device__ __forceinline__ void StoreWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    int tid         = linear_tid & (PtxArchProps::WARP_THREADS - 1);
+    int wid         = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
+    int warp_offset = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS) < valid_items)
+        {
+            ThreadStore<MODIFIER>(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS), items[ITEM]);
+        }
+    }
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Blocked, vectorized I/O
+ *********************************************************************/
+//@{
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items using the specified cache modifier.
+ *
+ * \blocked
+ *
+ * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
+ * which is the default starting offset returned by \p cudaMalloc()
+ *
+ * \par
+ * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
+ *   - \p ITEMS_PER_THREAD is odd
+ *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ */
+template <
+    PtxStoreModifier    MODIFIER,
+    typename            T,
+    int                 ITEMS_PER_THREAD>
+__device__ __forceinline__ void StoreBlockedVectorized(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T                   *block_ptr,                 ///< [in] Input pointer for storing from
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    enum
+    {
+        // Maximum CUDA vector size is 4 elements
+        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
+
+        // Vector size must be a power of two and an even divisor of the items per thread
+        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
+            MAX_VEC_SIZE :
+            1,
+
+        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
+    };
+
+    // Vector type
+    typedef typename VectorHelper<T, VEC_SIZE>::Type Vector;
+
+    // Alias global pointer
+    Vector *block_ptr_vectors = reinterpret_cast<Vector *>(block_ptr);
+
+    // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
+    Vector raw_vector[VECTORS_PER_THREAD];
+    T *raw_items = reinterpret_cast<T*>(raw_vector);
+
+    // Copy
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        raw_items[ITEM] = items[ITEM];
+    }
+
+    // Direct-store using vector types
+    StoreBlocked<MODIFIER>(linear_tid, block_ptr_vectors, raw_vector);
+}
+
+
+//@}  end member group
+
+
+/** @} */       // end group IoModule
+
+
+//-----------------------------------------------------------------------------
+// Generic BlockStore abstraction
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
+ */
+enum BlockStoreAlgorithm
+{
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec4) of data is written
+     * directly to memory.  The thread block writes items in a parallel "raking" fashion:
+     * thread<sub><em>i</em></sub> writes the <em>i</em><sup>th</sup> segment of consecutive elements.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) decreases as the
+     *   access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_STORE_DIRECT,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec4) of data is written directly
+     * to memory using CUDA's built-in vectorized stores as a coalescing optimization.
+     * The thread block writes items in a parallel "raking" fashion: thread<sub><em>i</em></sub> uses vector stores to
+     * write the <em>i</em><sup>th</sup> segment of consecutive elements.
+     *
+     * For example, <tt>st.global.v4.s32</tt> instructions will be generated when \p T = \p int and \p ITEMS_PER_THREAD > 4.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high until the the
+     *   access stride between threads (i.e., the number items per thread) exceeds the
+     *   maximum vector store width (typically 4 items or 64B, whichever is lower).
+     * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
+     *   - \p ITEMS_PER_THREAD is odd
+     *   - The \p OutputIteratorRA is not a simple pointer type
+     *   - The block output offset is not quadword-aligned
+     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     */
+    BLOCK_STORE_VECTORIZE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec4) is locally
+     * transposed into a [<em>striped arrangement</em>](index.html#sec5sec4)
+     * which is then written to memory.  More specifically, cub::BlockExchange
+     * used to locally reorder the items into a
+     * [<em>striped arrangement</em>](index.html#sec5sec4), after which the
+     * thread block writes items in a parallel "strip-mining" fashion: consecutive
+     * items owned by thread<sub><em>i</em></sub> are written to memory with
+     * stride \p BLOCK_THREADS between them.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_TRANSPOSE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec4) is locally
+     * transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec4)
+     * which is then written to memory.  More specifically, cub::BlockExchange used
+     * to locally reorder the items into a
+     * [<em>warp-striped arrangement</em>](index.html#sec5sec4), after which
+     * each warp writes its own contiguous segment in a parallel "strip-mining" fashion:
+     * consecutive items owned by lane<sub><em>i</em></sub> are written to memory
+     * with stride \p WARP_THREADS between them.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_WARP_TRANSPOSE,
+};
+
+
+
+/**
+ * \addtogroup BlockModule
+ * @{
+ */
+
+
+/**
+ * \brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec4) of items partitioned across a CUDA thread block to a linear segment of memory.  ![](block_store_logo.png)
+ *
+ * \par Overview
+ * The BlockStore class provides a single data movement abstraction that can be specialized
+ * to implement different cub::BlockStoreAlgorithm strategies.  This facilitates different
+ * performance policies for different architectures, data types, granularity sizes, etc.
+ *
+ * \par Optionally, BlockStore can be specialized by different data movement strategies:
+ *   -# <b>cub::BLOCK_STORE_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec4) of data is written
+ *      directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec4)
+ *      of data is written directly to memory using CUDA's built-in vectorized stores as a
+ *      coalescing optimization.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec4)
+ *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec4) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec4)
+ *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec4) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ *
+ * \tparam OutputIteratorRA     The input iterator type (may be a simple pointer type).
+ * \tparam BLOCK_THREADS        The thread block size in threads.
+ * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  default: cub::BLOCK_STORE_DIRECT.
+ * \tparam MODIFIER             <b>[optional]</b> cub::PtxStoreModifier cache modifier.  default: cub::STORE_DEFAULT.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> For transposition-based cub::BlockStoreAlgorithm parameterizations that utilize shared memory: When \p true, only use enough shared memory for a single warp's worth of data, time-slicing the block-wide exchange over multiple synchronized rounds (default: false)
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockStore}
+ * \par
+ * The code snippet below illustrates the storing of a "blocked" arrangement
+ * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+ * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+ * meaning items are locally reordered among threads so that memory references will be
+ * efficiently coalesced using a warp-striped access pattern.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockStore for 128 threads owning 4 integer items each
+ *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+ *
+ *     // Allocate shared memory for BlockStore
+ *     __shared__ typename BlockStore::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Store items to linear memory
+ *     int thread_data[4];
+ *     BlockStore(temp_storage).Store(d_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of \p thread_data across the block of threads is
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+ * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ *
+ */
+template <
+    typename                OutputIteratorRA,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockStoreAlgorithm     ALGORITHM           = BLOCK_STORE_DIRECT,
+    PtxStoreModifier        MODIFIER            = STORE_DEFAULT,
+    bool                    WARP_TIME_SLICING   = false>
+class BlockStore
+{
+private:
+    /******************************************************************************
+     * Constants and typed definitions
+     ******************************************************************************/
+
+    // Data type of input iterator
+    typedef typename std::iterator_traits<OutputIteratorRA>::value_type T;
+
+
+    /******************************************************************************
+     * Algorithmic variants
+     ******************************************************************************/
+
+    /// Store helper
+    template <BlockStoreAlgorithm _POLICY, int DUMMY = 0>
+    struct StoreInternal;
+
+
+    /**
+     * BLOCK_STORE_DIRECT specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        __device__ __forceinline__ void Store(
+            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreBlocked<MODIFIER>(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        __device__ __forceinline__ void Store(
+            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_VECTORIZE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
+        __device__ __forceinline__ void Store(
+            T                   *block_ptr,                 ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreBlockedVectorized<MODIFIER>(linear_tid, block_ptr, items);
+        }
+
+        /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
+        template <typename _OutputIteratorRA>
+        __device__ __forceinline__ void Store(
+            _OutputIteratorRA   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreBlocked<MODIFIER>(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        __device__ __forceinline__ void Store(
+            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
+    {
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_THREADS, ITEMS_PER_THREAD, WARP_TIME_SLICING> BlockExchange;
+
+        /// Shared memory storage layout type
+        typedef typename BlockExchange::TempStorage _TempStorage;
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        __device__ __forceinline__ void Store(
+            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            StoreStriped<MODIFIER, BLOCK_THREADS>(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        __device__ __forceinline__ void Store(
+            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            StoreStriped<MODIFIER, BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = PtxArchProps::WARP_THREADS
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_THREADS, ITEMS_PER_THREAD, WARP_TIME_SLICING> BlockExchange;
+
+        /// Shared memory storage layout type
+        typedef typename BlockExchange::TempStorage _TempStorage;
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        __device__ __forceinline__ void Store(
+            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreWarpStriped<MODIFIER>(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        __device__ __forceinline__ void Store(
+            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreWarpStriped<MODIFIER>(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Internal load implementation to use
+    typedef StoreInternal<ALGORITHM> InternalStore;
+
+
+    /// Shared memory storage layout type
+    typedef typename InternalStore::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Thread reference to shared storage
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+public:
+
+
+    /// \smemstorage{BlockStore}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockStore()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ BlockStore(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(threadIdx.x)
+    {}
+
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
+     */
+    __device__ __forceinline__ BlockStore(
+        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(linear_tid)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
+     */
+    __device__ __forceinline__ BlockStore(
+        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
+        int linear_tid)                        ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Store items into a linear segment of memory.
+     *
+     * \blocked
+     *
+     * The code snippet below illustrates the storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockStore for 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     *
+     */
+    __device__ __forceinline__ void Store(
+        OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items);
+    }
+
+    /**
+     * \brief Store items into a linear segment of memory, guarded by range.
+     *
+     * \blocked
+     *
+     * The code snippet below illustrates the guarded storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockStore for 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and \p valid_items is \p 5.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
+     * only the first two threads being unmasked to store portions of valid data.
+     *
+     */
+    __device__ __forceinline__ void Store(
+        OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+        int                 valid_items)                ///< [in] Number of valid items to write
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
+    }
+};
+
+/** @} */       // end group BlockModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/block/specializations/block_histogram_atomic.cuh b/lib/kokkos/TPL/cub/block/specializations/block_histogram_atomic.cuh
new file mode 100644
index 000000000..ecc980098
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/specializations/block_histogram_atomic.cuh
@@ -0,0 +1,85 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <
+    typename                T,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     BINS>
+struct BlockHistogramAtomic
+{
+    /// Shared memory storage layout type
+    struct TempStorage {};
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockHistogramAtomic(
+        TempStorage     &temp_storage,
+        int             linear_tid)
+    {}
+
+
+    /// Composite data onto an existing histogram
+    template <
+        typename            HistoCounter>
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        HistoCounter        histogram[BINS])                 ///< [out] Reference to shared/global memory histogram
+    {
+        // Update histogram
+        #pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+        {
+              atomicAdd(histogram + items[i], 1);
+        }
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/block/specializations/block_histogram_sort.cuh b/lib/kokkos/TPL/cub/block/specializations/block_histogram_sort.cuh
new file mode 100644
index 000000000..e81edec6c
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/specializations/block_histogram_sort.cuh
@@ -0,0 +1,197 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../block/block_radix_sort.cuh"
+#include "../../block/block_discontinuity.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/**
+ * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <
+    typename                T,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     BINS>
+struct BlockHistogramSort
+{
+    // Parameterize BlockRadixSort type for our thread block
+    typedef BlockRadixSort<T, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
+
+    // Parameterize BlockDiscontinuity type for our thread block
+    typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT;
+
+    // Shared memory
+    union _TempStorage
+    {
+        // Storage for sorting bin values
+        typename BlockRadixSortT::TempStorage sort;
+
+        struct
+        {
+            // Storage for detecting discontinuities in the tile of sorted bin values
+            typename BlockDiscontinuityT::TempStorage flag;
+
+            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
+            unsigned int run_begin[BINS];
+            unsigned int run_end[BINS];
+        };
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockHistogramSort(
+        TempStorage     &temp_storage,
+        int             linear_tid)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+
+    // Discontinuity functor
+    struct DiscontinuityOp
+    {
+        // Reference to temp_storage
+        _TempStorage &temp_storage;
+
+        // Constructor
+        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
+            temp_storage(temp_storage)
+        {}
+
+        // Discontinuity predicate
+        __device__ __forceinline__ bool operator()(const T &a, const T &b, unsigned int b_index)
+        {
+            if (a != b)
+            {
+                // Note the begin/end offsets in shared storage
+                temp_storage.run_begin[b] = b_index;
+                temp_storage.run_end[a] = b_index;
+
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    };
+
+
+    // Composite data onto an existing histogram
+    template <
+        typename            HistoCounter>
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        HistoCounter        histogram[BINS])                 ///< [out] Reference to shared/global memory histogram
+    {
+        enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
+
+        // Sort bytes in blocked arrangement
+        BlockRadixSortT(temp_storage.sort, linear_tid).Sort(items);
+
+        __syncthreads();
+
+        // Initialize the shared memory's run_begin and run_end for each bin
+        int histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
+        }
+        // Finish up with guarded initialization if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
+        }
+
+        __syncthreads();
+
+        int flags[ITEMS_PER_THREAD];    // unused
+
+        // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
+        DiscontinuityOp flag_op(temp_storage);
+        BlockDiscontinuityT(temp_storage.flag, linear_tid).FlagHeads(flags, items, flag_op);
+
+        // Update begin for first item
+        if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
+
+        __syncthreads();
+
+        // Composite into histogram
+        histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            int thread_offset = histo_offset + linear_tid;
+            HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
+            histogram[thread_offset] += count;
+        }
+        // Finish up with guarded composition if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            int thread_offset = histo_offset + linear_tid;
+            HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
+            histogram[thread_offset] += count;
+        }
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/block/specializations/block_reduce_raking.cuh b/lib/kokkos/TPL/cub/block/specializations/block_reduce_raking.cuh
new file mode 100644
index 000000000..434d25a87
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/specializations/block_reduce_raking.cuh
@@ -0,0 +1,214 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA threadblock
+ */
+
+#pragma once
+
+#include "../../block/block_raking_layout.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA threadblock
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_THREADS>  ///< The thread block size in threads
+struct BlockReduceRaking
+{
+    /// Layout type for padded threadblock raking grid
+    typedef BlockRakingLayout<T, BLOCK_THREADS, 1> BlockRakingLayout;
+
+    ///  WarpReduce utility type
+    typedef typename WarpReduce<T, 1, BlockRakingLayout::RAKING_THREADS>::InternalWarpReduce WarpReduce;
+
+    /// Constants
+    enum
+    {
+        /// Number of raking threads
+        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+        /// Cooperative work can be entirely warp synchronous
+        WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS),
+
+        /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
+        WARP_SYNCHRONOUS_UNGUARDED = ((RAKING_THREADS & (RAKING_THREADS - 1)) == 0),
+
+        /// Whether or not accesses into smem are unguarded
+        RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
+
+    };
+
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpReduce::TempStorage            warp_storage;        ///< Storage for warp-synchronous reduction
+        typename BlockRakingLayout::TempStorage     raking_grid;         ///< Padded threadblock raking grid
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceRaking(
+        TempStorage &temp_storage,
+        int linear_tid)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+
+    /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        cub::Sum reduction_op;
+
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
+            partial = WarpReduce(temp_storage.warp_storage, 0, linear_tid).template Sum<FULL_TILE, SEGMENT_LENGTH>(
+                partial,
+                num_valid);
+        }
+        else
+        {
+            // Place partial into shared memory grid.
+            *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
+
+            __syncthreads();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = raking_segment[0];
+
+                #pragma unroll
+                for (int ITEM = 1; ITEM < SEGMENT_LENGTH; ITEM++)
+                {
+                    // Update partial if addend is in range
+                    if ((FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITEM < num_valid))
+                    {
+                        partial = reduction_op(partial, raking_segment[ITEM]);
+                    }
+                }
+
+                partial = WarpReduce(temp_storage.warp_storage, 0, linear_tid).template Sum<FULL_TILE && RAKING_UNGUARDED, SEGMENT_LENGTH>(
+                    partial,
+                    num_valid);
+            }
+        }
+
+        return partial;
+    }
+
+
+    /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
+            partial = WarpReduce(temp_storage.warp_storage, 0, linear_tid).template Reduce<FULL_TILE, SEGMENT_LENGTH>(
+                partial,
+                num_valid,
+                reduction_op);
+        }
+        else
+        {
+            // Place partial into shared memory grid.
+            *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
+
+            __syncthreads();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = raking_segment[0];
+
+                #pragma unroll
+                for (int ITEM = 1; ITEM < SEGMENT_LENGTH; ITEM++)
+                {
+                    // Update partial if addend is in range
+                    if ((FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITEM < num_valid))
+                    {
+                        partial = reduction_op(partial, raking_segment[ITEM]);
+                    }
+                }
+
+                partial = WarpReduce(temp_storage.warp_storage, 0, linear_tid).template Reduce<FULL_TILE && RAKING_UNGUARDED, SEGMENT_LENGTH>(
+                    partial,
+                    num_valid,
+                    reduction_op);
+            }
+        }
+
+        return partial;
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/block/specializations/block_reduce_warp_reductions.cuh b/lib/kokkos/TPL/cub/block/specializations/block_reduce_warp_reductions.cuh
new file mode 100644
index 000000000..0e316dd17
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -0,0 +1,198 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock
+ */
+
+#pragma once
+
+#include "../../warp/warp_reduce.cuh"
+#include "../../util_arch.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_THREADS>  ///< The thread block size in threads
+struct BlockReduceWarpReductions
+{
+    /// Constants
+    enum
+    {
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + PtxArchProps::WARP_THREADS - 1) / PtxArchProps::WARP_THREADS,
+
+        /// The logical warp size for warp reductions
+        LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, PtxArchProps::WARP_THREADS),
+
+        /// Whether or not the logical warp size evenly divides the threadblock size
+        EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
+    };
+
+
+    ///  WarpReduce utility type
+    typedef typename WarpReduce<T, WARPS, LOGICAL_WARP_SIZE>::InternalWarpReduce WarpReduce;
+
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpReduce::TempStorage    warp_reduce;                ///< Buffer for warp-synchronous scan
+        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
+        T                                   block_prefix;               ///< Shared prefix for the entire threadblock
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    int linear_tid;
+    int warp_id;
+    int lane_id;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceWarpReductions(
+        TempStorage &temp_storage,
+        int linear_tid)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid),
+        warp_id((BLOCK_THREADS <= PtxArchProps::WARP_THREADS) ?
+            0 :
+            linear_tid / PtxArchProps::WARP_THREADS),
+        lane_id((BLOCK_THREADS <= PtxArchProps::WARP_THREADS) ?
+            linear_tid :
+            linear_tid % PtxArchProps::WARP_THREADS)
+    {}
+
+
+    /// Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp         reduction_op,       ///< [in] Binary scan operator
+        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub>s only]</b> Warp-wide aggregate reduction of input items
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        // Share lane aggregates
+        if (lane_id == 0)
+        {
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+        }
+
+        __syncthreads();
+
+        // Update total aggregate in warp 0, lane 0
+        if (linear_tid == 0)
+        {
+            #pragma unroll
+            for (int SUCCESSOR_WARP = 1; SUCCESSOR_WARP < WARPS; SUCCESSOR_WARP++)
+            {
+                if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
+                {
+                    warp_aggregate = reduction_op(warp_aggregate, temp_storage.warp_aggregates[SUCCESSOR_WARP]);
+                }
+            }
+        }
+
+        return warp_aggregate;
+    }
+
+
+    /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   input,          ///< [in] Calling thread's input partial reductions
+        int                 num_valid)      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        cub::Sum     reduction_op;
+        unsigned int    warp_offset = warp_id * LOGICAL_WARP_SIZE;
+        unsigned int    warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
+                            LOGICAL_WARP_SIZE :
+                            (warp_offset < num_valid) ?
+                                num_valid - warp_offset :
+                                0;
+
+        // Warp reduction in every warp
+        T warp_aggregate = WarpReduce(temp_storage.warp_reduce, warp_id, lane_id).template Sum<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
+            input,
+            warp_num_valid);
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+    }
+
+
+    /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        unsigned int    warp_id = (WARPS == 1) ? 0 : (linear_tid / LOGICAL_WARP_SIZE);
+        unsigned int    warp_offset = warp_id * LOGICAL_WARP_SIZE;
+        unsigned int    warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
+                            LOGICAL_WARP_SIZE :
+                            (warp_offset < num_valid) ?
+                                num_valid - warp_offset :
+                                0;
+
+        // Warp reduction in every warp
+        T warp_aggregate = WarpReduce(temp_storage.warp_reduce, warp_id, lane_id).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
+            input,
+            warp_num_valid,
+            reduction_op);
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/block/specializations/block_scan_raking.cuh b/lib/kokkos/TPL/cub/block/specializations/block_scan_raking.cuh
new file mode 100644
index 000000000..75e15d95c
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/specializations/block_scan_raking.cuh
@@ -0,0 +1,761 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+
+/**
+ * \file
+ * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA threadblock.
+ */
+
+#pragma once
+
+#include "../../util_arch.cuh"
+#include "../../block/block_raking_layout.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../thread/thread_scan.cuh"
+#include "../../warp/warp_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA threadblock.
+ */
+template <
+    typename            T,              ///< Data type being scanned
+    int                 BLOCK_THREADS,  ///< The thread block size in threads
+    bool                MEMOIZE>        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
+struct BlockScanRaking
+{
+    /// Layout type for padded threadblock raking grid
+    typedef BlockRakingLayout<T, BLOCK_THREADS> BlockRakingLayout;
+
+    /// Constants
+    enum
+    {
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + PtxArchProps::WARP_THREADS - 1) / PtxArchProps::WARP_THREADS,
+
+        /// Number of raking threads
+        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+        /// Cooperative work can be entirely warp synchronous
+        WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS),
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, 1, RAKING_THREADS> WarpScan;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpScan::TempStorage              warp_scan;          ///< Buffer for warp-synchronous scan
+        typename BlockRakingLayout::TempStorage     raking_grid;        ///< Padded threadblock raking grid
+        T                                           block_aggregate;    ///< Block aggregate
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    int             linear_tid;
+    T               cached_segment[SEGMENT_LENGTH];
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRaking(
+        TempStorage &temp_storage,
+        int linear_tid)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid)
+    {}
+
+    /// Performs upsweep raking reduction, returning the aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ T Upsweep(
+        ScanOp scan_op)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+        T *raking_ptr;
+
+        if (MEMOIZE)
+        {
+            // Copy data into registers
+            #pragma unroll
+            for (int i = 0; i < SEGMENT_LENGTH; i++)
+            {
+                cached_segment[i] = smem_raking_ptr[i];
+            }
+            raking_ptr = cached_segment;
+        }
+        else
+        {
+            raking_ptr = smem_raking_ptr;
+        }
+
+        T raking_partial = raking_ptr[0];
+
+        #pragma unroll
+        for (int i = 1; i < SEGMENT_LENGTH; i++)
+        {
+            if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + i) < BLOCK_THREADS))
+            {
+                raking_partial = scan_op(raking_partial, raking_ptr[i]);
+            }
+        }
+
+        return raking_partial;
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial,
+        bool            apply_prefix = true)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        T *raking_ptr = (MEMOIZE) ?
+            cached_segment :
+            smem_raking_ptr;
+
+        ThreadScanExclusive<SEGMENT_LENGTH>(raking_ptr, raking_ptr, scan_op, raking_partial, apply_prefix);
+
+        if (MEMOIZE)
+        {
+            // Copy data back to smem
+            #pragma unroll
+            for (int i = 0; i < SEGMENT_LENGTH; i++)
+            {
+                smem_raking_ptr[i] = cached_segment[i];
+            }
+        }
+    }
+
+
+    /// Performs inclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial,
+        bool            apply_prefix = true)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        T *raking_ptr = (MEMOIZE) ?
+            cached_segment :
+            smem_raking_ptr;
+
+        ThreadScanInclusive<SEGMENT_LENGTH>(raking_ptr, raking_ptr, scan_op, raking_partial, apply_prefix);
+
+        if (MEMOIZE)
+        {
+            // Copy data back to smem
+            #pragma unroll
+            for (int i = 0; i < SEGMENT_LENGTH; i++)
+            {
+                smem_raking_ptr[i] = cached_segment[i];
+            }
+        }
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &identity,          ///< [in] Identity value
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                input,
+                output,
+                identity,
+                scan_op,
+                block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Exclusive warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                    raking_partial,
+                    raking_partial,
+                    identity,
+                    scan_op,
+                    temp_storage.block_aggregate);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, raking_partial);
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename        ScanOp,
+        typename        BlockPrefixOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               identity,                       ///< [in] Identity value
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                input,
+                output,
+                identity,
+                scan_op,
+                block_aggregate,
+                block_prefix_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Exclusive warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                    raking_partial,
+                    raking_partial,
+                    identity,
+                    scan_op,
+                    temp_storage.block_aggregate,
+                    block_prefix_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, raking_partial);
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                input,
+                output,
+                scan_op,
+                block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Exclusive warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                    raking_partial,
+                    raking_partial,
+                    scan_op,
+                    temp_storage.block_aggregate);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0));
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                input,
+                output,
+                scan_op,
+                block_aggregate,
+                block_prefix_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Exclusive warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                    raking_partial,
+                    raking_partial,
+                    scan_op,
+                    temp_storage.block_aggregate,
+                    block_prefix_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, raking_partial);
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
+                input,
+                output,
+                block_aggregate);
+        }
+        else
+        {
+            // Raking scan
+            Sum scan_op;
+
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Exclusive warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
+                    raking_partial,
+                    raking_partial,
+                    temp_storage.block_aggregate);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, raking_partial);
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename BlockPrefixOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
+                input,
+                output,
+                block_aggregate,
+                block_prefix_op);
+        }
+        else
+        {
+            // Raking scan
+            Sum scan_op;
+
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Exclusive warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
+                    raking_partial,
+                    raking_partial,
+                    temp_storage.block_aggregate,
+                    block_prefix_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, raking_partial);
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).InclusiveScan(
+                input,
+                output,
+                scan_op,
+                block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Exclusive warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                    raking_partial,
+                    raking_partial,
+                    scan_op,
+                    temp_storage.block_aggregate);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0));
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).InclusiveScan(
+                input,
+                output,
+                scan_op,
+                block_aggregate,
+                block_prefix_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
+                    raking_partial,
+                    raking_partial,
+                    scan_op,
+                    temp_storage.block_aggregate,
+                    block_prefix_op);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, raking_partial);
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).InclusiveSum(
+                input,
+                output,
+                block_aggregate);
+        }
+        else
+        {
+            // Raking scan
+            Sum scan_op;
+
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Exclusive warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
+                    raking_partial,
+                    raking_partial,
+                    temp_storage.block_aggregate);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0));
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename BlockPrefixOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp scan
+            WarpScan(temp_storage.warp_scan, 0, linear_tid).InclusiveSum(
+                input,
+                output,
+                block_aggregate,
+                block_prefix_op);
+        }
+        else
+        {
+            // Raking scan
+            Sum scan_op;
+
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            __syncthreads();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction in grid
+                T raking_partial = Upsweep(scan_op);
+
+                // Warp synchronous scan
+                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
+                    raking_partial,
+                    raking_partial,
+                    temp_storage.block_aggregate,
+                    block_prefix_op);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, raking_partial);
+            }
+
+            __syncthreads();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/block/specializations/block_scan_warp_scans.cuh b/lib/kokkos/TPL/cub/block/specializations/block_scan_warp_scans.cuh
new file mode 100644
index 000000000..f7af3613d
--- /dev/null
+++ b/lib/kokkos/TPL/cub/block/specializations/block_scan_warp_scans.cuh
@@ -0,0 +1,342 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock.
+ */
+
+#pragma once
+
+#include "../../util_arch.cuh"
+#include "../../warp/warp_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock.
+ */
+template <
+    typename            T,
+    int                 BLOCK_THREADS>
+struct BlockScanWarpScans
+{
+    /// Constants
+    enum
+    {
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + PtxArchProps::WARP_THREADS - 1) / PtxArchProps::WARP_THREADS,
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARPS, PtxArchProps::WARP_THREADS> WarpScan;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpScan::TempStorage      warp_scan;                  ///< Buffer for warp-synchronous scan
+        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
+        T                                   block_prefix;               ///< Shared prefix for the entire threadblock
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    int linear_tid;
+    int warp_id;
+    int lane_id;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage,
+        int linear_tid)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(linear_tid),
+        warp_id((BLOCK_THREADS <= PtxArchProps::WARP_THREADS) ?
+            0 :
+            linear_tid / PtxArchProps::WARP_THREADS),
+        lane_id((BLOCK_THREADS <= PtxArchProps::WARP_THREADS) ?
+            linear_tid :
+            linear_tid % PtxArchProps::WARP_THREADS)
+    {}
+
+
+    /// Update the calling thread's partial reduction with the warp-wide aggregates from preceding warps.  Also returns block-wide aggregate in <em>thread</em><sub>0</sub>.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &partial,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub>s only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        bool            lane_valid = true)  ///< [in] Whether or not the partial belonging to the current thread is valid
+    {
+        // Share lane aggregates
+        temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+
+        __syncthreads();
+
+        block_aggregate = temp_storage.warp_aggregates[0];
+
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; WARP++)
+        {
+            if (warp_id == WARP)
+            {
+                partial = (lane_valid) ?
+                    scan_op(block_aggregate, partial) :     // fold it in our valid partial
+                    block_aggregate;                        // replace our invalid partial with the aggregate
+            }
+
+            block_aggregate = scan_op(block_aggregate, temp_storage.warp_aggregates[WARP]);
+        }
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &identity,          ///< [in] Identity value
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        T warp_aggregate;
+        WarpScan(temp_storage.warp_scan, warp_id, lane_id).ExclusiveScan(input, output, identity, scan_op, warp_aggregate);
+
+        // Update outputs and block_aggregate with warp-wide aggregates
+        ApplyWarpAggregates(output, scan_op, warp_aggregate, block_aggregate);
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               identity,                       ///< [in] Identity value
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        ExclusiveScan(input, output, identity, scan_op, block_aggregate);
+
+        // Compute and share threadblock prefix
+        if (warp_id == 0)
+        {
+            temp_storage.block_prefix = block_prefix_op(block_aggregate);
+        }
+
+        __syncthreads();
+
+        // Incorporate threadblock prefix into outputs
+        output = scan_op(temp_storage.block_prefix, output);
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        T warp_aggregate;
+        WarpScan(temp_storage.warp_scan, warp_id, lane_id).ExclusiveScan(input, output, scan_op, warp_aggregate);
+
+        // Update outputs and block_aggregate with warp-wide aggregates
+        ApplyWarpAggregates(output, scan_op, warp_aggregate, block_aggregate, (lane_id > 0));
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        ExclusiveScan(input, output, scan_op, block_aggregate);
+
+        // Compute and share threadblock prefix
+        if (warp_id == 0)
+        {
+            temp_storage.block_prefix = block_prefix_op(block_aggregate);
+        }
+
+        __syncthreads();
+
+        // Incorporate threadblock prefix into outputs
+        output = (linear_tid == 0) ?
+            temp_storage.block_prefix :
+            scan_op(temp_storage.block_prefix, output);
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        T warp_aggregate;
+        WarpScan(temp_storage.warp_scan, warp_id, lane_id).ExclusiveSum(input, output, warp_aggregate);
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        ApplyWarpAggregates(output, Sum(), warp_aggregate, block_aggregate);
+    }
+
+
+    /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename BlockPrefixOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        ExclusiveSum(input, output, block_aggregate);
+
+        // Compute and share threadblock prefix
+        if (warp_id == 0)
+        {
+            temp_storage.block_prefix = block_prefix_op(block_aggregate);
+        }
+
+        __syncthreads();
+
+        // Incorporate threadblock prefix into outputs
+        Sum scan_op;
+        output = scan_op(temp_storage.block_prefix, output);
+    }
+
+
+    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        T warp_aggregate;
+        WarpScan(temp_storage.warp_scan, warp_id, lane_id).InclusiveScan(input, output, scan_op, warp_aggregate);
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        ApplyWarpAggregates(output, scan_op, warp_aggregate, block_aggregate);
+
+    }
+
+
+    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        InclusiveScan(input, output, scan_op, block_aggregate);
+
+        // Compute and share threadblock prefix
+        if (warp_id == 0)
+        {
+            temp_storage.block_prefix = block_prefix_op(block_aggregate);
+        }
+
+        __syncthreads();
+
+        // Incorporate threadblock prefix into outputs
+        output = scan_op(temp_storage.block_prefix, output);
+    }
+
+
+    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        T warp_aggregate;
+        WarpScan(temp_storage.warp_scan, warp_id, lane_id).InclusiveSum(input, output, warp_aggregate);
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        ApplyWarpAggregates(output, Sum(), warp_aggregate, block_aggregate);
+    }
+
+
+    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename BlockPrefixOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
+        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
+    {
+        InclusiveSum(input, output, block_aggregate);
+
+        // Compute and share threadblock prefix
+        if (warp_id == 0)
+        {
+            temp_storage.block_prefix = block_prefix_op(block_aggregate);
+        }
+
+        __syncthreads();
+
+        // Incorporate threadblock prefix into outputs
+        Sum scan_op;
+        output = scan_op(temp_storage.block_prefix, output);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/cub.cuh b/lib/kokkos/TPL/cub/cub.cuh
new file mode 100644
index 000000000..dbb77da22
--- /dev/null
+++ b/lib/kokkos/TPL/cub/cub.cuh
@@ -0,0 +1,84 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * CUB umbrella include file
+ */
+
+#pragma once
+
+
+// Block
+#include "block/block_histogram.cuh"
+#include "block/block_discontinuity.cuh"
+#include "block/block_exchange.cuh"
+#include "block/block_load.cuh"
+#include "block/block_radix_rank.cuh"
+#include "block/block_radix_sort.cuh"
+#include "block/block_reduce.cuh"
+#include "block/block_scan.cuh"
+#include "block/block_store.cuh"
+
+// Device
+#include "device/device_histogram.cuh"
+#include "device/device_radix_sort.cuh"
+#include "device/device_reduce.cuh"
+#include "device/device_scan.cuh"
+
+// Grid
+//#include "grid/grid_barrier.cuh"
+#include "grid/grid_even_share.cuh"
+#include "grid/grid_mapping.cuh"
+#include "grid/grid_queue.cuh"
+
+// Host
+#include "host/spinlock.cuh"
+
+// Thread
+#include "thread/thread_load.cuh"
+#include "thread/thread_operators.cuh"
+#include "thread/thread_reduce.cuh"
+#include "thread/thread_scan.cuh"
+#include "thread/thread_store.cuh"
+
+// Warp
+#include "warp/warp_reduce.cuh"
+#include "warp/warp_scan.cuh"
+
+// Util
+#include "util_allocator.cuh"
+#include "util_arch.cuh"
+#include "util_debug.cuh"
+#include "util_device.cuh"
+#include "util_macro.cuh"
+#include "util_ptx.cuh"
+#include "util_type.cuh"
+#include "util_iterator.cuh"
+#include "util_vector.cuh"
+
diff --git a/lib/kokkos/TPL/cub/device/block/block_histo_tiles.cuh b/lib/kokkos/TPL/cub/device/block/block_histo_tiles.cuh
new file mode 100644
index 000000000..e1165d60c
--- /dev/null
+++ b/lib/kokkos/TPL/cub/device/block/block_histo_tiles.cuh
@@ -0,0 +1,322 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockHistogramTiles implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "specializations/block_histo_tiles_gatomic.cuh"
+#include "specializations/block_histo_tiles_satomic.cuh"
+#include "specializations/block_histo_tiles_sort.cuh"
+#include "../../util_type.cuh"
+#include "../../grid/grid_mapping.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+
+/**
+ * \brief BlockHistogramTilesAlgorithm enumerates alternative algorithms for BlockHistogramTiles.
+ */
+enum BlockHistogramTilesAlgorithm
+{
+
+    /**
+     * \par Overview
+     * A two-kernel approach in which:
+     * -# Thread blocks in the first kernel aggregate their own privatized
+     *    histograms using block-wide sorting (see BlockHistogramAlgorithm::BLOCK_HISTO_SORT).
+     * -# A single thread block in the second kernel reduces them into the output histogram(s).
+     *
+     * \par Performance Considerations
+     * Delivers consistent throughput regardless of sample bin distribution.
+     *
+     * However, because histograms are privatized in shared memory, a large
+     * number of bins (e.g., thousands) may adversely affect occupancy and
+     * performance (or even the ability to launch).
+     */
+    GRID_HISTO_SORT,
+
+
+    /**
+     * \par Overview
+     * A two-kernel approach in which:
+     * -# Thread blocks in the first kernel aggregate their own privatized
+     *    histograms using shared-memory \p atomicAdd().
+     * -# A single thread block in the second kernel reduces them into the
+     *    output histogram(s).
+     *
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
+     *
+     * However, because histograms are privatized in shared memory, a large
+     * number of bins (e.g., thousands) may adversely affect occupancy and
+     * performance (or even the ability to launch).
+     */
+    GRID_HISTO_SHARED_ATOMIC,
+
+
+    /**
+     * \par Overview
+     * A single-kernel approach in which thread blocks update the output histogram(s) directly
+     * using global-memory \p atomicAdd().
+     *
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
+     *
+     * Performance is not significantly impacted when computing histograms having large
+     * numbers of bins (e.g., thousands).
+     */
+    GRID_HISTO_GLOBAL_ATOMIC,
+
+};
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ * Tuning policy for BlockHistogramTiles
+ */
+template <
+    int                             _BLOCK_THREADS,
+    int                             _ITEMS_PER_THREAD,
+    BlockHistogramTilesAlgorithm    _GRID_ALGORITHM,
+    GridMappingStrategy             _GRID_MAPPING,
+    int                             _SM_OCCUPANCY>
+struct BlockHistogramTilesPolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,
+        SM_OCCUPANCY        = _SM_OCCUPANCY,
+    };
+
+    static const BlockHistogramTilesAlgorithm   GRID_ALGORITHM      = _GRID_ALGORITHM;
+    static const GridMappingStrategy            GRID_MAPPING        = _GRID_MAPPING;
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+/**
+ * Implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using global atomics
+ */
+template <
+    typename    BlockHistogramTilesPolicy,          ///< Tuning policy
+    int         BINS,                           ///< Number of histogram bins per channel
+    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
+    typename    InputIteratorRA,                ///< The input iterator type (may be a simple pointer type).  Must have a value type that can be cast as an integer in the range [0..BINS-1]
+    typename    HistoCounter,                   ///< Integral type for counting sample occurrences per histogram bin
+    typename    SizeT>                          ///< Integer type for offsets
+struct BlockHistogramTiles
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Histogram grid algorithm
+    static const BlockHistogramTilesAlgorithm GRID_ALGORITHM = BlockHistogramTilesPolicy::GRID_ALGORITHM;
+
+    // Alternative internal implementation types
+    typedef BlockHistogramTilesSort<            BlockHistogramTilesPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT>   BlockHistogramTilesSortT;
+    typedef BlockHistogramTilesSharedAtomic<    BlockHistogramTilesPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT>   BlockHistogramTilesSharedAtomicT;
+    typedef BlockHistogramTilesGlobalAtomic<    BlockHistogramTilesPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT>   BlockHistogramTilesGlobalAtomicT;
+
+    // Internal block sweep histogram type
+    typedef typename If<(GRID_ALGORITHM == GRID_HISTO_SORT),
+        BlockHistogramTilesSortT,
+        typename If<(GRID_ALGORITHM == GRID_HISTO_SHARED_ATOMIC),
+            BlockHistogramTilesSharedAtomicT,
+            BlockHistogramTilesGlobalAtomicT>::Type>::Type InternalBlockDelegate;
+
+    enum
+    {
+        TILE_ITEMS = InternalBlockDelegate::TILE_ITEMS,
+    };
+
+
+    // Temporary storage type
+    typedef typename InternalBlockDelegate::TempStorage TempStorage;
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Internal block delegate
+    InternalBlockDelegate internal_delegate;
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockHistogramTiles(
+        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
+        InputIteratorRA     d_in,                                           ///< Input data to reduce
+        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
+    :
+        internal_delegate(temp_storage, d_in, d_out_histograms)
+    {}
+
+
+    /**
+     * \brief Reduce a consecutive segment of input tiles
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        SizeT   block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        SizeT   block_oob)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        // Consume subsequent full tiles of input
+        while (block_offset + TILE_ITEMS <= block_oob)
+        {
+            internal_delegate.ConsumeTile<true>(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < block_oob)
+        {
+            int valid_items = block_oob - block_offset;
+            internal_delegate.ConsumeTile<false>(block_offset, valid_items);
+        }
+
+        // Aggregate output
+        internal_delegate.AggregateOutput();
+    }
+
+
+    /**
+     * Reduce a consecutive segment of input tiles
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        SizeT                               num_items,          ///< [in] Total number of global input items
+        GridEvenShare<SizeT>                &even_share,        ///< [in] GridEvenShare descriptor
+        GridQueue<SizeT>                    &queue,             ///< [in,out] GridQueue descriptor
+        Int2Type<GRID_MAPPING_EVEN_SHARE>   is_even_share)      ///< [in] Marker type indicating this is an even-share mapping
+    {
+        even_share.BlockInit();
+        ConsumeTiles(even_share.block_offset, even_share.block_oob);
+    }
+
+
+    /**
+     * Dequeue and reduce tiles of items as part of a inter-block scan
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        int                 num_items,          ///< Total number of input items
+        GridQueue<SizeT>    queue)              ///< Queue descriptor for assigning tiles of work to thread blocks
+    {
+        // Shared block offset
+        __shared__ SizeT shared_block_offset;
+
+        // We give each thread block at least one tile of input.
+        SizeT block_offset      = blockIdx.x * TILE_ITEMS;
+        SizeT even_share_base   = gridDim.x * TILE_ITEMS;
+
+        // Process full tiles of input
+        while (block_offset + TILE_ITEMS <= num_items)
+        {
+            internal_delegate.ConsumeTile<true>(block_offset);
+
+            // Dequeue up to TILE_ITEMS
+            if (threadIdx.x == 0)
+                shared_block_offset = queue.Drain(TILE_ITEMS) + even_share_base;
+
+            __syncthreads();
+
+            block_offset = shared_block_offset;
+
+            __syncthreads();
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < num_items)
+        {
+            int valid_items = num_items - block_offset;
+            internal_delegate.ConsumeTile<false>(block_offset, valid_items);
+        }
+
+        // Aggregate output
+        internal_delegate.AggregateOutput();
+    }
+
+
+    /**
+     * Dequeue and reduce tiles of items as part of a inter-block scan
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        SizeT                               num_items,          ///< [in] Total number of global input items
+        GridEvenShare<SizeT>                &even_share,        ///< [in] GridEvenShare descriptor
+        GridQueue<SizeT>                    &queue,             ///< [in,out] GridQueue descriptor
+        Int2Type<GRID_MAPPING_DYNAMIC>      is_dynamic)         ///< [in] Marker type indicating this is a dynamic mapping
+    {
+        ConsumeTiles(num_items, queue);
+    }
+
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/device/block/block_partition_tiles.cuh b/lib/kokkos/TPL/cub/device/block/block_partition_tiles.cuh
new file mode 100644
index 000000000..4597773af
--- /dev/null
+++ b/lib/kokkos/TPL/cub/device/block/block_partition_tiles.cuh
@@ -0,0 +1,381 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockPartitionTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide list partitioning.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "scan_tiles_types.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../block/block_load.cuh"
+#include "../../block/block_store.cuh"
+#include "../../block/block_scan.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_vector.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Tuning policy for BlockPartitionTiles
+ */
+template <
+    int                         _PARTITIONS,
+    int                         _BLOCK_THREADS,
+    int                         _ITEMS_PER_THREAD,
+    PtxLoadModifier             _LOAD_MODIFIER,
+    BlockScanAlgorithm          _SCAN_ALGORITHM>
+struct BlockPartitionTilesPolicy
+{
+    enum
+    {
+        PARTITIONS              = _PARTITIONS,
+        BLOCK_THREADS           = _BLOCK_THREADS,
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,
+    };
+
+    static const PtxLoadModifier        LOAD_MODIFIER       = _LOAD_MODIFIER;
+    static const BlockScanAlgorithm     SCAN_ALGORITHM      = _SCAN_ALGORITHM;
+};
+
+
+
+/**
+ * Tuple type for scanning partition membership flags
+ */
+template <
+    typename    SizeT,
+    int         PARTITIONS>
+struct PartitionScanTuple;
+
+
+/**
+ * Tuple type for scanning partition membership flags (specialized for 1 output partition)
+ */
+template <typename SizeT>
+struct PartitionScanTuple<SizeT, 1> : VectorHelper<SizeT, 1>::Type
+{
+    __device__ __forceinline__ PartitionScanTuple operator+(const PartitionScanTuple &other)
+    {
+        PartitionScanTuple retval;
+        retval.x = x + other.x;
+        return retval;
+    }
+
+    template <typename PredicateOp, typename T>
+    __device__ __forceinline__ void SetFlags(PredicateOp pred_op, T val)
+    {
+        this->x = pred_op(val);
+    }
+
+    template <typename PredicateOp, typename T, typename OutputIteratorRA, SizeT num_items>
+    __device__ __forceinline__ void Scatter(PredicateOp pred_op, T val, OutputIteratorRA d_out, SizeT num_items)
+    {
+        if (pred_op(val))
+            d_out[this->x - 1] = val;
+    }
+
+};
+
+
+/**
+ * Tuple type for scanning partition membership flags (specialized for 2 output partitions)
+ */
+template <typename SizeT>
+struct PartitionScanTuple<SizeT, 2> : VectorHelper<SizeT, 2>::Type
+{
+    __device__ __forceinline__ PartitionScanTuple operator+(const PartitionScanTuple &other)
+    {
+        PartitionScanTuple retval;
+        retval.x = x + other.x;
+        retval.y = y + other.y;
+        return retval;
+    }
+
+    template <typename PredicateOp, typename T>
+    __device__ __forceinline__ void SetFlags(PredicateOp pred_op, T val)
+    {
+        bool pred = pred_op(val);
+        this->x = pred;
+        this->y = !pred;
+    }
+
+    template <typename PredicateOp, typename T, typename OutputIteratorRA, SizeT num_items>
+    __device__ __forceinline__ void Scatter(PredicateOp pred_op, T val, OutputIteratorRA d_out, SizeT num_items)
+    {
+        SizeT scatter_offset = (pred_op(val)) ?
+            this->x - 1 :
+            num_items - this->y;
+
+        d_out[scatter_offset] = val;
+    }
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockPartitionTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide list partitioning.
+ *
+ * Implements a single-pass "domino" strategy with adaptive prefix lookback.
+ */
+template <
+    typename BlockPartitionTilesPolicy, ///< Tuning policy
+    typename InputIteratorRA,           ///< Input iterator type
+    typename OutputIteratorRA,          ///< Output iterator type
+    typename PredicateOp,               ///< Partition predicate functor type
+    typename SizeT>                     ///< Offset integer type
+struct BlockPartitionTiles
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Constants
+    enum
+    {
+        PARTITIONS          = BlockPartitionTilesPolicy::PARTITIONS,
+        BLOCK_THREADS       = BlockPartitionTilesPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockPartitionTilesPolicy::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Load modifier
+    static const PtxLoadModifier LOAD_MODIFIER = BlockPartitionTilesPolicy::LOAD_MODIFIER;
+
+    // Data type of input iterator
+    typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+    // Tuple type for scanning partition membership flags
+    typedef PartitionScanTuple<SizeT, PARTITIONS> PartitionScanTuple;
+
+    // Tile status descriptor type
+    typedef ScanTileDescriptor<PartitionScanTuple> ScanTileDescriptorT;
+
+    // Block scan type for scanning membership flag scan_tuples
+    typedef BlockScan<
+        PartitionScanTuple,
+        BlockPartitionTilesPolicy::BLOCK_THREADS,
+        BlockPartitionTilesPolicy::SCAN_ALGORITHM> BlockScanT;
+
+    // Callback type for obtaining inter-tile prefix during block scan
+    typedef DeviceScanBlockPrefixOp<PartitionScanTuple, Sum> InterblockPrefixOp;
+
+    // Shared memory type for this threadblock
+    struct TempStorage
+    {
+        typename InterblockPrefixOp::TempStorage    prefix;         // Smem needed for cooperative prefix callback
+        typename BlockScanT::TempStorage            scan;           // Smem needed for tile scanning
+        SizeT                                       tile_idx;       // Shared tile index
+    };
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    TempStorage                 &temp_storage;      ///< Reference to temp_storage
+    InputIteratorRA             d_in;               ///< Input data
+    OutputIteratorRA            d_out;              ///< Output data
+    ScanTileDescriptorT         *d_tile_status;     ///< Global list of tile status
+    PredicateOp                 pred_op;            ///< Unary predicate operator indicating membership in the first partition
+    SizeT                       num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    BlockPartitionTiles(
+        TempStorage                 &temp_storage,      ///< Reference to temp_storage
+        InputIteratorRA             d_in,               ///< Input data
+        OutputIteratorRA            d_out,              ///< Output data
+        ScanTileDescriptorT         *d_tile_status,     ///< Global list of tile status
+        PredicateOp                 pred_op,            ///< Unary predicate operator indicating membership in the first partition
+        SizeT                       num_items)          ///< Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out(d_out),
+        d_tile_status(d_tile_status),
+        pred_op(pred_op),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Domino scan
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        int                 tile_idx,           ///< Tile index
+        SizeT               block_offset,       ///< Tile offset
+        PartitionScanTuple  &partition_ends)    ///< Running total
+    {
+        T                   items[ITEMS_PER_THREAD];
+        PartitionScanTuple  scan_tuples[ITEMS_PER_THREAD];
+
+        // Load items
+        int valid_items = num_items - block_offset;
+        if (FULL_TILE)
+            LoadStriped<LOAD_MODIFIER, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
+        else
+            LoadStriped<LOAD_MODIFIER, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
+
+        // Prevent hoisting
+//        __syncthreads();
+//        __threadfence_block();
+
+        // Set partition membership flags in scan scan_tuples
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scan_tuples[ITEM].SetFlags(pred_op, items[ITEM]);
+        }
+
+        // Perform inclusive scan over scan scan_tuples
+        PartitionScanTuple block_aggregate;
+        if (tile_idx == 0)
+        {
+            BlockScanT(temp_storage.scan).InclusiveScan(scan_tuples, scan_tuples, Sum(), block_aggregate);
+            partition_ends = block_aggregate;
+
+            // Update tile status if there are successor tiles
+            if (FULL_TILE && (threadIdx.x == 0))
+                ScanTileDescriptorT::SetPrefix(d_tile_status, block_aggregate);
+        }
+        else
+        {
+            InterblockPrefixOp prefix_op(d_tile_status, temp_storage.prefix, Sum(), tile_idx);
+            BlockScanT(temp_storage.scan).InclusiveScan(scan_tuples, scan_tuples, Sum(), block_aggregate, prefix_op);
+            partition_ends = prefix_op.inclusive_prefix;
+        }
+
+        // Scatter items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Scatter if not out-of-bounds
+            if (FULL_TILE || (threadIdx.x + (ITEM * BLOCK_THREADS) < valid_items))
+            {
+                scan_tuples[ITEM].Scatter(pred_op, items[ITEM], d_out, num_items);
+            }
+        }
+    }
+
+
+    /**
+     * Dequeue and scan tiles of items as part of a domino scan
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        GridQueue<int>      queue,              ///< [in] Queue descriptor for assigning tiles of work to thread blocks
+        SizeT               num_tiles,          ///< [in] Total number of input tiles
+        PartitionScanTuple  &partition_ends,    ///< [out] Running partition end offsets
+        bool                &is_last_tile)      ///< [out] Whether or not this block handled the last tile (i.e., partition_ends is valid for the entire input)
+    {
+#if CUB_PTX_ARCH < 200
+
+        // No concurrent kernels allowed and blocks are launched in increasing order, so just assign one tile per block (up to 65K blocks)
+        int     tile_idx        = blockIdx.x;
+        SizeT   block_offset    = SizeT(TILE_ITEMS) * tile_idx;
+
+        if (block_offset + TILE_ITEMS <= num_items)
+        {
+            ConsumeTile<true>(tile_idx, block_offset, partition_ends);
+        }
+        else if (block_offset < num_items)
+        {
+            ConsumeTile<false>(tile_idx, block_offset, partition_ends);
+        }
+        is_last_tile = (tile_idx == num_tiles - 1);
+
+#else
+
+        // Get first tile
+        if (threadIdx.x == 0)
+            temp_storage.tile_idx = queue.Drain(1);
+
+        __syncthreads();
+
+        int tile_idx = temp_storage.tile_idx;
+        SizeT block_offset = SizeT(TILE_ITEMS) * tile_idx;
+
+        while (block_offset + TILE_ITEMS <= num_items)
+        {
+            // Consume full tile
+            ConsumeTile<true>(tile_idx, block_offset, partition_ends);
+            is_last_tile = (tile_idx == num_tiles - 1);
+
+            // Get next tile
+            if (threadIdx.x == 0)
+                temp_storage.tile_idx = queue.Drain(1);
+
+            __syncthreads();
+
+            tile_idx = temp_storage.tile_idx;
+            block_offset = SizeT(TILE_ITEMS) * tile_idx;
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < num_items)
+        {
+            ConsumeTile<false>(tile_idx, block_offset, partition_ends);
+            is_last_tile = (tile_idx == num_tiles - 1);
+        }
+#endif
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/device/block/block_radix_sort_downsweep_tiles.cuh b/lib/kokkos/TPL/cub/device/block/block_radix_sort_downsweep_tiles.cuh
new file mode 100644
index 000000000..91d628e00
--- /dev/null
+++ b/lib/kokkos/TPL/cub/device/block/block_radix_sort_downsweep_tiles.cuh
@@ -0,0 +1,713 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * BlockRadixSortDownsweepTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep.
+ */
+
+
+#pragma once
+
+#include "../../thread/thread_load.cuh"
+#include "../../block/block_load.cuh"
+#include "../../block/block_store.cuh"
+#include "../../block/block_radix_rank.cuh"
+#include "../../block/block_exchange.cuh"
+#include "../../util_type.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Types of scattering strategies
+ */
+enum RadixSortScatterAlgorithm
+{
+    RADIX_SORT_SCATTER_DIRECT,      ///< Scatter directly from registers to global bins
+    RADIX_SORT_SCATTER_TWO_PHASE,   ///< First scatter from registers into shared memory bins, then into global bins
+};
+
+
+/**
+ * Tuning policy for BlockRadixSortDownsweepTiles
+ */
+template <
+    int                         _BLOCK_THREADS,             ///< The number of threads per CTA
+    int                         _ITEMS_PER_THREAD,          ///< The number of consecutive downsweep keys to process per thread
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,            ///< The BlockLoad algorithm to use
+    PtxLoadModifier             _LOAD_MODIFIER,             ///< The PTX cache-modifier to use for loads
+    bool                        _EXCHANGE_TIME_SLICING,     ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure
+    bool                        _MEMOIZE_OUTER_SCAN,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure.  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
+    BlockScanAlgorithm          _INNER_SCAN_ALGORITHM,      ///< The cub::BlockScanAlgorithm algorithm to use
+    RadixSortScatterAlgorithm   _SCATTER_ALGORITHM,         ///< The scattering strategy to use
+    cudaSharedMemConfig         _SMEM_CONFIG,               ///< Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+    int                         _RADIX_BITS>                ///< The number of radix bits, i.e., log2(bins)
+struct BlockRadixSortDownsweepTilesPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,
+        EXCHANGE_TIME_SLICING   = _EXCHANGE_TIME_SLICING,
+        RADIX_BITS              = _RADIX_BITS,
+        MEMOIZE_OUTER_SCAN      = _MEMOIZE_OUTER_SCAN,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    static const BlockLoadAlgorithm         LOAD_ALGORITHM          = _LOAD_ALGORITHM;
+    static const PtxLoadModifier            LOAD_MODIFIER           = _LOAD_MODIFIER;
+    static const BlockScanAlgorithm         INNER_SCAN_ALGORITHM    = _INNER_SCAN_ALGORITHM;
+    static const RadixSortScatterAlgorithm  SCATTER_ALGORITHM       = _SCATTER_ALGORITHM;
+    static const cudaSharedMemConfig        SMEM_CONFIG             = _SMEM_CONFIG;
+
+    typedef BlockRadixSortDownsweepTilesPolicy<
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM,
+        LOAD_MODIFIER,
+        EXCHANGE_TIME_SLICING,
+        MEMOIZE_OUTER_SCAN,
+        INNER_SCAN_ALGORITHM,
+        SCATTER_ALGORITHM,
+        SMEM_CONFIG,
+        CUB_MAX(1, RADIX_BITS - 1)> AltPolicy;
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * CTA-wide "downsweep" abstraction for distributing keys from
+ * a range of input tiles.
+ */
+template <
+    typename BlockRadixSortDownsweepTilesPolicy,
+    typename Key,
+    typename Value,
+    typename SizeT>
+struct BlockRadixSortDownsweepTiles
+{
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    // Appropriate unsigned-bits representation of Key
+    typedef typename Traits<Key>::UnsignedBits UnsignedBits;
+
+    static const UnsignedBits MIN_KEY = Traits<Key>::MIN_KEY;
+    static const UnsignedBits MAX_KEY = Traits<Key>::MAX_KEY;
+
+    static const BlockLoadAlgorithm         LOAD_ALGORITHM          = BlockRadixSortDownsweepTilesPolicy::LOAD_ALGORITHM;
+    static const PtxLoadModifier            LOAD_MODIFIER           = BlockRadixSortDownsweepTilesPolicy::LOAD_MODIFIER;
+    static const BlockScanAlgorithm         INNER_SCAN_ALGORITHM    = BlockRadixSortDownsweepTilesPolicy::INNER_SCAN_ALGORITHM;
+    static const RadixSortScatterAlgorithm  SCATTER_ALGORITHM       = BlockRadixSortDownsweepTilesPolicy::SCATTER_ALGORITHM;
+    static const cudaSharedMemConfig        SMEM_CONFIG             = BlockRadixSortDownsweepTilesPolicy::SMEM_CONFIG;
+
+    enum
+    {
+        BLOCK_THREADS           = BlockRadixSortDownsweepTilesPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = BlockRadixSortDownsweepTilesPolicy::ITEMS_PER_THREAD,
+        EXCHANGE_TIME_SLICING   = BlockRadixSortDownsweepTilesPolicy::EXCHANGE_TIME_SLICING,
+        RADIX_BITS              = BlockRadixSortDownsweepTilesPolicy::RADIX_BITS,
+        MEMOIZE_OUTER_SCAN      = BlockRadixSortDownsweepTilesPolicy::MEMOIZE_OUTER_SCAN,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+        KEYS_ONLY               = Equals<Value, NullType>::VALUE,
+
+        WARP_THREADS            = PtxArchProps::LOG_WARP_THREADS,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        BYTES_PER_SIZET         = sizeof(SizeT),
+        LOG_BYTES_PER_SIZET     = Log2<BYTES_PER_SIZET>::VALUE,
+
+        LOG_SMEM_BANKS          = PtxArchProps::LOG_SMEM_BANKS,
+        SMEM_BANKS              = 1 << LOG_SMEM_BANKS,
+
+        DIGITS_PER_SCATTER_PASS = BLOCK_THREADS / SMEM_BANKS,
+        SCATTER_PASSES          = RADIX_DIGITS / DIGITS_PER_SCATTER_PASS,
+
+        LOG_STORE_TXN_THREADS   = LOG_SMEM_BANKS,
+        STORE_TXN_THREADS       = 1 << LOG_STORE_TXN_THREADS,
+    };
+
+    // BlockRadixRank type
+    typedef BlockRadixRank<
+        BLOCK_THREADS,
+        RADIX_BITS,
+        MEMOIZE_OUTER_SCAN,
+        INNER_SCAN_ALGORITHM,
+        SMEM_CONFIG> BlockRadixRank;
+
+    // BlockLoad type (keys)
+    typedef BlockLoad<
+        UnsignedBits*,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM,
+        LOAD_MODIFIER,
+        EXCHANGE_TIME_SLICING> BlockLoadKeys;
+
+    // BlockLoad type (values)
+    typedef BlockLoad<
+        Value*,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM,
+        LOAD_MODIFIER,
+        EXCHANGE_TIME_SLICING> BlockLoadValues;
+
+    // BlockExchange type (keys)
+    typedef BlockExchange<
+        UnsignedBits,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        EXCHANGE_TIME_SLICING> BlockExchangeKeys;
+
+    // BlockExchange type (values)
+    typedef BlockExchange<
+        Value,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        EXCHANGE_TIME_SLICING> BlockExchangeValues;
+
+
+    /**
+     * Shared memory storage layout
+     */
+    struct _TempStorage
+    {
+        SizeT   relative_bin_offsets[RADIX_DIGITS + 1];
+        bool    short_circuit;
+
+        union
+        {
+            typename BlockRadixRank::TempStorage        ranking;
+            typename BlockLoadKeys::TempStorage         load_keys;
+            typename BlockLoadValues::TempStorage       load_values;
+            typename BlockExchangeKeys::TempStorage     exchange_keys;
+            typename BlockExchangeValues::TempStorage   exchange_values;
+        };
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Input and output device pointers
+    UnsignedBits    *d_keys_in;
+    UnsignedBits    *d_keys_out;
+    Value           *d_values_in;
+    Value           *d_values_out;
+
+    // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
+    SizeT           bin_offset;
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+    // Whether to short-ciruit
+    bool            short_circuit;
+
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Decodes given keys to lookup digit offsets in shared memory
+     */
+    __device__ __forceinline__ void DecodeRelativeBinOffsets(
+        UnsignedBits    (&twiddled_keys)[ITEMS_PER_THREAD],
+        SizeT           (&relative_bin_offsets)[ITEMS_PER_THREAD])
+    {
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            UnsignedBits digit = BFE(twiddled_keys[KEY], current_bit, RADIX_BITS);
+
+            // Lookup base digit offset from shared memory
+            relative_bin_offsets[KEY] = temp_storage.relative_bin_offsets[digit];
+        }
+    }
+
+
+    /**
+     * Scatter ranked items to global memory
+     */
+    template <bool FULL_TILE, typename T>
+    __device__ __forceinline__ void ScatterItems(
+        T       (&items)[ITEMS_PER_THREAD],
+        int     (&local_ranks)[ITEMS_PER_THREAD],
+        SizeT   (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        T       *d_out,
+        SizeT   valid_items)
+    {
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Scatter if not out-of-bounds
+            if (FULL_TILE || (local_ranks[ITEM] < valid_items))
+            {
+                d_out[relative_bin_offsets[ITEM] + local_ranks[ITEM]] = items[ITEM];
+            }
+        }
+    }
+
+
+    /**
+     * Scatter ranked keys directly to global memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterKeys(
+        UnsignedBits                            (&twiddled_keys)[ITEMS_PER_THREAD],
+        SizeT                                   (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int                                     (&ranks)[ITEMS_PER_THREAD],
+        SizeT                                   valid_items,
+        Int2Type<RADIX_SORT_SCATTER_DIRECT>     scatter_algorithm)
+    {
+        // Compute scatter offsets
+        DecodeRelativeBinOffsets(twiddled_keys, relative_bin_offsets);
+
+        // Untwiddle keys before outputting
+        UnsignedBits keys[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            keys[KEY] = Traits<Key>::TwiddleOut(twiddled_keys[KEY]);
+        }
+
+        // Scatter to global
+        ScatterItems<FULL_TILE>(keys, ranks, relative_bin_offsets, d_keys_out, valid_items);
+    }
+
+
+    /**
+     * Scatter ranked keys through shared memory, then to global memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterKeys(
+        UnsignedBits                            (&twiddled_keys)[ITEMS_PER_THREAD],
+        SizeT                                   (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int                                     (&ranks)[ITEMS_PER_THREAD],
+        SizeT                                   valid_items,
+        Int2Type<RADIX_SORT_SCATTER_TWO_PHASE>  scatter_algorithm)
+    {
+        // Exchange keys through shared memory
+        BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(twiddled_keys, ranks);
+
+        // Compute striped local ranks
+        int local_ranks[ITEMS_PER_THREAD];
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS);
+        }
+
+        // Scatter directly
+        ScatterKeys<FULL_TILE>(
+            twiddled_keys,
+            relative_bin_offsets,
+            local_ranks,
+            valid_items,
+            Int2Type<RADIX_SORT_SCATTER_DIRECT>());
+    }
+
+
+    /**
+     * Scatter ranked values directly to global memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterValues(
+        Value                                   (&values)[ITEMS_PER_THREAD],
+        SizeT                                   (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int                                     (&ranks)[ITEMS_PER_THREAD],
+        SizeT                                   valid_items,
+        Int2Type<RADIX_SORT_SCATTER_DIRECT>     scatter_algorithm)
+    {
+        // Scatter to global
+        ScatterItems<FULL_TILE>(values, ranks, relative_bin_offsets, d_values_out, valid_items);
+    }
+
+
+    /**
+     * Scatter ranked values through shared memory, then to global memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterValues(
+        Value                                   (&values)[ITEMS_PER_THREAD],
+        SizeT                                   (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int                                     (&ranks)[ITEMS_PER_THREAD],
+        SizeT                                   valid_items,
+        Int2Type<RADIX_SORT_SCATTER_TWO_PHASE>  scatter_algorithm)
+    {
+        __syncthreads();
+
+        // Exchange keys through shared memory
+        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
+
+        // Compute striped local ranks
+        int local_ranks[ITEMS_PER_THREAD];
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS);
+        }
+
+        // Scatter directly
+        ScatterValues<FULL_TILE>(
+            values,
+            relative_bin_offsets,
+            local_ranks,
+            valid_items,
+            Int2Type<RADIX_SORT_SCATTER_DIRECT>());
+    }
+
+
+    /**
+     * Load a tile of items (specialized for full tile)
+     */
+    template <typename BlockLoadT, typename T>
+    __device__ __forceinline__ void LoadItems(
+        BlockLoadT      &block_loader, 
+        T               (&items)[ITEMS_PER_THREAD],
+        T               *d_in, 
+        SizeT           valid_items, 
+        Int2Type<true>  is_full_tile)
+    {
+        block_loader.Load(d_in, items);
+    }
+
+
+    /**
+     * Load a tile of items (specialized for partial tile)
+     */
+    template <typename BlockLoadT, typename T>
+    __device__ __forceinline__ void LoadItems(
+        BlockLoadT      &block_loader, 
+        T               (&items)[ITEMS_PER_THREAD],
+        T               *d_in, 
+        SizeT           valid_items, 
+        Int2Type<false> is_full_tile)
+    {
+        block_loader.Load(d_in, items, valid_items);
+    }
+
+
+    /**
+     * Truck along associated values
+     */
+    template <bool FULL_TILE, typename _Value>
+    __device__ __forceinline__ void GatherScatterValues(
+        _Value      (&values)[ITEMS_PER_THREAD],
+        SizeT       (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int         (&ranks)[ITEMS_PER_THREAD],
+        SizeT       block_offset,
+        SizeT       valid_items)
+    {
+        BlockLoadValues loader(temp_storage.load_values);
+        LoadItems(
+            loader,
+            values,
+            d_values_in + block_offset,
+            valid_items,
+            Int2Type<FULL_TILE>());
+
+        ScatterValues<FULL_TILE>(
+            values,
+            relative_bin_offsets,
+            ranks,
+            valid_items,
+            Int2Type<SCATTER_ALGORITHM>());
+    }
+
+
+    /**
+     * Truck along associated values (specialized for key-only sorting)
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void GatherScatterValues(
+        NullType    (&values)[ITEMS_PER_THREAD],
+        SizeT       (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int         (&ranks)[ITEMS_PER_THREAD],
+        SizeT       block_offset,
+        SizeT       valid_items)
+    {}
+
+
+    /**
+     * Process tile
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ProcessTile(
+        SizeT block_offset,
+        const SizeT &valid_items = TILE_ITEMS)
+    {
+        // Per-thread tile data
+        UnsignedBits    keys[ITEMS_PER_THREAD];                     // Keys
+        UnsignedBits    twiddled_keys[ITEMS_PER_THREAD];            // Twiddled keys
+        int             ranks[ITEMS_PER_THREAD];                    // For each key, the local rank within the CTA
+        SizeT           relative_bin_offsets[ITEMS_PER_THREAD];     // For each key, the global scatter base offset of the corresponding digit
+
+        if (LOAD_ALGORITHM != BLOCK_LOAD_DIRECT) __syncthreads();
+
+        // Assign max-key to all keys
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            keys[ITEM] = MAX_KEY;
+        }
+
+        // Load tile of keys
+        BlockLoadKeys loader(temp_storage.load_keys);
+        LoadItems(
+            loader,
+            keys,
+            d_keys_in + block_offset,
+            valid_items, 
+            Int2Type<FULL_TILE>());
+
+        __syncthreads();
+
+        // Twiddle key bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            twiddled_keys[KEY] = Traits<Key>::TwiddleIn(keys[KEY]);
+        }
+
+        // Rank the twiddled keys
+        int inclusive_digit_prefix;
+        BlockRadixRank(temp_storage.ranking).RankKeys(
+            twiddled_keys,
+            ranks,
+            current_bit,
+            inclusive_digit_prefix);
+
+        // Update global scatter base offsets for each digit
+        if ((BLOCK_THREADS == RADIX_DIGITS) || (threadIdx.x < RADIX_DIGITS))
+        {
+            int exclusive_digit_prefix;
+
+            // Get exclusive digit prefix from inclusive prefix
+#if CUB_PTX_ARCH >= 300
+            exclusive_digit_prefix = ShuffleUp(inclusive_digit_prefix, 1);
+            if (threadIdx.x == 0)
+                exclusive_digit_prefix = 0;
+#else
+            volatile int* exchange = reinterpret_cast<int *>(temp_storage.relative_bin_offsets);
+            exchange[threadIdx.x] = 0;
+            exchange[threadIdx.x + 1] = inclusive_digit_prefix;
+            exclusive_digit_prefix = exchange[threadIdx.x];
+#endif
+
+            bin_offset -= exclusive_digit_prefix;
+            temp_storage.relative_bin_offsets[threadIdx.x] = bin_offset;
+            bin_offset += inclusive_digit_prefix;
+        }
+
+        __syncthreads();
+
+        // Scatter keys
+        ScatterKeys<FULL_TILE>(twiddled_keys, relative_bin_offsets, ranks, valid_items, Int2Type<SCATTER_ALGORITHM>());
+
+        // Gather/scatter values
+        Value values[ITEMS_PER_THREAD];
+        GatherScatterValues<FULL_TILE>(values, relative_bin_offsets, ranks, block_offset, valid_items);
+    }
+
+
+    /**
+     * Copy tiles within the range of input
+     */
+    template <typename T>
+    __device__ __forceinline__ void Copy(
+        T       *d_in,
+        T       *d_out,
+        SizeT   block_offset,
+        SizeT   block_oob)
+    {
+        // Simply copy the input
+        while (block_offset + TILE_ITEMS <= block_oob)
+        {
+            T items[ITEMS_PER_THREAD];
+
+            LoadStriped<LOAD_DEFAULT, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
+            __syncthreads();
+            StoreStriped<STORE_DEFAULT, BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
+
+            block_offset += TILE_ITEMS;
+        }
+
+        // Clean up last partial tile with guarded-I/O
+        if (block_offset < block_oob)
+        {
+            SizeT valid_items = block_oob - block_offset;
+
+            T items[ITEMS_PER_THREAD];
+
+            LoadStriped<LOAD_DEFAULT, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
+            __syncthreads();
+            StoreStriped<STORE_DEFAULT, BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
+        }
+    }
+
+
+    /**
+     * Copy tiles within the range of input (specialized for NullType)
+     */
+    __device__ __forceinline__ void Copy(
+        NullType    *d_in,
+        NullType    *d_out,
+        SizeT       block_offset,
+        SizeT       block_oob)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockRadixSortDownsweepTiles(
+        TempStorage &temp_storage,
+        SizeT       bin_offset,
+        Key         *d_keys_in,
+        Key         *d_keys_out,
+        Value       *d_values_in,
+        Value       *d_values_out,
+        int         current_bit)
+    :
+        temp_storage(temp_storage.Alias()),
+        bin_offset(bin_offset),
+        d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_in(d_values_in),
+        d_values_out(d_values_out),
+        current_bit(current_bit),
+        short_circuit(false)
+    {}
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockRadixSortDownsweepTiles(
+        TempStorage &temp_storage,
+        SizeT       num_items,
+        SizeT       *d_spine,
+        Key         *d_keys_in,
+        Key         *d_keys_out,
+        Value       *d_values_in,
+        Value       *d_values_out,
+        int         current_bit)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_in(d_values_in),
+        d_values_out(d_values_out),
+        current_bit(current_bit)
+    {
+        // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
+        if (threadIdx.x < RADIX_DIGITS)
+        {
+            // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
+            SizeT first_block_bin_offset = d_spine[gridDim.x * threadIdx.x];
+            int predicate = ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
+            this->temp_storage.short_circuit = WarpAll(predicate);
+
+            // Load my block's bin offset for my bin
+            bin_offset = d_spine[(gridDim.x * threadIdx.x) + blockIdx.x];
+        }
+
+        __syncthreads();
+
+        short_circuit = this->temp_storage.short_circuit;
+    }
+
+
+    /**
+     * Distribute keys from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessTiles(
+        SizeT           block_offset,
+        const SizeT     &block_oob)
+    {
+        if (short_circuit)
+        {
+            // Copy keys
+            Copy(d_keys_in, d_keys_out, block_offset, block_oob);
+
+            // Copy values
+            Copy(d_values_in, d_values_out, block_offset, block_oob);
+        }
+        else
+        {
+            // Process full tiles of tile_items
+            while (block_offset + TILE_ITEMS <= block_oob)
+            {
+                ProcessTile<true>(block_offset);
+                block_offset += TILE_ITEMS;
+            }
+
+            // Clean up last partial tile with guarded-I/O
+            if (block_offset < block_oob)
+            {
+                ProcessTile<false>(block_offset, block_oob - block_offset);
+            }
+        }
+    }
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/device/block/block_radix_sort_upsweep_tiles.cuh b/lib/kokkos/TPL/cub/device/block/block_radix_sort_upsweep_tiles.cuh
new file mode 100644
index 000000000..22f8c9c75
--- /dev/null
+++ b/lib/kokkos/TPL/cub/device/block/block_radix_sort_upsweep_tiles.cuh
@@ -0,0 +1,464 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * BlockRadixSortUpsweepTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep.
+ */
+
+#pragma once
+
+#include "../../thread/thread_reduce.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../block/block_load.cuh"
+#include "../../util_type.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Tuning policy for BlockRadixSortUpsweepTiles
+ */
+template <
+    int                 _BLOCK_THREADS,     ///< The number of threads per CTA
+    int                 _ITEMS_PER_THREAD,  ///< The number of items to load per thread per tile
+    PtxLoadModifier     _LOAD_MODIFIER,     ///< Load cache-modifier
+    int                 _RADIX_BITS>        ///< The number of radix bits, i.e., log2(bins)
+struct BlockRadixSortUpsweepTilesPolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,
+        RADIX_BITS          = _RADIX_BITS,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    static const PtxLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
+
+    typedef BlockRadixSortUpsweepTilesPolicy<
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_MODIFIER,
+        CUB_MAX(1, RADIX_BITS - 1)> AltPolicy;
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockRadixSortUpsweepTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep.
+ *
+ * Computes radix digit histograms over a range of input tiles.
+ */
+template <
+    typename BlockRadixSortUpsweepTilesPolicy,
+    typename Key,
+    typename SizeT>
+struct BlockRadixSortUpsweepTiles
+{
+
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    typedef typename Traits<Key>::UnsignedBits UnsignedBits;
+
+    // Integer type for digit counters (to be packed into words of PackedCounters)
+    typedef unsigned char DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef unsigned int PackedCounter;
+
+    static const PtxLoadModifier LOAD_MODIFIER = BlockRadixSortUpsweepTilesPolicy::LOAD_MODIFIER;
+
+    enum
+    {
+        RADIX_BITS              = BlockRadixSortUpsweepTilesPolicy::RADIX_BITS,
+        BLOCK_THREADS           = BlockRadixSortUpsweepTilesPolicy::BLOCK_THREADS,
+        KEYS_PER_THREAD         = BlockRadixSortUpsweepTilesPolicy::ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS        = PtxArchProps::LOG_WARP_THREADS,
+        WARP_THREADS            = 1 << LOG_WARP_THREADS,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        TILE_ITEMS              = BLOCK_THREADS * KEYS_PER_THREAD,
+
+        BYTES_PER_COUNTER       = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER   = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO           = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO       = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES       = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
+        COUNTER_LANES           = 1 << LOG_COUNTER_LANES,
+
+        // To prevent counter overflow, we must periodically unpack and aggregate the
+        // digit counters back into registers.  Each counter lane is assigned to a
+        // warp for aggregation.
+
+        LANES_PER_WARP          = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
+
+        // Unroll tiles in batches without risk of counter overflow
+        UNROLL_COUNT            = CUB_MIN(64, 255 / KEYS_PER_THREAD),
+        UNROLLED_ELEMENTS       = UNROLL_COUNT * TILE_ITEMS,
+    };
+
+
+
+    /**
+     * Shared memory storage layout
+     */
+    struct _TempStorage
+    {
+        union
+        {
+            DigitCounter    digit_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+            PackedCounter   packed_counters[COUNTER_LANES][BLOCK_THREADS];
+            SizeT           digit_partials[RADIX_DIGITS][WARP_THREADS + 1];
+        };
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields (aggregate state bundle)
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Thread-local counters for periodically aggregating composite-counter lanes
+    SizeT           local_counts[LANES_PER_WARP][PACKING_RATIO];
+
+    // Input and output device pointers
+    UnsignedBits    *d_keys_in;
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+
+
+    //---------------------------------------------------------------------
+    // Helper structure for templated iteration
+    //---------------------------------------------------------------------
+
+    // Iterate
+    template <int COUNT, int MAX>
+    struct Iterate
+    {
+        enum {
+            HALF = (MAX / 2),
+        };
+
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(
+            BlockRadixSortUpsweepTiles &cta,
+            UnsignedBits keys[KEYS_PER_THREAD])
+        {
+            cta.Bucket(keys[COUNT]);
+
+            // Next
+            Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
+        }
+
+        // ProcessTiles
+        static __device__ __forceinline__ void ProcessTiles(BlockRadixSortUpsweepTiles &cta, SizeT block_offset)
+        {
+            // Next
+            Iterate<1, HALF>::ProcessTiles(cta, block_offset);
+            Iterate<1, MAX - HALF>::ProcessTiles(cta, block_offset + (HALF * TILE_ITEMS));
+        }
+    };
+
+    // Terminate
+    template <int MAX>
+    struct Iterate<MAX, MAX>
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(BlockRadixSortUpsweepTiles &cta, UnsignedBits keys[KEYS_PER_THREAD]) {}
+
+        // ProcessTiles
+        static __device__ __forceinline__ void ProcessTiles(BlockRadixSortUpsweepTiles &cta, SizeT block_offset)
+        {
+            cta.ProcessFullTile(block_offset);
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Decode a key and increment corresponding smem digit counter
+     */
+    __device__ __forceinline__ void Bucket(UnsignedBits key)
+    {
+        // Perform transform op
+        UnsignedBits converted_key = Traits<Key>::TwiddleIn(key);
+
+        // Add in sub-counter offset
+        UnsignedBits sub_counter = BFE(converted_key, current_bit, LOG_PACKING_RATIO);
+
+        // Add in row offset
+        UnsignedBits row_offset = BFE(converted_key, current_bit + LOG_PACKING_RATIO, LOG_COUNTER_LANES);
+
+        // Increment counter
+        temp_storage.digit_counters[row_offset][threadIdx.x][sub_counter]++;
+
+    }
+
+
+    /**
+     * Reset composite counters
+     */
+    __device__ __forceinline__ void ResetDigitCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
+        {
+            temp_storage.packed_counters[LANE][threadIdx.x] = 0;
+        }
+    }
+
+
+    /**
+     * Reset the unpacked counters in each thread
+     */
+    __device__ __forceinline__ void ResetUnpackedCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            #pragma unroll
+            for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+            {
+                local_counts[LANE][UNPACKED_COUNTER] = 0;
+            }
+        }
+    }
+
+
+    /**
+     * Extracts and aggregates the digit counters for each counter lane
+     * owned by this warp
+     */
+    __device__ __forceinline__ void UnpackDigitCounts()
+    {
+        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1);
+
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            const int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                #pragma unroll
+                for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
+                {
+                    #pragma unroll
+                    for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                    {
+                        SizeT counter = temp_storage.digit_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
+                        local_counts[LANE][UNPACKED_COUNTER] += counter;
+                    }
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Places unpacked counters into smem for final digit reduction
+     */
+    __device__ __forceinline__ void ReduceUnpackedCounts(SizeT &bin_count)
+    {
+        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1);
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    temp_storage.digit_partials[digit_row + UNPACKED_COUNTER][warp_tid] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        __syncthreads();
+
+        // Rake-reduce bin_count reductions
+        if (threadIdx.x < RADIX_DIGITS)
+        {
+            bin_count = ThreadReduce<WARP_THREADS>(
+                temp_storage.digit_partials[threadIdx.x],
+                Sum());
+        }
+    }
+
+
+    /**
+     * Processes a single, full tile
+     */
+    __device__ __forceinline__ void ProcessFullTile(SizeT block_offset)
+    {
+        // Tile of keys
+        UnsignedBits keys[KEYS_PER_THREAD];
+
+        LoadStriped<LOAD_MODIFIER, BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
+
+        // Prevent hoisting
+//        __threadfence_block();
+//        __syncthreads();
+
+        // Bucket tile of keys
+        Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
+    }
+
+
+    /**
+     * Processes a single load (may have some threads masked off)
+     */
+    __device__ __forceinline__ void ProcessPartialTile(
+        SizeT block_offset,
+        const SizeT &block_oob)
+    {
+        // Process partial tile if necessary using single loads
+        block_offset += threadIdx.x;
+        while (block_offset < block_oob)
+        {
+            // Load and bucket key
+            UnsignedBits key = ThreadLoad<LOAD_MODIFIER>(d_keys_in + block_offset);
+            Bucket(key);
+            block_offset += BLOCK_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockRadixSortUpsweepTiles(
+        TempStorage &temp_storage,
+        Key         *d_keys_in,
+        int         current_bit)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
+        current_bit(current_bit)
+    {}
+
+
+    /**
+     * Compute radix digit histograms from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessTiles(
+        SizeT           block_offset,
+        const SizeT     &block_oob,
+        SizeT           &bin_count)                ///< [out] The digit count for tid'th bin (output param, valid in the first RADIX_DIGITS threads)
+    {
+        // Reset digit counters in smem and unpacked counters in registers
+        ResetDigitCounters();
+        ResetUnpackedCounters();
+
+        // Unroll batches of full tiles
+        while (block_offset + UNROLLED_ELEMENTS <= block_oob)
+        {
+            Iterate<0, UNROLL_COUNT>::ProcessTiles(*this, block_offset);
+            block_offset += UNROLLED_ELEMENTS;
+
+            __syncthreads();
+
+            // Aggregate back into local_count registers to prevent overflow
+            UnpackDigitCounts();
+
+            __syncthreads();
+
+            // Reset composite counters in lanes
+            ResetDigitCounters();
+        }
+
+        // Unroll single full tiles
+        while (block_offset + TILE_ITEMS <= block_oob)
+        {
+            ProcessFullTile(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process partial tile if necessary
+        ProcessPartialTile(
+            block_offset,
+            block_oob);
+
+        __syncthreads();
+
+        // Aggregate back into local_count registers
+        UnpackDigitCounts();
+
+        __syncthreads();
+
+        // Final raking reduction of counts by bin
+        ReduceUnpackedCounts(bin_count);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/device/block/block_reduce_by_key_tiles.cuh b/lib/kokkos/TPL/cub/device/block/block_reduce_by_key_tiles.cuh
new file mode 100644
index 000000000..99e1980b6
--- /dev/null
+++ b/lib/kokkos/TPL/cub/device/block/block_reduce_by_key_tiles.cuh
@@ -0,0 +1,399 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceByKeyiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "scan_tiles_types.cuh"
+#include "../../block/block_load.cuh"
+#include "../../block/block_discontinuity.cuh"
+#include "../../block/block_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Utility data types
+ ******************************************************************************/
+
+/// Scan tuple data type for reduce-value-by-key
+template <typename Value, typename SizeT>
+struct ReduceByKeyuple
+{
+    Value   value;      // Initially set as value, contains segment aggregate after prefix scan
+    SizeT   flag;       // Initially set as a tail flag, contains scatter offset after prefix scan
+};
+
+
+/// Binary reduce-by-key scan operator
+template <typename ReductionOp>
+struct ReduceByKeyScanOp
+{
+    /// Reduction functor
+    ReductionOp reduction_op;
+
+    /// Constructor
+    ReduceByKeyScanOp(ReductionOp reduction_op) : reduction_op(reduction_op)
+    {}
+
+    /// Binary scan operator
+    template <typename ReduceByKeyuple>
+    __device__ __forceinline__ ReduceByKeyuple operator()(
+        const ReduceByKeyuple &first,
+        const ReduceByKeyuple &second)
+    {
+        ReduceByKeyuple retval;
+        retval.val = (second.flag) ? second.val : reduction_op(first.val, second.val);
+        retval.flag = first.flag + second.flag;
+        return retval;
+    }
+};
+
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Tuning policy for BlockReduceByKeyiles
+ */
+template <
+    int                         _BLOCK_THREADS,
+    int                         _ITEMS_PER_THREAD,
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,
+    bool                        _LOAD_WARP_TIME_SLICING,
+    PtxLoadModifier             _LOAD_MODIFIER,
+    BlockScanAlgorithm          _SCAN_ALGORITHM>
+struct BlockReduceByKeyilesPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,
+        LOAD_WARP_TIME_SLICING  = _LOAD_WARP_TIME_SLICING,
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM      = _LOAD_ALGORITHM;
+    static const PtxLoadModifier        LOAD_MODIFIER       = _LOAD_MODIFIER;
+    static const BlockScanAlgorithm     SCAN_ALGORITHM      = _SCAN_ALGORITHM;
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockReduceByKeyiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan.
+ */
+template <
+    typename BlockReduceByKeyilesPolicy,   ///< Tuning policy
+    typename KeyInputIteratorRA,            ///< Random-access input iterator type for keys
+    typename KeyOutputIteratorRA,           ///< Random-access output iterator type for keys
+    typename ValueInputIteratorRA,          ///< Random-access input iterator type for values
+    typename ValueOutputIteratorRA,         ///< Random-access output iterator type for values
+    typename ReductionOp,                   ///< Reduction functor type
+    typename SizeT>                         ///< Offset integer type
+struct BlockReduceByKeyiles
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Data types of input iterators
+    typedef typename std::iterator_traits<KeyInputIteratorRA>::value_type   Key;    // Key data type
+    typedef typename std::iterator_traits<ValueInputIteratorRA>::value_type Value;  // Value data type
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockReduceByKeyilesPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockReduceByKeyilesPolicy::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        STATUS_PADDING      = PtxArchProps::WARP_THREADS,
+    };
+
+    // Block load type for keys
+    typedef BlockLoad<
+        KeyInputIteratorRA,
+        BlockReduceByKeyilesPolicy::BLOCK_THREADS,
+        BlockReduceByKeyilesPolicy::ITEMS_PER_THREAD,
+        BlockReduceByKeyilesPolicy::LOAD_ALGORITHM,
+        BlockReduceByKeyilesPolicy::LOAD_MODIFIER,
+        BlockReduceByKeyilesPolicy::LOAD_WARP_TIME_SLICING>    BlockLoadKeys;
+
+    // Block load type for values
+    typedef BlockLoad<
+        ValueInputIteratorRA,
+        BlockReduceByKeyilesPolicy::BLOCK_THREADS,
+        BlockReduceByKeyilesPolicy::ITEMS_PER_THREAD,
+        BlockReduceByKeyilesPolicy::LOAD_ALGORITHM,
+        BlockReduceByKeyilesPolicy::LOAD_MODIFIER,
+        BlockReduceByKeyilesPolicy::LOAD_WARP_TIME_SLICING>    BlockLoadValues;
+
+    // Block discontinuity type for setting tail flags
+    typedef BlockDiscontinuity<Key, BLOCK_THREADS>              BlockDiscontinuityKeys;
+
+    // Scan tuple type
+    typedef ReduceByKeyuple<Value, SizeT>                      ScanTuple;
+
+    // Tile status descriptor type
+    typedef ScanTileDescriptor<ScanTuple>                 ScanTileDescriptorT;
+
+    // Block scan functor type
+    typedef ReduceByKeyScanOp<ReductionOp>                      ScanOp;
+
+    // Block scan prefix callback type
+    typedef DeviceScanBlockPrefixOp<ScanTuple, ScanOp>          PrefixCallback;
+
+    // Block scan type
+    typedef BlockScan<
+        ScanTuple,
+        BlockReduceByKeyilesPolicy::BLOCK_THREADS,
+        BlockReduceByKeyilesPolicy::SCAN_ALGORITHM>            BlockScanT;
+
+    /// Shared memory type for this threadblock
+    struct _TempStorage
+    {
+        union
+        {
+            typename BlockLoadKeys::TempStorage         load_keys;      // Smem needed for loading tiles of keys
+            typename BlockLoadValues::TempStorage       load_values;    // Smem needed for loading tiles of values
+            struct
+            {
+                typename BlockScanT::TempStorage        scan;           // Smem needed for tile scanning
+                typename PrefixCallback::TempStorage    prefix;         // Smem needed for cooperative prefix callback
+            };
+        };
+
+        typename BlockDiscontinuityKeys::TempStorage    flagging;       // Smem needed for tile scanning
+        SizeT                                           tile_idx;       // Shared tile index
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage                &temp_storage;      ///< Reference to temp_storage
+    KeyInputIteratorRA          d_keys_in;          ///< Key input data
+    KeyOutputIteratorRA         d_keys_out;         ///< Key output data
+    ValueInputIteratorRA        d_values_in;        ///< Value input data
+    ValueOutputIteratorRA       d_values_out;       ///< Value output data
+    ScanTileDescriptorT         *d_tile_status;     ///< Global list of tile status
+    ScanOp                      scan_op;            ///< Binary scan operator
+    int                         num_tiles;          ///< Total number of input tiles for the entire problem
+    SizeT                       num_items;          ///< Total number of scan items for the entire problem
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    BlockReduceByKeyiles(
+        TempStorage                 &temp_storage,      ///< Reference to temp_storage
+        KeyInputIteratorRA          d_keys_in,          ///< Key input data
+        KeyOutputIteratorRA         d_keys_out,         ///< Key output data
+        ValueInputIteratorRA        d_values_in,        ///< Value input data
+        ValueOutputIteratorRA       d_values_out,       ///< Value output data
+        ScanTileDescriptorT       *d_tile_status,     ///< Global list of tile status
+        ReductionOp                 reduction_op,       ///< Binary scan operator
+        int                         num_tiles,          ///< Total number of input tiles for the entire problem
+        SizeT                       num_items)          ///< Total number of scan items for the entire problem
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(d_keys_in),
+        d_keys_out(d_keys_out),
+        d_values_in(d_values_in),
+        d_values_out(d_values_out),
+        d_tile_status(d_tile_status),
+        scan_op(reduction_op),
+        num_tiles(num_tiles),
+        num_items(num_items)
+    {}
+
+
+    /**
+     * Process a tile of input
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        int     tile_idx,                   ///< Tile index
+        SizeT   block_offset,               ///< Tile offset
+        int     valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
+    {
+        Key         keys[ITEMS_PER_THREAD];
+        Value       values[ITEMS_PER_THREAD];
+        int         tail_flags[ITEMS_PER_THREAD];
+        ScanTuple   scan_tuples[ITEMS_PER_THREAD];
+
+        // Load keys
+        if (FULL_TILE)
+            BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys);
+        else
+            BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, valid_items);
+
+        // Set tail flags
+        if (tile_idx == num_tiles - 1)
+        {
+            // Last tile
+            BlockDiscontinuityKeys(temp_storage.flagging).FlagTails(tail_flags, keys, Equality());
+        }
+        else
+        {
+            // Preceding tiles require the first element of the next tile
+            Key tile_suffix_item;
+            if (threadIdx.x == 0)
+                tile_suffix_item = d_keys_in[block_offset + TILE_ITEMS];
+
+            BlockDiscontinuityKeys(temp_storage.flagging).FlagTails(tail_flags, keys, Equality(), tile_suffix_item);
+        }
+
+        __syncthreads();
+
+        // Load values
+        if (FULL_TILE)
+            BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values);
+        else
+            BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values, valid_items);
+
+        // Assemble scan tuples
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scan_tuples[ITEM].value     = values[ITEM];
+            scan_tuples[ITEM].flag      = tail_flags[ITEM];
+        }
+
+        __syncthreads();
+
+        // Perform inclusive prefix scan
+        ScanTuple block_aggregate;
+        if (tile_idx == 0)
+        {
+            // Without prefix callback
+            BlockScanT(temp_storage.scan).InclusiveScan(scan_tuples, scan_tuples, scan_op, block_aggregate);
+
+            // Update tile status
+            if (threadIdx.x == 0)
+                ScanTileDescriptorT::SetPrefix(d_tile_status, block_aggregate);
+        }
+        else
+        {
+            // With prefix callback
+            PrefixCallback prefix_op(d_tile_status, temp_storage.prefix, scan_op, tile_idx);
+            BlockScanT(temp_storage.scan).InclusiveScan(scan_tuples, scan_tuples, scan_op, block_aggregate, prefix_op);
+        }
+
+        // Scatter flagged keys and values to output
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int tile_item = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+
+            // Set the head flag on the last item in a partially-full tile
+            if (!FULL_TILE && (tile_item == valid_items - 1))
+                tail_flags[ITEM] = 1;
+
+            // Decrement scatter offset
+            scan_tuples[ITEM].flag--;
+
+            // Scatter key and aggregate value if flagged and in range
+            if ((FULL_TILE || (tile_item < valid_items)) && (tail_flags[ITEM]))
+            {
+                d_keys_out[scan_tuples[ITEM].flag]      = keys[ITEM];
+                d_values_out[scan_tuples[ITEM].flag]    = scan_tuples[ITEM].value;
+            }
+        }
+    }
+
+
+
+    /**
+     * Dequeue and scan tiles of elements
+     */
+    __device__ __forceinline__ void ProcessTiles(GridQueue<int> queue)          ///< Queue descriptor for assigning tiles of work to thread blocks
+    {
+        // We give each thread block at least one tile of input
+        int tile_idx = blockIdx.x;
+
+        // Consume full tiles of input
+        SizeT block_offset = SizeT(TILE_ITEMS) * tile_idx;
+        while (block_offset + TILE_ITEMS <= num_items)
+        {
+            ConsumeTile<true>(tile_idx, block_offset);
+
+            // Get next tile
+#if CUB_PTX_ARCH < 200
+            // No concurrent kernels allowed, so just stripe tiles
+            tile_idx += gridDim.x;
+#else
+            // Concurrent kernels are allowed, so we must only use active blocks to dequeue tile indices
+            if (threadIdx.x == 0)
+                temp_storage.tile_idx = queue.Drain(1) + gridDim.x;
+
+            __syncthreads();
+
+            tile_idx = temp_storage.tile_idx;
+#endif
+            block_offset = SizeT(TILE_ITEMS) * tile_idx;
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < num_items)
+        {
+            // Consume a partially-full tile
+            int valid_items = num_items - block_offset;
+            ConsumeTile<false>(tile_idx, block_offset, valid_items);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/device/block/block_reduce_tiles.cuh b/lib/kokkos/TPL/cub/device/block/block_reduce_tiles.cuh
new file mode 100644
index 000000000..a83c098ae
--- /dev/null
+++ b/lib/kokkos/TPL/cub/device/block/block_reduce_tiles.cuh
@@ -0,0 +1,375 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../block/block_load.cuh"
+#include "../../block/block_reduce.cuh"
+#include "../../grid/grid_mapping.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../util_vector.cuh"
+#include "../../util_namespace.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Tuning policy for BlockReduceTiles
+ */
+template <
+    int                     _BLOCK_THREADS,         ///< Threads per thread block
+    int                     _ITEMS_PER_THREAD,      ///< Items per thread per tile of input
+    int                     _VECTOR_LOAD_LENGTH,    ///< Number of items per vectorized load
+    BlockReduceAlgorithm    _BLOCK_ALGORITHM,       ///< Cooperative block-wide reduction algorithm to use
+    PtxLoadModifier         _LOAD_MODIFIER,         ///< PTX load modifier
+    GridMappingStrategy     _GRID_MAPPING>          ///< How to map tiles of input onto thread blocks
+struct BlockReduceTilesPolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,
+        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,
+    };
+
+    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;
+    static const GridMappingStrategy   GRID_MAPPING         = _GRID_MAPPING;
+    static const PtxLoadModifier       LOAD_MODIFIER        = _LOAD_MODIFIER;
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockReduceTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction.
+ *
+ * Each thread reduces only the values it loads. If \p FIRST_TILE, this
+ * partial reduction is stored into \p thread_aggregate.  Otherwise it is
+ * accumulated into \p thread_aggregate.
+ */
+template <
+    typename BlockReduceTilesPolicy,
+    typename InputIteratorRA,
+    typename SizeT,
+    typename ReductionOp>
+struct BlockReduceTiles
+{
+
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    typedef typename std::iterator_traits<InputIteratorRA>::value_type  T;              // Type of input iterator
+    typedef VectorHelper<T, BlockReduceTilesPolicy::VECTOR_LOAD_LENGTH> VecHelper;      // Helper type for vectorizing loads of T
+    typedef typename VecHelper::Type                                    VectorT;        // Vector of T
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockReduceTilesPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockReduceTilesPolicy::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        VECTOR_LOAD_LENGTH  = BlockReduceTilesPolicy::VECTOR_LOAD_LENGTH,
+
+        // Can vectorize according to the policy if the input iterator is a native pointer to a built-in primitive
+        CAN_VECTORIZE       = (BlockReduceTilesPolicy::VECTOR_LOAD_LENGTH > 1) &&
+                                (IsPointer<InputIteratorRA>::VALUE) &&
+                                (VecHelper::BUILT_IN),
+
+    };
+
+    static const PtxLoadModifier      LOAD_MODIFIER   = BlockReduceTilesPolicy::LOAD_MODIFIER;
+    static const BlockReduceAlgorithm BLOCK_ALGORITHM = BlockReduceTilesPolicy::BLOCK_ALGORITHM;
+
+    // Parameterized BlockReduce primitive
+    typedef BlockReduce<T, BLOCK_THREADS, BlockReduceTilesPolicy::BLOCK_ALGORITHM> BlockReduceT;
+
+    /// Shared memory type required by this thread block
+    typedef typename BlockReduceT::TempStorage _TempStorage;
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    T                       thread_aggregate;   ///< Each thread's partial reduction
+    _TempStorage&           temp_storage;       ///< Reference to temp_storage
+    InputIteratorRA         d_in;               ///< Input data to reduce
+    ReductionOp             reduction_op;       ///< Binary reduction operator
+    int                     first_tile_size;    ///< Size of first tile consumed
+    bool                    input_aligned;      ///< Whether or not input is vector-aligned
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockReduceTiles(
+        TempStorage&            temp_storage,       ///< Reference to temp_storage
+        InputIteratorRA         d_in,               ///< Input data to reduce
+        ReductionOp             reduction_op)       ///< Binary reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        reduction_op(reduction_op),
+        first_tile_size(0),
+        input_aligned(CAN_VECTORIZE && ((size_t(d_in) & (sizeof(VectorT) - 1)) == 0))
+    {}
+
+
+    /**
+     * Process a single tile of input
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        SizeT   block_offset,                   ///< The offset the tile to consume
+        int     valid_items = TILE_ITEMS)       ///< The number of valid items in the tile
+    {
+        if (FULL_TILE)
+        {
+            T stripe_partial;
+
+            // Load full tile
+            if (input_aligned)
+            {
+                // Alias items as an array of VectorT and load it in striped fashion
+                enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
+
+                VectorT vec_items[WORDS];
+
+                // Load striped into vec items
+                VectorT* alias_ptr = reinterpret_cast<VectorT*>(d_in + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH));
+
+                #pragma unroll
+                for (int i = 0; i < WORDS; ++i)
+                    vec_items[i] = alias_ptr[BLOCK_THREADS * i];
+
+                // Reduce items within each thread stripe
+                stripe_partial = ThreadReduce<ITEMS_PER_THREAD>(
+                    reinterpret_cast<T*>(vec_items),
+                    reduction_op);
+            }
+            else
+            {
+                T items[ITEMS_PER_THREAD];
+
+                // Load items in striped fashion
+                LoadStriped<LOAD_MODIFIER, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
+
+                // Reduce items within each thread stripe
+                stripe_partial = ThreadReduce(items, reduction_op);
+            }
+
+            // Update running thread aggregate
+            thread_aggregate = (first_tile_size) ?
+                reduction_op(thread_aggregate, stripe_partial) :       // Update
+                stripe_partial;                                        // Assign
+        }
+        else
+        {
+
+            // Partial tile
+            int thread_offset = threadIdx.x;
+
+            if (!first_tile_size && (thread_offset < valid_items))
+            {
+                // Assign thread_aggregate
+                thread_aggregate = ThreadLoad<LOAD_MODIFIER>(d_in + block_offset + thread_offset);
+                thread_offset += BLOCK_THREADS;
+            }
+
+            while (thread_offset < valid_items)
+            {
+                // Update thread aggregate
+                T item = ThreadLoad<LOAD_MODIFIER>(d_in + block_offset + thread_offset);
+                thread_aggregate = reduction_op(thread_aggregate, item);
+                thread_offset += BLOCK_THREADS;
+            }
+        }
+
+        // Set first tile size if necessary
+        if (!first_tile_size)
+            first_tile_size = valid_items;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Consume a contiguous segment of tiles
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        SizeT   block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        SizeT   block_oob,                          ///< [in] Threadblock end offset (exclusive)
+        T       &block_aggregate)                   ///< [out] Running total
+    {
+        // Consume subsequent full tiles of input
+        while (block_offset + TILE_ITEMS <= block_oob)
+        {
+            ConsumeTile<true>(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < block_oob)
+        {
+            int valid_items = block_oob - block_offset;
+            ConsumeTile<false>(block_offset, valid_items);
+        }
+
+        // Compute block-wide reduction
+        block_aggregate = (first_tile_size < TILE_ITEMS) ?
+            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
+            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
+    }
+
+
+    /**
+     * Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        SizeT                               num_items,          ///< [in] Total number of global input items
+        GridEvenShare<SizeT>                &even_share,        ///< [in] GridEvenShare descriptor
+        GridQueue<SizeT>                    &queue,             ///< [in,out] GridQueue descriptor
+        T                                   &block_aggregate,   ///< [out] Running total
+        Int2Type<GRID_MAPPING_EVEN_SHARE>   is_even_share)      ///< [in] Marker type indicating this is an even-share mapping
+    {
+        // Initialize even-share descriptor for this thread block
+        even_share.BlockInit();
+
+        // Consume input tiles
+        ConsumeTiles(even_share.block_offset, even_share.block_oob, block_aggregate);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Dynamically consume tiles
+    //---------------------------------------------------------------------
+
+    /**
+     * Dequeue and reduce tiles of items as part of a inter-block scan
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        int                 num_items,          ///< Total number of input items
+        GridQueue<SizeT>    queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
+        T                   &block_aggregate)   ///< [out] Running total
+    {
+        // Shared dequeue offset
+        __shared__ SizeT dequeue_offset;
+
+        // We give each thread block at least one tile of input.
+        SizeT block_offset = blockIdx.x * TILE_ITEMS;
+        SizeT even_share_base = gridDim.x * TILE_ITEMS;
+
+        if (block_offset + TILE_ITEMS <= num_items)
+        {
+            // Consume full tile of input
+            ConsumeTile<true>(block_offset);
+
+            // Dequeue more tiles
+            while (true)
+            {
+                 // Dequeue a tile of items
+                if (threadIdx.x == 0)
+                    dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;
+
+                __syncthreads();
+
+                // Grab tile offset and check if we're done with full tiles
+                block_offset = dequeue_offset;
+
+                __syncthreads();
+
+                if (block_offset + TILE_ITEMS > num_items)
+                    break;
+
+                // Consume a full tile
+                ConsumeTile<true>(block_offset);
+            }
+        }
+
+        if (block_offset < num_items)
+        {
+            int valid_items = num_items - block_offset;
+            ConsumeTile<false>(block_offset, valid_items);
+        }
+
+        // Compute block-wide reduction
+        block_aggregate = (first_tile_size < TILE_ITEMS) ?
+            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
+            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
+    }
+
+
+    /**
+     * Dequeue and reduce tiles of items as part of a inter-block scan
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        SizeT                               num_items,          ///< [in] Total number of global input items
+        GridEvenShare<SizeT>                &even_share,        ///< [in] GridEvenShare descriptor
+        GridQueue<SizeT>                    &queue,             ///< [in,out] GridQueue descriptor
+        T                                   &block_aggregate,   ///< [out] Running total
+        Int2Type<GRID_MAPPING_DYNAMIC>      is_dynamic)         ///< [in] Marker type indicating this is a dynamic mapping
+    {
+        ConsumeTiles(num_items, queue, block_aggregate);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/device/block/block_scan_tiles.cuh b/lib/kokkos/TPL/cub/device/block/block_scan_tiles.cuh
new file mode 100644
index 000000000..980220480
--- /dev/null
+++ b/lib/kokkos/TPL/cub/device/block/block_scan_tiles.cuh
@@ -0,0 +1,509 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "scan_tiles_types.cuh"
+#include "../../block/block_load.cuh"
+#include "../../block/block_store.cuh"
+#include "../../block/block_scan.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Tuning policy for BlockScanTiles
+ */
+template <
+    int                         _BLOCK_THREADS,
+    int                         _ITEMS_PER_THREAD,
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,
+    bool                        _LOAD_WARP_TIME_SLICING,
+    PtxLoadModifier             _LOAD_MODIFIER,
+    BlockStoreAlgorithm         _STORE_ALGORITHM,
+    bool                        _STORE_WARP_TIME_SLICING,
+    BlockScanAlgorithm          _SCAN_ALGORITHM>
+struct BlockScanTilesPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,
+        LOAD_WARP_TIME_SLICING  = _LOAD_WARP_TIME_SLICING,
+        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM      = _LOAD_ALGORITHM;
+    static const PtxLoadModifier        LOAD_MODIFIER       = _LOAD_MODIFIER;
+    static const BlockStoreAlgorithm    STORE_ALGORITHM     = _STORE_ALGORITHM;
+    static const BlockScanAlgorithm     SCAN_ALGORITHM      = _SCAN_ALGORITHM;
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief BlockScanTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan.
+ *
+ * Implements a single-pass "domino" strategy with adaptive prefix lookback.
+ */
+template <
+    typename BlockScanTilesPolicy,     ///< Tuning policy
+    typename InputIteratorRA,               ///< Input iterator type
+    typename OutputIteratorRA,              ///< Output iterator type
+    typename ScanOp,                        ///< Scan functor type
+    typename Identity,                      ///< Identity element type (cub::NullType for inclusive scan)
+    typename SizeT>                         ///< Offset integer type
+struct BlockScanTiles
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Data type of input iterator
+    typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+    // Constants
+    enum
+    {
+        INCLUSIVE           = Equals<Identity, NullType>::VALUE,            // Inclusive scan if no identity type is provided
+        BLOCK_THREADS       = BlockScanTilesPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockScanTilesPolicy::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Block load type
+    typedef BlockLoad<
+        InputIteratorRA,
+        BlockScanTilesPolicy::BLOCK_THREADS,
+        BlockScanTilesPolicy::ITEMS_PER_THREAD,
+        BlockScanTilesPolicy::LOAD_ALGORITHM,
+        BlockScanTilesPolicy::LOAD_MODIFIER,
+        BlockScanTilesPolicy::LOAD_WARP_TIME_SLICING>   BlockLoadT;
+
+    // Block store type
+    typedef BlockStore<
+        OutputIteratorRA,
+        BlockScanTilesPolicy::BLOCK_THREADS,
+        BlockScanTilesPolicy::ITEMS_PER_THREAD,
+        BlockScanTilesPolicy::STORE_ALGORITHM,
+        STORE_DEFAULT,
+        BlockScanTilesPolicy::STORE_WARP_TIME_SLICING>  BlockStoreT;
+
+    // Tile status descriptor type
+    typedef ScanTileDescriptor<T>                 ScanTileDescriptorT;
+
+    // Block scan type
+    typedef BlockScan<
+        T,
+        BlockScanTilesPolicy::BLOCK_THREADS,
+        BlockScanTilesPolicy::SCAN_ALGORITHM> BlockScanT;
+
+    // Callback type for obtaining inter-tile prefix during block scan
+    typedef DeviceScanBlockPrefixOp<T, ScanOp> InterblockPrefixOp;
+
+    // Shared memory type for this threadblock
+    struct _TempStorage
+    {
+        union
+        {
+            typename BlockLoadT::TempStorage            load;               // Smem needed for tile loading
+            typename BlockStoreT::TempStorage           store;              // Smem needed for tile storing
+            struct
+            {
+                typename InterblockPrefixOp::TempStorage    prefix;         // Smem needed for cooperative prefix callback
+                typename BlockScanT::TempStorage            scan;           // Smem needed for tile scanning
+            };
+        };
+
+        SizeT                                           tile_idx;           // Shared tile index
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage                &temp_storage;      ///< Reference to temp_storage
+    InputIteratorRA             d_in;               ///< Input data
+    OutputIteratorRA            d_out;              ///< Output data
+    ScanOp                      scan_op;            ///< Binary scan operator
+    Identity                    identity;           ///< Identity element
+
+
+
+    //---------------------------------------------------------------------
+    // Block scan utility methods (first tile)
+    //---------------------------------------------------------------------
+
+    /**
+     * Exclusive scan specialization
+     */
+    template <typename _ScanOp, typename _Identity>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate);
+    }
+
+    /**
+     * Exclusive sum specialization
+     */
+    template <typename _Identity>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate);
+    }
+
+    /**
+     * Inclusive scan specialization
+     */
+    template <typename _ScanOp>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
+    }
+
+    /**
+     * Inclusive sum specialization
+     */
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate)
+    {
+        BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate);
+    }
+
+    //---------------------------------------------------------------------
+    // Block scan utility methods (subsequent tiles)
+    //---------------------------------------------------------------------
+
+    /**
+     * Exclusive scan specialization (with prefix from predecessors)
+     */
+    template <typename _ScanOp, typename _Identity, typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate, prefix_op);
+    }
+
+    /**
+     * Exclusive sum specialization (with prefix from predecessors)
+     */
+    template <typename _Identity, typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate, prefix_op);
+    }
+
+    /**
+     * Inclusive scan specialization (with prefix from predecessors)
+     */
+    template <typename _ScanOp, typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate, prefix_op);
+    }
+
+    /**
+     * Inclusive sum specialization (with prefix from predecessors)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
+    {
+        BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate, prefix_op);
+    }
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    BlockScanTiles(
+        TempStorage                 &temp_storage,      ///< Reference to temp_storage
+        InputIteratorRA             d_in,               ///< Input data
+        OutputIteratorRA            d_out,              ///< Output data
+        ScanOp                      scan_op,            ///< Binary scan operator
+        Identity                    identity)           ///< Identity element
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out(d_out),
+        scan_op(scan_op),
+        identity(identity)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Domino scan
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (domino scan)
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        SizeT                 num_items,          ///< Total number of input items
+        int                   tile_idx,           ///< Tile index
+        SizeT                 block_offset,       ///< Tile offset
+        ScanTileDescriptorT   *d_tile_status)     ///< Global list of tile status
+    {
+        // Load items
+        T items[ITEMS_PER_THREAD];
+
+        if (FULL_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, num_items - block_offset);
+
+        __syncthreads();
+
+        T block_aggregate;
+        if (tile_idx == 0)
+        {
+            ScanBlock(items, scan_op, identity, block_aggregate);
+
+            // Update tile status if there are successor tiles
+            if (FULL_TILE && (threadIdx.x == 0))
+                ScanTileDescriptorT::SetPrefix(d_tile_status, block_aggregate);
+        }
+        else
+        {
+            InterblockPrefixOp prefix_op(d_tile_status, temp_storage.prefix, scan_op, tile_idx);
+            ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
+        }
+
+        __syncthreads();
+
+        // Store items
+        if (FULL_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, num_items - block_offset);
+    }
+
+    /**
+     * Dequeue and scan tiles of items as part of a domino scan
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        int                   num_items,          ///< Total number of input items
+        GridQueue<int>        queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
+        ScanTileDescriptorT   *d_tile_status)     ///< Global list of tile status
+    {
+#if CUB_PTX_ARCH < 200
+
+        // No concurrent kernels allowed and blocks are launched in increasing order, so just assign one tile per block (up to 65K blocks)
+        int     tile_idx        = blockIdx.x;
+        SizeT   block_offset    = SizeT(TILE_ITEMS) * tile_idx;
+
+        if (block_offset + TILE_ITEMS <= num_items)
+            ConsumeTile<true>(num_items, tile_idx, block_offset, d_tile_status);
+        else if (block_offset < num_items)
+            ConsumeTile<false>(num_items, tile_idx, block_offset, d_tile_status);
+
+#else
+
+        // Get first tile
+        if (threadIdx.x == 0)
+            temp_storage.tile_idx = queue.Drain(1);
+
+        __syncthreads();
+
+        int tile_idx = temp_storage.tile_idx;
+        SizeT block_offset = SizeT(TILE_ITEMS) * tile_idx;
+
+        while (block_offset + TILE_ITEMS <= num_items)
+        {
+            // Consume full tile
+            ConsumeTile<true>(num_items, tile_idx, block_offset, d_tile_status);
+
+            // Get next tile
+            if (threadIdx.x == 0)
+                temp_storage.tile_idx = queue.Drain(1);
+
+            __syncthreads();
+
+            tile_idx = temp_storage.tile_idx;
+            block_offset = SizeT(TILE_ITEMS) * tile_idx;
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < num_items)
+        {
+            ConsumeTile<false>(num_items, tile_idx, block_offset, d_tile_status);
+        }
+#endif
+
+    }
+
+
+    //---------------------------------------------------------------------
+    // Even-share scan
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input
+     */
+    template <
+        bool FULL_TILE,
+        bool FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        SizeT                   block_offset,               ///< Tile offset
+        RunningBlockPrefixOp<T> &prefix_op,                 ///< Running prefix operator
+        int                     valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
+    {
+        // Load items
+        T items[ITEMS_PER_THREAD];
+
+        if (FULL_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, valid_items);
+
+        __syncthreads();
+
+        // Block scan
+        T block_aggregate;
+        if (FIRST_TILE)
+        {
+            ScanBlock(items, scan_op, identity, block_aggregate);
+            prefix_op.running_total = block_aggregate;
+        }
+        else
+        {
+            ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
+        }
+
+        __syncthreads();
+
+        // Store items
+        if (FULL_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, valid_items);
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        SizeT   block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        SizeT   block_oob)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        RunningBlockPrefixOp<T> prefix_op;
+
+        if (block_offset + TILE_ITEMS <= block_oob)
+        {
+            // Consume first tile of input (full)
+            ConsumeTile<true, true>(block_offset, prefix_op);
+            block_offset += TILE_ITEMS;
+
+            // Consume subsequent full tiles of input
+            while (block_offset + TILE_ITEMS <= block_oob)
+            {
+                ConsumeTile<true, false>(block_offset, prefix_op);
+                block_offset += TILE_ITEMS;
+            }
+
+            // Consume a partially-full tile
+            if (block_offset < block_oob)
+            {
+                int valid_items = block_oob - block_offset;
+                ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
+            }
+        }
+        else
+        {
+            // Consume the first tile of input (partially-full)
+            int valid_items = block_oob - block_offset;
+            ConsumeTile<false, true>(block_offset, prefix_op, valid_items);
+        }
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles, seeded with the specified prefix value
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        SizeT   block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        SizeT   block_oob,                          ///< [in] Threadblock end offset (exclusive)
+        T       prefix)                             ///< [in] The prefix to apply to the scan segment
+    {
+        RunningBlockPrefixOp<T> prefix_op;
+        prefix_op.running_total = prefix;
+
+        // Consume full tiles of input
+        while (block_offset + TILE_ITEMS <= block_oob)
+        {
+            ConsumeTile<true, false>(block_offset, prefix_op);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < block_oob)
+        {
+            int valid_items = block_oob - block_offset;
+            ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/device/block/scan_tiles_types.cuh b/lib/kokkos/TPL/cub/device/block/scan_tiles_types.cuh
new file mode 100644
index 000000000..2b933d0af
--- /dev/null
+++ b/lib/kokkos/TPL/cub/device/block/scan_tiles_types.cuh
@@ -0,0 +1,318 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Utility types for device-wide scan
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * Enumerations of tile status
+ */
+enum ScanTileStatus
+{
+    SCAN_TILE_OOB,          // Out-of-bounds (e.g., padding)
+    SCAN_TILE_INVALID,      // Not yet processed
+    SCAN_TILE_PARTIAL,      // Tile aggregate is available
+    SCAN_TILE_PREFIX,       // Inclusive tile prefix is available
+};
+
+
+/**
+ * Data type of tile status descriptor.
+ *
+ * Specialized for scan status and value types that can be combined into the same
+ * machine word that can be read/written coherently in a single access.
+ */
+template <
+    typename    T,
+    bool        SINGLE_WORD = (PowerOfTwo<sizeof(T)>::VALUE && (sizeof(T) <= 8))>
+struct ScanTileDescriptor
+{
+    // Status word type
+    typedef typename If<(sizeof(T) == 8),
+        long long,
+        typename If<(sizeof(T) == 4),
+            int,
+            typename If<(sizeof(T) == 2),
+                short,
+                char>::Type>::Type>::Type StatusWord;
+
+    // Vector word type
+    typedef typename If<(sizeof(T) == 8),
+        longlong2,
+        typename If<(sizeof(T) == 4),
+            int2,
+            typename If<(sizeof(T) == 2),
+                int,
+                short>::Type>::Type>::Type VectorWord;
+
+    T           value;
+    StatusWord  status;
+
+    static __device__ __forceinline__ void SetPrefix(ScanTileDescriptor *ptr, T prefix)
+    {
+        ScanTileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_PREFIX;
+        tile_descriptor.value = prefix;
+
+        VectorWord alias;
+        *reinterpret_cast<ScanTileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(reinterpret_cast<VectorWord*>(ptr), alias);
+    }
+
+    static __device__ __forceinline__ void SetPartial(ScanTileDescriptor *ptr, T partial)
+    {
+        ScanTileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_PARTIAL;
+        tile_descriptor.value = partial;
+
+        VectorWord alias;
+        *reinterpret_cast<ScanTileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(reinterpret_cast<VectorWord*>(ptr), alias);
+    }
+
+    static __device__ __forceinline__ void WaitForValid(
+        ScanTileDescriptor    *ptr,
+        int                     &status,
+        T                       &value)
+    {
+        ScanTileDescriptor tile_descriptor;
+        while (true)
+        {
+            VectorWord alias = ThreadLoad<LOAD_CG>(reinterpret_cast<VectorWord*>(ptr));
+
+            tile_descriptor = *reinterpret_cast<ScanTileDescriptor*>(&alias);
+            if (tile_descriptor.status != SCAN_TILE_INVALID) break;
+
+            __threadfence_block();
+        }
+
+        status = tile_descriptor.status;
+        value = tile_descriptor.value;
+    }
+
+};
+
+
+/**
+ * Data type of tile status descriptor.
+ *
+ * Specialized for scan status and value types that cannot fused into
+ * the same machine word.
+ */
+template <typename T>
+struct ScanTileDescriptor<T, false>
+{
+    T       prefix_value;
+    T       partial_value;
+
+    /// Workaround for the fact that win32 doesn't guarantee 16B alignment 16B values of T
+    union
+    {
+        int                     status;
+        Uninitialized<T>        padding;
+    };
+
+    static __device__ __forceinline__ void SetPrefix(ScanTileDescriptor *ptr, T prefix)
+    {
+        ThreadStore<STORE_CG>(&ptr->prefix_value, prefix);
+        __threadfence_block();
+//        __threadfence();        // __threadfence_block seems sufficient on current architectures to prevent reordeing
+        ThreadStore<STORE_CG>(&ptr->status, (int) SCAN_TILE_PREFIX);
+
+    }
+
+    static __device__ __forceinline__ void SetPartial(ScanTileDescriptor *ptr, T partial)
+    {
+        ThreadStore<STORE_CG>(&ptr->partial_value, partial);
+        __threadfence_block();
+//        __threadfence();        // __threadfence_block seems sufficient on current architectures to prevent reordeing
+        ThreadStore<STORE_CG>(&ptr->status, (int) SCAN_TILE_PARTIAL);
+    }
+
+    static __device__ __forceinline__ void WaitForValid(
+        ScanTileDescriptor    *ptr,
+        int                         &status,
+        T                           &value)
+    {
+        while (true)
+        {
+            status = ThreadLoad<LOAD_CG>(&ptr->status);
+            if (status != SCAN_TILE_INVALID) break;
+
+            __threadfence_block();
+        }
+
+        value = (status == SCAN_TILE_PARTIAL) ?
+            ThreadLoad<LOAD_CG>(&ptr->partial_value) :
+            ThreadLoad<LOAD_CG>(&ptr->prefix_value);
+    }
+};
+
+
+/**
+ * Stateful prefix functor that provides the the running prefix for
+ * the current tile by using the callback warp to wait on on
+ * aggregates/prefixes from predecessor tiles to become available
+ */
+template <
+    typename T,
+    typename ScanOp>
+struct DeviceScanBlockPrefixOp
+{
+    // Parameterized warp reduce
+    typedef WarpReduce<T>                       WarpReduceT;
+
+    // Storage type
+    typedef typename WarpReduceT::TempStorage   _TempStorage;
+
+    // Alias wrapper allowing storage to be unioned
+    typedef Uninitialized<_TempStorage>         TempStorage;
+
+    // Tile status descriptor type
+    typedef ScanTileDescriptor<T>               ScanTileDescriptorT;
+
+    // Fields
+    ScanTileDescriptorT         *d_tile_status;     ///< Pointer to array of tile status
+    _TempStorage                &temp_storage;      ///< Reference to a warp-reduction instance
+    ScanOp                      scan_op;            ///< Binary scan operator
+    int                         tile_idx;           ///< The current tile index
+    T                           inclusive_prefix;   ///< Inclusive prefix for the tile
+
+    // Constructor
+    __device__ __forceinline__
+    DeviceScanBlockPrefixOp(
+        ScanTileDescriptorT     *d_tile_status,
+        TempStorage             &temp_storage,
+        ScanOp                  scan_op,
+        int                     tile_idx) :
+            d_tile_status(d_tile_status),
+            temp_storage(temp_storage.Alias()),
+            scan_op(scan_op),
+            tile_idx(tile_idx) {}
+
+
+    // Block until all predecessors within the specified window have non-invalid status
+    __device__ __forceinline__
+    void ProcessWindow(
+        int                         predecessor_idx,
+        int                         &predecessor_status,
+        T                           &window_aggregate)
+    {
+        T value;
+        ScanTileDescriptorT::WaitForValid(d_tile_status + predecessor_idx, predecessor_status, value);
+
+        // Perform a segmented reduction to get the prefix for the current window
+        int flag = (predecessor_status != SCAN_TILE_PARTIAL);
+        window_aggregate = WarpReduceT(temp_storage).TailSegmentedReduce(value, flag, scan_op);
+    }
+
+
+    // Prefix functor (called by the first warp)
+    __device__ __forceinline__
+    T operator()(T block_aggregate)
+    {
+        // Update our status with our tile-aggregate
+        if (threadIdx.x == 0)
+        {
+            ScanTileDescriptorT::SetPartial(d_tile_status + tile_idx, block_aggregate);
+        }
+
+        // Wait for the window of predecessor tiles to become valid
+        int predecessor_idx = tile_idx - threadIdx.x - 1;
+        int predecessor_status;
+        T window_aggregate;
+        ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+
+        // The exclusive tile prefix starts out as the current window aggregate
+        T exclusive_prefix = window_aggregate;
+
+        // Keep sliding the window back until we come across a tile whose inclusive prefix is known
+        while (WarpAll(predecessor_status != SCAN_TILE_PREFIX))
+        {
+            predecessor_idx -= PtxArchProps::WARP_THREADS;
+
+            // Update exclusive tile prefix with the window prefix
+            ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+            exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
+        }
+
+        // Compute the inclusive tile prefix and update the status for this tile
+        if (threadIdx.x == 0)
+        {
+            inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
+            ScanTileDescriptorT::SetPrefix(
+                d_tile_status + tile_idx,
+                inclusive_prefix);
+        }
+
+        // Return exclusive_prefix
+        return exclusive_prefix;
+    }
+};
+
+
+// Running scan prefix callback type for single-block scans.
+// Maintains a running prefix that can be applied to consecutive
+// scan operations.
+template <typename T>
+struct RunningBlockPrefixOp
+{
+    // Running prefix
+    T running_total;
+
+    // Callback operator.
+    __device__ T operator()(T block_aggregate)
+    {
+        T old_prefix = running_total;
+        running_total += block_aggregate;
+        return old_prefix;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_gatomic.cuh b/lib/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_gatomic.cuh
new file mode 100644
index 000000000..5896dbcf6
--- /dev/null
+++ b/lib/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_gatomic.cuh
@@ -0,0 +1,184 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockHistogramTilesGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../../util_type.cuh"
+#include "../../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/**
+ * BlockHistogramTilesGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using global atomics
+ */
+template <
+    typename    BlockHistogramTilesPolicy,      ///< Tuning policy
+    int         BINS,                           ///< Number of histogram bins per channel
+    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
+    typename    InputIteratorRA,                ///< The input iterator type (may be a simple pointer type).  Must have a value type that can be cast as an integer in the range [0..BINS-1]
+    typename    HistoCounter,                   ///< Integral type for counting sample occurrences per histogram bin
+    typename    SizeT>                          ///< Integer type for offsets
+struct BlockHistogramTilesGlobalAtomic
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Sample type
+    typedef typename std::iterator_traits<InputIteratorRA>::value_type SampleT;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockHistogramTilesPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockHistogramTilesPolicy::ITEMS_PER_THREAD,
+        TILE_CHANNEL_ITEMS  = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS          = TILE_CHANNEL_ITEMS * CHANNELS,
+    };
+
+    // Shared memory type required by this thread block
+    typedef NullType TempStorage;
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to output histograms
+    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
+
+    /// Input data to reduce
+    InputIteratorRA d_in;
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockHistogramTilesGlobalAtomic(
+        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
+        InputIteratorRA     d_in,                                           ///< Input data to reduce
+        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
+    :
+        d_in(d_in),
+        d_out_histograms(d_out_histograms)
+    {}
+
+
+    /**
+     * Process a single tile of input
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        SizeT   block_offset,               ///< The offset the tile to consume
+        int     valid_items = TILE_ITEMS)   ///< The number of valid items in the tile
+    {
+        if (FULL_TILE)
+        {
+            // Full tile of samples to read and composite
+            SampleT items[ITEMS_PER_THREAD][CHANNELS];
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (CHANNEL < ACTIVE_CHANNELS)
+                    {
+                        items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+                    }
+                }
+            }
+
+            __threadfence_block();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (CHANNEL < ACTIVE_CHANNELS)
+                    {
+                        atomicAdd(d_out_histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
+                    }
+                }
+            }
+        }
+        else
+        {
+            // Only a partially-full tile of samples to read and composite
+            int bounds = valid_items - (threadIdx.x * CHANNELS);
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
+                    {
+                        SampleT item  = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+                        atomicAdd(d_out_histograms[CHANNEL] + item, 1);
+                    }
+                }
+            }
+
+        }
+    }
+
+
+    /**
+     * Aggregate results into output
+     */
+    __device__ __forceinline__ void AggregateOutput()
+    {}
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_satomic.cuh b/lib/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_satomic.cuh
new file mode 100644
index 000000000..c55d78953
--- /dev/null
+++ b/lib/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_satomic.cuh
@@ -0,0 +1,237 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockHistogramTilesSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../../util_type.cuh"
+#include "../../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * BlockHistogramTilesSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics
+ */
+template <
+    typename    BlockHistogramTilesPolicy,          ///< Tuning policy
+    int         BINS,                           ///< Number of histogram bins
+    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
+    typename    InputIteratorRA,                ///< The input iterator type (may be a simple pointer type).  Must have a value type that can be cast as an integer in the range [0..BINS-1]
+    typename    HistoCounter,                   ///< Integral type for counting sample occurrences per histogram bin
+    typename    SizeT>                          ///< Integer type for offsets
+struct BlockHistogramTilesSharedAtomic
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Sample type
+    typedef typename std::iterator_traits<InputIteratorRA>::value_type SampleT;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockHistogramTilesPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockHistogramTilesPolicy::ITEMS_PER_THREAD,
+        TILE_CHANNEL_ITEMS  = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS          = TILE_CHANNEL_ITEMS * CHANNELS,
+    };
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        HistoCounter histograms[ACTIVE_CHANNELS][BINS + 1];  // One word of padding between channel histograms to prevent warps working on different histograms from hammering on the same bank
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to temp_storage
+    _TempStorage &temp_storage;
+
+    /// Reference to output histograms
+    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
+
+    /// Input data to reduce
+    InputIteratorRA d_in;
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockHistogramTilesSharedAtomic(
+        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
+        InputIteratorRA     d_in,                                           ///< Input data to reduce
+        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out_histograms(d_out_histograms)
+    {
+        // Initialize histogram bin counts to zeros
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int histo_offset = 0;
+
+            #pragma unroll
+            for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+            {
+                this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
+            }
+            // Finish up with guarded initialization if necessary
+            if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS))
+            {
+                this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
+            }
+        }
+    }
+
+
+    /**
+     * Process a single tile of input
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        SizeT   block_offset,               ///< The offset the tile to consume
+        int     valid_items = TILE_ITEMS)   ///< The number of valid items in the tile
+    {
+        if (FULL_TILE)
+        {
+            // Full tile of samples to read and composite
+            SampleT items[ITEMS_PER_THREAD][CHANNELS];
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (CHANNEL < ACTIVE_CHANNELS)
+                    {
+                        items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+                    }
+                }
+            }
+
+            __threadfence_block();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (CHANNEL < ACTIVE_CHANNELS)
+                    {
+                        atomicAdd(temp_storage.histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
+                    }
+                }
+            }
+
+            __threadfence_block();
+        }
+        else
+        {
+            // Only a partially-full tile of samples to read and composite
+            int bounds = valid_items - (threadIdx.x * CHANNELS);
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
+                {
+                    if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
+                    {
+                        SampleT item  = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
+                        atomicAdd(temp_storage.histograms[CHANNEL] + item, 1);
+                    }
+                }
+            }
+
+        }
+    }
+
+
+    /**
+     * Aggregate results into output
+     */
+    __device__ __forceinline__ void AggregateOutput()
+    {
+        // Barrier to ensure shared memory histograms are coherent
+        __syncthreads();
+
+        // Copy shared memory histograms to output
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int channel_offset  = (blockIdx.x * BINS);
+            int histo_offset    = 0;
+
+            #pragma unroll
+            for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+            {
+                d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
+            }
+            // Finish up with guarded initialization if necessary
+            if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS))
+            {
+                d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
+            }
+        }
+    }
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_sort.cuh b/lib/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_sort.cuh
new file mode 100644
index 000000000..0f821309c
--- /dev/null
+++ b/lib/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_sort.cuh
@@ -0,0 +1,364 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockHistogramTilesSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../../../block/block_radix_sort.cuh"
+#include "../../../block/block_discontinuity.cuh"
+#include "../../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * BlockHistogramTilesSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting
+ */
+template <
+    typename    BlockHistogramTilesPolicy,          ///< Tuning policy
+    int         BINS,                           ///< Number of histogram bins per channel
+    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
+    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
+    typename    InputIteratorRA,                ///< The input iterator type (may be a simple pointer type).  Must have a value type that can be cast as an integer in the range [0..BINS-1]
+    typename    HistoCounter,                   ///< Integral type for counting sample occurrences per histogram bin
+    typename    SizeT>                          ///< Integer type for offsets
+struct BlockHistogramTilesSort
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Sample type
+    typedef typename std::iterator_traits<InputIteratorRA>::value_type SampleT;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS               = BlockHistogramTilesPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD            = BlockHistogramTilesPolicy::ITEMS_PER_THREAD,
+        TILE_CHANNEL_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS                  = TILE_CHANNEL_ITEMS * CHANNELS,
+
+        STRIPED_COUNTERS_PER_THREAD = (BINS + BLOCK_THREADS - 1) / BLOCK_THREADS,
+    };
+
+    // Parameterize BlockRadixSort type for our thread block
+    typedef BlockRadixSort<SampleT, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
+
+    // Parameterize BlockDiscontinuity type for our thread block
+    typedef BlockDiscontinuity<SampleT, BLOCK_THREADS> BlockDiscontinuityT;
+
+    /// Shared memory type required by this thread block
+    union _TempStorage
+    {
+        // Storage for sorting bin values
+        typename BlockRadixSortT::TempStorage sort;
+
+        struct
+        {
+            // Storage for detecting discontinuities in the tile of sorted bin values
+            typename BlockDiscontinuityT::TempStorage flag;
+
+            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
+            int run_begin[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
+            int run_end[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
+        };
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Discontinuity functor
+    struct DiscontinuityOp
+    {
+        // Reference to temp_storage
+        _TempStorage &temp_storage;
+
+        // Constructor
+        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
+            temp_storage(temp_storage)
+        {}
+
+        // Discontinuity predicate
+        __device__ __forceinline__ bool operator()(const SampleT &a, const SampleT &b, int b_index)
+        {
+            if (a != b)
+            {
+                // Note the begin/end offsets in shared storage
+                temp_storage.run_begin[b] = b_index;
+                temp_storage.run_end[a] = b_index;
+
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to temp_storage
+    _TempStorage &temp_storage;
+
+    /// Histogram counters striped across threads
+    HistoCounter thread_counters[ACTIVE_CHANNELS][STRIPED_COUNTERS_PER_THREAD];
+
+    /// Reference to output histograms
+    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
+
+    /// Input data to reduce
+    InputIteratorRA d_in;
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ BlockHistogramTilesSort(
+        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
+        InputIteratorRA     d_in,                                           ///< Input data to reduce
+        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out_histograms(d_out_histograms)
+    {
+        // Initialize histogram counters striped across threads
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            #pragma unroll
+            for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+            {
+                thread_counters[CHANNEL][COUNTER] = 0;
+            }
+        }
+    }
+
+
+    /**
+     * Composite a tile of input items
+     */
+    __device__ __forceinline__ void Composite(
+        SampleT   (&items)[ITEMS_PER_THREAD],                     ///< Tile of samples
+        HistoCounter    thread_counters[STRIPED_COUNTERS_PER_THREAD])   ///< Histogram counters striped across threads
+    {
+        // Sort bytes in blocked arrangement
+        BlockRadixSortT(temp_storage.sort).Sort(items);
+
+        __syncthreads();
+
+        // Initialize the shared memory's run_begin and run_end for each bin
+        #pragma unroll
+        for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+        {
+            temp_storage.run_begin[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
+            temp_storage.run_end[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
+        }
+
+        __syncthreads();
+
+        // Note the begin/end run offsets of bin runs in the sorted tile
+        int flags[ITEMS_PER_THREAD];                // unused
+        DiscontinuityOp flag_op(temp_storage);
+        BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
+
+        // Update begin for first item
+        if (threadIdx.x == 0) temp_storage.run_begin[items[0]] = 0;
+
+        __syncthreads();
+
+        // Composite into histogram
+        // Initialize the shared memory's run_begin and run_end for each bin
+        #pragma unroll
+        for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+        {
+            int          bin            = (COUNTER * BLOCK_THREADS) + threadIdx.x;
+            HistoCounter run_length     = temp_storage.run_end[bin] - temp_storage.run_begin[bin];
+
+            thread_counters[COUNTER] += run_length;
+        }
+    }
+
+
+    /**
+     * Process one channel within a tile.
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTileChannel(
+        int     channel,
+        SizeT   block_offset,
+        int     valid_items)
+    {
+        // Load items in striped fashion
+        if (FULL_TILE)
+        {
+            // Full tile of samples to read and composite
+            SampleT items[ITEMS_PER_THREAD];
+
+            // Unguarded loads
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                items[ITEM] = d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)];
+            }
+
+            // Composite our histogram data
+            Composite(items, thread_counters[channel]);
+        }
+        else
+        {
+            // Only a partially-full tile of samples to read and composite
+            SampleT items[ITEMS_PER_THREAD];
+
+            // Assign our tid as the bin for out-of-bounds items (to give an even distribution), and keep track of how oob items to subtract out later
+            int bounds = (valid_items - (threadIdx.x * CHANNELS));
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                items[ITEM] = ((ITEM * BLOCK_THREADS * CHANNELS) < bounds) ?
+                    d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)] :
+                    0;
+            }
+
+            // Composite our histogram data
+            Composite(items, thread_counters[channel]);
+
+            __syncthreads();
+
+            // Correct the overcounting in the zero-bin from invalid (out-of-bounds) items
+            if (threadIdx.x == 0)
+            {
+                int extra = (TILE_ITEMS - valid_items) / CHANNELS;
+                thread_counters[channel][0] -= extra;
+            }
+        }
+    }
+
+
+    /**
+     * Template iteration over channels (to silence not-unrolled warnings for SM10-13).  Inductive step.
+     */
+    template <bool FULL_TILE, int CHANNEL, int END>
+    struct IterateChannels
+    {
+        /**
+         * Process one channel within a tile.
+         */
+        static __device__ __forceinline__ void ConsumeTileChannel(
+            BlockHistogramTilesSort *cta,
+            SizeT               block_offset,
+            int                 valid_items)
+        {
+            __syncthreads();
+
+            cta->ConsumeTileChannel<FULL_TILE>(CHANNEL, block_offset, valid_items);
+
+            IterateChannels<FULL_TILE, CHANNEL + 1, END>::ConsumeTileChannel(cta, block_offset, valid_items);
+        }
+    };
+
+
+    /**
+     * Template iteration over channels (to silence not-unrolled warnings for SM10-13).  Base step.
+     */
+    template <bool FULL_TILE, int END>
+    struct IterateChannels<FULL_TILE, END, END>
+    {
+        static __device__ __forceinline__ void ConsumeTileChannel(BlockHistogramTilesSort *cta, SizeT block_offset, int valid_items) {}
+    };
+
+
+    /**
+     * Process a single tile of input
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        SizeT   block_offset,               ///< The offset the tile to consume
+        int     valid_items = TILE_ITEMS)   ///< The number of valid items in the tile
+    {
+        // First channel
+        ConsumeTileChannel<FULL_TILE>(0, block_offset, valid_items);
+
+        // Iterate through remaining channels
+        IterateChannels<FULL_TILE, 1, ACTIVE_CHANNELS>::ConsumeTileChannel(this, block_offset, valid_items);
+    }
+
+
+    /**
+     * Aggregate results into output
+     */
+    __device__ __forceinline__ void AggregateOutput()
+    {
+        // Copy counters striped across threads into the histogram output
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int channel_offset  = (blockIdx.x * BINS);
+
+            #pragma unroll
+            for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
+            {
+                int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x;
+
+                if ((STRIPED_COUNTERS_PER_THREAD * BLOCK_THREADS == BINS) || (bin < BINS))
+                {
+                    d_out_histograms[CHANNEL][channel_offset + bin] = thread_counters[CHANNEL][COUNTER];
+                }
+            }
+        }
+    }
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/device/device_histogram.cuh b/lib/kokkos/TPL/cub/device/device_histogram.cuh
new file mode 100644
index 000000000..6f5a74d1f
--- /dev/null
+++ b/lib/kokkos/TPL/cub/device/device_histogram.cuh
@@ -0,0 +1,1062 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from samples data residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "block/block_histo_tiles.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../util_debug.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Initialization pass kernel entry point (multi-block).  Prepares queue descriptors zeroes global counters.
+ */
+template <
+    int                                             BINS,                   ///< Number of histogram bins per channel
+    int                                             ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
+    typename                                        SizeT,                  ///< Integer type used for global array indexing
+    typename                                        HistoCounter>           ///< Integral type for counting sample occurrences per histogram bin
+__launch_bounds__ (BINS, 1)
+__global__ void InitHistoKernel(
+    GridQueue<SizeT>                                grid_queue,             ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks
+    ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS>    d_out_histograms,       ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][BINS]</tt>
+    SizeT                                           num_samples)            ///< [in] Total number of samples \p d_samples for all channels
+{
+    d_out_histograms.array[blockIdx.x][threadIdx.x] = 0;
+    if (threadIdx.x == 0) grid_queue.ResetDrain(num_samples);
+}
+
+
+/**
+ * Histogram pass kernel entry point (multi-block).  Computes privatized histograms, one per thread block.
+ */
+template <
+    typename                                        BlockHistogramTilesPolicy,   ///< Tuning policy for cub::BlockHistogramTiles abstraction
+    int                                             BINS,                       ///< Number of histogram bins per channel
+    int                                             CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int                                             ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                                        InputIteratorRA,            ///< The input iterator type (may be a simple pointer type).  Must have a value type that is assignable to <tt>unsigned char</tt>
+    typename                                        HistoCounter,               ///< Integral type for counting sample occurrences per histogram bin
+    typename                                        SizeT>                      ///< Integer type used for global array indexing
+__launch_bounds__ (int(BlockHistogramTilesPolicy::BLOCK_THREADS), BlockHistogramTilesPolicy::SM_OCCUPANCY)
+__global__ void MultiBlockHistogramKernel(
+    InputIteratorRA                                 d_samples,                  ///< [in] Array of sample data. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples).
+    ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS>    d_out_histograms,           ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][gridDim.x][BINS]</tt>
+    SizeT                                           num_samples,                ///< [in] Total number of samples \p d_samples for all channels
+    GridEvenShare<SizeT>                            even_share,                 ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
+    GridQueue<SizeT>                                queue)                      ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks
+{
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockHistogramTilesPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockHistogramTilesPolicy::ITEMS_PER_THREAD,
+        TILE_SIZE           = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Thread block type for compositing input tiles
+    typedef BlockHistogramTiles<BlockHistogramTilesPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT> BlockHistogramTilesT;
+
+    // Shared memory for BlockHistogramTiles
+    __shared__ typename BlockHistogramTilesT::TempStorage temp_storage;
+
+    // Consume input tiles
+    BlockHistogramTilesT(temp_storage, d_samples, d_out_histograms.array).ConsumeTiles(
+        num_samples,
+        even_share,
+        queue,
+        Int2Type<BlockHistogramTilesPolicy::GRID_MAPPING>());
+}
+
+
+/**
+ * Block-aggregation pass kernel entry point (single-block).  Aggregates privatized threadblock histograms from a previous multi-block histogram pass.
+ */
+template <
+    int                                             BINS,                   ///< Number of histogram bins per channel
+    int                                             ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
+    typename                                        HistoCounter>           ///< Integral type for counting sample occurrences per histogram bin
+__launch_bounds__ (BINS, 1)
+__global__ void AggregateHistoKernel(
+    HistoCounter*                                   d_block_histograms,     ///< [in] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][num_threadblocks][BINS]</tt>
+    ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS>    d_out_histograms,       ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][BINS]</tt>
+    int                                             num_threadblocks)       ///< [in] Number of threadblock histograms per channel in \p d_block_histograms
+{
+    // Accumulate threadblock-histograms from the channel
+    HistoCounter bin_aggregate = 0;
+
+    int block_offset = blockIdx.x * (num_threadblocks * BINS);
+    int block_oob = block_offset + (num_threadblocks * BINS);
+
+#if CUB_PTX_ARCH >= 200
+    #pragma unroll 32
+#endif
+    while (block_offset < block_oob)
+    {
+        bin_aggregate += d_block_histograms[block_offset + threadIdx.x];
+        block_offset += BINS;
+    }
+
+    // Output
+    d_out_histograms.array[blockIdx.x][threadIdx.x] = bin_aggregate;
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * DeviceHistogram
+ *****************************************************************************/
+
+/**
+ * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from samples data residing within global memory. ![](histogram_logo.png)
+ * \ingroup DeviceModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
+ * counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceHistogram}
+ *
+ * \par Performance
+ *
+ * \image html histo_perf.png
+ *
+ */
+struct DeviceHistogram
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    /// Generic structure for encapsulating dispatch properties.  Mirrors the constants within BlockHistogramTilesPolicy.
+    struct KernelDispachParams
+    {
+        // Policy fields
+        int                         block_threads;
+        int                         items_per_thread;
+        BlockHistogramTilesAlgorithm    block_algorithm;
+        GridMappingStrategy         grid_mapping;
+        int                         subscription_factor;
+
+        // Derived fields
+        int                         channel_tile_size;
+
+        template <typename BlockHistogramTilesPolicy>
+        __host__ __device__ __forceinline__
+        void Init(int subscription_factor = 1)
+        {
+            block_threads               = BlockHistogramTilesPolicy::BLOCK_THREADS;
+            items_per_thread            = BlockHistogramTilesPolicy::ITEMS_PER_THREAD;
+            block_algorithm             = BlockHistogramTilesPolicy::GRID_ALGORITHM;
+            grid_mapping                = BlockHistogramTilesPolicy::GRID_MAPPING;
+            this->subscription_factor   = subscription_factor;
+
+            channel_tile_size           = block_threads * items_per_thread;
+        }
+
+        __host__ __device__ __forceinline__
+        void Print()
+        {
+            printf("%d, %d, %d, %d, %d",
+                block_threads,
+                items_per_thread,
+                block_algorithm,
+                grid_mapping,
+                subscription_factor);
+        }
+
+    };
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// Specializations of tuned policy types for different PTX architectures
+    template <
+        int                         CHANNELS,
+        int                         ACTIVE_CHANNELS,
+        BlockHistogramTilesAlgorithm    GRID_ALGORITHM,
+        int                         ARCH>
+    struct TunedPolicies;
+
+    /// SM35 tune
+    template <int CHANNELS, int ACTIVE_CHANNELS, BlockHistogramTilesAlgorithm GRID_ALGORITHM>
+    struct TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 350>
+    {
+        typedef BlockHistogramTilesPolicy<
+            (GRID_ALGORITHM == GRID_HISTO_SORT) ? 128 : 256,
+            (GRID_ALGORITHM == GRID_HISTO_SORT) ? 12 : (30 / ACTIVE_CHANNELS),
+            GRID_ALGORITHM,
+            (GRID_ALGORITHM == GRID_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE,
+            (GRID_ALGORITHM == GRID_HISTO_SORT) ? 8 : 1> MultiBlockPolicy;
+        enum { SUBSCRIPTION_FACTOR = 7 };
+    };
+
+    /// SM30 tune
+    template <int CHANNELS, int ACTIVE_CHANNELS, BlockHistogramTilesAlgorithm GRID_ALGORITHM>
+    struct TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 300>
+    {
+        typedef BlockHistogramTilesPolicy<
+            128,
+            (GRID_ALGORITHM == GRID_HISTO_SORT) ? 20 : (22 / ACTIVE_CHANNELS),
+            GRID_ALGORITHM,
+            (GRID_ALGORITHM == GRID_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE,
+            1> MultiBlockPolicy;
+        enum { SUBSCRIPTION_FACTOR = 1 };
+    };
+
+    /// SM20 tune
+    template <int CHANNELS, int ACTIVE_CHANNELS, BlockHistogramTilesAlgorithm GRID_ALGORITHM>
+    struct TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 200>
+    {
+        typedef BlockHistogramTilesPolicy<
+            128,
+            (GRID_ALGORITHM == GRID_HISTO_SORT) ? 21 : (23 / ACTIVE_CHANNELS),
+            GRID_ALGORITHM,
+            GRID_MAPPING_DYNAMIC,
+            1> MultiBlockPolicy;
+        enum { SUBSCRIPTION_FACTOR = 1 };
+    };
+
+    /// SM10 tune
+    template <int CHANNELS, int ACTIVE_CHANNELS, BlockHistogramTilesAlgorithm GRID_ALGORITHM>
+    struct TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 100>
+    {
+        typedef BlockHistogramTilesPolicy<
+            128, 
+            7, 
+            GRID_HISTO_SORT,        // (use sort regardless because atomics are perf-useless)
+            GRID_MAPPING_EVEN_SHARE,
+            1> MultiBlockPolicy;
+        enum { SUBSCRIPTION_FACTOR = 1 };
+    };
+
+
+    /// Tuning policy for the PTX architecture that DeviceHistogram operations will get dispatched to
+    template <
+        int                         CHANNELS,
+        int                         ACTIVE_CHANNELS,
+        BlockHistogramTilesAlgorithm      GRID_ALGORITHM>
+    struct PtxDefaultPolicies
+    {
+        static const int PTX_TUNE_ARCH =   (CUB_PTX_ARCH >= 350) ?
+                                                350 :
+                                                (CUB_PTX_ARCH >= 300) ?
+                                                    300 :
+                                                    (CUB_PTX_ARCH >= 200) ?
+                                                        200 :
+                                                        100;
+
+        // Tuned policy set for the current PTX compiler pass
+        typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, PTX_TUNE_ARCH> PtxTunedPolicies;
+
+        // Subscription factor for the current PTX compiler pass
+        static const int SUBSCRIPTION_FACTOR = PtxTunedPolicies::SUBSCRIPTION_FACTOR;
+
+        // MultiBlockPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+        struct MultiBlockPolicy : PtxTunedPolicies::MultiBlockPolicy {};
+
+        /**
+         * Initialize dispatch params with the policies corresponding to the PTX assembly we will use
+         */
+        static void InitDispatchParams(int ptx_version, KernelDispachParams &multi_block_dispatch_params)
+        {
+            if (ptx_version >= 350)
+            {
+                typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 350> TunedPolicies;
+                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+            }
+            else if (ptx_version >= 300)
+            {
+                typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 300> TunedPolicies;
+                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+            }
+            else if (ptx_version >= 200)
+            {
+                typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 200> TunedPolicies;
+                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+            }
+            else
+            {
+                typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 100> TunedPolicies;
+                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+            }
+        }
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for invoking device-wide, multi-channel, histogram
+     */
+    template <
+        int                         BINS,                               ///< Number of histogram bins per channel
+        int                         CHANNELS,                           ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+        int                         ACTIVE_CHANNELS,                    ///< Number of channels actively being histogrammed
+        typename                    InitHistoKernelPtr,                 ///< Function type of cub::InitHistoKernel
+        typename                    MultiBlockHistogramKernelPtr,           ///< Function type of cub::MultiBlockHistogramKernel
+        typename                    AggregateHistoKernelPtr,            ///< Function type of cub::AggregateHistoKernel
+        typename                    InputIteratorRA,                    ///< The input iterator type (may be a simple pointer type).  Must have a value type that is assignable to <tt>unsigned char</tt>
+        typename                    HistoCounter,                       ///< Integral type for counting sample occurrences per histogram bin
+        typename                    SizeT>                              ///< Integer type used for global array indexing
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InitHistoKernelPtr          init_kernel,                        ///< [in] Kernel function pointer to parameterization of cub::InitHistoKernel
+        MultiBlockHistogramKernelPtr    multi_block_kernel,                 ///< [in] Kernel function pointer to parameterization of cub::MultiBlockHistogramKernel
+        AggregateHistoKernelPtr     aggregate_kernel,                   ///< [in] Kernel function pointer to parameterization of cub::AggregateHistoKernel
+        KernelDispachParams         &multi_block_dispatch_params,       ///< [in] Dispatch parameters that match the policy that \p multi_block_kernel was compiled for
+        InputIteratorRA             d_samples,                          ///< [in] Input samples to histogram
+        HistoCounter                *d_histograms[ACTIVE_CHANNELS],     ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter.
+        SizeT                       num_samples,                        ///< [in] Number of samples to process
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get a rough estimate of multi_block_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
+            int multi_block_sm_occupancy = CUB_MIN(
+                ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
+                ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / multi_block_dispatch_params.block_threads);
+
+#ifndef __CUDA_ARCH__
+            // We're on the host, so come up with a more accurate estimate of multi_block_kernel SM occupancy from actual device properties
+            Device device_props;
+            if (CubDebug(error = device_props.Init(device_ordinal))) break;
+
+            if (CubDebug(error = device_props.MaxSmOccupancy(
+                multi_block_sm_occupancy,
+                multi_block_kernel,
+                multi_block_dispatch_params.block_threads))) break;
+#endif
+
+            // Get device occupancy for multi_block_kernel
+            int multi_block_occupancy = multi_block_sm_occupancy * sm_count;
+
+            // Even-share work distribution
+            GridEvenShare<SizeT> even_share;
+
+            // Get tile size for multi_block_kernel
+            int multi_block_tile_size = multi_block_dispatch_params.channel_tile_size * CHANNELS;
+
+            // Get grid size for multi_block_kernel
+            int multi_block_grid_size;
+            switch (multi_block_dispatch_params.grid_mapping)
+            {
+            case GRID_MAPPING_EVEN_SHARE:
+
+                // Work is distributed evenly
+                even_share.GridInit(
+                    num_samples,
+                    multi_block_occupancy * multi_block_dispatch_params.subscription_factor,
+                    multi_block_tile_size);
+                multi_block_grid_size = even_share.grid_size;
+                break;
+
+            case GRID_MAPPING_DYNAMIC:
+
+                // Work is distributed dynamically
+                int num_tiles           = (num_samples + multi_block_tile_size - 1) / multi_block_tile_size;
+                multi_block_grid_size   = (num_tiles < multi_block_occupancy) ?
+                    num_tiles :                 // Not enough to fill the device with threadblocks
+                    multi_block_occupancy;      // Fill the device with threadblocks
+                break;
+            };
+
+            // Temporary storage allocation requirements
+            void* allocations[2];
+            size_t allocation_sizes[2] =
+            {
+                ACTIVE_CHANNELS * multi_block_grid_size * sizeof(HistoCounter) * BINS,      // bytes needed for privatized histograms
+                GridQueue<int>::AllocationSize()                                            // bytes needed for grid queue descriptor
+            };
+
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+                return cudaSuccess;
+
+            // Privatized per-block reductions
+            HistoCounter *d_block_histograms = (HistoCounter*) allocations[0];
+
+            // Grid queue descriptor
+            GridQueue<SizeT> queue(allocations[1]);
+
+            // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_histo_wrapper;
+            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+                d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
+
+            // Setup array wrapper for temporary histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_temp_histo_wrapper;
+            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+                d_temp_histo_wrapper.array[CHANNEL] = d_block_histograms + (CHANNEL * multi_block_grid_size * BINS);
+
+            // Log init_kernel configuration
+            if (stream_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", ACTIVE_CHANNELS, BINS, (long long) stream);
+
+            // Invoke init_kernel to initialize counters and queue descriptor
+            init_kernel<<<ACTIVE_CHANNELS, BINS, 0, stream>>>(queue, d_histo_wrapper, num_samples);
+
+            // Sync the stream if specified
+            if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Whether we need privatized histograms (i.e., non-global atomics and multi-block)
+            bool privatized_temporaries = (multi_block_grid_size > 1) && (multi_block_dispatch_params.block_algorithm != GRID_HISTO_GLOBAL_ATOMIC);
+
+            // Log multi_block_kernel configuration
+            if (stream_synchronous) CubLog("Invoking multi_block_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                multi_block_grid_size, multi_block_dispatch_params.block_threads, (long long) stream, multi_block_dispatch_params.items_per_thread, multi_block_sm_occupancy);
+
+            // Invoke multi_block_kernel
+            multi_block_kernel<<<multi_block_grid_size, multi_block_dispatch_params.block_threads, 0, stream>>>(
+                d_samples,
+                (privatized_temporaries) ?
+                    d_temp_histo_wrapper :
+                    d_histo_wrapper,
+                num_samples,
+                even_share,
+                queue);
+
+            // Sync the stream if specified
+            if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Aggregate privatized block histograms if necessary
+            if (privatized_temporaries)
+            {
+                // Log aggregate_kernel configuration
+                if (stream_synchronous) CubLog("Invoking aggregate_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    ACTIVE_CHANNELS, BINS, (long long) stream);
+
+                // Invoke aggregate_kernel
+                aggregate_kernel<<<ACTIVE_CHANNELS, BINS, 0, stream>>>(
+                    d_block_histograms,
+                    d_histo_wrapper,
+                    multi_block_grid_size);
+
+                // Sync the stream if specified
+                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * \brief Computes a device-wide histogram
+     *
+     * \tparam GRID_ALGORITHM      cub::BlockHistogramTilesAlgorithm enumerator specifying the underlying algorithm to use
+     * \tparam CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)  Must have a value type that is assignable to <tt>unsigned char</tt>
+     * \tparam HistoCounter         <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
+     */
+    template <
+        BlockHistogramTilesAlgorithm    GRID_ALGORITHM,
+        int                         BINS,                       ///< Number of histogram bins per channel
+        int                         CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+        int                         ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+        typename                    InputIteratorRA,            ///< The input iterator type (may be a simple pointer type).  Must have a value type that is assignable to <tt>unsigned char</tt>
+        typename                    HistoCounter>               ///< Integral type for counting sample occurrences per histogram bin
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA     d_samples,                          ///< [in] Input samples to histogram
+        HistoCounter        *d_histograms[ACTIVE_CHANNELS],     ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter.
+        int                 num_samples,                        ///< [in] Number of samples to process
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+        // Type used for array indexing
+        typedef int SizeT;
+
+        // Tuning polices for the PTX architecture that will get dispatched to
+        typedef PtxDefaultPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM> PtxDefaultPolicies;
+        typedef typename PtxDefaultPolicies::MultiBlockPolicy MultiBlockPolicy;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Declare dispatch parameters
+            KernelDispachParams multi_block_dispatch_params;
+
+        #ifdef __CUDA_ARCH__
+
+            // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
+            multi_block_dispatch_params.Init<MultiBlockPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
+
+        #else
+
+            // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+            PtxDefaultPolicies::InitDispatchParams(ptx_version, multi_block_dispatch_params);
+
+        #endif
+
+            Dispatch<BINS, CHANNELS, ACTIVE_CHANNELS>(
+                d_temp_storage,
+                temp_storage_bytes,
+                InitHistoKernel<BINS, ACTIVE_CHANNELS, SizeT, HistoCounter>,
+                MultiBlockHistogramKernel<MultiBlockPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT>,
+                AggregateHistoKernel<BINS, ACTIVE_CHANNELS, HistoCounter>,
+                multi_block_dispatch_params,
+                d_samples,
+                d_histograms,
+                num_samples,
+                stream,
+                stream_synchronous);
+
+            if (CubDebug(error)) break;
+        }
+        while (0);
+
+        return error;
+    }
+
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    /******************************************************************//**
+     * \name Single-channel samples
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a device-wide histogram.  Uses fast block-sorting to compute the histogram. Delivers consistent throughput regardless of sample diversity, but occupancy may be limited by histogram bin count.
+     *
+     * However, because histograms are privatized in shared memory, a large
+     * number of bins (e.g., thousands) may adversely affect occupancy and
+     * performance (or even the ability to launch).
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \par
+     * The code snippet below illustrates the computation of a 256-bin histogram of
+     * single-channel <tt>unsigned char</tt> samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Declare and initialize device pointers for input samples and 256-bin output histogram
+     * unsigned char *d_samples;
+     * unsigned int *d_histogram;
+     * int num_items = ...
+     * ...
+     *
+     * // Wrap d_samples device pointer in a random-access texture iterator
+     * cub::TexIteratorRA<unsigned int> d_samples_tex_itr;
+     * d_samples_tex_itr.BindTexture(d_samples, num_items * sizeof(unsigned char));
+     *
+     * // Determine temporary device storage requirements for histogram computation
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceHistogram::SingleChannelSorting<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_items);
+     *
+     * // Allocate temporary storage for histogram computation
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histogram
+     * cub::DeviceHistogram::SingleChannelSorting<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_items);
+     *
+     * // Unbind texture iterator
+     * d_samples_tex_itr.UnbindTexture();
+     *
+     * \endcode
+     *
+     * \tparam BINS                 Number of histogram bins per channel
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)  Must have a value type that can be cast as an integer in the range [0..BINS-1]
+     * \tparam HistoCounter         <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
+     */
+    template <
+        int                 BINS,
+        typename            InputIteratorRA,
+        typename            HistoCounter>
+    __host__ __device__ __forceinline__
+    static cudaError_t SingleChannelSorting(
+        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA     d_samples,                          ///< [in] Input samples
+        HistoCounter*       d_histogram,                        ///< [out] Array of BINS counters of integral type \p HistoCounter.
+        int                 num_samples,                        ///< [in] Number of samples to process
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+        return Dispatch<GRID_HISTO_SORT, BINS, 1, 1>(
+            d_temp_storage, temp_storage_bytes, d_samples, &d_histogram, num_samples, stream, stream_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide histogram.  Uses shared-memory atomic read-modify-write operations to compute the histogram.  Input samples having lower diversity can cause performance to be degraded, and occupancy may be limited by histogram bin count.
+     *
+     * However, because histograms are privatized in shared memory, a large
+     * number of bins (e.g., thousands) may adversely affect occupancy and
+     * performance (or even the ability to launch).
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \par
+     * The code snippet below illustrates the computation of a 256-bin histogram of
+     * single-channel <tt>unsigned char</tt> samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Declare and initialize device pointers for input samples and 256-bin output histogram
+     * unsigned char *d_samples;
+     * unsigned int *d_histogram;
+     * int num_items = ...
+     * ...
+     *
+     * // Wrap d_samples device pointer in a random-access texture iterator
+     * cub::TexIteratorRA<unsigned int> d_samples_tex_itr;
+     * d_samples_tex_itr.BindTexture(d_samples, num_items * sizeof(unsigned char));
+     *
+     * // Determine temporary device storage requirements for histogram computation
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceHistogram::SingleChannelSorting<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_items);
+     *
+     * // Allocate temporary storage for histogram computation
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histogram
+     * cub::DeviceHistogram::SingleChannelSharedAtomic<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_items);
+     *
+     * // Unbind texture iterator
+     * d_samples_tex_itr.UnbindTexture();
+     *
+     * \endcode
+     *
+     * \tparam BINS                 Number of histogram bins per channel
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)  Must have a value type that can be cast as an integer in the range [0..BINS-1]
+     * \tparam HistoCounter         <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
+     */
+    template <
+        int                 BINS,
+        typename            InputIteratorRA,
+        typename            HistoCounter>
+    __host__ __device__ __forceinline__
+    static cudaError_t SingleChannelSharedAtomic(
+        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA     d_samples,                          ///< [in] Input samples
+        HistoCounter*       d_histogram,                        ///< [out] Array of BINS counters of integral type \p HistoCounter.
+        int                 num_samples,                        ///< [in] Number of samples to process
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return Dispatch<GRID_HISTO_SHARED_ATOMIC, BINS, 1, 1>(
+            d_temp_storage, temp_storage_bytes, d_samples, &d_histogram, num_samples, stream, stream_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide histogram.  Uses global-memory atomic read-modify-write operations to compute the histogram.  Input samples having lower diversity can cause performance to be degraded.
+     *
+     * Performance is not significantly impacted when computing histograms having large
+     * numbers of bins (e.g., thousands).
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \par
+     * The code snippet below illustrates the computation of a 256-bin histogram of
+     * single-channel <tt>unsigned char</tt> samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Declare and initialize device pointers for input samples and 256-bin output histogram
+     * unsigned char *d_samples;
+     * unsigned int *d_histogram;
+     * int num_items = ...
+     * ...
+     *
+     * // Wrap d_samples device pointer in a random-access texture iterator
+     * cub::TexIteratorRA<unsigned int> d_samples_tex_itr;
+     * d_samples_tex_itr.BindTexture(d_samples, num_items * sizeof(unsigned char));
+     *
+     * // Determine temporary device storage requirements for histogram computation
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceHistogram::SingleChannelSorting<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_items);
+     *
+     * // Allocate temporary storage for histogram computation
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histogram
+     * cub::DeviceHistogram::SingleChannelGlobalAtomic<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_items);
+     *
+     * // Unbind texture iterator
+     * d_samples_tex_itr.UnbindTexture();
+     *
+     * \endcode
+     *
+     * \tparam BINS                 Number of histogram bins per channel
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)  Must have a value type that can be cast as an integer in the range [0..BINS-1]
+     * \tparam HistoCounter         <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
+     */
+    template <
+        int                 BINS,
+        typename            InputIteratorRA,
+        typename            HistoCounter>
+    __host__ __device__ __forceinline__
+    static cudaError_t SingleChannelGlobalAtomic(
+        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA     d_samples,                          ///< [in] Input samples
+        HistoCounter*       d_histogram,                        ///< [out] Array of BINS counters of integral type \p HistoCounter.
+        int                 num_samples,                        ///< [in] Number of samples to process
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return Dispatch<GRID_HISTO_GLOBAL_ATOMIC, BINS, 1, 1>(
+            d_temp_storage, temp_storage_bytes, d_samples, &d_histogram, num_samples, stream, stream_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Interleaved multi-channel samples
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a device-wide histogram from multi-channel data.  Uses fast block-sorting to compute the histogram.  Delivers consistent throughput regardless of sample diversity, but occupancy may be limited by histogram bin count.
+     *
+     * However, because histograms are privatized in shared memory, a large
+     * number of bins (e.g., thousands) may adversely affect occupancy and
+     * performance (or even the ability to launch).
+     *
+     * The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \par
+     * The code snippet below illustrates the computation of three 256-bin histograms from
+     * interleaved quad-channel <tt>unsigned char</tt> samples (e.g., RGB histograms from RGBA samples).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Declare and initialize device pointers for input samples and
+     * // three 256-bin output histograms
+     * unsigned char *d_samples;
+     * unsigned int *d_histograms[3];
+     * int num_items = ...
+     * ...
+     *
+     * // Wrap d_samples device pointer in a random-access texture iterator
+     * cub::TexIteratorRA<unsigned int> d_samples_tex_itr;
+     * d_samples_tex_itr.BindTexture(d_samples, num_items * sizeof(unsigned char));
+     *
+     * // Determine temporary device storage requirements for histogram computation
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiChannelSorting<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_items);
+     *
+     * // Allocate temporary storage for histogram computation
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiChannelSorting<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_items);
+     *
+     * // Unbind texture iterator
+     * d_samples_tex_itr.UnbindTexture();
+     *
+     * \endcode
+     *
+     * \tparam BINS                 Number of histogram bins per channel
+     * \tparam CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)  Must have a value type that can be cast as an integer in the range [0..BINS-1]
+     * \tparam HistoCounter         <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
+     */
+    template <
+        int                 BINS,
+        int                 CHANNELS,
+        int                 ACTIVE_CHANNELS,
+        typename            InputIteratorRA,
+        typename            HistoCounter>
+    __host__ __device__ __forceinline__
+    static cudaError_t MultiChannelSorting(
+        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA     d_samples,                          ///< [in] Input samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples).
+        HistoCounter        *d_histograms[ACTIVE_CHANNELS],     ///< [out] Array of channel histogram counter arrays, each having BINS counters of integral type \p HistoCounter.
+        int                 num_samples,                        ///< [in] Total number of samples to process in all channels, including non-active channels
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return Dispatch<GRID_HISTO_SORT, BINS, CHANNELS, ACTIVE_CHANNELS>(
+            d_temp_storage, temp_storage_bytes, d_samples, d_histograms, num_samples, stream, stream_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide histogram from multi-channel data.  Uses shared-memory atomic read-modify-write operations to compute the histogram.  Input samples having lower diversity can cause performance to be degraded, and occupancy may be limited by histogram bin count.
+     *
+     * However, because histograms are privatized in shared memory, a large
+     * number of bins (e.g., thousands) may adversely affect occupancy and
+     * performance (or even the ability to launch).
+     *
+     * The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \par
+     * The code snippet below illustrates the computation of three 256-bin histograms from
+     * interleaved quad-channel <tt>unsigned char</tt> samples (e.g., RGB histograms from RGBA samples).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Declare and initialize device pointers for input samples and
+     * // three 256-bin output histograms
+     * unsigned char *d_samples;
+     * unsigned int *d_histograms[3];
+     * int num_items = ...
+     * ...
+     *
+     * // Wrap d_samples device pointer in a random-access texture iterator
+     * cub::TexIteratorRA<unsigned int> d_samples_tex_itr;
+     * d_samples_tex_itr.BindTexture(d_samples, num_items * sizeof(unsigned char));
+     *
+     * // Determine temporary device storage requirements for histogram computation
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiChannelSharedAtomic<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_items);
+     *
+     * // Allocate temporary storage for histogram computation
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiChannelSharedAtomic<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_items);
+     *
+     * // Unbind texture iterator
+     * d_samples_tex_itr.UnbindTexture();
+     *
+     * \endcode
+     *
+     * \tparam BINS                 Number of histogram bins per channel
+     * \tparam CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)  Must have a value type that can be cast as an integer in the range [0..BINS-1]
+     * \tparam HistoCounter         <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
+     */
+    template <
+        int                 BINS,
+        int                 CHANNELS,
+        int                 ACTIVE_CHANNELS,
+        typename            InputIteratorRA,
+        typename            HistoCounter>
+    __host__ __device__ __forceinline__
+    static cudaError_t MultiChannelSharedAtomic(
+        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA     d_samples,                          ///< [in] Input samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples).
+        HistoCounter        *d_histograms[ACTIVE_CHANNELS],     ///< [out] Array of channel histogram counter arrays, each having BINS counters of integral type \p HistoCounter.
+        int                 num_samples,                        ///< [in] Total number of samples to process in all channels, including non-active channels
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return Dispatch<GRID_HISTO_SHARED_ATOMIC, BINS, CHANNELS, ACTIVE_CHANNELS>(
+            d_temp_storage, temp_storage_bytes, d_samples, d_histograms, num_samples, stream, stream_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide histogram from multi-channel data.  Uses global-memory atomic read-modify-write operations to compute the histogram.  Input samples having lower diversity can cause performance to be degraded.
+     *
+     * Performance is not significantly impacted when computing histograms having large
+     * numbers of bins (e.g., thousands).
+     *
+     * The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * Performance is often improved when referencing input samples through a texture-caching iterator, e.g., cub::TexIteratorRA or cub::TexTransformIteratorRA.
+     *
+     * \par
+     * The code snippet below illustrates the computation of three 256-bin histograms from
+     * interleaved quad-channel <tt>unsigned char</tt> samples (e.g., RGB histograms from RGBA samples).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Declare and initialize device pointers for input samples and
+     * // three 256-bin output histograms
+     * unsigned char *d_samples;
+     * unsigned int *d_histograms[3];
+     * int num_items = ...
+     * ...
+     *
+     * // Wrap d_samples device pointer in a random-access texture iterator
+     * cub::TexIteratorRA<unsigned int> d_samples_tex_itr;
+     * d_samples_tex_itr.BindTexture(d_samples, num_items * sizeof(unsigned char));
+     *
+     * // Determine temporary device storage requirements for histogram computation
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiChannelGlobalAtomic<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_items);
+     *
+     * // Allocate temporary storage for histogram computation
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiChannelGlobalAtomic<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_items);
+     *
+     * // Unbind texture iterator
+     * d_samples_tex_itr.UnbindTexture();
+     *
+     * \endcode
+     *
+     * \tparam BINS                 Number of histogram bins per channel
+     * \tparam CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)  Must have a value type that can be cast as an integer in the range [0..BINS-1]
+     * \tparam HistoCounter         <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
+     */
+    template <
+        int                 BINS,
+        int                 CHANNELS,
+        int                 ACTIVE_CHANNELS,
+        typename            InputIteratorRA,
+        typename            HistoCounter>
+    __host__ __device__ __forceinline__
+    static cudaError_t MultiChannelGlobalAtomic(
+        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA     d_samples,                          ///< [in] Input samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples).
+        HistoCounter        *d_histograms[ACTIVE_CHANNELS],     ///< [out] Array of channel histogram counter arrays, each having BINS counters of integral type \p HistoCounter.
+        int                 num_samples,                        ///< [in] Total number of samples to process in all channels, including non-active channels
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return Dispatch<GRID_HISTO_GLOBAL_ATOMIC, BINS, CHANNELS, ACTIVE_CHANNELS>(
+            d_temp_storage, temp_storage_bytes, d_samples, d_histograms, num_samples, stream, stream_synchronous);
+    }
+
+    //@}  end member group
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/lib/kokkos/TPL/cub/device/device_radix_sort.cuh b/lib/kokkos/TPL/cub/device/device_radix_sort.cuh
new file mode 100644
index 000000000..087d546bc
--- /dev/null
+++ b/lib/kokkos/TPL/cub/device/device_radix_sort.cuh
@@ -0,0 +1,890 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRadixSort provides operations for computing a device-wide, parallel reduction across data items residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "block/block_radix_sort_upsweep_tiles.cuh"
+#include "block/block_radix_sort_downsweep_tiles.cuh"
+#include "block/block_scan_tiles.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../util_debug.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Upsweep pass kernel entry point (multi-block).  Computes privatized digit histograms, one per block.
+ */
+template <
+    typename                BlockRadixSortUpsweepTilesPolicy, ///< Tuning policy for cub::BlockRadixSortUpsweepTiles abstraction
+    typename                Key,                            ///< Key type
+    typename                SizeT>                          ///< Integer type used for global array indexing
+__launch_bounds__ (int(BlockRadixSortUpsweepTilesPolicy::BLOCK_THREADS), 1)
+__global__ void RadixSortUpsweepKernel(
+    Key                     *d_keys,                        ///< [in] Input keys buffer
+    SizeT                   *d_spine,                       ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    SizeT                   num_items,                      ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    bool                    use_primary_bit_granularity,    ///< [in] Whether nor not to use the primary policy (or the embedded alternate policy for smaller bit granularity)
+    bool                    first_pass,                     ///< [in] Whether this is the first digit pass
+    GridEvenShare<SizeT>    even_share)                     ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
+{
+
+    // Alternate policy for when fewer bits remain
+    typedef typename BlockRadixSortUpsweepTilesPolicy::AltPolicy AltPolicy;
+
+    // Parameterize two versions of BlockRadixSortUpsweepTiles type for the current configuration
+    typedef BlockRadixSortUpsweepTiles<BlockRadixSortUpsweepTilesPolicy, Key, SizeT>    BlockRadixSortUpsweepTilesT;          // Primary
+    typedef BlockRadixSortUpsweepTiles<AltPolicy, Key, SizeT>                           AltBlockRadixSortUpsweepTilesT;       // Alternate (smaller bit granularity)
+
+    // Shared memory storage
+    __shared__ union
+    {
+        typename BlockRadixSortUpsweepTilesT::TempStorage     pass_storage;
+        typename AltBlockRadixSortUpsweepTilesT::TempStorage  alt_pass_storage;
+    } temp_storage;
+
+    // Initialize even-share descriptor for this thread block
+    even_share.BlockInit();
+
+    // Process input tiles (each of the first RADIX_DIGITS threads will compute a count for that digit)
+    if (use_primary_bit_granularity)
+    {
+        // Primary granularity
+        SizeT bin_count;
+        BlockRadixSortUpsweepTilesT(temp_storage.pass_storage, d_keys, current_bit).ProcessTiles(
+            even_share.block_offset,
+            even_share.block_oob,
+            bin_count);
+
+        // Write out digit counts (striped)
+        if (threadIdx.x < BlockRadixSortUpsweepTilesT::RADIX_DIGITS)
+        {
+            d_spine[(gridDim.x * threadIdx.x) + blockIdx.x] = bin_count;
+        }
+    }
+    else
+    {
+        // Alternate granularity
+        // Process input tiles (each of the first RADIX_DIGITS threads will compute a count for that digit)
+        SizeT bin_count;
+        AltBlockRadixSortUpsweepTilesT(temp_storage.alt_pass_storage, d_keys, current_bit).ProcessTiles(
+            even_share.block_offset,
+            even_share.block_oob,
+            bin_count);
+
+        // Write out digit counts (striped)
+        if (threadIdx.x < AltBlockRadixSortUpsweepTilesT::RADIX_DIGITS)
+        {
+            d_spine[(gridDim.x * threadIdx.x) + blockIdx.x] = bin_count;
+        }
+    }
+}
+
+
+/**
+ * Spine scan kernel entry point (single-block).  Computes an exclusive prefix sum over the privatized digit histograms
+ */
+template <
+    typename    BlockScanTilesPolicy,   ///< Tuning policy for cub::BlockScanTiles abstraction
+    typename    SizeT>                  ///< Integer type used for global array indexing
+__launch_bounds__ (int(BlockScanTilesPolicy::BLOCK_THREADS), 1)
+__global__ void RadixSortScanKernel(
+    SizeT       *d_spine,               ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    int         num_counts)             ///< [in] Total number of bin-counts
+{
+    // Parameterize the BlockScanTiles type for the current configuration
+    typedef BlockScanTiles<BlockScanTilesPolicy, SizeT*, SizeT*, cub::Sum, SizeT, SizeT> BlockScanTilesT;
+
+    // Shared memory storage
+    __shared__ typename BlockScanTilesT::TempStorage temp_storage;
+
+    // Block scan instance
+    BlockScanTilesT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), SizeT(0)) ;
+
+    // Process full input tiles
+    int block_offset = 0;
+    RunningBlockPrefixOp<SizeT> prefix_op;
+    prefix_op.running_total = 0;
+    while (block_offset < num_counts)
+    {
+        block_scan.ConsumeTile<true, false>(block_offset, prefix_op);
+        block_offset += BlockScanTilesT::TILE_ITEMS;
+    }
+}
+
+
+/**
+ * Downsweep pass kernel entry point (multi-block).  Scatters keys (and values) into corresponding bins for the current digit place.
+ */
+template <
+    typename                BlockRadixSortDownsweepTilesPolicy,   ///< Tuning policy for cub::BlockRadixSortUpsweepTiles abstraction
+    typename                Key,                                ///< Key type
+    typename                Value,                              ///< Value type
+    typename                SizeT>                              ///< Integer type used for global array indexing
+__launch_bounds__ (int(BlockRadixSortDownsweepTilesPolicy::BLOCK_THREADS))
+__global__ void RadixSortDownsweepKernel(
+    Key                     *d_keys_in,                     ///< [in] Input keys ping buffer
+    Key                     *d_keys_out,                    ///< [in] Output keys pong buffer
+    Value                   *d_values_in,                   ///< [in] Input values ping buffer
+    Value                   *d_values_out,                  ///< [in] Output values pong buffer
+    SizeT                   *d_spine,                       ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    SizeT                   num_items,                      ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    bool                    use_primary_bit_granularity,    ///< [in] Whether nor not to use the primary policy (or the embedded alternate policy for smaller bit granularity)
+    bool                    first_pass,                     ///< [in] Whether this is the first digit pass
+    bool                    last_pass,                      ///< [in] Whether this is the last digit pass
+    GridEvenShare<SizeT>    even_share)                     ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
+{
+
+    // Alternate policy for when fewer bits remain
+    typedef typename BlockRadixSortDownsweepTilesPolicy::AltPolicy AltPolicy;
+
+    // Parameterize two versions of BlockRadixSortDownsweepTiles type for the current configuration
+    typedef BlockRadixSortDownsweepTiles<BlockRadixSortDownsweepTilesPolicy, Key, Value, SizeT>     BlockRadixSortDownsweepTilesT;
+    typedef BlockRadixSortDownsweepTiles<AltPolicy, Key, Value, SizeT>                            AltBlockRadixSortDownsweepTilesT;
+
+    // Shared memory storage
+    __shared__ union
+    {
+        typename BlockRadixSortDownsweepTilesT::TempStorage       pass_storage;
+        typename AltBlockRadixSortDownsweepTilesT::TempStorage    alt_pass_storage;
+
+    } temp_storage;
+
+    // Initialize even-share descriptor for this thread block
+    even_share.BlockInit();
+
+    if (use_primary_bit_granularity)
+    {
+        // Process input tiles
+        BlockRadixSortDownsweepTilesT(temp_storage.pass_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit).ProcessTiles(
+            even_share.block_offset,
+            even_share.block_oob);
+    }
+    else
+    {
+        // Process input tiles
+        AltBlockRadixSortDownsweepTilesT(temp_storage.alt_pass_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit).ProcessTiles(
+            even_share.block_offset,
+            even_share.block_oob);
+    }
+}
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+
+
+/******************************************************************************
+ * DeviceRadixSort
+ *****************************************************************************/
+
+/**
+ * \brief DeviceRadixSort provides operations for computing a device-wide, parallel radix sort across data items residing within global memory. ![](sorting_logo.png)
+ * \ingroup DeviceModule
+ *
+ * \par Overview
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending order.  It relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \par
+ * DeviceRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
+ * <tt>unsigned char</tt>, \p int, \p double, etc.  Although the direct radix sorting
+ * method can only be applied to unsigned integral types, BlockRadixSort
+ * is able to sort signed and floating-point types via simple bit-wise transformations
+ * that ensure lexicographic key ordering.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceRadixSort}
+ *
+ * \par Performance
+ *
+ * \image html lsd_sort_perf.png
+ *
+ */
+struct DeviceRadixSort
+{
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    /// Generic structure for encapsulating dispatch properties codified in block policy.
+    struct KernelDispachParams
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        cudaSharedMemConfig     smem_config;
+        int                     radix_bits;
+        int                     alt_radix_bits;
+        int                     subscription_factor;
+        int                     tile_size;
+
+        template <typename SortBlockPolicy>
+        __host__ __device__ __forceinline__
+        void InitUpsweepPolicy(int subscription_factor = 1)
+        {
+            block_threads               = SortBlockPolicy::BLOCK_THREADS;
+            items_per_thread            = SortBlockPolicy::ITEMS_PER_THREAD;
+            radix_bits                  = SortBlockPolicy::RADIX_BITS;
+            alt_radix_bits              = SortBlockPolicy::AltPolicy::RADIX_BITS;
+            smem_config                 = cudaSharedMemBankSizeFourByte;
+            this->subscription_factor   = subscription_factor;
+            tile_size                   = block_threads * items_per_thread;
+        }
+
+        template <typename ScanBlockPolicy>
+        __host__ __device__ __forceinline__
+        void InitScanPolicy()
+        {
+            block_threads               = ScanBlockPolicy::BLOCK_THREADS;
+            items_per_thread            = ScanBlockPolicy::ITEMS_PER_THREAD;
+            radix_bits                  = 0;
+            alt_radix_bits              = 0;
+            smem_config                 = cudaSharedMemBankSizeFourByte;
+            subscription_factor         = 0;
+            tile_size                   = block_threads * items_per_thread;
+        }
+
+        template <typename SortBlockPolicy>
+        __host__ __device__ __forceinline__
+        void InitDownsweepPolicy(int subscription_factor = 1)
+        {
+            block_threads               = SortBlockPolicy::BLOCK_THREADS;
+            items_per_thread            = SortBlockPolicy::ITEMS_PER_THREAD;
+            radix_bits                  = SortBlockPolicy::RADIX_BITS;
+            alt_radix_bits              = SortBlockPolicy::AltPolicy::RADIX_BITS;
+            smem_config                 = SortBlockPolicy::SMEM_CONFIG;
+            this->subscription_factor   = subscription_factor;
+            tile_size                   = block_threads * items_per_thread;
+        }
+    };
+
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// Specializations of tuned policy types for different PTX architectures
+    template <typename Key, typename Value, typename SizeT, int ARCH>
+    struct TunedPolicies;
+
+    /// SM35 tune
+    template <typename Key, typename Value, typename SizeT>
+    struct TunedPolicies<Key, Value, SizeT, 350>
+    {
+        enum {
+            KEYS_ONLY       = (Equals<Value, NullType>::VALUE),
+            SCALE_FACTOR    = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
+            RADIX_BITS      = 5,
+        };
+
+        // UpsweepPolicy
+        typedef BlockRadixSortUpsweepTilesPolicy <64,     CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyKeys;
+        typedef BlockRadixSortUpsweepTilesPolicy <128,    CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
+/*
+        // 4bit
+        typedef BlockRadixSortUpsweepTilesPolicy <128, 15, LOAD_LDG, RADIX_BITS> UpsweepPolicyKeys;
+        typedef BlockRadixSortUpsweepTilesPolicy <256, 13, LOAD_LDG, RADIX_BITS> UpsweepPolicyPairs;
+*/
+        // ScanPolicy
+        typedef BlockScanTilesPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // DownsweepPolicy
+        typedef BlockRadixSortDownsweepTilesPolicy <64,   CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys;
+        typedef BlockRadixSortDownsweepTilesPolicy <128,  CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
+
+/*
+        // 4bit
+        typedef BlockRadixSortDownsweepTilesPolicy <128, 15, BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys;
+        typedef BlockRadixSortDownsweepTilesPolicy <256, 13, BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs;
+*/
+        enum { SUBSCRIPTION_FACTOR = 7 };
+    };
+
+
+    /// SM20 tune
+    template <typename Key, typename Value, typename SizeT>
+    struct TunedPolicies<Key, Value, SizeT, 200>
+    {
+        enum {
+            KEYS_ONLY       = (Equals<Value, NullType>::VALUE),
+            SCALE_FACTOR    = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
+            RADIX_BITS      = 5,
+        };
+
+        // UpsweepPolicy
+        typedef BlockRadixSortUpsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys;
+        typedef BlockRadixSortUpsweepTilesPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
+
+        // ScanPolicy
+        typedef BlockScanTilesPolicy <512, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // DownsweepPolicy
+        typedef BlockRadixSortDownsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyKeys;
+        typedef BlockRadixSortDownsweepTilesPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyPairs;
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
+
+        enum { SUBSCRIPTION_FACTOR = 3 };
+    };
+
+
+    /// SM10 tune
+    template <typename Key, typename Value, typename SizeT>
+    struct TunedPolicies<Key, Value, SizeT, 100>
+    {
+        enum {
+            RADIX_BITS = 4,
+        };
+
+        // UpsweepPolicy
+        typedef BlockRadixSortUpsweepTilesPolicy <64, 9, LOAD_DEFAULT, RADIX_BITS> UpsweepPolicy;
+
+        // ScanPolicy
+        typedef BlockScanTilesPolicy <256, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // DownsweepPolicy
+        typedef BlockRadixSortDownsweepTilesPolicy <64, 9, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicy;
+
+        enum { SUBSCRIPTION_FACTOR = 3 };
+    };
+
+
+
+    /******************************************************************************
+     * Default policy initializer
+     ******************************************************************************/
+
+    /// Tuning policy for the PTX architecture that DeviceRadixSort operations will get dispatched to
+    template <typename Key, typename Value, typename SizeT>
+    struct PtxDefaultPolicies
+    {
+
+        static const int PTX_TUNE_ARCH =   (CUB_PTX_ARCH >= 350) ?
+                                                350 :
+                                                (CUB_PTX_ARCH >= 200) ?
+                                                    200 :
+                                                    100;
+
+        // Tuned policy set for the current PTX compiler pass
+        typedef TunedPolicies<Key, Value, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
+
+        // UpsweepPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+        struct UpsweepPolicy : PtxTunedPolicies::UpsweepPolicy {};
+
+        // ScanPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+        struct ScanPolicy : PtxTunedPolicies::ScanPolicy {};
+
+        // DownsweepPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+        struct DownsweepPolicy : PtxTunedPolicies::DownsweepPolicy {};
+
+        // Subscription factor for the current PTX compiler pass
+        enum { SUBSCRIPTION_FACTOR = PtxTunedPolicies::SUBSCRIPTION_FACTOR };
+
+
+        /**
+         * Initialize dispatch params with the policies corresponding to the PTX assembly we will use
+         */
+        static void InitDispatchParams(
+            int                    ptx_version,
+            KernelDispachParams    &upsweep_dispatch_params,
+            KernelDispachParams    &scan_dispatch_params,
+            KernelDispachParams    &downsweep_dispatch_params)
+        {
+            if (ptx_version >= 350)
+            {
+                typedef TunedPolicies<Key, Value, SizeT, 350> TunedPolicies;
+                upsweep_dispatch_params.InitUpsweepPolicy<typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+                scan_dispatch_params.InitScanPolicy<typename TunedPolicies::ScanPolicy>();
+                downsweep_dispatch_params.InitDownsweepPolicy<typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+            }
+            else if (ptx_version >= 200)
+            {
+                typedef TunedPolicies<Key, Value, SizeT, 200> TunedPolicies;
+                upsweep_dispatch_params.InitUpsweepPolicy<typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+                scan_dispatch_params.InitScanPolicy<typename TunedPolicies::ScanPolicy>();
+                downsweep_dispatch_params.InitDownsweepPolicy<typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+            }
+            else
+            {
+                typedef TunedPolicies<Key, Value, SizeT, 100> TunedPolicies;
+                upsweep_dispatch_params.InitUpsweepPolicy<typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+                scan_dispatch_params.InitScanPolicy<typename TunedPolicies::ScanPolicy>();
+                downsweep_dispatch_params.InitDownsweepPolicy<typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+            }
+        }
+    };
+
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction using a two-stages of kernel invocations.
+     */
+    template <
+        typename            UpsweepKernelPtr,                       ///< Function type of cub::RadixSortUpsweepKernel
+        typename            SpineKernelPtr,                         ///< Function type of cub::SpineScanKernel
+        typename            DownsweepKernelPtr,                     ///< Function type of cub::RadixSortUpsweepKernel
+        typename            Key,                                    ///< Key type
+        typename            Value,                                  ///< Value type
+        typename            SizeT>                                  ///< Integer type used for global array indexing
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void                *d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        UpsweepKernelPtr    upsweep_kernel,                         ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
+        SpineKernelPtr      scan_kernel,                            ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
+        DownsweepKernelPtr  downsweep_kernel,                       ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
+        KernelDispachParams &upsweep_dispatch_params,               ///< [in] Dispatch parameters that match the policy that \p upsweep_kernel was compiled for
+        KernelDispachParams &scan_dispatch_params,                  ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
+        KernelDispachParams &downsweep_dispatch_params,             ///< [in] Dispatch parameters that match the policy that \p downsweep_kernel was compiled for
+        DoubleBuffer<Key>   &d_keys,                                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<Value> &d_values,                              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        SizeT               num_items,                              ///< [in] Number of items to reduce
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int                 end_bit             = sizeof(Key) * 8,  ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get a rough estimate of downsweep_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
+            int downsweep_sm_occupancy = CUB_MIN(
+                ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
+                ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / downsweep_dispatch_params.block_threads);
+            int upsweep_sm_occupancy = downsweep_sm_occupancy;
+
+#ifndef __CUDA_ARCH__
+            // We're on the host, so come up with more accurate estimates of SM occupancy from actual device properties
+            Device device_props;
+            if (CubDebug(error = device_props.Init(device_ordinal))) break;
+
+            if (CubDebug(error = device_props.MaxSmOccupancy(
+                downsweep_sm_occupancy,
+                downsweep_kernel,
+                downsweep_dispatch_params.block_threads))) break;
+
+            if (CubDebug(error = device_props.MaxSmOccupancy(
+                upsweep_sm_occupancy,
+                upsweep_kernel,
+                upsweep_dispatch_params.block_threads))) break;
+#endif
+            // Get device occupancies
+            int downsweep_occupancy = downsweep_sm_occupancy * sm_count;
+
+            // Get even-share work distribution descriptor
+            GridEvenShare<SizeT> even_share;
+            int max_downsweep_grid_size = downsweep_occupancy * downsweep_dispatch_params.subscription_factor;
+            int downsweep_grid_size;
+            even_share.GridInit(num_items, max_downsweep_grid_size, downsweep_dispatch_params.tile_size);
+            downsweep_grid_size = even_share.grid_size;
+
+            // Get number of spine elements (round up to nearest spine scan kernel tile size)
+            int bins            = 1 << downsweep_dispatch_params.radix_bits;
+            int spine_size      = downsweep_grid_size * bins;
+            int spine_tiles     = (spine_size + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
+            spine_size          = spine_tiles * scan_dispatch_params.tile_size;
+
+            int alt_bins            = 1 << downsweep_dispatch_params.alt_radix_bits;
+            int alt_spine_size      = downsweep_grid_size * alt_bins;
+            int alt_spine_tiles     = (alt_spine_size + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
+            alt_spine_size          = alt_spine_tiles * scan_dispatch_params.tile_size;
+
+            // Temporary storage allocation requirements
+            void* allocations[1];
+            size_t allocation_sizes[1] =
+            {
+                spine_size * sizeof(SizeT),    // bytes needed for privatized block digit histograms
+            };
+
+            // Alias temporaries (or set the necessary size of the storage allocation)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+                return cudaSuccess;
+
+            // Privatized per-block digit histograms
+            SizeT *d_spine = (SizeT*) allocations[0];
+
+#ifndef __CUDA_ARCH__
+            // Get current smem bank configuration
+            cudaSharedMemConfig original_smem_config;
+            if (CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config))) break;
+            cudaSharedMemConfig current_smem_config = original_smem_config;
+#endif
+            // Iterate over digit places
+            int current_bit = begin_bit;
+            while (current_bit < end_bit)
+            {
+                // Use primary bit granularity if bits remaining is a whole multiple of bit primary granularity
+                int bits_remaining = end_bit - current_bit;
+                bool use_primary_bit_granularity = (bits_remaining % downsweep_dispatch_params.radix_bits == 0);
+                int radix_bits = (use_primary_bit_granularity) ?
+                    downsweep_dispatch_params.radix_bits :
+                    downsweep_dispatch_params.alt_radix_bits;
+
+#ifndef __CUDA_ARCH__
+                // Update smem config if necessary
+                if (current_smem_config != upsweep_dispatch_params.smem_config)
+                {
+                    if (CubDebug(error = cudaDeviceSetSharedMemConfig(upsweep_dispatch_params.smem_config))) break;
+                    current_smem_config = upsweep_dispatch_params.smem_config;
+                }
+#endif
+
+                // Log upsweep_kernel configuration
+                if (stream_synchronous)
+                    CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy, selector %d, current bit %d, bit_grain %d\n",
+                    downsweep_grid_size, upsweep_dispatch_params.block_threads, (long long) stream, upsweep_dispatch_params.smem_config, upsweep_dispatch_params.items_per_thread, upsweep_sm_occupancy, d_keys.selector, current_bit, radix_bits);
+
+                // Invoke upsweep_kernel with same grid size as downsweep_kernel
+                upsweep_kernel<<<downsweep_grid_size, upsweep_dispatch_params.block_threads, 0, stream>>>(
+                    d_keys.d_buffers[d_keys.selector],
+                    d_spine,
+                    num_items,
+                    current_bit,
+                    use_primary_bit_granularity,
+                    (current_bit == begin_bit),
+                    even_share);
+
+                // Sync the stream if specified
+                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+                // Log scan_kernel configuration
+                if (stream_synchronous) CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+                    1, scan_dispatch_params.block_threads, (long long) stream, scan_dispatch_params.items_per_thread);
+
+                // Invoke scan_kernel
+                scan_kernel<<<1, scan_dispatch_params.block_threads, 0, stream>>>(
+                    d_spine,
+                    (use_primary_bit_granularity) ? spine_size : alt_spine_size);
+
+                // Sync the stream if specified
+                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+#ifndef __CUDA_ARCH__
+                // Update smem config if necessary
+                if (current_smem_config != downsweep_dispatch_params.smem_config)
+                {
+                    if (CubDebug(error = cudaDeviceSetSharedMemConfig(downsweep_dispatch_params.smem_config))) break;
+                    current_smem_config = downsweep_dispatch_params.smem_config;
+                }
+#endif
+
+                // Log downsweep_kernel configuration
+                if (stream_synchronous) CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy\n",
+                    downsweep_grid_size, downsweep_dispatch_params.block_threads, (long long) stream, downsweep_dispatch_params.smem_config, downsweep_dispatch_params.items_per_thread, downsweep_sm_occupancy);
+
+                // Invoke downsweep_kernel
+                downsweep_kernel<<<downsweep_grid_size, downsweep_dispatch_params.block_threads, 0, stream>>>(
+                    d_keys.d_buffers[d_keys.selector],
+                    d_keys.d_buffers[d_keys.selector ^ 1],
+                    d_values.d_buffers[d_values.selector],
+                    d_values.d_buffers[d_values.selector ^ 1],
+                    d_spine,
+                    num_items,
+                    current_bit,
+                    use_primary_bit_granularity,
+                    (current_bit == begin_bit),
+                    (current_bit + downsweep_dispatch_params.radix_bits >= end_bit),
+                    even_share);
+
+                // Sync the stream if specified
+                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+                // Invert selectors
+                d_keys.selector ^= 1;
+                d_values.selector ^= 1;
+
+                // Update current bit position
+                current_bit += radix_bits;
+            }
+
+#ifndef __CUDA_ARCH__
+            // Reset smem config if necessary
+            if (current_smem_config != original_smem_config)
+            {
+                if (CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config))) break;
+            }
+#endif
+
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+
+    /**
+     * \brief Sorts key-value pairs.
+     *
+     * \par
+     * The sorting operation requires a pair of key buffers and a pair of value
+     * buffers.  Each pair is wrapped in a DoubleBuffer structure whose member
+     * DoubleBuffer::Current() references the active buffer.  The currently-active
+     * buffer may be changed by the sorting operation.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \par
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers for
+     * // sorting data (keys, values, and equivalently-sized alternate buffers)
+     * int num_items = ...
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements for sorting operation
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Allocate temporary storage for sorting operation
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Sorted keys and values are referenced by d_keys.Current() and d_values.Current()
+     *
+     * \endcode
+     *
+     * \tparam Key      <b>[inferred]</b> Key type
+     * \tparam Value    <b>[inferred]</b> Value type
+     */
+    template <
+        typename            Key,
+        typename            Value>
+    __host__ __device__ __forceinline__
+    static cudaError_t SortPairs(
+        void                *d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        DoubleBuffer<Key>   &d_keys,                                ///< [in,out] Double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<Value> &d_values,                              ///< [in,out] Double-buffer of values whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                 num_items,                              ///< [in] Number of items to reduce
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The first (least-significant) bit index needed for key comparison
+        int                 end_bit             = sizeof(Key) * 8,  ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+        // Type used for array indexing
+        typedef int SizeT;
+
+        // Tuning polices
+        typedef PtxDefaultPolicies<Key, Value, SizeT>           PtxDefaultPolicies; // Wrapper of default kernel policies
+        typedef typename PtxDefaultPolicies::UpsweepPolicy      UpsweepPolicy;      // Upsweep kernel policy
+        typedef typename PtxDefaultPolicies::ScanPolicy         ScanPolicy;         // Scan kernel policy
+        typedef typename PtxDefaultPolicies::DownsweepPolicy    DownsweepPolicy;    // Downsweep kernel policy
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Declare dispatch parameters
+            KernelDispachParams upsweep_dispatch_params;
+            KernelDispachParams scan_dispatch_params;
+            KernelDispachParams downsweep_dispatch_params;
+
+#ifdef __CUDA_ARCH__
+            // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
+            upsweep_dispatch_params.InitUpsweepPolicy<UpsweepPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
+            scan_dispatch_params.InitScanPolicy<ScanPolicy>();
+            downsweep_dispatch_params.InitDownsweepPolicy<DownsweepPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
+#else
+            // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+            PtxDefaultPolicies::InitDispatchParams(
+                ptx_version,
+                upsweep_dispatch_params,
+                scan_dispatch_params,
+                downsweep_dispatch_params);
+#endif
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                RadixSortUpsweepKernel<UpsweepPolicy, Key, SizeT>,
+                RadixSortScanKernel<ScanPolicy, SizeT>,
+                RadixSortDownsweepKernel<DownsweepPolicy, Key, Value, SizeT>,
+                upsweep_dispatch_params,
+                scan_dispatch_params,
+                downsweep_dispatch_params,
+                d_keys,
+                d_values,
+                num_items,
+                begin_bit,
+                end_bit,
+                stream,
+                stream_synchronous))) break;
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /**
+     * \brief Sorts keys
+     *
+     * \par
+     * The sorting operation requires a pair of key buffers.  The pair is
+     * wrapped in a DoubleBuffer structure whose member DoubleBuffer::Current()
+     * references the active buffer.  The currently-active buffer may be changed
+     * by the sorting operation.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \par
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers for
+     * // sorting data (keys and equivalently-sized alternate buffer)
+     * int num_items = ...
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements for sorting operation
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Allocate temporary storage for sorting operation
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Sorted keys are referenced by d_keys.Current()
+     *
+     * \endcode
+     *
+     * \tparam Key      <b>[inferred]</b> Key type
+     */
+    template <typename Key>
+    __host__ __device__ __forceinline__
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        DoubleBuffer<Key>   &d_keys,                                ///< [in,out] Double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] Number of items to reduce
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The first (least-significant) bit index needed for key comparison
+        int                 end_bit             = sizeof(Key) * 8,  ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+        DoubleBuffer<NullType> d_values;
+        return SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream, stream_synchronous);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/lib/kokkos/TPL/cub/device/device_reduce.cuh b/lib/kokkos/TPL/cub/device/device_reduce.cuh
new file mode 100644
index 000000000..069af8c1f
--- /dev/null
+++ b/lib/kokkos/TPL/cub/device/device_reduce.cuh
@@ -0,0 +1,775 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduce provides operations for computing a device-wide, parallel reduction across data items residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "block/block_reduce_tiles.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../util_debug.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+
+
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Reduction pass kernel entry point (multi-block).  Computes privatized reductions, one per thread block.
+ */
+template <
+    typename                BlockReduceTilesPolicy, ///< Tuning policy for cub::BlockReduceTiles abstraction
+    typename                InputIteratorRA,        ///< Random-access iterator type for input (may be a simple pointer type)
+    typename                OutputIteratorRA,       ///< Random-access iterator type for output (may be a simple pointer type)
+    typename                SizeT,                  ///< Integer type used for global array indexing
+    typename                ReductionOp>            ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+__launch_bounds__ (int(BlockReduceTilesPolicy::BLOCK_THREADS), 1)
+__global__ void ReducePrivatizedKernel(
+    InputIteratorRA         d_in,                   ///< [in] Input data to reduce
+    OutputIteratorRA        d_out,                  ///< [out] Output location for result
+    SizeT                   num_items,              ///< [in] Total number of input data items
+    GridEvenShare<SizeT>    even_share,             ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
+    GridQueue<SizeT>        queue,                  ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks
+    ReductionOp             reduction_op)           ///< [in] Binary reduction operator
+{
+    // Data type
+    typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+    // Thread block type for reducing input tiles
+    typedef BlockReduceTiles<BlockReduceTilesPolicy, InputIteratorRA, SizeT, ReductionOp> BlockReduceTilesT;
+
+    // Block-wide aggregate
+    T block_aggregate;
+
+    // Shared memory storage
+    __shared__ typename BlockReduceTilesT::TempStorage temp_storage;
+
+    // Consume input tiles
+    BlockReduceTilesT(temp_storage, d_in, reduction_op).ConsumeTiles(
+        num_items,
+        even_share,
+        queue,
+        block_aggregate,
+        Int2Type<BlockReduceTilesPolicy::GRID_MAPPING>());
+
+    // Output result
+    if (threadIdx.x == 0)
+    {
+        d_out[blockIdx.x] = block_aggregate;
+    }
+}
+
+
+/**
+ * Reduction pass kernel entry point (single-block).  Aggregates privatized threadblock reductions from a previous multi-block reduction pass.
+ */
+template <
+    typename                BlockReduceTilesPolicy,  ///< Tuning policy for cub::BlockReduceTiles abstraction
+    typename                InputIteratorRA,        ///< Random-access iterator type for input (may be a simple pointer type)
+    typename                OutputIteratorRA,       ///< Random-access iterator type for output (may be a simple pointer type)
+    typename                SizeT,                  ///< Integer type used for global array indexing
+    typename                ReductionOp>            ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+__launch_bounds__ (int(BlockReduceTilesPolicy::BLOCK_THREADS), 1)
+__global__ void ReduceSingleKernel(
+    InputIteratorRA         d_in,                   ///< [in] Input data to reduce
+    OutputIteratorRA        d_out,                  ///< [out] Output location for result
+    SizeT                   num_items,              ///< [in] Total number of input data items
+    ReductionOp             reduction_op)           ///< [in] Binary reduction operator
+{
+    // Data type
+    typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+    // Thread block type for reducing input tiles
+    typedef BlockReduceTiles<BlockReduceTilesPolicy, InputIteratorRA, SizeT, ReductionOp> BlockReduceTilesT;
+
+    // Block-wide aggregate
+    T block_aggregate;
+
+    // Shared memory storage
+    __shared__ typename BlockReduceTilesT::TempStorage temp_storage;
+
+    // Consume input tiles
+    BlockReduceTilesT(temp_storage, d_in, reduction_op).ConsumeTiles(
+        SizeT(0),
+        SizeT(num_items),
+        block_aggregate);
+
+    // Output result
+    if (threadIdx.x == 0)
+    {
+        d_out[blockIdx.x] = block_aggregate;
+    }
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * DeviceReduce
+ *****************************************************************************/
+
+/**
+ * \brief DeviceReduce provides operations for computing a device-wide, parallel reduction across data items residing within global memory. ![](reduce_logo.png)
+ * \ingroup DeviceModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a list of input elements.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceReduce}
+ *
+ * \par Performance
+ *
+ * \image html reduction_perf.png
+ *
+ */
+struct DeviceReduce
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    /// Generic structure for encapsulating dispatch properties codified in block policy.
+    struct KernelDispachParams
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        int                     vector_load_length;
+        BlockReduceAlgorithm    block_algorithm;
+        PtxLoadModifier         load_modifier;
+        GridMappingStrategy     grid_mapping;
+        int                     subscription_factor;
+        int                     tile_size;
+
+        template <typename BlockPolicy>
+        __host__ __device__ __forceinline__
+        void Init(int subscription_factor = 1)
+        {
+            block_threads               = BlockPolicy::BLOCK_THREADS;
+            items_per_thread            = BlockPolicy::ITEMS_PER_THREAD;
+            vector_load_length          = BlockPolicy::VECTOR_LOAD_LENGTH;
+            block_algorithm             = BlockPolicy::BLOCK_ALGORITHM;
+            load_modifier               = BlockPolicy::LOAD_MODIFIER;
+            grid_mapping                = BlockPolicy::GRID_MAPPING;
+            this->subscription_factor   = subscription_factor;
+            tile_size                   = block_threads * items_per_thread;
+        }
+
+        __host__ __device__ __forceinline__
+        void Print()
+        {
+            printf("%d threads, %d per thread, %d veclen, %d algo, %d loadmod, %d mapping, %d subscription",
+                block_threads,
+                items_per_thread,
+                vector_load_length,
+                block_algorithm,
+                load_modifier,
+                grid_mapping,
+                subscription_factor);
+        }
+
+    };
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// Specializations of tuned policy types for different PTX architectures
+    template <
+        typename    T,
+        typename    SizeT,
+        int         ARCH>
+    struct TunedPolicies;
+
+    /// SM35 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 350>
+    {
+        // PrivatizedPolicy (1B): GTX Titan: 206.0 GB/s @ 192M 1B items
+        typedef BlockReduceTilesPolicy<128, 12,  1, BLOCK_REDUCE_RAKING, LOAD_LDG, GRID_MAPPING_DYNAMIC>                PrivatizedPolicy1B;
+
+        // PrivatizedPolicy (4B): GTX Titan: 254.2 GB/s @ 48M 4B items
+        typedef BlockReduceTilesPolicy<512, 20,  1, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>         PrivatizedPolicy4B;
+
+        // PrivatizedPolicy
+        typedef typename If<(sizeof(T) < 4),
+            PrivatizedPolicy1B,
+            PrivatizedPolicy4B>::Type PrivatizedPolicy;
+
+        // SinglePolicy
+        typedef BlockReduceTilesPolicy<256, 8, 1, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>  SinglePolicy;
+
+        enum { SUBSCRIPTION_FACTOR = 7 };
+
+    };
+
+    /// SM30 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 300>
+    {
+        // PrivatizedPolicy: GTX670: 154.0 @ 48M 32-bit T
+        typedef BlockReduceTilesPolicy<256, 2,  1, BLOCK_REDUCE_WARP_REDUCTIONS,  LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>    PrivatizedPolicy;
+
+        // SinglePolicy
+        typedef BlockReduceTilesPolicy<256, 24, 4, BLOCK_REDUCE_WARP_REDUCTIONS,  LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>    SinglePolicy;
+
+        enum { SUBSCRIPTION_FACTOR = 1 };
+    };
+
+    /// SM20 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 200>
+    {
+        // PrivatizedPolicy (1B): GTX 580: 158.1 GB/s @ 192M 1B items
+        typedef BlockReduceTilesPolicy<192, 24,  4, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>            PrivatizedPolicy1B;
+
+        // PrivatizedPolicy (4B): GTX 580: 178.9 GB/s @ 48M 4B items
+        typedef BlockReduceTilesPolicy<128, 8,  4, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_DYNAMIC>                PrivatizedPolicy4B;
+
+        // PrivatizedPolicy
+        typedef typename If<(sizeof(T) < 4),
+            PrivatizedPolicy1B,
+            PrivatizedPolicy4B>::Type PrivatizedPolicy;
+
+        // SinglePolicy
+        typedef BlockReduceTilesPolicy<192, 7,  1, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>             SinglePolicy;
+
+        enum { SUBSCRIPTION_FACTOR = 2 };
+    };
+
+    /// SM13 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 130>
+    {
+        // PrivatizedPolicy
+        typedef BlockReduceTilesPolicy<128, 8,  2,  BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>            PrivatizedPolicy;
+
+        // SinglePolicy
+        typedef BlockReduceTilesPolicy<32,  4,  4,  BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>            SinglePolicy;
+
+        enum { SUBSCRIPTION_FACTOR = 1 };
+    };
+
+    /// SM10 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 100>
+    {
+        // PrivatizedPolicy
+        typedef BlockReduceTilesPolicy<128, 8,  2,  BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>            PrivatizedPolicy;
+
+        // SinglePolicy
+        typedef BlockReduceTilesPolicy<32,  4,  4,  BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>            SinglePolicy;
+
+        enum { SUBSCRIPTION_FACTOR = 1 };
+    };
+
+
+
+    /******************************************************************************
+     * Default policy initializer
+     ******************************************************************************/
+
+    /// Tuning policy for the PTX architecture that DeviceReduce operations will get dispatched to
+    template <typename T, typename SizeT>
+    struct PtxDefaultPolicies
+    {
+        static const int PTX_TUNE_ARCH =   (CUB_PTX_ARCH >= 350) ?
+                                                350 :
+                                                (CUB_PTX_ARCH >= 300) ?
+                                                    300 :
+                                                    (CUB_PTX_ARCH >= 200) ?
+                                                        200 :
+                                                        (CUB_PTX_ARCH >= 130) ?
+                                                            130 :
+                                                            100;
+
+        // Tuned policy set for the current PTX compiler pass
+        typedef TunedPolicies<T, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
+
+        // Subscription factor for the current PTX compiler pass
+        static const int SUBSCRIPTION_FACTOR = PtxTunedPolicies::SUBSCRIPTION_FACTOR;
+
+        // PrivatizedPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+        struct PrivatizedPolicy : PtxTunedPolicies::PrivatizedPolicy {};
+
+        // SinglePolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+        struct SinglePolicy : PtxTunedPolicies::SinglePolicy {};
+
+
+        /**
+         * Initialize dispatch params with the policies corresponding to the PTX assembly we will use
+         */
+        static void InitDispatchParams(
+            int                    ptx_version,
+            KernelDispachParams    &privatized_dispatch_params,
+            KernelDispachParams    &single_dispatch_params)
+        {
+            if (ptx_version >= 350)
+            {
+                typedef TunedPolicies<T, SizeT, 350> TunedPolicies;
+                privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+                single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
+            }
+            else if (ptx_version >= 300)
+            {
+                typedef TunedPolicies<T, SizeT, 300> TunedPolicies;
+                privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+                single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
+            }
+            else if (ptx_version >= 200)
+            {
+                typedef TunedPolicies<T, SizeT, 200> TunedPolicies;
+                privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+                single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
+            }
+            else if (ptx_version >= 130)
+            {
+                typedef TunedPolicies<T, SizeT, 130> TunedPolicies;
+                privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+                single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
+            }
+            else
+            {
+                typedef TunedPolicies<T, SizeT, 100> TunedPolicies;
+                privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
+                single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
+            }
+        }
+    };
+
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction using a two-stages of kernel invocations.
+     */
+    template <
+        typename                    ReducePrivatizedKernelPtr,          ///< Function type of cub::ReducePrivatizedKernel
+        typename                    ReduceSingleKernelPtr,              ///< Function type of cub::ReduceSingleKernel
+        typename                    ResetDrainKernelPtr,                ///< Function type of cub::ResetDrainKernel
+        typename                    InputIteratorRA,                    ///< Random-access iterator type for input (may be a simple pointer type)
+        typename                    OutputIteratorRA,                   ///< Random-access iterator type for output (may be a simple pointer type)
+        typename                    SizeT,                              ///< Integer type used for global array indexing
+        typename                    ReductionOp>                        ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        ReducePrivatizedKernelPtr   privatized_kernel,                  ///< [in] Kernel function pointer to parameterization of cub::ReducePrivatizedKernel
+        ReduceSingleKernelPtr       single_kernel,                      ///< [in] Kernel function pointer to parameterization of cub::ReduceSingleKernel
+        ResetDrainKernelPtr         prepare_drain_kernel,               ///< [in] Kernel function pointer to parameterization of cub::ResetDrainKernel
+        KernelDispachParams         &privatized_dispatch_params,        ///< [in] Dispatch parameters that match the policy that \p privatized_kernel_ptr was compiled for
+        KernelDispachParams         &single_dispatch_params,            ///< [in] Dispatch parameters that match the policy that \p single_kernel was compiled for
+        InputIteratorRA             d_in,                               ///< [in] Input data to reduce
+        OutputIteratorRA            d_out,                              ///< [out] Output location for result
+        SizeT                       num_items,                          ///< [in] Number of items to reduce
+        ReductionOp                 reduction_op,                       ///< [in] Binary reduction operator
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+
+#else
+
+        // Data type of input iterator
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            if ((privatized_kernel == NULL) || (num_items <= (single_dispatch_params.tile_size)))
+            {
+                // Dispatch a single-block reduction kernel
+
+                // Return if the caller is simply requesting the size of the storage allocation
+                if (d_temp_storage == NULL)
+                {
+                    temp_storage_bytes = 1;
+                    return cudaSuccess;
+                }
+
+                // Log single_kernel configuration
+                if (stream_synchronous) CubLog("Invoking ReduceSingle<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+                    single_dispatch_params.block_threads, (long long) stream, single_dispatch_params.items_per_thread);
+
+                // Invoke single_kernel
+                single_kernel<<<1, single_dispatch_params.block_threads>>>(
+                    d_in,
+                    d_out,
+                    num_items,
+                    reduction_op);
+
+                // Sync the stream if specified
+                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            }
+            else
+            {
+                // Dispatch two kernels: a multi-block kernel to compute
+                // privatized per-block reductions, and then a single-block
+                // to reduce those
+
+                // Get device ordinal
+                int device_ordinal;
+                if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+                // Get SM count
+                int sm_count;
+                if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+                // Get a rough estimate of privatized_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
+                int privatized_sm_occupancy = CUB_MIN(
+                    ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
+                    ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / privatized_dispatch_params.block_threads);
+
+#ifndef __CUDA_ARCH__
+                // We're on the host, so come up with a more accurate estimate of privatized_kernel SM occupancy from actual device properties
+                Device device_props;
+                if (CubDebug(error = device_props.Init(device_ordinal))) break;
+
+                if (CubDebug(error = device_props.MaxSmOccupancy(
+                    privatized_sm_occupancy,
+                    privatized_kernel,
+                    privatized_dispatch_params.block_threads))) break;
+#endif
+
+                // Get device occupancy for privatized_kernel
+                int privatized_occupancy = privatized_sm_occupancy * sm_count;
+
+                // Even-share work distribution
+                GridEvenShare<SizeT> even_share;
+
+                // Get grid size for privatized_kernel
+                int privatized_grid_size;
+                switch (privatized_dispatch_params.grid_mapping)
+                {
+                case GRID_MAPPING_EVEN_SHARE:
+
+                    // Work is distributed evenly
+                    even_share.GridInit(
+                        num_items,
+                        privatized_occupancy * privatized_dispatch_params.subscription_factor,
+                        privatized_dispatch_params.tile_size);
+                    privatized_grid_size = even_share.grid_size;
+                    break;
+
+                case GRID_MAPPING_DYNAMIC:
+
+                    // Work is distributed dynamically
+                    int num_tiles = (num_items + privatized_dispatch_params.tile_size - 1) / privatized_dispatch_params.tile_size;
+                    privatized_grid_size   = (num_tiles < privatized_occupancy) ?
+                        num_tiles :                 // Not enough to fill the device with threadblocks
+                        privatized_occupancy;      // Fill the device with threadblocks
+                    break;
+                };
+
+                // Temporary storage allocation requirements
+                void* allocations[2];
+                size_t allocation_sizes[2] =
+                {
+                    privatized_grid_size * sizeof(T),      // bytes needed for privatized block reductions
+                    GridQueue<int>::AllocationSize()        // bytes needed for grid queue descriptor
+                };
+
+                // Alias temporaries (or set the necessary size of the storage allocation)
+                if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+                // Return if the caller is simply requesting the size of the storage allocation
+                if (d_temp_storage == NULL)
+                    return cudaSuccess;
+
+                // Privatized per-block reductions
+                T *d_block_reductions = (T*) allocations[0];
+
+                // Grid queue descriptor
+                GridQueue<SizeT> queue(allocations[1]);
+
+                // Prepare the dynamic queue descriptor if necessary
+                if (privatized_dispatch_params.grid_mapping == GRID_MAPPING_DYNAMIC)
+                {
+                    // Prepare queue using a kernel so we know it gets prepared once per operation
+                    if (stream_synchronous) CubLog("Invoking prepare_drain_kernel<<<1, 1, 0, %lld>>>()\n", (long long) stream);
+
+                    // Invoke prepare_drain_kernel
+                    prepare_drain_kernel<<<1, 1, 0, stream>>>(queue, num_items);
+
+                    // Sync the stream if specified
+                    if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+                }
+
+                // Log privatized_kernel configuration
+                if (stream_synchronous) CubLog("Invoking privatized_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    privatized_grid_size, privatized_dispatch_params.block_threads, (long long) stream, privatized_dispatch_params.items_per_thread, privatized_sm_occupancy);
+
+                // Invoke privatized_kernel
+                privatized_kernel<<<privatized_grid_size, privatized_dispatch_params.block_threads, 0, stream>>>(
+                    d_in,
+                    d_block_reductions,
+                    num_items,
+                    even_share,
+                    queue,
+                    reduction_op);
+
+                // Sync the stream if specified
+                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+                // Log single_kernel configuration
+                if (stream_synchronous) CubLog("Invoking single_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+                    1, single_dispatch_params.block_threads, (long long) stream, single_dispatch_params.items_per_thread);
+
+                // Invoke single_kernel
+                single_kernel<<<1, single_dispatch_params.block_threads, 0, stream>>>(
+                    d_block_reductions,
+                    d_out,
+                    privatized_grid_size,
+                    reduction_op);
+
+                // Sync the stream if specified
+                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    /**
+     * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor.
+     *
+     * \par
+     * Does not support non-commutative reduction operators.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \par
+     * The code snippet below illustrates the max reduction of a device vector of \p int items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Declare and initialize device pointers for input and output
+     * int *d_reduce_input, *d_aggregate;
+     * int num_items = ...
+     * ...
+     *
+     * // Determine temporary device storage requirements for reduction
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_reduce_input, d_aggregate, num_items, cub::Max());
+     *
+     * // Allocate temporary storage for reduction
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduction (max)
+     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_reduce_input, d_aggregate, num_items, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
+     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename                    InputIteratorRA,
+        typename                    OutputIteratorRA,
+        typename                    ReductionOp>
+    __host__ __device__ __forceinline__
+    static cudaError_t Reduce(
+        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA             d_in,                               ///< [in] Input data to reduce
+        OutputIteratorRA            d_out,                              ///< [out] Output location for result
+        int                         num_items,                          ///< [in] Number of items to reduce
+        ReductionOp                 reduction_op,                       ///< [in] Binary reduction operator
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+        // Type used for array indexing
+        typedef int SizeT;
+
+        // Data type of input iterator
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+        // Tuning polices
+        typedef PtxDefaultPolicies<T, SizeT>                    PtxDefaultPolicies;     // Wrapper of default kernel policies
+        typedef typename PtxDefaultPolicies::PrivatizedPolicy   PrivatizedPolicy;       // Multi-block kernel policy
+        typedef typename PtxDefaultPolicies::SinglePolicy       SinglePolicy;           // Single-block kernel policy
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Declare dispatch parameters
+            KernelDispachParams privatized_dispatch_params;
+            KernelDispachParams single_dispatch_params;
+
+#ifdef __CUDA_ARCH__
+            // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
+            privatized_dispatch_params.Init<PrivatizedPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
+            single_dispatch_params.Init<SinglePolicy>();
+#else
+            // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+            PtxDefaultPolicies::InitDispatchParams(ptx_version, privatized_dispatch_params, single_dispatch_params);
+#endif
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                ReducePrivatizedKernel<PrivatizedPolicy, InputIteratorRA, T*, SizeT, ReductionOp>,
+                ReduceSingleKernel<SinglePolicy, T*, OutputIteratorRA, SizeT, ReductionOp>,
+                ResetDrainKernel<SizeT>,
+                privatized_dispatch_params,
+                single_dispatch_params,
+                d_in,
+                d_out,
+                num_items,
+                reduction_op,
+                stream,
+                stream_synchronous))) break;
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /**
+     * \brief Computes a device-wide sum using the addition ('+') operator.
+     *
+     * \par
+     * Does not support non-commutative reduction operators.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \par
+     * The code snippet below illustrates the sum reduction of a device vector of \p int items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Declare and initialize device pointers for input and output
+     * int *d_reduce_input, *d_aggregate;
+     * int num_items = ...
+     * ...
+     *
+     * // Determine temporary device storage requirements for summation
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_reduce_input, d_aggregate, num_items);
+     *
+     * // Allocate temporary storage for summation
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduction summation
+     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_reduce_input, d_aggregate, num_items);
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
+     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
+     */
+    template <
+        typename                    InputIteratorRA,
+        typename                    OutputIteratorRA>
+    __host__ __device__ __forceinline__
+    static cudaError_t Sum(
+        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA             d_in,                               ///< [in] Input data to reduce
+        OutputIteratorRA            d_out,                              ///< [out] Output location for result
+        int                         num_items,                          ///< [in] Number of items to reduce
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+        return Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Sum(), stream, stream_synchronous);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/lib/kokkos/TPL/cub/device/device_reduce_by_key.cuh b/lib/kokkos/TPL/cub/device/device_reduce_by_key.cuh
new file mode 100644
index 000000000..f05f75154
--- /dev/null
+++ b/lib/kokkos/TPL/cub/device/device_reduce_by_key.cuh
@@ -0,0 +1,633 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduceByKey provides operations for computing a device-wide, parallel prefix scan across data items residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "block/block_reduce_by_key_tiles.cuh"
+#include "device_scan.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../util_iterator.cuh"
+#include "../util_debug.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Reduce-by-key kernel entry point (multi-block)
+ */
+template <
+    typename    BlockReduceByKeyilesPolicy,    ///< Tuning policy for cub::BlockReduceByKeyiles abstraction
+    typename    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
+    typename    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
+    typename    T,                              ///< The scan data type
+    typename    ReductionOp,                    ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename    Identity,                       ///< Identity value type (cub::NullType for inclusive scans)
+    typename    SizeT>                          ///< Integer type used for global array indexing
+__launch_bounds__ (int(BlockSweepScanPolicy::BLOCK_THREADS))
+__global__ void MultiBlockScanKernel(
+    InputIteratorRA             d_in,           ///< Input data
+    OutputIteratorRA            d_out,          ///< Output data
+    ScanTileDescriptor<T> *d_tile_status, ///< Global list of tile status
+    ReductionOp                 reduction_op,   ///< Binary scan operator
+    Identity                    identity,       ///< Identity element
+    SizeT                       num_items,      ///< Total number of scan items for the entire problem
+    GridQueue<int>              queue)          ///< Descriptor for performing dynamic mapping of tile data to thread blocks
+{
+    enum
+    {
+        TILE_STATUS_PADDING = PtxArchProps::WARP_THREADS,
+    };
+
+    // Thread block type for scanning input tiles
+    typedef BlockSweepScan<
+        BlockSweepScanPolicy,
+        InputIteratorRA,
+        OutputIteratorRA,
+        ReductionOp,
+        Identity,
+        SizeT> BlockSweepScanT;
+
+    // Shared memory for BlockSweepScan
+    __shared__ typename BlockSweepScanT::TempStorage temp_storage;
+
+    // Process tiles
+    BlockSweepScanT(temp_storage, d_in, d_out, reduction_op, identity).ConsumeTiles(
+        num_items,
+        queue,
+        d_tile_status + TILE_STATUS_PADDING);
+}
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * DeviceReduceByKey
+ *****************************************************************************/
+
+/**
+ * \addtogroup DeviceModule
+ * @{
+ */
+
+/**
+ * \brief DeviceReduceByKey provides operations for computing a device-wide, parallel prefix scan across data items residing within global memory. ![](scan_logo.png)
+ */
+struct DeviceReduceByKey
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    /// Generic structure for encapsulating dispatch properties.  Mirrors the constants within BlockSweepScanPolicy.
+    struct KernelDispachParams
+    {
+        // Policy fields
+        int                     block_threads;
+        int                     items_per_thread;
+        BlockLoadAlgorithm      load_policy;
+        BlockStoreAlgorithm     store_policy;
+        BlockScanAlgorithm      scan_algorithm;
+
+        // Other misc
+        int                     tile_size;
+
+        template <typename BlockSweepScanPolicy>
+        __host__ __device__ __forceinline__
+        void Init()
+        {
+            block_threads               = BlockSweepScanPolicy::BLOCK_THREADS;
+            items_per_thread            = BlockSweepScanPolicy::ITEMS_PER_THREAD;
+            load_policy                 = BlockSweepScanPolicy::LOAD_ALGORITHM;
+            store_policy                = BlockSweepScanPolicy::STORE_ALGORITHM;
+            scan_algorithm              = BlockSweepScanPolicy::SCAN_ALGORITHM;
+
+            tile_size                   = block_threads * items_per_thread;
+        }
+
+        __host__ __device__ __forceinline__
+        void Print()
+        {
+            printf("%d, %d, %d, %d, %d",
+                block_threads,
+                items_per_thread,
+                load_policy,
+                store_policy,
+                scan_algorithm);
+        }
+
+    };
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+
+    /// Specializations of tuned policy types for different PTX architectures
+    template <
+        typename    T,
+        typename    SizeT,
+        int         ARCH>
+    struct TunedPolicies;
+
+    /// SM35 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 350>
+    {
+        typedef BlockSweepScanPolicy<128, 16,  BLOCK_LOAD_DIRECT, false, LOAD_LDG, BLOCK_STORE_WARP_TRANSPOSE, true, BLOCK_SCAN_RAKING_MEMOIZE> MultiBlockPolicy;
+    };
+
+    /// SM30 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 300>
+    {
+        typedef BlockSweepScanPolicy<256, 9,  BLOCK_LOAD_WARP_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, false, BLOCK_SCAN_RAKING_MEMOIZE> MultiBlockPolicy;
+    };
+
+    /// SM20 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 200>
+    {
+        typedef BlockSweepScanPolicy<128, 15,  BLOCK_LOAD_WARP_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, false, BLOCK_SCAN_RAKING_MEMOIZE> MultiBlockPolicy;
+    };
+
+    /// SM10 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 100>
+    {
+        typedef BlockSweepScanPolicy<128, 7,  BLOCK_LOAD_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_TRANSPOSE, false, BLOCK_SCAN_RAKING> MultiBlockPolicy;
+    };
+
+
+    /// Tuning policy for the PTX architecture that DeviceReduceByKey operations will get dispatched to
+    template <typename T, typename SizeT>
+    struct PtxDefaultPolicies
+    {
+        static const int PTX_TUNE_ARCH =   (CUB_PTX_ARCH >= 350) ?
+                                                350 :
+                                                (CUB_PTX_ARCH >= 300) ?
+                                                    300 :
+                                                    (CUB_PTX_ARCH >= 200) ?
+                                                        200 :
+                                                        100;
+
+        // Tuned policy set for the current PTX compiler pass
+        typedef TunedPolicies<T, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
+
+        // MultiBlockPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+        struct MultiBlockPolicy : PtxTunedPolicies::MultiBlockPolicy {};
+
+        /**
+         * Initialize dispatch params with the policies corresponding to the PTX assembly we will use
+         */
+        static void InitDispatchParams(int ptx_version, KernelDispachParams &multi_block_dispatch_params)
+        {
+            if (ptx_version >= 350)
+            {
+                typedef TunedPolicies<T, SizeT, 350> TunedPolicies;
+                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>();
+            }
+            else if (ptx_version >= 300)
+            {
+                typedef TunedPolicies<T, SizeT, 300> TunedPolicies;
+                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>();
+            }
+            else if (ptx_version >= 200)
+            {
+                typedef TunedPolicies<T, SizeT, 200> TunedPolicies;
+                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>();
+            }
+            else
+            {
+                typedef TunedPolicies<T, SizeT, 100> TunedPolicies;
+                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>();
+            }
+        }
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine
+     */
+    template <
+        typename                    InitScanKernelPtr,              ///< Function type of cub::InitScanKernel
+        typename                    MultiBlockScanKernelPtr,        ///< Function type of cub::MultiBlockScanKernel
+        typename                    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
+        typename                    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
+        typename                    ReductionOp,                         ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+        typename                    Identity,                       ///< Identity value type (cub::NullType for inclusive scans)
+        typename                    SizeT>                          ///< Integer type used for global array indexing
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InitScanKernelPtr           init_kernel,                    ///< [in] Kernel function pointer to parameterization of cub::InitScanKernel
+        MultiBlockScanKernelPtr     multi_block_kernel,             ///< [in] Kernel function pointer to parameterization of cub::MultiBlockScanKernel
+        KernelDispachParams         &multi_block_dispatch_params,   ///< [in] Dispatch parameters that match the policy that \p multi_block_kernel was compiled for
+        InputIteratorRA             d_in,                           ///< [in] Iterator pointing to scan input
+        OutputIteratorRA            d_out,                          ///< [in] Iterator pointing to scan output
+        ReductionOp                      reduction_op,                        ///< [in] Binary scan operator
+        Identity                    identity,                       ///< [in] Identity element
+        SizeT                       num_items,                      ///< [in] Total number of items to scan
+        cudaStream_t                stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+
+#else
+
+        enum
+        {
+            TILE_STATUS_PADDING = 32,
+        };
+
+        // Data type
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Number of input tiles
+            int num_tiles = (num_items + multi_block_dispatch_params.tile_size - 1) / multi_block_dispatch_params.tile_size;
+
+            // Temporary storage allocation requirements
+            void* allocations[2];
+            size_t allocation_sizes[2] =
+            {
+                (num_tiles + TILE_STATUS_PADDING) * sizeof(ScanTileDescriptor<T>),        // bytes needed for tile status descriptors
+                GridQueue<int>::AllocationSize()                                            // bytes needed for grid queue descriptor
+            };
+
+            // Alias temporaries (or set the necessary size of the storage allocation)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+                return cudaSuccess;
+
+            // Global list of tile status
+            ScanTileDescriptor<T> *d_tile_status = (ScanTileDescriptor<T>*) allocations[0];
+
+            // Grid queue descriptor
+            GridQueue<int> queue(allocations[1]);
+
+            // Get GPU id
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Log init_kernel configuration
+            int init_kernel_threads = 128;
+            int init_grid_size = (num_tiles + init_kernel_threads - 1) / init_kernel_threads;
+            if (stream_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors and queue descriptors
+            init_kernel<<<init_grid_size, init_kernel_threads, 0, stream>>>(
+                queue,
+                d_tile_status,
+                num_tiles);
+
+            // Sync the stream if specified
+#ifndef __CUDA_ARCH__
+            if (stream_synchronous && CubDebug(error = cudaStreamSynchronize(stream))) break;
+#else
+            if (stream_synchronous && CubDebug(error = cudaDeviceSynchronize())) break;
+#endif
+
+            // Get a rough estimate of multi_block_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
+            int multi_sm_occupancy = CUB_MIN(
+                ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
+                ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / multi_block_dispatch_params.block_threads);
+
+#ifndef __CUDA_ARCH__
+
+            // We're on the host, so come up with a more accurate estimate of multi_block_kernel SM occupancy from actual device properties
+            Device device_props;
+            if (CubDebug(error = device_props.Init(device_ordinal))) break;
+
+            if (CubDebug(error = device_props.MaxSmOccupancy(
+                multi_sm_occupancy,
+                multi_block_kernel,
+                multi_block_dispatch_params.block_threads))) break;
+
+#endif
+            // Get device occupancy for multi_block_kernel
+            int multi_block_occupancy = multi_sm_occupancy * sm_count;
+
+            // Get grid size for multi_block_kernel
+            int multi_block_grid_size = (num_tiles < multi_block_occupancy) ?
+                num_tiles :                 // Not enough to fill the device with threadblocks
+                multi_block_occupancy;            // Fill the device with threadblocks
+
+            // Log multi_block_kernel configuration
+            if (stream_synchronous) CubLog("Invoking multi_block_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                multi_block_grid_size, multi_block_dispatch_params.block_threads, (long long) stream, multi_block_dispatch_params.items_per_thread, multi_sm_occupancy);
+
+            // Invoke multi_block_kernel
+            multi_block_kernel<<<multi_block_grid_size, multi_block_dispatch_params.block_threads, 0, stream>>>(
+                d_in,
+                d_out,
+                d_tile_status,
+                reduction_op,
+                identity,
+                num_items,
+                queue);
+
+            // Sync the stream if specified
+#ifndef __CUDA_ARCH__
+            if (stream_synchronous && CubDebug(error = cudaStreamSynchronize(stream))) break;
+#else
+            if (stream_synchronous && CubDebug(error = cudaDeviceSynchronize())) break;
+#endif
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+
+    /**
+     * Internal scan dispatch routine for using default tuning policies
+     */
+    template <
+        typename                    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
+        typename                    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
+        typename                    ReductionOp,                         ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+        typename                    Identity,                       ///< Identity value type (cub::NullType for inclusive scans)
+        typename                    SizeT>                          ///< Integer type used for global array indexing
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA             d_in,                           ///< [in] Iterator pointing to scan input
+        OutputIteratorRA            d_out,                          ///< [in] Iterator pointing to scan output
+        ReductionOp                      reduction_op,                        ///< [in] Binary scan operator
+        Identity                    identity,                       ///< [in] Identity element
+        SizeT                       num_items,                      ///< [in] Total number of items to scan
+        cudaStream_t                stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+        // Data type
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+        // Tuning polices for the PTX architecture that will get dispatched to
+        typedef PtxDefaultPolicies<T, SizeT> PtxDefaultPolicies;
+        typedef typename PtxDefaultPolicies::MultiBlockPolicy MultiBlockPolicy;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Declare dispatch parameters
+            KernelDispachParams multi_block_dispatch_params;
+
+#ifdef __CUDA_ARCH__
+            // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
+            multi_block_dispatch_params.Init<MultiBlockPolicy>();
+#else
+            // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+            PtxDefaultPolicies::InitDispatchParams(ptx_version, multi_block_dispatch_params);
+#endif
+
+            Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                InitScanKernel<T, SizeT>,
+                MultiBlockScanKernel<MultiBlockPolicy, InputIteratorRA, OutputIteratorRA, T, ReductionOp, Identity, SizeT>,
+                multi_block_dispatch_params,
+                d_in,
+                d_out,
+                reduction_op,
+                identity,
+                num_items,
+                stream,
+                stream_synchronous);
+
+            if (CubDebug(error)) break;
+        }
+        while (0);
+
+        return error;
+    }
+
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    /******************************************************************//**
+     * Interface
+     *********************************************************************/
+
+
+    /**
+     * \brief Computes device-wide reductions of consecutive values whose corresponding keys are equal.
+     *
+     * The resulting output lists of value-aggregates and their corresponding keys are compacted.
+     *
+     * \devicestorage
+     *
+     * \tparam KeyInputIteratorRA       <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
+     * \tparam KeyOutputIteratorRA      <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
+     * \tparam ValueInputIteratorRA     <b>[inferred]</b> Random-access input iterator type for values input (may be a simple pointer type)
+     * \tparam ValueOutputIteratorRA    <b>[inferred]</b> Random-access output iterator type for values output (may be a simple pointer type)
+     * \tparam ReductionOp              <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>, where \p T is the value type of \p ValueInputIteratorRA
+     */
+    template <
+        typename                KeyInputIteratorRA,
+        typename                KeyOutputIteratorRA,
+        typename                ValueInputIteratorRA,
+        typename                ValueOutputIteratorRA,
+        typename                ReductionOp>
+    __host__ __device__ __forceinline__
+    static cudaError_t ReduceValues(
+        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        KeyInputIteratorRA      d_keys_in,                      ///< [in] Key input data
+        KeyOutputIteratorRA     d_keys_out,                     ///< [out] Key output data (compacted)
+        ValueInputIteratorRA    d_values_in,                    ///< [in] Value input data
+        ValueOutputIteratorRA   d_values_out,                   ///< [out] Value output data (compacted)
+        int                     num_items,                      ///< [in] Total number of input pairs
+        ReductionOp             reduction_op,                   ///< [in] Binary value reduction operator
+        cudaStream_t            stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, reduction_op, num_items, stream, stream_synchronous);
+    }
+
+
+    /**
+     * \brief Computes device-wide sums of consecutive values whose corresponding keys are equal.
+     *
+     * The resulting output lists of value-aggregates and their corresponding keys are compacted.
+     *
+     * \devicestorage
+     *
+     * \tparam KeyInputIteratorRA       <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
+     * \tparam KeyOutputIteratorRA      <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
+     * \tparam ValueInputIteratorRA     <b>[inferred]</b> Random-access input iterator type for values input (may be a simple pointer type)
+     * \tparam ValueOutputIteratorRA    <b>[inferred]</b> Random-access output iterator type for values output (may be a simple pointer type)
+     * \tparam ReductionOp              <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>, where \p T is the value type of \p ValueInputIteratorRA
+     */
+    template <
+        typename                KeyInputIteratorRA,
+        typename                KeyOutputIteratorRA,
+        typename                ValueInputIteratorRA,
+        typename                ValueOutputIteratorRA>
+    __host__ __device__ __forceinline__
+    static cudaError_t SumValues(
+        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        KeyInputIteratorRA      d_keys_in,                      ///< [in] Key input data
+        KeyOutputIteratorRA     d_keys_out,                     ///< [in] Key output data (compacted)
+        ValueInputIteratorRA    d_values_in,                    ///< [in] Value input data
+        ValueOutputIteratorRA   d_values_out,                   ///< [in] Value output data (compacted)
+        int                     num_items,                      ///< [in] Total number of input pairs
+        cudaStream_t            stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return ReduceValues(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, cub::Sum(), num_items, stream, stream_synchronous);
+    }
+
+
+    /**
+     * \brief Computes the "run-length" of each group of consecutive, equal-valued keys.
+     *
+     * The resulting output lists of run-length counts and their corresponding keys are compacted.
+     *
+     * \devicestorage
+     *
+     * \tparam KeyInputIteratorRA       <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
+     * \tparam KeyOutputIteratorRA      <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
+     * \tparam CountOutputIteratorRA    <b>[inferred]</b> Random-access output iterator type for output of key-counts whose value type must be convertible to an integer type (may be a simple pointer type)
+     */
+    template <
+        typename                KeyInputIteratorRA,
+        typename                KeyOutputIteratorRA,
+        typename                CountOutputIteratorRA>
+    __host__ __device__ __forceinline__
+    static cudaError_t RunLengths(
+        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        KeyInputIteratorRA      d_keys_in,                      ///< [in] Key input data
+        KeyOutputIteratorRA     d_keys_out,                     ///< [in] Key output data (compacted)
+        CountOutputIteratorRA   d_counts_out,                   ///< [in] Run-length counts output data (compacted)
+        int                     num_items,                      ///< [in] Total number of keys
+        cudaStream_t            stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef typename std::iterator_traits<CountOutputIteratorRA>::value_type CountT;
+        return SumValues(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, ConstantIteratorRA<CountT>(1), d_counts_out, num_items, stream, stream_synchronous);
+    }
+
+
+    /**
+     * \brief Removes duplicates within each group of consecutive, equal-valued keys.  Only the first key from each group (and corresponding value) is kept.
+     *
+     * The resulting keys are compacted.
+     *
+     * \devicestorage
+     *
+     * \tparam KeyInputIteratorRA       <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
+     * \tparam KeyOutputIteratorRA      <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
+     * \tparam ValueInputIteratorRA     <b>[inferred]</b> Random-access input iterator type for values input (may be a simple pointer type)
+     * \tparam ValueOutputIteratorRA    <b>[inferred]</b> Random-access output iterator type for values output (may be a simple pointer type)
+     * \tparam ReductionOp              <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>, where \p T is the value type of \p ValueInputIteratorRA
+     */
+    template <
+        typename                KeyInputIteratorRA,
+        typename                KeyOutputIteratorRA,
+        typename                ValueInputIteratorRA,
+        typename                ValueOutputIteratorRA,
+        typename                ReductionOp>
+    __host__ __device__ __forceinline__
+    static cudaError_t Unique(
+        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        KeyInputIteratorRA      d_keys_in,                      ///< [in] Key input data
+        KeyOutputIteratorRA     d_keys_out,                     ///< [out] Key output data (compacted)
+        ValueInputIteratorRA    d_values_in,                    ///< [in] Value input data
+        ValueOutputIteratorRA   d_values_out,                   ///< [out] Value output data (compacted)
+        int                     num_items,                      ///< [in] Total number of input pairs
+        cudaStream_t            stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, reduction_op, num_items, stream, stream_synchronous);
+    }
+
+
+
+};
+
+
+/** @} */       // DeviceModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/lib/kokkos/TPL/cub/device/device_reorder.cuh b/lib/kokkos/TPL/cub/device/device_reorder.cuh
new file mode 100644
index 000000000..cba3bb48f
--- /dev/null
+++ b/lib/kokkos/TPL/cub/device/device_reorder.cuh
@@ -0,0 +1,550 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReorder provides device-wide operations for partitioning and filtering lists of items residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "device_scan.cuh"
+#include "block/block_partition_tiles.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../util_debug.cuh"
+#include "../util_device.cuh"
+#include "../util_vector.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Partition kernel entry point (multi-block)
+ */
+template <
+    typename    BlockPartitionTilesPolicy,  ///< Tuning policy for cub::BlockPartitionTiles abstraction
+    typename    InputIteratorRA,            ///< Random-access iterator type for input (may be a simple pointer type)
+    typename    OutputIteratorRA,           ///< Random-access iterator type for output (may be a simple pointer type)
+    typename    LengthOutputIterator,       ///< Output iterator type for recording the length of the first partition (may be a simple pointer type)
+    typename    PredicateOp,                ///< Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
+    typename    SizeT>                      ///< Integer type used for global array indexing
+__launch_bounds__ (int(BlockPartitionTilesPolicy::BLOCK_THREADS))
+__global__ void PartitionKernel(
+    InputIteratorRA                                                                         d_in,               ///< Input data
+    OutputIteratorRA                                                                        d_out,              ///< Output data
+    LengthOutputIterator                                                                    d_partition_length, ///< Number of items in the first partition
+    ScanTileDescriptor<PartitionScanTuple<SizeT, BlockPartitionTilesPolicy::PARTITOINS> >   *d_tile_status,     ///< Global list of tile status
+    PredicateOp                                                                             pred_op,            ///< Unary predicate operator indicating membership in the first partition
+    SizeT                                                                                   num_items,          ///< Total number of input items for the entire problem
+    int                                                                                     num_tiles,          ///< Totla number of intut tiles for the entire problem
+    GridQueue<int>                                                                          queue)              ///< Descriptor for performing dynamic mapping of tile data to thread blocks
+{
+    enum
+    {
+        TILE_STATUS_PADDING = PtxArchProps::WARP_THREADS,
+    };
+
+    typedef PartitionScanTuple<SizeT, BlockPartitionTilesPolicy::PARTITOINS> PartitionScanTuple;
+
+    // Thread block type for scanning input tiles
+    typedef BlockPartitionTiles<
+        BlockPartitionTilesPolicy,
+        InputIteratorRA,
+        OutputIteratorRA,
+        PredicateOp,
+        SizeT> BlockPartitionTilesT;
+
+    // Shared memory for BlockPartitionTiles
+    __shared__ typename BlockPartitionTilesT::TempStorage temp_storage;
+
+    // Process tiles
+    PartitionScanTuple  partition_ends;     // Ending offsets for partitions (one-after)
+    bool                is_last_tile;       // Whether or not this block handled the last tile (i.e., partition_ends is valid for the entire input)
+    BlockPartitionTilesT(temp_storage, d_in, d_out, d_tile_status + TILE_STATUS_PADDING, pred_op, num_items).ConsumeTiles(
+        queue,
+        num_tiles,
+        partition_ends,
+        is_last_tile);
+
+    // Record the length of the first partition
+    if (is_last_tile && (threadIdx.x == 0))
+    {
+        *d_partition_length = partition_ends.x;
+    }
+}
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * DeviceReorder
+ *****************************************************************************/
+
+/**
+ * \addtogroup DeviceModule
+ * @{
+ */
+
+/**
+ * \brief DeviceReorder provides device-wide operations for partitioning and filtering lists of items residing within global memory
+ */
+struct DeviceReorder
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    /// Generic structure for encapsulating dispatch properties.  Mirrors the constants within BlockPartitionTilesPolicy.
+    struct KernelDispachParams
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        BlockScanAlgorithm      scan_algorithm;
+        int                     tile_size;
+
+        template <typename BlockPartitionTilesPolicy>
+        __host__ __device__ __forceinline__
+        void Init()
+        {
+            block_threads               = BlockPartitionTilesPolicy::BLOCK_THREADS;
+            items_per_thread            = BlockPartitionTilesPolicy::ITEMS_PER_THREAD;
+            scan_algorithm              = BlockPartitionTilesPolicy::SCAN_ALGORITHM;
+            tile_size                   = block_threads * items_per_thread;
+        }
+    };
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+
+    /// Specializations of tuned policy types for different PTX architectures
+    template <
+        int         PARTITIONS,
+        typename    T,
+        typename    SizeT,
+        int         ARCH>
+    struct TunedPolicies;
+
+    /// SM35 tune
+    template <int PARTITIONS, typename T, typename SizeT>
+    struct TunedPolicies<PARTITIONS, T, SizeT, 350>
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 16,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef BlockPartitionTilesPolicy<PARTITIONS, 128, ITEMS_PER_THREAD, LOAD_LDG, BLOCK_SCAN_RAKING_MEMOIZE> PartitionPolicy;
+    };
+
+    /// SM30 tune
+    template <int PARTITIONS, typename T, typename SizeT>
+    struct TunedPolicies<PARTITIONS, T, SizeT, 300>
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef BlockPartitionTilesPolicy<PARTITIONS, 256, ITEMS_PER_THREAD, LOAD_DEFAULT, BLOCK_SCAN_RAKING_MEMOIZE> PartitionPolicy;
+    };
+
+    /// SM20 tune
+    template <int PARTITIONS, typename T, typename SizeT>
+    struct TunedPolicies<PARTITIONS, T, SizeT, 200>
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef BlockPartitionTilesPolicy<PARTITIONS, 128, ITEMS_PER_THREAD, LOAD_DEFAULT, BLOCK_SCAN_RAKING_MEMOIZE> PartitionPolicy;
+    };
+
+    /// SM10 tune
+    template <int PARTITIONS, typename T, typename SizeT>
+    struct TunedPolicies<PARTITIONS, T, SizeT, 100>
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 7,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+        typedef BlockPartitionTilesPolicy<PARTITIONS, 128, ITEMS_PER_THREAD, LOAD_DEFAULT, BLOCK_SCAN_RAKING> PartitionPolicy;
+    };
+
+
+    /// Tuning policy for the PTX architecture that DevicePartition operations will get dispatched to
+    template <int PARTITIONS, typename T, typename SizeT>
+    struct PtxDefaultPolicies
+    {
+        static const int PTX_TUNE_ARCH =   (CUB_PTX_ARCH >= 350) ?
+                                                350 :
+                                                (CUB_PTX_ARCH >= 300) ?
+                                                    300 :
+                                                    (CUB_PTX_ARCH >= 200) ?
+                                                        200 :
+                                                        100;
+
+        // Tuned policy set for the current PTX compiler pass
+        typedef TunedPolicies<PARTITIONS, T, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
+
+        // PartitionPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+        struct PartitionPolicy : PtxTunedPolicies::PartitionPolicy {};
+
+        /**
+         * Initialize dispatch params with the policies corresponding to the PTX assembly we will use
+         */
+        static void InitDispatchParams(int ptx_version, KernelDispachParams &scan_dispatch_params)
+        {
+            if (ptx_version >= 350)
+            {
+                typedef TunedPolicies<PARTITIONS, T, SizeT, 350> TunedPolicies;
+                scan_dispatch_params.Init<typename TunedPolicies::PartitionPolicy>();
+            }
+            else if (ptx_version >= 300)
+            {
+                typedef TunedPolicies<PARTITIONS, T, SizeT, 300> TunedPolicies;
+                scan_dispatch_params.Init<typename TunedPolicies::PartitionPolicy>();
+            }
+            else if (ptx_version >= 200)
+            {
+                typedef TunedPolicies<PARTITIONS, T, SizeT, 200> TunedPolicies;
+                scan_dispatch_params.Init<typename TunedPolicies::PartitionPolicy>();
+            }
+            else
+            {
+                typedef TunedPolicies<PARTITIONS, T, SizeT, 100> TunedPolicies;
+                scan_dispatch_params.Init<typename TunedPolicies::PartitionPolicy>();
+            }
+        }
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine
+     */
+    template <
+        typename                    ScanInitKernelPtr,              ///< Function type of cub::ScanInitKernel
+        typename                    PartitionKernelPtr,             ///< Function type of cub::PartitionKernel
+        typename                    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
+        typename                    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
+        typename                    LengthOutputIterator,           ///< Output iterator type for recording the length of the first partition (may be a simple pointer type)
+        typename                    PredicateOp,                    ///< Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
+        typename                    SizeT>                          ///< Integer type used for global array indexing
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        int                         ptx_version,                    ///< [in] PTX version
+        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        ScanInitKernelPtr           init_kernel,                    ///< [in] Kernel function pointer to parameterization of cub::PartitionInitKernel
+        PartitionKernelPtr          partition_kernel,               ///< [in] Kernel function pointer to parameterization of cub::PartitionKernel
+        KernelDispachParams         &scan_dispatch_params,          ///< [in] Dispatch parameters that match the policy that \p partition_kernel was compiled for
+        InputIteratorRA             d_in,                           ///< [in] Iterator pointing to scan input
+        OutputIteratorRA            d_out,                          ///< [in] Iterator pointing to scan output
+        LengthOutputIterator        d_partition_length,                 ///< [out] Output iterator referencing the location where the pivot offset (i.e., the length of the first partition) is to be recorded
+        PredicateOp                 pred_op,                        ///< [in] Unary predicate operator indicating membership in the first partition
+        SizeT                       num_items,                      ///< [in] Total number of items to partition
+        cudaStream_t                stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        enum
+        {
+            TILE_STATUS_PADDING = 32,
+        };
+
+        // Data type
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+        // Scan tuple type and tile status descriptor type
+        typedef typename VectorHelper<SizeT, 2>::Type ScanTuple;
+        typedef ScanTileDescriptor<ScanTuple> ScanTileDescriptorT;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Number of input tiles
+            int num_tiles = (num_items + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
+
+            // Temporary storage allocation requirements
+            void* allocations[2];
+            size_t allocation_sizes[2] =
+            {
+                (num_tiles + TILE_STATUS_PADDING) * sizeof(ScanTileDescriptorT),      // bytes needed for tile status descriptors
+                GridQueue<int>::AllocationSize()                                            // bytes needed for grid queue descriptor
+            };
+
+            // Alias temporaries (or set the necessary size of the storage allocation)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+                return cudaSuccess;
+
+            // Global list of tile status
+            ScanTileDescriptorT *d_tile_status = (ScanTileDescriptorT*) allocations[0];
+
+            // Grid queue descriptor
+            GridQueue<int> queue(allocations[1]);
+
+            // Log init_kernel configuration
+            int init_kernel_threads = 128;
+            int init_grid_size = (num_tiles + init_kernel_threads - 1) / init_kernel_threads;
+            if (stream_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors and queue descriptors
+            init_kernel<<<init_grid_size, init_kernel_threads, 0, stream>>>(
+                queue,
+                d_tile_status,
+                num_tiles);
+
+            // Sync the stream if specified
+            if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Get grid size for multi-block kernel
+            int scan_grid_size;
+            int multi_sm_occupancy = -1;
+            if (ptx_version < 200)
+            {
+                // We don't have atomics (or don't have fast ones), so just assign one
+                // block per tile (limited to 65K tiles)
+                scan_grid_size = num_tiles;
+            }
+            else
+            {
+                // We have atomics and can thus reuse blocks across multiple tiles using a queue descriptor.
+                // Get GPU id
+                int device_ordinal;
+                if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+                // Get SM count
+                int sm_count;
+                if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+                // Get a rough estimate of partition_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
+                multi_sm_occupancy = CUB_MIN(
+                    ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
+                    ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / scan_dispatch_params.block_threads);
+
+#ifndef __CUDA_ARCH__
+                // We're on the host, so come up with a
+                Device device_props;
+                if (CubDebug(error = device_props.Init(device_ordinal))) break;
+
+                if (CubDebug(error = device_props.MaxSmOccupancy(
+                    multi_sm_occupancy,
+                    partition_kernel,
+                    scan_dispatch_params.block_threads))) break;
+#endif
+                // Get device occupancy for partition_kernel
+                int scan_occupancy = multi_sm_occupancy * sm_count;
+
+                // Get grid size for partition_kernel
+                scan_grid_size = (num_tiles < scan_occupancy) ?
+                    num_tiles :                 // Not enough to fill the device with threadblocks
+                    scan_occupancy;      // Fill the device with threadblocks
+            }
+
+            // Log partition_kernel configuration
+            if (stream_synchronous) CubLog("Invoking partition_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                scan_grid_size, scan_dispatch_params.block_threads, (long long) stream, scan_dispatch_params.items_per_thread, multi_sm_occupancy);
+
+            // Invoke partition_kernel
+            partition_kernel<<<scan_grid_size, scan_dispatch_params.block_threads, 0, stream>>>(
+                d_in,
+                d_out,
+                d_partition_length,
+                d_tile_status,
+                pred_op,
+                num_items,
+                num_tiles,
+                queue);
+
+            // Sync the stream if specified
+            if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+
+    /**
+     * Internal partition dispatch routine for using default tuning policies
+     */
+    template <
+        typename                    PARTITIONS,                     ///< Number of partitions we are keeping
+        typename                    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
+        typename                    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
+        typename                    LengthOutputIterator,           ///< Output iterator type for recording the length of the first partition (may be a simple pointer type)
+        typename                    PredicateOp,                    ///< Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
+        typename                    SizeT>                          ///< Integer type used for global array indexing
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA             d_in,                           ///< [in] Iterator pointing to input items
+        OutputIteratorRA            d_out,                          ///< [in] Iterator pointing to output items
+        LengthOutputIterator        d_partition_length,             ///< [out] Output iterator referencing the location where the pivot offset (i.e., the length of the first partition) is to be recorded
+        PredicateOp                 pred_op,                        ///< [in] Unary predicate operator indicating membership in the first partition
+        SizeT                       num_items,                      ///< [in] Total number of items to partition
+        cudaStream_t                stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+        // Data type
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+        // Tuning polices
+        typedef PtxDefaultPolicies<PARTITIONS, T, SizeT>        PtxDefaultPolicies;     // Wrapper of default kernel policies
+        typedef typename PtxDefaultPolicies::PartitionPolicy    PartitionPolicy;        // Partition kernel policy
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Declare dispatch parameters
+            KernelDispachParams scan_dispatch_params;
+
+            int ptx_version;
+#ifdef __CUDA_ARCH__
+            // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
+            scan_dispatch_params.Init<PartitionPolicy>();
+            ptx_version = CUB_PTX_ARCH;
+#else
+            // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+            PtxDefaultPolicies::InitDispatchParams(ptx_version, scan_dispatch_params);
+#endif
+
+            Dispatch(
+                ptx_version,
+                d_temp_storage,
+                temp_storage_bytes,
+                ScanInitKernel<T, SizeT>,
+                PartitionKernel<PartitionPolicy, InputIteratorRA, OutputIteratorRA, LengthOutputIterator, PredicateOp, SizeT>,
+                scan_dispatch_params,
+                d_in,
+                d_out,
+                d_partition_length,
+                pred_op,
+                num_items,
+                stream,
+                stream_synchronous);
+
+            if (CubDebug(error)) break;
+        }
+        while (0);
+
+        return error;
+    }
+
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    /**
+     * \brief Splits a list of input items into two partitions within the given output list using the specified predicate.  The relative ordering of inputs is not necessarily preserved.
+     *
+     * An item \p val is placed in the first partition if <tt>pred_op(val) == true</tt>, otherwise
+     * it is placed in the second partition.  The offset of the partitioning pivot (equivalent to
+     * the total length of the first partition as well as the starting offset of the second), is
+     * recorded to \p d_partition_length.
+     *
+     * The length of the output referenced by \p d_out is assumed to be the same as that of \p d_in.
+     *
+     * \devicestorage
+     *
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
+     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
+     * \tparam LengthOutputIterator <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
+     * \tparam PredicateOp          <b>[inferred]</b> Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
+     */
+    template <
+        typename                InputIteratorRA,
+        typename                OutputIteratorRA,
+        typename                LengthOutputIterator,
+        typename                PredicateOp>
+    __host__ __device__ __forceinline__
+    static cudaError_t Partition(
+        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA         d_in,                           ///< [in] Iterator pointing to input items
+        OutputIteratorRA        d_out,                          ///< [in] Iterator pointing to output items
+        LengthOutputIterator    d_pivot_offset,                 ///< [out] Output iterator referencing the location where the pivot offset is to be recorded
+        PredicateOp             pred_op,                        ///< [in] Unary predicate operator indicating membership in the first partition
+        int                     num_items,                      ///< [in] Total number of items to partition
+        cudaStream_t            stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+        return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), T(), num_items, stream, stream_synchronous);
+    }
+
+
+};
+
+
+/** @} */       // DeviceModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/lib/kokkos/TPL/cub/device/device_scan.cuh b/lib/kokkos/TPL/cub/device/device_scan.cuh
new file mode 100644
index 000000000..c0640c857
--- /dev/null
+++ b/lib/kokkos/TPL/cub/device/device_scan.cuh
@@ -0,0 +1,812 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScan provides operations for computing a device-wide, parallel prefix scan across data items residing within global memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "block/block_scan_tiles.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../util_debug.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+    typename T,                                     ///< Scan value type
+    typename SizeT>                                 ///< Integer type used for global array indexing
+__global__ void ScanInitKernel(
+    GridQueue<SizeT>            grid_queue,         ///< [in] Descriptor for performing dynamic mapping of input tiles to thread blocks
+    ScanTileDescriptor<T>       *d_tile_status,     ///< [out] Tile status words
+    int                         num_tiles)          ///< [in] Number of tiles
+{
+    typedef ScanTileDescriptor<T> ScanTileDescriptorT;
+
+    enum
+    {
+        TILE_STATUS_PADDING = PtxArchProps::WARP_THREADS,
+    };
+
+    // Reset queue descriptor
+    if ((blockIdx.x == 0) && (threadIdx.x == 0)) grid_queue.ResetDrain(num_tiles);
+
+    // Initialize tile status
+    int tile_offset = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (tile_offset < num_tiles)
+    {
+        // Not-yet-set
+        d_tile_status[TILE_STATUS_PADDING + tile_offset].status = SCAN_TILE_INVALID;
+    }
+
+    if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+    {
+        // Padding
+        d_tile_status[threadIdx.x].status = SCAN_TILE_OOB;
+    }
+}
+
+
+/**
+ * Scan kernel entry point (multi-block)
+ */
+template <
+    typename    BlockScanTilesPolicy,           ///< Tuning policy for cub::BlockScanTiles abstraction
+    typename    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
+    typename    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
+    typename    T,                              ///< The scan data type
+    typename    ScanOp,                         ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename    Identity,                       ///< Identity value type (cub::NullType for inclusive scans)
+    typename    SizeT>                          ///< Integer type used for global array indexing
+__launch_bounds__ (int(BlockScanTilesPolicy::BLOCK_THREADS))
+__global__ void ScanKernel(
+    InputIteratorRA             d_in,           ///< Input data
+    OutputIteratorRA            d_out,          ///< Output data
+    ScanTileDescriptor<T>       *d_tile_status, ///< Global list of tile status
+    ScanOp                      scan_op,        ///< Binary scan operator
+    Identity                    identity,       ///< Identity element
+    SizeT                       num_items,      ///< Total number of scan items for the entire problem
+    GridQueue<int>              queue)          ///< Descriptor for performing dynamic mapping of tile data to thread blocks
+{
+    enum
+    {
+        TILE_STATUS_PADDING = PtxArchProps::WARP_THREADS,
+    };
+
+    // Thread block type for scanning input tiles
+    typedef BlockScanTiles<
+        BlockScanTilesPolicy,
+        InputIteratorRA,
+        OutputIteratorRA,
+        ScanOp,
+        Identity,
+        SizeT> BlockScanTilesT;
+
+    // Shared memory for BlockScanTiles
+    __shared__ typename BlockScanTilesT::TempStorage temp_storage;
+
+    // Process tiles
+    BlockScanTilesT(temp_storage, d_in, d_out, scan_op, identity).ConsumeTiles(
+        num_items,
+        queue,
+        d_tile_status + TILE_STATUS_PADDING);
+}
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * DeviceScan
+ *****************************************************************************/
+
+/**
+ * \brief DeviceScan provides operations for computing a device-wide, parallel prefix scan across data items residing within global memory. ![](device_scan.png)
+ * \ingroup DeviceModule
+ *
+ * \par Overview
+ * Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ * produces an output list where each element is computed to be the reduction
+ * of the elements occurring earlier in the input list.  <em>Prefix sum</em>
+ * connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ * the <em>i</em><sup>th</sup> output reduction.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceScan}
+ *
+ * \par Performance
+ *
+ * \image html scan_perf.png
+ *
+ */
+struct DeviceScan
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    /// Generic structure for encapsulating dispatch properties.  Mirrors the constants within BlockScanTilesPolicy.
+    struct KernelDispachParams
+    {
+        // Policy fields
+        int                     block_threads;
+        int                     items_per_thread;
+        BlockLoadAlgorithm      load_policy;
+        BlockStoreAlgorithm     store_policy;
+        BlockScanAlgorithm      scan_algorithm;
+
+        // Other misc
+        int                     tile_size;
+
+        template <typename BlockScanTilesPolicy>
+        __host__ __device__ __forceinline__
+        void Init()
+        {
+            block_threads               = BlockScanTilesPolicy::BLOCK_THREADS;
+            items_per_thread            = BlockScanTilesPolicy::ITEMS_PER_THREAD;
+            load_policy                 = BlockScanTilesPolicy::LOAD_ALGORITHM;
+            store_policy                = BlockScanTilesPolicy::STORE_ALGORITHM;
+            scan_algorithm              = BlockScanTilesPolicy::SCAN_ALGORITHM;
+
+            tile_size                   = block_threads * items_per_thread;
+        }
+
+        __host__ __device__ __forceinline__
+        void Print()
+        {
+            printf("%d, %d, %d, %d, %d",
+                block_threads,
+                items_per_thread,
+                load_policy,
+                store_policy,
+                scan_algorithm);
+        }
+
+    };
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+
+    /// Specializations of tuned policy types for different PTX architectures
+    template <
+        typename    T,
+        typename    SizeT,
+        int         ARCH>
+    struct TunedPolicies;
+
+    /// SM35 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 350>
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 16,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        // ScanPolicy: GTX Titan: 29.1B items/s (232.4 GB/s) @ 48M 32-bit T
+        typedef BlockScanTilesPolicy<128, ITEMS_PER_THREAD,  BLOCK_LOAD_DIRECT, false, LOAD_LDG, BLOCK_STORE_WARP_TRANSPOSE, true, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+    };
+
+    /// SM30 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 300>
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef BlockScanTilesPolicy<256, ITEMS_PER_THREAD,  BLOCK_LOAD_WARP_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+    };
+
+    /// SM20 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 200>
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        // ScanPolicy: GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
+        typedef BlockScanTilesPolicy<128, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+    };
+
+    /// SM10 tune
+    template <typename T, typename SizeT>
+    struct TunedPolicies<T, SizeT, 100>
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 7,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+        typedef BlockScanTilesPolicy<128, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_TRANSPOSE, false, BLOCK_SCAN_RAKING> ScanPolicy;
+    };
+
+
+    /// Tuning policy for the PTX architecture that DeviceScan operations will get dispatched to
+    template <typename T, typename SizeT>
+    struct PtxDefaultPolicies
+    {
+        static const int PTX_TUNE_ARCH =   (CUB_PTX_ARCH >= 350) ?
+                                                350 :
+                                                (CUB_PTX_ARCH >= 300) ?
+                                                    300 :
+                                                    (CUB_PTX_ARCH >= 200) ?
+                                                        200 :
+                                                        100;
+
+        // Tuned policy set for the current PTX compiler pass
+        typedef TunedPolicies<T, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
+
+        // ScanPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
+        struct ScanPolicy : PtxTunedPolicies::ScanPolicy {};
+
+        /**
+         * Initialize dispatch params with the policies corresponding to the PTX assembly we will use
+         */
+        static void InitDispatchParams(int ptx_version, KernelDispachParams &scan_dispatch_params)
+        {
+            if (ptx_version >= 350)
+            {
+                typedef TunedPolicies<T, SizeT, 350> TunedPolicies;
+                scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
+            }
+            else if (ptx_version >= 300)
+            {
+                typedef TunedPolicies<T, SizeT, 300> TunedPolicies;
+                scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
+            }
+            else if (ptx_version >= 200)
+            {
+                typedef TunedPolicies<T, SizeT, 200> TunedPolicies;
+                scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
+            }
+            else
+            {
+                typedef TunedPolicies<T, SizeT, 100> TunedPolicies;
+                scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
+            }
+        }
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine
+     */
+    template <
+        typename                    ScanInitKernelPtr,              ///< Function type of cub::ScanInitKernel
+        typename                    ScanKernelPtr,                  ///< Function type of cub::ScanKernel
+        typename                    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
+        typename                    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
+        typename                    ScanOp,                         ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+        typename                    Identity,                       ///< Identity value type (cub::NullType for inclusive scans)
+        typename                    SizeT>                          ///< Integer type used for global array indexing
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        int                         ptx_version,                    ///< [in] PTX version
+        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        ScanInitKernelPtr           init_kernel,                    ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel
+        ScanKernelPtr               scan_kernel,                    ///< [in] Kernel function pointer to parameterization of cub::ScanKernel
+        KernelDispachParams         &scan_dispatch_params,          ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
+        InputIteratorRA             d_in,                           ///< [in] Iterator pointing to scan input
+        OutputIteratorRA            d_out,                          ///< [in] Iterator pointing to scan output
+        ScanOp                      scan_op,                        ///< [in] Binary scan operator
+        Identity                    identity,                       ///< [in] Identity element
+        SizeT                       num_items,                      ///< [in] Total number of items to scan
+        cudaStream_t                stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        enum
+        {
+            TILE_STATUS_PADDING     = 32,
+            INIT_KERNEL_THREADS     = 128
+        };
+
+        // Data type
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+        // Tile status descriptor type
+        typedef ScanTileDescriptor<T> ScanTileDescriptorT;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Number of input tiles
+            int num_tiles = (num_items + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
+
+            // Temporary storage allocation requirements
+            void* allocations[2];
+            size_t allocation_sizes[2] =
+            {
+                (num_tiles + TILE_STATUS_PADDING) * sizeof(ScanTileDescriptorT),      // bytes needed for tile status descriptors
+                GridQueue<int>::AllocationSize()                                      // bytes needed for grid queue descriptor
+            };
+
+            // Alias temporaries (or set the necessary size of the storage allocation)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+                return cudaSuccess;
+
+            // Global list of tile status
+            ScanTileDescriptorT *d_tile_status = (ScanTileDescriptorT*) allocations[0];
+
+            // Grid queue descriptor
+            GridQueue<int> queue(allocations[1]);
+
+            // Log init_kernel configuration
+            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
+            if (stream_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors and queue descriptors
+            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+                queue,
+                d_tile_status,
+                num_tiles);
+
+            // Sync the stream if specified
+            if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Get grid size for multi-block kernel
+            int scan_grid_size;
+            int multi_sm_occupancy = -1;
+            if (ptx_version < 200)
+            {
+                // We don't have atomics (or don't have fast ones), so just assign one
+                // block per tile (limited to 65K tiles)
+                scan_grid_size = num_tiles;
+            }
+            else
+            {
+                // We have atomics and can thus reuse blocks across multiple tiles using a queue descriptor.
+                // Get GPU id
+                int device_ordinal;
+                if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+                // Get SM count
+                int sm_count;
+                if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+                // Get a rough estimate of scan_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
+                multi_sm_occupancy = CUB_MIN(
+                    ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
+                    ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / scan_dispatch_params.block_threads);
+
+#ifndef __CUDA_ARCH__
+                // We're on the host, so come up with a
+                Device device_props;
+                if (CubDebug(error = device_props.Init(device_ordinal))) break;
+
+                if (CubDebug(error = device_props.MaxSmOccupancy(
+                    multi_sm_occupancy,
+                    scan_kernel,
+                    scan_dispatch_params.block_threads))) break;
+#endif
+                // Get device occupancy for scan_kernel
+                int scan_occupancy = multi_sm_occupancy * sm_count;
+
+                // Get grid size for scan_kernel
+                scan_grid_size = (num_tiles < scan_occupancy) ?
+                    num_tiles :                 // Not enough to fill the device with threadblocks
+                    scan_occupancy;      // Fill the device with threadblocks
+            }
+
+            // Log scan_kernel configuration
+            if (stream_synchronous) CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                scan_grid_size, scan_dispatch_params.block_threads, (long long) stream, scan_dispatch_params.items_per_thread, multi_sm_occupancy);
+
+            // Invoke scan_kernel
+            scan_kernel<<<scan_grid_size, scan_dispatch_params.block_threads, 0, stream>>>(
+                d_in,
+                d_out,
+                d_tile_status,
+                scan_op,
+                identity,
+                num_items,
+                queue);
+
+            // Sync the stream if specified
+            if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+
+    /**
+     * Internal scan dispatch routine for using default tuning policies
+     */
+    template <
+        typename                    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
+        typename                    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
+        typename                    ScanOp,                         ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+        typename                    Identity,                       ///< Identity value type (cub::NullType for inclusive scans)
+        typename                    SizeT>                          ///< Integer type used for global array indexing
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA             d_in,                           ///< [in] Iterator pointing to scan input
+        OutputIteratorRA            d_out,                          ///< [in] Iterator pointing to scan output
+        ScanOp                      scan_op,                        ///< [in] Binary scan operator
+        Identity                    identity,                       ///< [in] Identity element
+        SizeT                       num_items,                      ///< [in] Total number of items to scan
+        cudaStream_t                stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
+    {
+        // Data type
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+
+        // Tuning polices
+        typedef PtxDefaultPolicies<T, SizeT>                    PtxDefaultPolicies;     // Wrapper of default kernel policies
+        typedef typename PtxDefaultPolicies::ScanPolicy   ScanPolicy;       // Scan kernel policy
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Declare dispatch parameters
+            KernelDispachParams scan_dispatch_params;
+
+            int ptx_version;
+#ifdef __CUDA_ARCH__
+            // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
+            scan_dispatch_params.Init<ScanPolicy>();
+            ptx_version = CUB_PTX_ARCH;
+#else
+            // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+            PtxDefaultPolicies::InitDispatchParams(ptx_version, scan_dispatch_params);
+#endif
+
+            Dispatch(
+                ptx_version,
+                d_temp_storage,
+                temp_storage_bytes,
+                ScanInitKernel<T, SizeT>,
+                ScanKernel<ScanPolicy, InputIteratorRA, OutputIteratorRA, T, ScanOp, Identity, SizeT>,
+                scan_dispatch_params,
+                d_in,
+                d_out,
+                scan_op,
+                identity,
+                num_items,
+                stream,
+                stream_synchronous);
+
+            if (CubDebug(error)) break;
+        }
+        while (0);
+
+        return error;
+    }
+
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    /******************************************************************//**
+     * \name Exclusive scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes a device-wide exclusive prefix sum.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \par
+     * The code snippet below illustrates the exclusive prefix sum of a device vector of \p int items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Declare and initialize device pointers for input and output
+     * int *d_scan_input, *d_scan_output;
+     * int num_items = ...
+     *
+     * ...
+     *
+     * // Determine temporary device storage requirements for exclusive prefix sum
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, num_items);
+     *
+     * // Allocate temporary storage for exclusive prefix sum
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix sum
+     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, num_items);
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
+     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
+     */
+    template <
+        typename            InputIteratorRA,
+        typename            OutputIteratorRA>
+    __host__ __device__ __forceinline__
+    static cudaError_t ExclusiveSum(
+        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA     d_in,                               ///< [in] Iterator pointing to scan input
+        OutputIteratorRA    d_out,                              ///< [in] Iterator pointing to scan output
+        int                 num_items,                          ///< [in] Total number of items to scan
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
+        return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), T(), num_items, stream, stream_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor.
+     *
+     * \par
+     * Supports non-commutative scan operators.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \par
+     * The code snippet below illustrates the exclusive prefix scan of a device vector of \p int items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Declare and initialize device pointers for input and output
+     * int *d_scan_input, *d_scan_output;
+     * int num_items = ...
+     *
+     * ...
+     *
+     * // Determine temporary device storage requirements for exclusive prefix scan
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, cub::Max(), (int) MIN_INT, num_items);
+     *
+     * // Allocate temporary storage for exclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix scan (max)
+     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, cub::Max(), (int) MIN_INT, num_items);
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
+     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam Identity             <b>[inferred]</b> Type of the \p identity value used Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            InputIteratorRA,
+        typename            OutputIteratorRA,
+        typename            ScanOp,
+        typename            Identity>
+    __host__ __device__ __forceinline__
+    static cudaError_t ExclusiveScan(
+        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA     d_in,                               ///< [in] Iterator pointing to scan input
+        OutputIteratorRA    d_out,                              ///< [in] Iterator pointing to scan output
+        ScanOp              scan_op,                            ///< [in] Binary scan operator
+        Identity            identity,                           ///< [in] Identity element
+        int                 num_items,                          ///< [in] Total number of items to scan
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, identity, num_items, stream, stream_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive scans
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a device-wide inclusive prefix sum.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \par
+     * The code snippet below illustrates the inclusive prefix sum of a device vector of \p int items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Declare and initialize device pointers for input and output
+     * int *d_scan_input, *d_scan_output;
+     * int num_items = ...
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix sum
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix sum
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix sum
+     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, num_items);
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
+     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
+     */
+    template <
+        typename            InputIteratorRA,
+        typename            OutputIteratorRA>
+    __host__ __device__ __forceinline__
+    static cudaError_t InclusiveSum(
+        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA     d_in,                               ///< [in] Iterator pointing to scan input
+        OutputIteratorRA    d_out,                              ///< [in] Iterator pointing to scan output
+        int                 num_items,                          ///< [in] Total number of items to scan
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), NullType(), num_items, stream, stream_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
+     *
+     * \par
+     * Supports non-commutative scan operators.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \par
+     * The code snippet below illustrates the inclusive prefix scan of a device vector of \p int items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     * ...
+     *
+     * // Declare and initialize device pointers for input and output
+     * int *d_scan_input, *d_scan_output;
+     * int num_items = ...
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix scan
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, cub::Max(), num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix scan (max)
+     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, cub::Max(), num_items);
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
+     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            InputIteratorRA,
+        typename            OutputIteratorRA,
+        typename            ScanOp>
+    __host__ __device__ __forceinline__
+    static cudaError_t InclusiveScan(
+        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
+        InputIteratorRA     d_in,                               ///< [in] Iterator pointing to scan input
+        OutputIteratorRA    d_out,                              ///< [in] Iterator pointing to scan output
+        ScanOp              scan_op,                            ///< [in] Binary scan operator
+        int                 num_items,                          ///< [in] Total number of items to scan
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream, stream_synchronous);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/lib/kokkos/TPL/cub/grid/grid_barrier.cuh b/lib/kokkos/TPL/cub/grid/grid_barrier.cuh
new file mode 100644
index 000000000..ebdc4b552
--- /dev/null
+++ b/lib/kokkos/TPL/cub/grid/grid_barrier.cuh
@@ -0,0 +1,211 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+
+#pragma once
+
+#include "../util_debug.cuh"
+#include "../util_namespace.cuh"
+#include "../thread/thread_load.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+class GridBarrier
+{
+protected :
+
+    typedef unsigned int SyncFlag;
+
+    // Counters in global device memory
+    SyncFlag* d_sync;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrier() : d_sync(NULL) {}
+
+
+    /**
+     * Synchronize
+     */
+    __device__ __forceinline__ void Sync() const
+    {
+        volatile SyncFlag *d_vol_sync = d_sync;
+
+        // Threadfence and syncthreads to make sure global writes are visible before
+        // thread-0 reports in with its sync counter
+        __threadfence();
+        __syncthreads();
+
+        if (blockIdx.x == 0)
+        {
+            // Report in ourselves
+            if (threadIdx.x == 0)
+            {
+                d_vol_sync[blockIdx.x] = 1;
+            }
+
+            __syncthreads();
+
+            // Wait for everyone else to report in
+            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            __syncthreads();
+
+            // Let everyone know it's safe to proceed
+            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                d_vol_sync[peer_block] = 0;
+            }
+        }
+        else
+        {
+            if (threadIdx.x == 0)
+            {
+                // Report in
+                d_vol_sync[blockIdx.x] = 1;
+
+                // Wait for acknowledgment
+                while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            __syncthreads();
+        }
+    }
+};
+
+
+/**
+ * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
+ *
+ * Uses RAII for lifetime, i.e., device resources are reclaimed when
+ * the destructor is called.
+ */
+class GridBarrierLifetime : public GridBarrier
+{
+protected:
+
+    // Number of bytes backed by d_sync
+    size_t sync_bytes;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
+
+
+    /**
+     * DeviceFrees and resets the progress counters
+     */
+    cudaError_t HostReset()
+    {
+        cudaError_t retval = cudaSuccess;
+        if (d_sync)
+        {
+            CubDebug(retval = cudaFree(d_sync));
+            d_sync = NULL;
+        }
+        sync_bytes = 0;
+        return retval;
+    }
+
+
+    /**
+     * Destructor
+     */
+    virtual ~GridBarrierLifetime()
+    {
+        HostReset();
+    }
+
+
+    /**
+     * Sets up the progress counters for the next kernel launch (lazily
+     * allocating and initializing them if necessary)
+     */
+    cudaError_t Setup(int sweep_grid_size)
+    {
+        cudaError_t retval = cudaSuccess;
+        do {
+            size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
+            if (new_sync_bytes > sync_bytes)
+            {
+                if (d_sync)
+                {
+                    if (CubDebug(retval = cudaFree(d_sync))) break;
+                }
+
+                sync_bytes = new_sync_bytes;
+
+                // Allocate and initialize to zero
+                if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
+                if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
+            }
+        } while (0);
+
+        return retval;
+    }
+};
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/grid/grid_even_share.cuh b/lib/kokkos/TPL/cub/grid/grid_even_share.cuh
new file mode 100644
index 000000000..defe9e0a6
--- /dev/null
+++ b/lib/kokkos/TPL/cub/grid/grid_even_share.cuh
@@ -0,0 +1,197 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion.  Each threadblock gets roughly the same number of fixed-size work units (grains).
+ */
+
+
+#pragma once
+
+#include "../util_namespace.cuh"
+#include "../util_macro.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion.  Each threadblock gets roughly the same number of fixed-size work units (grains).
+ *
+ * \par Overview
+ * GridEvenShare indicates which sections of input are to be mapped onto which threadblocks.
+ * Threadblocks may receive one of three different amounts of work: "big", "normal",
+ * and "last".  The "big" workloads are one scheduling grain larger than "normal".  The "last" work unit
+ * for the last threadblock may be partially-full if the input is not an even multiple of
+ * the scheduling grain size.
+ *
+ * \par
+ * Before invoking a child grid, a parent thread will typically construct and initialize an instance of
+ * GridEvenShare using \p GridInit().  The instance can be passed to child threadblocks which can
+ * initialize their per-threadblock offsets using \p BlockInit().
+ *
+ * \tparam SizeT Integer type for array indexing
+ */
+template <typename SizeT>
+class GridEvenShare
+{
+private:
+
+    SizeT   total_grains;
+    int     big_blocks;
+    SizeT   big_share;
+    SizeT   normal_share;
+    SizeT   normal_base_offset;
+
+
+public:
+
+    /// Total number of input items
+    SizeT   num_items;
+
+    /// Grid size in threadblocks
+    int     grid_size;
+
+    /// Offset into input marking the beginning of the owning thread block's segment of input tiles
+    SizeT   block_offset;
+
+    /// Offset into input of marking the end (one-past) of the owning thread block's segment of input tiles
+    SizeT   block_oob;
+
+    /**
+     * \brief Block-based constructor for single-block grids.
+     */
+    __device__ __forceinline__ GridEvenShare(SizeT num_items) :
+        num_items(num_items),
+        grid_size(1),
+        block_offset(0),
+        block_oob(num_items) {}
+
+
+    /**
+     * \brief Default constructor.  Zero-initializes block-specific fields.
+     */
+    __host__ __device__ __forceinline__ GridEvenShare() :
+        num_items(0),
+        grid_size(0),
+        block_offset(0),
+        block_oob(0) {}
+
+
+    /**
+     * \brief Initializes the grid-specific members \p num_items and \p grid_size. To be called prior prior to kernel launch)
+     */
+    __host__ __device__ __forceinline__ void GridInit(
+        SizeT   num_items,                  ///< Total number of input items
+        int     max_grid_size,              ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
+        int     schedule_granularity)       ///< Granularity by which the input can be parcelled into and distributed among threablocks.  Usually the thread block's native tile size (or a multiple thereof.
+    {
+        this->num_items             = num_items;
+        this->block_offset          = 0;
+        this->block_oob             = 0;
+        this->total_grains          = (num_items + schedule_granularity - 1) / schedule_granularity;
+        this->grid_size             = CUB_MIN(total_grains, max_grid_size);
+        SizeT grains_per_block      = total_grains / grid_size;
+        this->big_blocks            = total_grains - (grains_per_block * grid_size);        // leftover grains go to big blocks
+        this->normal_share          = grains_per_block * schedule_granularity;
+        this->normal_base_offset    = big_blocks * schedule_granularity;
+        this->big_share             = normal_share + schedule_granularity;
+    }
+
+
+    /**
+     * \brief Initializes the threadblock-specific details (e.g., to be called by each threadblock after startup)
+     */
+    __device__ __forceinline__ void BlockInit()
+    {
+        if (blockIdx.x < big_blocks)
+        {
+            // This threadblock gets a big share of grains (grains_per_block + 1)
+            block_offset = (blockIdx.x * big_share);
+            block_oob = block_offset + big_share;
+        }
+        else if (blockIdx.x < total_grains)
+        {
+            // This threadblock gets a normal share of grains (grains_per_block)
+            block_offset = normal_base_offset + (blockIdx.x * normal_share);
+            block_oob = block_offset + normal_share;
+        }
+
+        // Last threadblock
+        if (blockIdx.x == grid_size - 1)
+        {
+            block_oob = num_items;
+        }
+    }
+
+
+    /**
+     * Print to stdout
+     */
+    __host__ __device__ __forceinline__ void Print()
+    {
+        printf(
+#ifdef __CUDA_ARCH__
+            "\tthreadblock(%d) "
+            "block_offset(%lu) "
+            "block_oob(%lu) "
+#endif
+            "num_items(%lu)  "
+            "total_grains(%lu)  "
+            "big_blocks(%lu)  "
+            "big_share(%lu)  "
+            "normal_share(%lu)\n",
+#ifdef __CUDA_ARCH__
+                blockIdx.x,
+                (unsigned long) block_offset,
+                (unsigned long) block_oob,
+#endif
+                (unsigned long) num_items,
+                (unsigned long) total_grains,
+                (unsigned long) big_blocks,
+                (unsigned long) big_share,
+                (unsigned long) normal_share);
+    }
+};
+
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/grid/grid_mapping.cuh b/lib/kokkos/TPL/cub/grid/grid_mapping.cuh
new file mode 100644
index 000000000..419f9ac0e
--- /dev/null
+++ b/lib/kokkos/TPL/cub/grid/grid_mapping.cuh
@@ -0,0 +1,95 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+
+#pragma once
+
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/******************************************************************************
+ * Mapping policies
+ *****************************************************************************/
+
+
+/**
+ * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+enum GridMappingStrategy
+{
+    /**
+     * \brief An "even-share" strategy for assigning input tiles to thread blocks.
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p segments, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each segment is comprised of
+     * consecutive tiles, where a tile is a small, constant-sized unit of input
+     * to be processed to completion before the thread block terminates or
+     * obtains more work.  The kernel invokes \p p thread blocks, each
+     * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
+     * in tile-size increments.
+     */
+    GRID_MAPPING_EVEN_SHARE,
+
+    /**
+     * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
+     *
+     * \par Overview
+     * The input is treated as a queue to be dynamically consumed by a grid of
+     * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
+     * unit of input to be processed to completion before the thread block
+     * terminates or obtains more work.  The grid size \p p is constant,
+     * loosely corresponding to the number of thread blocks that may actively
+     * reside on the target device.
+     */
+    GRID_MAPPING_DYNAMIC,
+};
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/grid/grid_queue.cuh b/lib/kokkos/TPL/cub/grid/grid_queue.cuh
new file mode 100644
index 000000000..009260d87
--- /dev/null
+++ b/lib/kokkos/TPL/cub/grid/grid_queue.cuh
@@ -0,0 +1,207 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridQueue is a descriptor utility for dynamic queue management.
+ */
+
+#pragma once
+
+#include "../util_namespace.cuh"
+#include "../util_debug.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridQueue is a descriptor utility for dynamic queue management.
+ *
+ * \par Overview
+ * GridQueue descriptors provides abstractions for "filling" or
+ * "draining" globally-shared vectors.
+ *
+ * \par
+ * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
+ * returning a unique offset for the calling thread to write its items.
+ * The GridQueue maintains the total "fill-size".  The fill counter must be reset
+ * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
+ * will be filling.
+ *
+ * \par
+ * Similarly a "draining" GridQueue works by works by atomically-incrementing a
+ * zero-initialized counter, returning a unique offset for the calling thread to
+ * read its items. Threads can safely drain until the array's logical fill-size is
+ * exceeded.  The drain counter must be reset using GridQueue::ResetDrain or
+ * GridQueue::ResetDrainAfterFill by the host or kernel instance prior to the kernel instance that
+ * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
+ * is simply the number of elements in the array.)
+ *
+ * \par
+ * Iterative work management can be implemented simply with a pair of flip-flopping
+ * work buffers, each with an associated set of fill and drain GridQueue descriptors.
+ *
+ * \tparam SizeT Integer type for array indexing
+ */
+template <typename SizeT>
+class GridQueue
+{
+private:
+
+    /// Counter indices
+    enum
+    {
+        FILL    = 0,
+        DRAIN   = 1,
+    };
+
+    /// Pair of counters
+    SizeT *d_counters;
+
+public:
+
+    /// Returns the device allocation size in bytes needed to construct a GridQueue instance
+    __host__ __device__ __forceinline__
+    static size_t AllocationSize()
+    {
+        return sizeof(SizeT) * 2;
+    }
+
+
+    /// Constructs an invalid GridQueue descriptor around the device storage allocation
+    __host__ __device__ __forceinline__ GridQueue(
+        void *d_storage)                    ///< Device allocation to back the GridQueue.  Must be at least as big as <tt>AllocationSize()</tt>.
+    :
+        d_counters((SizeT*) d_storage)
+    {}
+
+
+    /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
+    __host__ __device__ __forceinline__ cudaError_t ResetDrainAfterFill(cudaStream_t stream = 0)
+    {
+#ifdef __CUDA_ARCH__
+        d_counters[DRAIN] = 0;
+        return cudaSuccess;
+#else
+        return ResetDrain(0, stream);
+#endif
+    }
+
+    /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance.  To be called by the host or by a kernel prior to that which will be draining.
+    __host__ __device__ __forceinline__ cudaError_t ResetDrain(
+        SizeT fill_size,
+        cudaStream_t stream = 0)
+    {
+#ifdef __CUDA_ARCH__
+        d_counters[FILL] = fill_size;
+        d_counters[DRAIN] = 0;
+        return cudaSuccess;
+#else
+        SizeT counters[2];
+        counters[FILL] = fill_size;
+        counters[DRAIN] = 0;
+        return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(SizeT) * 2, cudaMemcpyHostToDevice, stream));
+#endif
+    }
+
+
+    /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
+    __host__ __device__ __forceinline__ cudaError_t ResetFill()
+    {
+#ifdef __CUDA_ARCH__
+        d_counters[FILL] = 0;
+        return cudaSuccess;
+#else
+        return CubDebug(cudaMemset(d_counters + FILL, 0, sizeof(SizeT)));
+#endif
+    }
+
+
+    /// Returns the fill-size established by the parent or by the previous kernel.
+    __host__ __device__ __forceinline__ cudaError_t FillSize(
+        SizeT &fill_size,
+        cudaStream_t stream = 0)
+    {
+#ifdef __CUDA_ARCH__
+        fill_size = d_counters[FILL];
+#else
+        return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(SizeT), cudaMemcpyDeviceToHost, stream));
+#endif
+    }
+
+
+    /// Drain num_items.  Returns offset from which to read items.
+    __device__ __forceinline__ SizeT Drain(SizeT num_items)
+    {
+        return atomicAdd(d_counters + DRAIN, num_items);
+    }
+
+
+    /// Fill num_items.  Returns offset from which to write items.
+    __device__ __forceinline__ SizeT Fill(SizeT num_items)
+    {
+        return atomicAdd(d_counters + FILL, num_items);
+    }
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Reset grid queue (call with 1 block of 1 thread)
+ */
+template <typename SizeT>
+__global__ void ResetDrainKernel(
+    GridQueue<SizeT>    grid_queue,
+    SizeT               num_items)
+{
+    grid_queue.ResetDrain(num_items);
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/lib/kokkos/TPL/cub/host/spinlock.cuh b/lib/kokkos/TPL/cub/host/spinlock.cuh
new file mode 100644
index 000000000..5621b6f1a
--- /dev/null
+++ b/lib/kokkos/TPL/cub/host/spinlock.cuh
@@ -0,0 +1,123 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Simple x86/x64 atomic spinlock, portable across MS Windows (cl.exe) & Linux (g++)
+ */
+
+
+#pragma once
+
+#if defined(_WIN32) || defined(_WIN64)
+    #include <intrin.h>
+    #include <windows.h>
+    #undef small            // Windows is terrible for polluting macro namespace
+
+    /**
+     * Compiler read/write barrier
+     */
+    #pragma intrinsic(_ReadWriteBarrier)
+
+#endif
+
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+#if defined(_MSC_VER)
+
+    // Microsoft VC++
+    typedef long Spinlock;
+
+#else
+
+    // GNU g++
+    typedef int Spinlock;
+
+    /**
+     * Compiler read/write barrier
+     */
+    __forceinline__ void _ReadWriteBarrier()
+    {
+        __sync_synchronize();
+    }
+
+    /**
+     * Atomic exchange
+     */
+    __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
+    {
+        // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
+        _ReadWriteBarrier();
+        return __sync_lock_test_and_set(Target, Value);
+    }
+
+    /**
+     * Pause instruction to prevent excess processor bus usage
+     */
+    __forceinline__ void YieldProcessor()
+    {
+#ifndef __arm__
+        asm volatile("pause\n": : :"memory");
+#endif  // __arm__
+    }
+
+#endif  // defined(_MSC_VER)
+
+/**
+ * Return when the specified spinlock has been acquired
+ */
+__forceinline__ void Lock(volatile Spinlock *lock)
+{
+    while (1)
+    {
+        if (!_InterlockedExchange(lock, 1)) return;
+        while (*lock) YieldProcessor();
+    }
+}
+
+
+/**
+ * Release the specified spinlock
+ */
+__forceinline__ void Unlock(volatile Spinlock *lock)
+{
+    _ReadWriteBarrier();
+    *lock = 0;
+}
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/lib/kokkos/TPL/cub/thread/thread_load.cuh b/lib/kokkos/TPL/cub/thread/thread_load.cuh
new file mode 100644
index 000000000..ee112b9d5
--- /dev/null
+++ b/lib/kokkos/TPL/cub/thread/thread_load.cuh
@@ -0,0 +1,429 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for reading memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include <cuda.h>
+
+#include <iterator>
+
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup IoModule
+ * @{
+ */
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief Enumeration of PTX cache-modifiers for memory load operations.
+ */
+enum PtxLoadModifier
+{
+    LOAD_DEFAULT,       ///< Default (no modifier)
+    LOAD_CA,            ///< Cache at all levels
+    LOAD_CG,            ///< Cache at global level
+    LOAD_CS,            ///< Cache streaming (likely to be accessed once)
+    LOAD_CV,            ///< Cache as volatile (including cached system lines)
+    LOAD_LDG,           ///< Cache as texture
+    LOAD_VOLATILE,      ///< Volatile (any memory space)
+};
+
+
+/**
+ * \name Simple I/O
+ * @{
+ */
+
+/**
+ * \brief Thread utility for reading memory using cub::PtxLoadModifier cache modifiers.
+ *
+ * Cache modifiers will only be effected for built-in types (i.e., C++
+ * primitives and CUDA vector-types).
+ *
+ * For example:
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * // 32-bit load using cache-global modifier:
+ * int *d_in;
+ * int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
+ *
+ * // 16-bit load using default modifier
+ * short *d_in;
+ * short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
+ *
+ * // 256-bit load using cache-volatile modifier
+ * double4 *d_in;
+ * double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
+ *
+ * // 96-bit load using default cache modifier (ignoring LOAD_CS)
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
+ * \endcode
+ *
+ */
+template <
+    PtxLoadModifier MODIFIER,
+    typename InputIteratorRA>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorRA>::value_type ThreadLoad(InputIteratorRA itr);
+
+
+//@}  end member group
+
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Define a int4 (16B) ThreadLoad specialization for the given PTX load modifier
+ */
+#define CUB_LOAD_16(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ int4 ThreadLoad<cub_modifier, int4*>(int4* ptr)              \
+    {                                                                                       \
+        int4 retval;                                                                        \
+        asm volatile ("ld."#ptx_modifier".v4.s32 {%0, %1, %2, %3}, [%4];" :                 \
+            "=r"(retval.x),                                                                 \
+            "=r"(retval.y),                                                                 \
+            "=r"(retval.z),                                                                 \
+            "=r"(retval.w) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ longlong2 ThreadLoad<cub_modifier, longlong2*>(longlong2* ptr)              \
+    {                                                                                       \
+        longlong2 retval;                                                                   \
+        asm volatile ("ld."#ptx_modifier".v2.s64 {%0, %1}, [%2];" :                         \
+            "=l"(retval.x),                                                                 \
+            "=l"(retval.y) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+/**
+ * Define a int2 (8B) ThreadLoad specialization for the given PTX load modifier
+ */
+#define CUB_LOAD_8(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ short4 ThreadLoad<cub_modifier, short4*>(short4* ptr)        \
+    {                                                                                       \
+        short4 retval;                                                                      \
+        asm volatile ("ld."#ptx_modifier".v4.s16 {%0, %1, %2, %3}, [%4];" :                 \
+            "=h"(retval.x),                                                                 \
+            "=h"(retval.y),                                                                 \
+            "=h"(retval.z),                                                                 \
+            "=h"(retval.w) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ int2 ThreadLoad<cub_modifier, int2*>(int2* ptr)              \
+    {                                                                                       \
+        int2 retval;                                                                        \
+        asm volatile ("ld."#ptx_modifier".v2.s32 {%0, %1}, [%2];" :                         \
+            "=r"(retval.x),                                                                 \
+            "=r"(retval.y) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ long long ThreadLoad<cub_modifier, long long*>(long long* ptr)                 \
+    {                                                                                       \
+        long long retval;                                                                   \
+        asm volatile ("ld."#ptx_modifier".s64 %0, [%1];" :                                  \
+            "=l"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+/**
+ * Define a int (4B) ThreadLoad specialization for the given PTX load modifier
+ */
+#define CUB_LOAD_4(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ int ThreadLoad<cub_modifier, int*>(int* ptr)                 \
+    {                                                                                       \
+        int retval;                                                                         \
+        asm volatile ("ld."#ptx_modifier".s32 %0, [%1];" :                                  \
+            "=r"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+
+/**
+ * Define a short (2B) ThreadLoad specialization for the given PTX load modifier
+ */
+#define CUB_LOAD_2(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ short ThreadLoad<cub_modifier, short*>(short* ptr)           \
+    {                                                                                       \
+        short retval;                                                                       \
+        asm volatile ("ld."#ptx_modifier".s16 %0, [%1];" :                                  \
+            "=h"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+
+/**
+ * Define a char (1B) ThreadLoad specialization for the given PTX load modifier
+ */
+#define CUB_LOAD_1(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ char ThreadLoad<cub_modifier, char*>(char* ptr)              \
+    {                                                                                       \
+        short retval;                                                                       \
+        asm volatile (                                                                      \
+        "{"                                                                                 \
+        "   .reg .s8 datum;"                                                                \
+        "    ld."#ptx_modifier".s8 datum, [%1];"                                            \
+        "    cvt.s16.s8 %0, datum;"                                                         \
+        "}" :                                                                               \
+            "=h"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return (char) retval;                                                               \
+    }
+
+
+/**
+ * Define powers-of-two ThreadLoad specializations for the given PTX load modifier
+ */
+#define CUB_LOAD_ALL(cub_modifier, ptx_modifier)                                            \
+    CUB_LOAD_16(cub_modifier, ptx_modifier)                                                 \
+    CUB_LOAD_8(cub_modifier, ptx_modifier)                                                  \
+    CUB_LOAD_4(cub_modifier, ptx_modifier)                                                  \
+    CUB_LOAD_2(cub_modifier, ptx_modifier)                                                  \
+    CUB_LOAD_1(cub_modifier, ptx_modifier)                                                  \
+
+
+/**
+ * Define ThreadLoad specializations for the various PTX load modifiers
+ */
+#if CUB_PTX_ARCH >= 200
+    CUB_LOAD_ALL(LOAD_CA, ca)
+    CUB_LOAD_ALL(LOAD_CG, cg)
+    CUB_LOAD_ALL(LOAD_CS, cs)
+    CUB_LOAD_ALL(LOAD_CV, cv)
+#else
+    // LOAD_CV on SM10-13 uses "volatile.global" to ensure reads from last level
+    CUB_LOAD_ALL(LOAD_CV, volatile.global)
+#endif
+#if CUB_PTX_ARCH >= 350
+    CUB_LOAD_ALL(LOAD_LDG, global.nc)
+#endif
+
+
+/// Helper structure for templated load iteration (inductive case)
+template <PtxLoadModifier MODIFIER, int COUNT, int MAX>
+struct IterateThreadLoad
+{
+    template <typename T>
+    static __device__ __forceinline__ void Load(T *ptr, T *vals)
+    {
+        vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
+        IterateThreadLoad<MODIFIER, COUNT + 1, MAX>::Load(ptr, vals);
+    }
+};
+
+/// Helper structure for templated load iteration (termination case)
+template <PtxLoadModifier MODIFIER, int MAX>
+struct IterateThreadLoad<MODIFIER, MAX, MAX>
+{
+    template <typename T>
+    static __device__ __forceinline__ void Load(T *ptr, T *vals) {}
+};
+
+
+
+/**
+ * Load with LOAD_DEFAULT on iterator types
+ */
+template <typename InputIteratorRA>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorRA>::value_type ThreadLoad(
+    InputIteratorRA         itr,
+    Int2Type<LOAD_DEFAULT>  modifier,
+    Int2Type<false>         is_pointer)
+{
+    return *itr;
+}
+
+
+/**
+ * Load with LOAD_DEFAULT on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_DEFAULT>  modifier,
+    Int2Type<true>          is_pointer)
+{
+    return *ptr;
+}
+
+
+/**
+ * Load with LOAD_VOLATILE on primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoadVolatile(
+    T                       *ptr,
+    Int2Type<true>          is_primitive)
+{
+    T retval = *reinterpret_cast<volatile T*>(ptr);
+
+#if (CUB_PTX_ARCH <= 130)
+    if (sizeof(T) == 1) __threadfence_block();
+#endif
+
+    return retval;
+}
+
+
+/**
+ * Load with LOAD_VOLATILE on non-primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoadVolatile(
+    T                       *ptr,
+    Int2Type<false>          is_primitive)
+{
+    typedef typename WordAlignment<T>::VolatileWord VolatileWord;   // Word type for memcopying
+    enum { NUM_WORDS = sizeof(T) / sizeof(VolatileWord) };
+
+    // Memcopy from aliased source into array of uninitialized words
+    typename WordAlignment<T>::UninitializedVolatileWords words;
+
+    #pragma unroll
+    for (int i = 0; i < NUM_WORDS; ++i)
+        words.buf[i] = reinterpret_cast<volatile VolatileWord*>(ptr)[i];
+
+    // Load from words
+    return *reinterpret_cast<T*>(words.buf);
+}
+
+
+/**
+ * Load with LOAD_VOLATILE on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_VOLATILE> modifier,
+    Int2Type<true>          is_pointer)
+{
+    return ThreadLoadVolatile(ptr, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+
+#if (CUB_PTX_ARCH <= 130)
+
+/**
+ * Load with LOAD_CG uses LOAD_CV in pre-SM20 PTX to ensure coherent reads when run on newer architectures with L1
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_CG>       modifier,
+    Int2Type<true>          is_pointer)
+{
+    return ThreadLoad<LOAD_CV>(ptr);
+}
+
+#endif  // (CUB_PTX_ARCH <= 130)
+
+
+/**
+ * Load with arbitrary MODIFIER on pointer types
+ */
+template <typename T, int MODIFIER>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<MODIFIER>      modifier,
+    Int2Type<true>          is_pointer)
+{
+    typedef typename WordAlignment<T>::DeviceWord DeviceWord;
+    enum { NUM_WORDS = sizeof(T) / sizeof(DeviceWord) };
+
+    // Memcopy from aliased source into array of uninitialized words
+    typename WordAlignment<T>::UninitializedDeviceWords words;
+
+    IterateThreadLoad<PtxLoadModifier(MODIFIER), 0, NUM_WORDS>::Load(
+        reinterpret_cast<DeviceWord*>(ptr),
+        words.buf);
+
+    // Load from words
+    return *reinterpret_cast<T*>(words.buf);
+}
+
+
+/**
+ * Generic ThreadLoad definition
+ */
+template <
+    PtxLoadModifier MODIFIER,
+    typename InputIteratorRA>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorRA>::value_type ThreadLoad(InputIteratorRA itr)
+{
+    return ThreadLoad(
+        itr,
+        Int2Type<MODIFIER>(),
+        Int2Type<IsPointer<InputIteratorRA>::VALUE>());
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group IoModule
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/thread/thread_operators.cuh b/lib/kokkos/TPL/cub/thread/thread_operators.cuh
new file mode 100644
index 000000000..bfb3d7c1b
--- /dev/null
+++ b/lib/kokkos/TPL/cub/thread/thread_operators.cuh
@@ -0,0 +1,145 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Simple binary operator functor types
+ */
+
+/******************************************************************************
+ * Simple functor operators
+ ******************************************************************************/
+
+#pragma once
+
+#include "../util_macro.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup ThreadModule
+ * @{
+ */
+
+/**
+ * \brief Default equality functor
+ */
+struct Equality
+{
+    /// Boolean equality operator, returns <tt>(a == b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
+    {
+        return a == b;
+    }
+};
+
+
+/**
+ * \brief Default inequality functor
+ */
+struct Inequality
+{
+    /// Boolean inequality operator, returns <tt>(a != b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
+    {
+        return a != b;
+    }
+};
+
+
+/**
+ * \brief Default sum functor
+ */
+struct Sum
+{
+    /// Boolean sum operator, returns <tt>a + b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b)
+    {
+        return a + b;
+    }
+};
+
+
+/**
+ * \brief Default max functor
+ */
+struct Max
+{
+    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b)
+    {
+        return CUB_MAX(a, b);
+    }
+};
+
+
+/**
+ * \brief Default min functor
+ */
+struct Min
+{
+    /// Boolean min operator, returns <tt>(a < b) ? a : b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b)
+    {
+        return CUB_MIN(a, b);
+    }
+};
+
+
+/**
+ * \brief Default cast functor
+ */
+template <typename B>
+struct Cast
+{
+    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
+    template <typename A>
+    __host__ __device__ __forceinline__ B operator()(const A &a)
+    {
+        return (B) a;
+    }
+};
+
+
+
+/** @} */       // end group ThreadModule
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/thread/thread_reduce.cuh b/lib/kokkos/TPL/cub/thread/thread_reduce.cuh
new file mode 100644
index 000000000..374fd77ae
--- /dev/null
+++ b/lib/kokkos/TPL/cub/thread/thread_reduce.cuh
@@ -0,0 +1,145 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential reduction over statically-sized array types
+ */
+
+#pragma once
+
+#include "../thread/thread_operators.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup ThreadModule
+ * @{
+ */
+
+/**
+ * \name Sequential reduction over statically-sized array types
+ * @{
+ */
+
+/**
+ * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     Length of input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*          input,                  ///< [in] Input array
+    ReductionOp reduction_op,           ///< [in] Binary reduction operator
+    T           prefix)                 ///< [in] Prefix to seed reduction with
+{
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+    {
+        prefix = reduction_op(prefix, input[i]);
+    }
+
+    return prefix;
+}
+
+
+/**
+ * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     Length of input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*          input,                  ///< [in] Input array
+    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+{
+    T prefix = input[0];
+    return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
+}
+
+
+/**
+ * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> Length of \p input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    ReductionOp reduction_op,           ///< [in] Binary reduction operator
+    T           prefix)                 ///< [in] Prefix to seed reduction with
+{
+    return ThreadReduce<LENGTH>(input, reduction_op, prefix);
+}
+
+
+/**
+ * \brief Serial reduction with the specified operator
+ *
+ * \tparam LENGTH     <b>[inferred]</b> Length of \p input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+{
+    return ThreadReduce<LENGTH>((T*) input, reduction_op);
+}
+
+
+//@}  end member group
+
+/** @} */       // end group ThreadModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/thread/thread_scan.cuh b/lib/kokkos/TPL/cub/thread/thread_scan.cuh
new file mode 100644
index 000000000..b43bbcf00
--- /dev/null
+++ b/lib/kokkos/TPL/cub/thread/thread_scan.cuh
@@ -0,0 +1,231 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential prefix scan over statically-sized array types
+ */
+
+#pragma once
+
+#include "../thread/thread_operators.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup ThreadModule
+ * @{
+ */
+
+/**
+ * \name Sequential prefix scan over statically-sized array types
+ * @{
+ */
+
+/**
+ * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     Length of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  If not, the first output element is undefined.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    T inclusive = input[0];
+    if (apply_prefix)
+    {
+        inclusive = scan_op(prefix, inclusive);
+    }
+    output[0] = prefix;
+    T exclusive = inclusive;
+
+    #pragma unroll
+    for (int i = 1; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(exclusive, input[i]);
+        output[i] = exclusive;
+        exclusive = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+/**
+ * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> Length of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix);
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     Length of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op)                ///< [in] Binary scan operator
+{
+    T inclusive = input[0];
+    output[0] = inclusive;
+
+    // Continue scan
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(inclusive, input[i]);
+        output[i] = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> Length of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op)                ///< [in] Binary scan operator
+{
+    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     Length of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    T inclusive = input[0];
+    if (apply_prefix)
+    {
+        inclusive = scan_op(prefix, inclusive);
+    }
+    output[0] = inclusive;
+
+    // Continue scan
+    #pragma unroll
+    for (int i = 1; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(inclusive, input[i]);
+        output[i] = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> Length of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+}
+
+
+//@}  end member group
+
+/** @} */       // end group ThreadModule
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/thread/thread_store.cuh b/lib/kokkos/TPL/cub/thread/thread_store.cuh
new file mode 100644
index 000000000..8d39e07b1
--- /dev/null
+++ b/lib/kokkos/TPL/cub/thread/thread_store.cuh
@@ -0,0 +1,412 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for writing memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include <cuda.h>
+
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup IoModule
+ * @{
+ */
+
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief Enumeration of PTX cache-modifiers for memory store operations.
+ */
+enum PtxStoreModifier
+{
+    STORE_DEFAULT,              ///< Default (no modifier)
+    STORE_WB,                   ///< Cache write-back all coherent levels
+    STORE_CG,                   ///< Cache at global level
+    STORE_CS,                   ///< Cache streaming (likely to be accessed once)
+    STORE_WT,                   ///< Cache write-through (to system memory)
+    STORE_VOLATILE,             ///< Volatile shared (any memory space)
+};
+
+
+/**
+ * \name Simple I/O
+ * @{
+ */
+
+/**
+ * \brief Thread utility for writing memory using cub::PtxStoreModifier cache modifiers.
+ *
+ * Cache modifiers will only be effected for built-in types (i.e., C++
+ * primitives and CUDA vector-types).
+ *
+ * For example:
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * // 32-bit store using cache-global modifier:
+ * int *d_out;
+ * int val;
+ * cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
+ *
+ * // 16-bit store using default modifier
+ * short *d_out;
+ * short val;
+ * cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
+ *
+ * // 256-bit store using write-through modifier
+ * double4 *d_out;
+ * double4 val;
+ * cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
+ *
+ * // 96-bit store using default cache modifier (ignoring STORE_CS)
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val;
+ * cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
+ * \endcode
+ *
+ */
+template <
+    PtxStoreModifier MODIFIER,
+    typename OutputIteratorRA,
+    typename T>
+__device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, T val);
+
+
+//@}  end member group
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Define a int4 (16B) ThreadStore specialization for the given PTX load modifier
+ */
+#define CUB_STORE_16(cub_modifier, ptx_modifier)                                            \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, int4*, int4>(int4* ptr, int4 val)              \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v4.s32 [%0], {%1, %2, %3, %4};" : :               \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val.x),                                                                     \
+            "r"(val.y),                                                                     \
+            "r"(val.z),                                                                     \
+            "r"(val.w));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, longlong2*, longlong2>(longlong2* ptr, longlong2 val)              \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v2.s64 [%0], {%1, %2};" : :                       \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "l"(val.x),                                                                     \
+            "l"(val.y));                                                                    \
+    }
+
+
+/**
+ * Define a int2 (8B) ThreadStore specialization for the given PTX load modifier
+ */
+#define CUB_STORE_8(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, short4*, short4>(short4* ptr, short4 val)              \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v4.s16 [%0], {%1, %2, %3, %4};" : :               \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(val.x),                                                                     \
+            "h"(val.y),                                                                     \
+            "h"(val.z),                                                                     \
+            "h"(val.w));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, int2*, int2>(int2* ptr, int2 val)              \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v2.s32 [%0], {%1, %2};" : :                       \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val.x),                                                                     \
+            "r"(val.y));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, long long*, long long>(long long* ptr, long long val)                 \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".s64 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "l"(val));                                                                      \
+    }
+
+/**
+ * Define a int (4B) ThreadStore specialization for the given PTX load modifier
+ */
+#define CUB_STORE_4(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, int*, int>(int* ptr, int val)                 \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".s32 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val));                                                                      \
+    }
+
+
+/**
+ * Define a short (2B) ThreadStore specialization for the given PTX load modifier
+ */
+#define CUB_STORE_2(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, short*, short>(short* ptr, short val)           \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".s16 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(val));                                                                      \
+    }
+
+
+/**
+ * Define a char (1B) ThreadStore specialization for the given PTX load modifier
+ */
+#define CUB_STORE_1(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, char*, char>(char* ptr, char val)              \
+    {                                                                                       \
+        asm volatile (                                                                      \
+        "{"                                                                                 \
+        "   .reg .s8 datum;"                                                                \
+        "   cvt.s8.s16 datum, %1;"                                                          \
+        "   st."#ptx_modifier".s8 [%0], datum;"                                             \
+        "}" : :                                                                             \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(short(val)));                                                               \
+    }
+
+/**
+ * Define powers-of-two ThreadStore specializations for the given PTX load modifier
+ */
+#define CUB_STORE_ALL(cub_modifier, ptx_modifier)                                           \
+    CUB_STORE_16(cub_modifier, ptx_modifier)                                                \
+    CUB_STORE_8(cub_modifier, ptx_modifier)                                                 \
+    CUB_STORE_4(cub_modifier, ptx_modifier)                                                 \
+    CUB_STORE_2(cub_modifier, ptx_modifier)                                                 \
+    CUB_STORE_1(cub_modifier, ptx_modifier)                                                 \
+
+
+/**
+ * Define ThreadStore specializations for the various PTX load modifiers
+ */
+#if CUB_PTX_ARCH >= 200
+    CUB_STORE_ALL(STORE_WB, ca)
+    CUB_STORE_ALL(STORE_CG, cg)
+    CUB_STORE_ALL(STORE_CS, cs)
+    CUB_STORE_ALL(STORE_WT, cv)
+#else
+    // STORE_WT on SM10-13 uses "volatile.global" to ensure writes to last level
+    CUB_STORE_ALL(STORE_WT, volatile.global)
+#endif
+
+
+
+/// Helper structure for templated store iteration (inductive case)
+template <PtxStoreModifier MODIFIER, int COUNT, int MAX>
+struct IterateThreadStore
+{
+    template <typename T>
+    static __device__ __forceinline__ void Store(T *ptr, T *vals)
+    {
+        ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
+        IterateThreadStore<MODIFIER, COUNT + 1, MAX>::Store(ptr, vals);
+    }
+};
+
+/// Helper structure for templated store iteration (termination case)
+template <PtxStoreModifier MODIFIER, int MAX>
+struct IterateThreadStore<MODIFIER, MAX, MAX>
+{
+    template <typename T>
+    static __device__ __forceinline__ void Store(T *ptr, T *vals) {}
+};
+
+
+
+
+/**
+ * Store with STORE_DEFAULT on iterator types
+ */
+template <typename OutputIteratorRA, typename T>
+__device__ __forceinline__ void ThreadStore(
+    OutputIteratorRA            itr,
+    T                           val,
+    Int2Type<STORE_DEFAULT>     modifier,
+    Int2Type<false>             is_pointer)
+{
+    *itr = val;
+}
+
+
+/**
+ * Store with STORE_DEFAULT on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_DEFAULT>     modifier,
+    Int2Type<true>              is_pointer)
+{
+    *ptr = val;
+}
+
+
+/**
+ * Store with STORE_VOLATILE on primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStoreVolatile(
+    T                           *ptr,
+    T                           val,
+    Int2Type<true>              is_primitive)
+{
+    *reinterpret_cast<volatile T*>(ptr) = val;
+}
+
+
+/**
+ * Store with STORE_VOLATILE on non-primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStoreVolatile(
+    T                           *ptr,
+    T                           val,
+    Int2Type<false>             is_primitive)
+{
+    typedef typename WordAlignment<T>::VolatileWord VolatileWord;   // Word type for memcopying
+    enum { NUM_WORDS = sizeof(T) / sizeof(VolatileWord) };
+
+    // Store into array of uninitialized words
+    typename WordAlignment<T>::UninitializedVolatileWords words;
+    *reinterpret_cast<T*>(words.buf) = val;
+
+    // Memcopy words to aliased destination
+    #pragma unroll
+    for (int i = 0; i < NUM_WORDS; ++i)
+        reinterpret_cast<volatile VolatileWord*>(ptr)[i] = words.buf[i];
+}
+
+
+/**
+ * Store with STORE_VOLATILE on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_VOLATILE>    modifier,
+    Int2Type<true>              is_pointer)
+{
+    ThreadStoreVolatile(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+
+#if (CUB_PTX_ARCH <= 350)
+
+/**
+ * Store with STORE_CG on pointer types (uses STORE_DEFAULT on current architectures)
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_CG>          modifier,
+    Int2Type<true>              is_pointer)
+{
+    ThreadStore<STORE_DEFAULT>(ptr, val);
+}
+
+#endif  // (CUB_PTX_ARCH <= 350)
+
+
+/**
+ * Store with arbitrary MODIFIER on pointer types
+ */
+template <typename T, int MODIFIER>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<MODIFIER>          modifier,
+    Int2Type<true>              is_pointer)
+{
+    typedef typename WordAlignment<T>::DeviceWord DeviceWord;   // Word type for memcopying
+    enum { NUM_WORDS = sizeof(T) / sizeof(DeviceWord) };
+
+    // Store into array of uninitialized words
+    typename WordAlignment<T>::UninitializedDeviceWords words;
+    *reinterpret_cast<T*>(words.buf) = val;
+
+    // Memcopy words to aliased destination
+    IterateThreadStore<PtxStoreModifier(MODIFIER), 0, NUM_WORDS>::Store(
+        reinterpret_cast<DeviceWord*>(ptr),
+        words.buf);
+}
+
+
+/**
+ * Generic ThreadStore definition
+ */
+template <PtxStoreModifier MODIFIER, typename OutputIteratorRA, typename T>
+__device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, T val)
+{
+    ThreadStore(
+        itr,
+        val,
+        Int2Type<MODIFIER>(),
+        Int2Type<IsPointer<OutputIteratorRA>::VALUE>());
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group IoModule
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/util_allocator.cuh b/lib/kokkos/TPL/cub/util_allocator.cuh
new file mode 100644
index 000000000..ae40f3305
--- /dev/null
+++ b/lib/kokkos/TPL/cub/util_allocator.cuh
@@ -0,0 +1,661 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple caching allocator for device memory allocations. The allocator is
+ * thread-safe and capable of managing device allocations on multiple devices.
+ ******************************************************************************/
+
+#pragma once
+
+#ifndef __CUDA_ARCH__
+    #include <set>              // NVCC (EDG, really) takes FOREVER to compile std::map
+    #include <map>
+#endif
+
+#include <math.h>
+
+#include "util_namespace.cuh"
+#include "util_debug.cuh"
+
+#include "host/spinlock.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+/******************************************************************************
+ * CachingDeviceAllocator (host use)
+ ******************************************************************************/
+
+/**
+ * \brief A simple caching allocator for device memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe and is capable of managing cached device allocations
+ * on multiple devices.  It behaves as follows:
+ *
+ * \par
+ * - Allocations categorized by bin size.
+ * - Bin sizes progress geometrically in accordance with the growth factor
+ *   \p bin_growth provided during construction.  Unused device allocations within
+ *   a larger bin cache are not reused for allocation requests that categorize to
+ *   smaller bin sizes.
+ * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
+ *   (\p bin_growth ^ \p min_bin).
+ * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ *   bin and are simply freed when they are deallocated instead of being returned
+ *   to a bin-cache.
+ * - %If the total storage of cached allocations on a given device will exceed
+ *   \p max_cached_bytes, allocations for that device are simply freed when they are
+ *   deallocated instead of being returned to their bin-cache.
+ *
+ * \par
+ * For example, the default-constructed CachingDeviceAllocator is configured with:
+ * - \p bin_growth = 8
+ * - \p min_bin = 3
+ * - \p max_bin = 7
+ * - \p max_cached_bytes = 6MB - 1B
+ *
+ * \par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes per device
+ *
+ */
+struct CachingDeviceAllocator
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// Invalid device ordinal
+        INVALID_DEVICE_ORDINAL = -1,
+    };
+
+    /**
+     * Integer pow function for unsigned base and exponent
+     */
+    static unsigned int IntPow(
+        unsigned int base,
+        unsigned int exp)
+    {
+        unsigned int retval = 1;
+        while (exp > 0)
+        {
+            if (exp & 1) {
+                retval = retval * base;        // multiply the result by the current base
+            }
+            base = base * base;                // square the base
+            exp = exp >> 1;                    // divide the exponent in half
+        }
+        return retval;
+    }
+
+
+    /**
+     * Round up to the nearest power-of
+     */
+    static void NearestPowerOf(
+        unsigned int &power,
+        size_t &rounded_bytes,
+        unsigned int base,
+        size_t value)
+    {
+        power = 0;
+        rounded_bytes = 1;
+
+        while (rounded_bytes < value)
+        {
+            rounded_bytes *= base;
+            power++;
+        }
+    }
+
+    /**
+     * Descriptor for device memory allocations
+     */
+    struct BlockDescriptor
+    {
+        int   device;        // device ordinal
+        void*           d_ptr;      // Device pointer
+        size_t          bytes;      // Size of allocation in bytes
+        unsigned int    bin;        // Bin enumeration
+
+        // Constructor
+        BlockDescriptor(void *d_ptr, int device) :
+            d_ptr(d_ptr),
+            bytes(0),
+            bin(0),
+            device(device) {}
+
+        // Constructor
+        BlockDescriptor(size_t bytes, unsigned int bin, int device) :
+            d_ptr(NULL),
+            bytes(bytes),
+            bin(bin),
+            device(device) {}
+
+        // Comparison functor for comparing device pointers
+        static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device < b.device) {
+                return true;
+            } else if (a.device > b.device) {
+                return false;
+            } else {
+                return (a.d_ptr < b.d_ptr);
+            }
+        }
+
+        // Comparison functor for comparing allocation sizes
+        static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device < b.device) {
+                return true;
+            } else if (a.device > b.device) {
+                return false;
+            } else {
+                return (a.bytes < b.bytes);
+            }
+        }
+    };
+
+    /// BlockDescriptor comparator function interface
+    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+#ifndef __CUDA_ARCH__   // Only define STL container members in host code
+
+    /// Set type for cached blocks (ordered by size)
+    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+    /// Set type for live blocks (ordered by ptr)
+    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+    /// Map type of device ordinals to the number of cached bytes cached by each device
+    typedef std::map<int, size_t> GpuCachedBytes;
+
+#endif // __CUDA_ARCH__
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    Spinlock        spin_lock;          /// Spinlock for thread-safety
+
+    unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
+    unsigned int    min_bin;            /// Minimum bin enumeration
+    unsigned int    max_bin;            /// Maximum bin enumeration
+
+    size_t          min_bin_bytes;      /// Minimum bin size
+    size_t          max_bin_bytes;      /// Maximum bin size
+    size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device
+
+    bool            debug;              /// Whether or not to print (de)allocation events to stdout
+    bool            skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
+
+#ifndef __CUDA_ARCH__   // Only define STL container members in host code
+
+    GpuCachedBytes  cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
+    CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
+    BusyBlocks      live_blocks;        /// Set of live device allocations currently in use
+
+#endif // __CUDA_ARCH__
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Constructor.
+     */
+    CachingDeviceAllocator(
+        unsigned int bin_growth,    ///< Geometric growth factor for bin-sizes
+        unsigned int min_bin,       ///< Minimum bin
+        unsigned int max_bin,       ///< Maximum bin
+        size_t max_cached_bytes)    ///< Maximum aggregate cached bytes per device
+    :
+    #ifndef __CUDA_ARCH__   // Only define STL container members in host code
+            cached_blocks(BlockDescriptor::SizeCompare),
+            live_blocks(BlockDescriptor::PtrCompare),
+    #endif
+            debug(false),
+            spin_lock(0),
+            bin_growth(bin_growth),
+            min_bin(min_bin),
+            max_bin(max_bin),
+            min_bin_bytes(IntPow(bin_growth, min_bin)),
+            max_bin_bytes(IntPow(bin_growth, max_bin)),
+            max_cached_bytes(max_cached_bytes)
+    {}
+
+
+    /**
+     * \brief Default constructor.
+     *
+     * Configured with:
+     * \par
+     * - \p bin_growth = 8
+     * - \p min_bin = 3
+     * - \p max_bin = 7
+     * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+     *
+     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+     * sets a maximum of 6,291,455 cached bytes per device
+     */
+    CachingDeviceAllocator(bool skip_cleanup = false) :
+    #ifndef __CUDA_ARCH__   // Only define STL container members in host code
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare),
+    #endif
+        skip_cleanup(skip_cleanup),
+        debug(false),
+        spin_lock(0),
+        bin_growth(8),
+        min_bin(3),
+        max_bin(7),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes((max_bin_bytes * 3) - 1)
+    {}
+
+
+    /**
+     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
+     */
+    cudaError_t SetMaxCachedBytes(
+        size_t max_cached_bytes)
+    {
+    #ifdef __CUDA_ARCH__
+        // Caching functionality only defined on host
+        return CubDebug(cudaErrorInvalidConfiguration);
+    #else
+
+        // Lock
+        Lock(&spin_lock);
+
+        this->max_cached_bytes = max_cached_bytes;
+
+        if (debug) CubLog("New max_cached_bytes(%lld)\n", (long long) max_cached_bytes);
+
+        // Unlock
+        Unlock(&spin_lock);
+
+        return cudaSuccess;
+
+    #endif  // __CUDA_ARCH__
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the specified device
+     */
+    cudaError_t DeviceAllocate(
+        void** d_ptr,
+        size_t bytes,
+        int device)
+    {
+    #ifdef __CUDA_ARCH__
+        // Caching functionality only defined on host
+        return CubDebug(cudaErrorInvalidConfiguration);
+    #else
+
+        bool locked                     = false;
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        // Round up to nearest bin size
+        unsigned int bin;
+        size_t bin_bytes;
+        NearestPowerOf(bin, bin_bytes, bin_growth, bytes);
+        if (bin < min_bin) {
+            bin = min_bin;
+            bin_bytes = min_bin_bytes;
+        }
+
+        // Check if bin is greater than our maximum bin
+        if (bin > max_bin)
+        {
+            // Allocate the request exactly and give out-of-range bin
+            bin = (unsigned int) -1;
+            bin_bytes = bytes;
+        }
+
+        BlockDescriptor search_key(bin_bytes, bin, device);
+
+        // Lock
+        if (!locked) {
+            Lock(&spin_lock);
+            locked = true;
+        }
+
+        do {
+            // Find a free block big enough within the same bin on the same device
+            CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+            if ((block_itr != cached_blocks.end()) &&
+                (block_itr->device == device) &&
+                (block_itr->bin == search_key.bin))
+            {
+                // Reuse existing cache block.  Insert into live blocks.
+                search_key = *block_itr;
+                live_blocks.insert(search_key);
+
+                // Remove from free blocks
+                cached_blocks.erase(block_itr);
+                cached_bytes[device] -= search_key.bytes;
+
+                if (debug) CubLog("\tdevice %d reused cached block (%lld bytes). %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+                    device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
+            }
+            else
+            {
+                // Need to allocate a new cache block. Unlock.
+                if (locked) {
+                    Unlock(&spin_lock);
+                    locked = false;
+                }
+
+                // Set to specified device
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+                if (CubDebug(error = cudaSetDevice(device))) break;
+
+                // Allocate
+                if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) break;
+
+                // Lock
+                if (!locked) {
+                    Lock(&spin_lock);
+                    locked = true;
+                }
+
+                // Insert into live blocks
+                live_blocks.insert(search_key);
+
+                if (debug) CubLog("\tdevice %d allocating new device block %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+                    device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
+            }
+        } while(0);
+
+        // Unlock
+        if (locked) {
+            Unlock(&spin_lock);
+            locked = false;
+        }
+
+        // Copy device pointer to output parameter (NULL on error)
+        *d_ptr = search_key.d_ptr;
+
+        // Attempt to revert back to previous device if necessary
+        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+
+    #endif  // __CUDA_ARCH__
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the current device
+     */
+    cudaError_t DeviceAllocate(
+        void** d_ptr,
+        size_t bytes)
+    {
+    #ifdef __CUDA_ARCH__
+        // Caching functionality only defined on host
+        return CubDebug(cudaErrorInvalidConfiguration);
+    #else
+        cudaError_t error = cudaSuccess;
+        do {
+            int current_device;
+            if (CubDebug(error = cudaGetDevice(&current_device))) break;
+            if (CubDebug(error = DeviceAllocate(d_ptr, bytes, current_device))) break;
+        } while(0);
+
+        return error;
+
+    #endif  // __CUDA_ARCH__
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator
+     */
+    cudaError_t DeviceFree(
+        void* d_ptr,
+        int device)
+    {
+    #ifdef __CUDA_ARCH__
+        // Caching functionality only defined on host
+        return CubDebug(cudaErrorInvalidConfiguration);
+    #else
+
+        bool locked                     = false;
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        BlockDescriptor search_key(d_ptr, device);
+
+        // Lock
+        if (!locked) {
+            Lock(&spin_lock);
+            locked = true;
+        }
+
+        do {
+            // Find corresponding block descriptor
+            BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+            if (block_itr == live_blocks.end())
+            {
+                // Cannot find pointer
+                if (CubDebug(error = cudaErrorUnknown)) break;
+            }
+            else
+            {
+                // Remove from live blocks
+                search_key = *block_itr;
+                live_blocks.erase(block_itr);
+
+                // Check if we should keep the returned allocation
+                if (cached_bytes[device] + search_key.bytes <= max_cached_bytes)
+                {
+                    // Insert returned allocation into free blocks
+                    cached_blocks.insert(search_key);
+                    cached_bytes[device] += search_key.bytes;
+
+                    if (debug) CubLog("\tdevice %d returned %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+                        device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
+                }
+                else
+                {
+                    // Free the returned allocation.  Unlock.
+                    if (locked) {
+                        Unlock(&spin_lock);
+                        locked = false;
+                    }
+
+                    // Set to specified device
+                    if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+                    if (CubDebug(error = cudaSetDevice(device))) break;
+
+                    // Free device memory
+                    if (CubDebug(error = cudaFree(d_ptr))) break;
+
+                    if (debug) CubLog("\tdevice %d freed %lld bytes.  %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+                        device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
+                }
+            }
+        } while (0);
+
+        // Unlock
+        if (locked) {
+            Unlock(&spin_lock);
+            locked = false;
+        }
+
+        // Attempt to revert back to entry-point device if necessary
+        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+
+    #endif  // __CUDA_ARCH__
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator
+     */
+    cudaError_t DeviceFree(
+        void* d_ptr)
+    {
+    #ifdef __CUDA_ARCH__
+        // Caching functionality only defined on host
+        return CubDebug(cudaErrorInvalidConfiguration);
+    #else
+
+        int current_device;
+        cudaError_t error = cudaSuccess;
+
+        do {
+            if (CubDebug(error = cudaGetDevice(&current_device))) break;
+            if (CubDebug(error = DeviceFree(d_ptr, current_device))) break;
+        } while(0);
+
+        return error;
+
+    #endif  // __CUDA_ARCH__
+    }
+
+
+    /**
+     * \brief Frees all cached device allocations on all devices
+     */
+    cudaError_t FreeAllCached()
+    {
+    #ifdef __CUDA_ARCH__
+        // Caching functionality only defined on host
+        return CubDebug(cudaErrorInvalidConfiguration);
+    #else
+
+        cudaError_t error         = cudaSuccess;
+        bool locked               = false;
+        int entrypoint_device     = INVALID_DEVICE_ORDINAL;
+        int current_device        = INVALID_DEVICE_ORDINAL;
+
+        // Lock
+        if (!locked) {
+            Lock(&spin_lock);
+            locked = true;
+        }
+
+        while (!cached_blocks.empty())
+        {
+            // Get first block
+            CachedBlocks::iterator begin = cached_blocks.begin();
+
+            // Get entry-point device ordinal if necessary
+            if (entrypoint_device == INVALID_DEVICE_ORDINAL)
+            {
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+            }
+
+            // Set current device ordinal if necessary
+            if (begin->device != current_device)
+            {
+                if (CubDebug(error = cudaSetDevice(begin->device))) break;
+                current_device = begin->device;
+            }
+
+            // Free device memory
+            if (CubDebug(error = cudaFree(begin->d_ptr))) break;
+
+            // Reduce balance and erase entry
+            cached_bytes[current_device] -= begin->bytes;
+            cached_blocks.erase(begin);
+
+            if (debug) CubLog("\tdevice %d freed %lld bytes.  %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
+                current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device], (long long) live_blocks.size());
+        }
+
+        // Unlock
+        if (locked) {
+            Unlock(&spin_lock);
+            locked = false;
+        }
+
+        // Attempt to revert back to entry-point device if necessary
+        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+
+    #endif  // __CUDA_ARCH__
+    }
+
+
+    /**
+     * \brief Destructor
+     */
+    virtual ~CachingDeviceAllocator()
+    {
+        if (!skip_cleanup)
+            FreeAllCached();
+    }
+
+};
+
+
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/util_arch.cuh b/lib/kokkos/TPL/cub/util_arch.cuh
new file mode 100644
index 000000000..232a33c4f
--- /dev/null
+++ b/lib/kokkos/TPL/cub/util_arch.cuh
@@ -0,0 +1,295 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Static architectural properties by SM version.
+ */
+
+
+/******************************************************************************
+ * Static architectural properties by SM version.
+ *
+ * "Device" reflects the PTX architecture targeted by the active compiler
+ * pass.  It provides useful compile-time statics within device code.  E.g.,:
+ *
+ *     __shared__ int[Device::WARP_THREADS];
+ *
+ *     int padded_offset = threadIdx.x + (threadIdx.x >> Device::LOG_SMEM_BANKS);
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
+#ifndef __CUDA_ARCH__
+    #define CUB_PTX_ARCH 0
+#else
+    #define CUB_PTX_ARCH __CUDA_ARCH__
+#endif
+
+
+/// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
+#if !defined(__CUDA_ARCH__) || defined(CUB_CDP)
+#define CUB_RUNTIME_ENABLED
+#endif
+
+
+/// Execution space for destructors
+#if ((CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH < 200))
+    #define CUB_DESTRUCTOR __host__
+#else
+    #define CUB_DESTRUCTOR __host__ __device__
+#endif
+
+
+/**
+ * \brief Structure for statically reporting CUDA device properties, parameterized by SM architecture.
+ *
+ * The default specialization is for SM10.
+ */
+template <int SM_ARCH>
+struct ArchProps
+{
+    enum
+    {
+        LOG_WARP_THREADS    =
+                                        5,                        /// Log of the number of threads per warp
+        WARP_THREADS        =
+                                        1 << LOG_WARP_THREADS,    /// Number of threads per warp
+        LOG_SMEM_BANKS      =
+                                        4,                        /// Log of the number of smem banks
+        SMEM_BANKS          =
+                                        1 << LOG_SMEM_BANKS,      /// The number of smem banks
+        SMEM_BANK_BYTES     =
+                                        4,                        /// Size of smem bank words
+        SMEM_BYTES          =
+                                        16 * 1024,                /// Maximum SM shared memory
+        SMEM_ALLOC_UNIT     =
+                                        512,                      /// Smem allocation size in bytes
+        REGS_BY_BLOCK       =
+                                        true,                     /// Whether or not the architecture allocates registers by block (or by warp)
+        REG_ALLOC_UNIT      =
+                                        256,                      /// Number of registers allocated at a time per block (or by warp)
+        WARP_ALLOC_UNIT     =
+                                        2,                        /// Granularity of warps for which registers are allocated
+        MAX_SM_THREADS      =
+                                        768,                      /// Maximum number of threads per SM
+        MAX_SM_THREADBLOCKS =
+                                        8,                        /// Maximum number of thread blocks per SM
+        MAX_BLOCK_THREADS   =
+                                        512,                      /// Maximum number of thread per thread block
+        MAX_SM_REGISTERS    =
+                                        8 * 1024,                 /// Maximum number of registers per SM
+    };
+};
+
+
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Architecture properties for SM30
+ */
+template <>
+struct ArchProps<300>
+{
+    enum
+    {
+        LOG_WARP_THREADS    = 5,                        // 32 threads per warp
+        WARP_THREADS        = 1 << LOG_WARP_THREADS,
+        LOG_SMEM_BANKS      = 5,                        // 32 banks
+        SMEM_BANKS          = 1 << LOG_SMEM_BANKS,
+        SMEM_BANK_BYTES     = 4,                        // 4 byte bank words
+        SMEM_BYTES          = 48 * 1024,                // 48KB shared memory
+        SMEM_ALLOC_UNIT     = 256,                      // 256B smem allocation segment size
+        REGS_BY_BLOCK       = false,                    // Allocates registers by warp
+        REG_ALLOC_UNIT      = 256,                      // 256 registers allocated at a time per warp
+        WARP_ALLOC_UNIT     = 4,                        // Registers are allocated at a granularity of every 4 warps per threadblock
+        MAX_SM_THREADS      = 2048,                     // 2K max threads per SM
+        MAX_SM_THREADBLOCKS = 16,                       // 16 max threadblocks per SM
+        MAX_BLOCK_THREADS   = 1024,                     // 1024 max threads per threadblock
+        MAX_SM_REGISTERS    = 64 * 1024,                // 64K max registers per SM
+    };
+
+    // Callback utility
+    template <typename T>
+    static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
+    {
+        target.template Callback<ArchProps>();
+    }
+};
+
+
+/**
+ * Architecture properties for SM20
+ */
+template <>
+struct ArchProps<200>
+{
+    enum
+    {
+        LOG_WARP_THREADS    = 5,                        // 32 threads per warp
+        WARP_THREADS        = 1 << LOG_WARP_THREADS,
+        LOG_SMEM_BANKS      = 5,                        // 32 banks
+        SMEM_BANKS          = 1 << LOG_SMEM_BANKS,
+        SMEM_BANK_BYTES     = 4,                        // 4 byte bank words
+        SMEM_BYTES          = 48 * 1024,                // 48KB shared memory
+        SMEM_ALLOC_UNIT     = 128,                      // 128B smem allocation segment size
+        REGS_BY_BLOCK       = false,                    // Allocates registers by warp
+        REG_ALLOC_UNIT      = 64,                       // 64 registers allocated at a time per warp
+        WARP_ALLOC_UNIT     = 2,                        // Registers are allocated at a granularity of every 2 warps per threadblock
+        MAX_SM_THREADS      = 1536,                     // 1536 max threads per SM
+        MAX_SM_THREADBLOCKS = 8,                        // 8 max threadblocks per SM
+        MAX_BLOCK_THREADS   = 1024,                     // 1024 max threads per threadblock
+        MAX_SM_REGISTERS    = 32 * 1024,                // 32K max registers per SM
+    };
+
+    // Callback utility
+    template <typename T>
+    static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
+    {
+        if (sm_version > 200) {
+            ArchProps<300>::Callback(target, sm_version);
+        } else {
+            target.template Callback<ArchProps>();
+        }
+    }
+};
+
+
+/**
+ * Architecture properties for SM12
+ */
+template <>
+struct ArchProps<120>
+{
+    enum
+    {
+        LOG_WARP_THREADS    = 5,                        // 32 threads per warp
+        WARP_THREADS        = 1 << LOG_WARP_THREADS,
+        LOG_SMEM_BANKS      = 4,                        // 16 banks
+        SMEM_BANKS          = 1 << LOG_SMEM_BANKS,
+        SMEM_BANK_BYTES     = 4,                        // 4 byte bank words
+        SMEM_BYTES          = 16 * 1024,                // 16KB shared memory
+        SMEM_ALLOC_UNIT     = 512,                      // 512B smem allocation segment size
+        REGS_BY_BLOCK       = true,                     // Allocates registers by threadblock
+        REG_ALLOC_UNIT      = 512,                      // 512 registers allocated at time per threadblock
+        WARP_ALLOC_UNIT     = 2,                        // Registers are allocated at a granularity of every 2 warps per threadblock
+        MAX_SM_THREADS      = 1024,                     // 1024 max threads per SM
+        MAX_SM_THREADBLOCKS = 8,                        // 8 max threadblocks per SM
+        MAX_BLOCK_THREADS   = 512,                      // 512 max threads per threadblock
+        MAX_SM_REGISTERS    = 16 * 1024,                // 16K max registers per SM
+    };
+
+    // Callback utility
+    template <typename T>
+    static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
+    {
+        if (sm_version > 120) {
+            ArchProps<200>::Callback(target, sm_version);
+        } else {
+            target.template Callback<ArchProps>();
+        }
+    }
+};
+
+
+/**
+ * Architecture properties for SM10.  Derives from the default ArchProps specialization.
+ */
+template <>
+struct ArchProps<100> : ArchProps<0>
+{
+    // Callback utility
+    template <typename T>
+    static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
+    {
+        if (sm_version > 100) {
+            ArchProps<120>::Callback(target, sm_version);
+        } else {
+            target.template Callback<ArchProps>();
+        }
+    }
+};
+
+
+/**
+ * Architecture properties for SM35
+ */
+template <>
+struct ArchProps<350> : ArchProps<300> {};        // Derives from SM30
+
+/**
+ * Architecture properties for SM21
+ */
+template <>
+struct ArchProps<210> : ArchProps<200> {};        // Derives from SM20
+
+/**
+ * Architecture properties for SM13
+ */
+template <>
+struct ArchProps<130> : ArchProps<120> {};        // Derives from SM12
+
+/**
+ * Architecture properties for SM11
+ */
+template <>
+struct ArchProps<110> : ArchProps<100> {};        // Derives from SM10
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief The architectural properties for the PTX version targeted by the active compiler pass.
+ */
+struct PtxArchProps : ArchProps<CUB_PTX_ARCH> {};
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/util_debug.cuh b/lib/kokkos/TPL/cub/util_debug.cuh
new file mode 100644
index 000000000..2ac67d7d0
--- /dev/null
+++ b/lib/kokkos/TPL/cub/util_debug.cuh
@@ -0,0 +1,115 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Error and event logging routines.
+ *
+ * The following macros definitions are supported:
+ * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include "util_namespace.cuh"
+#include "util_arch.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+/// CUB error reporting macro (prints error messages to stderr)
+#if (defined(DEBUG) || defined(_DEBUG))
+    #define CUB_STDERR
+#endif
+
+
+
+/**
+ * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
+ *
+ * \return The CUDA error.
+ */
+__host__ __device__ __forceinline__ cudaError_t Debug(
+    cudaError_t     error,
+    const char*     filename,
+    int             line)
+{
+#ifdef CUB_STDERR
+    if (error)
+    {
+    #if (CUB_PTX_ARCH == 0)
+        fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
+        fflush(stderr);
+    #elif (CUB_PTX_ARCH >= 200)
+        printf("CUDA error %d [block %d, thread %d, %s, %d]\n", error, blockIdx.x, threadIdx.x, filename, line);
+    #endif
+    }
+#endif
+    return error;
+}
+
+
+/**
+ * \brief Debug macro
+ */
+#define CubDebug(e) cub::Debug((e), __FILE__, __LINE__)
+
+
+/**
+ * \brief Debug macro with exit
+ */
+#define CubDebugExit(e) if (cub::Debug((e), __FILE__, __LINE__)) { exit(1); }
+
+
+/**
+ * \brief Log macro for printf statements.
+ */
+#if (CUB_PTX_ARCH == 0)
+    #define CubLog(format, ...) printf(format,__VA_ARGS__);
+#elif (CUB_PTX_ARCH >= 200)
+    #define CubLog(format, ...) printf("[block %d, thread %d]: " format, blockIdx.x, threadIdx.x, __VA_ARGS__);
+#endif
+
+
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/util_device.cuh b/lib/kokkos/TPL/cub/util_device.cuh
new file mode 100644
index 000000000..0631b924a
--- /dev/null
+++ b/lib/kokkos/TPL/cub/util_device.cuh
@@ -0,0 +1,378 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Properties of a given CUDA device and the corresponding PTX bundle
+ */
+
+#pragma once
+
+#include "util_arch.cuh"
+#include "util_debug.cuh"
+#include "util_namespace.cuh"
+#include "util_macro.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
+ */
+template <typename T>
+__global__ void EmptyKernel(void) { }
+
+
+/**
+ * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
+ */
+template <int ALLOCATIONS>
+__host__ __device__ __forceinline__
+cudaError_t AliasTemporaries(
+    void    *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+    size_t  &temp_storage_bytes,                ///< [in,out] Size in bytes of \t d_temp_storage allocation
+    void*   (&allocations)[ALLOCATIONS],        ///< [in,out] Pointers to device allocations needed
+    size_t  (&allocation_sizes)[ALLOCATIONS])   ///< [in] Sizes in bytes of device allocations needed
+{
+    const int ALIGN_BYTES   = 256;
+    const int ALIGN_MASK    = ~(ALIGN_BYTES - 1);
+
+    // Compute exclusive prefix sum over allocation requests
+    size_t bytes_needed = 0;
+    for (int i = 0; i < ALLOCATIONS; ++i)
+    {
+        size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
+        allocation_sizes[i] = bytes_needed;
+        bytes_needed += allocation_bytes;
+    }
+
+    // Check if the caller is simply requesting the size of the storage allocation
+    if (!d_temp_storage)
+    {
+        temp_storage_bytes = bytes_needed;
+        return cudaSuccess;
+    }
+
+    // Check if enough storage provided
+    if (temp_storage_bytes < bytes_needed)
+    {
+        return CubDebug(cudaErrorMemoryAllocation);
+    }
+
+    // Alias
+    for (int i = 0; i < ALLOCATIONS; ++i)
+    {
+        allocations[i] = static_cast<char*>(d_temp_storage) + allocation_sizes[i];
+    }
+
+    return cudaSuccess;
+}
+
+
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/**
+ * \brief Retrieves the PTX version (major * 100 + minor * 10)
+ */
+__host__ __device__ __forceinline__ cudaError_t PtxVersion(int &ptx_version)
+{
+#ifndef CUB_RUNTIME_ENABLED
+
+    // CUDA API calls not supported from this device
+    return cudaErrorInvalidConfiguration;
+
+#else
+
+    cudaError_t error = cudaSuccess;
+    do
+    {
+        cudaFuncAttributes empty_kernel_attrs;
+        if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
+        ptx_version = empty_kernel_attrs.ptxVersion * 10;
+    }
+    while (0);
+
+    return error;
+
+#endif
+}
+
+
+/**
+ * Synchronize the stream if specified
+ */
+__host__ __device__ __forceinline__
+static cudaError_t SyncStream(cudaStream_t stream)
+{
+#ifndef __CUDA_ARCH__
+    return cudaStreamSynchronize(stream);
+#else
+    // Device can't yet sync on a specific stream
+    return cudaDeviceSynchronize();
+#endif
+}
+
+
+
+/**
+ * \brief Properties of a given CUDA device and the corresponding PTX bundle
+ */
+class Device
+{
+private:
+
+    /// Type definition of the EmptyKernel kernel entry point
+    typedef void (*EmptyKernelPtr)();
+
+    /// Force EmptyKernel<void> to be generated if this class is used
+    __host__ __device__ __forceinline__
+    EmptyKernelPtr Empty()
+    {
+        return EmptyKernel<void>;
+    }
+
+public:
+
+    // Version information
+    int     sm_version;             ///< SM version of target device (SM version X.YZ in XYZ integer form)
+    int     ptx_version;            ///< Bundled PTX version for target device (PTX version X.YZ in XYZ integer form)
+
+    // Target device properties
+    int     sm_count;               ///< Number of SMs
+    int     warp_threads;           ///< Number of threads per warp
+    int     smem_bank_bytes;        ///< Number of bytes per SM bank
+    int     smem_banks;             ///< Number of smem banks
+    int     smem_bytes;             ///< Smem bytes per SM
+    int     smem_alloc_unit;        ///< Smem segment size
+    bool    regs_by_block;          ///< Whether registers are allocated by threadblock (or by warp)
+    int     reg_alloc_unit;         ///< Granularity of register allocation within the SM
+    int     warp_alloc_unit;        ///< Granularity of warp allocation within the SM
+    int     max_sm_threads;         ///< Maximum number of threads per SM
+    int     max_sm_blocks;          ///< Maximum number of threadblocks per SM
+    int     max_block_threads;      ///< Maximum number of threads per threadblock
+    int     max_sm_registers;       ///< Maximum number of registers per SM
+    int     max_sm_warps;           ///< Maximum number of warps per SM
+
+    /**
+     * Callback for initializing device properties
+     */
+    template <typename ArchProps>
+    __host__ __device__ __forceinline__ void Callback()
+    {
+        warp_threads        = ArchProps::WARP_THREADS;
+        smem_bank_bytes     = ArchProps::SMEM_BANK_BYTES;
+        smem_banks          = ArchProps::SMEM_BANKS;
+        smem_bytes          = ArchProps::SMEM_BYTES;
+        smem_alloc_unit     = ArchProps::SMEM_ALLOC_UNIT;
+        regs_by_block       = ArchProps::REGS_BY_BLOCK;
+        reg_alloc_unit      = ArchProps::REG_ALLOC_UNIT;
+        warp_alloc_unit     = ArchProps::WARP_ALLOC_UNIT;
+        max_sm_threads      = ArchProps::MAX_SM_THREADS;
+        max_sm_blocks       = ArchProps::MAX_SM_THREADBLOCKS;
+        max_block_threads   = ArchProps::MAX_BLOCK_THREADS;
+        max_sm_registers    = ArchProps::MAX_SM_REGISTERS;
+        max_sm_warps        = max_sm_threads / warp_threads;
+    }
+
+
+public:
+
+    /**
+     * Initializer.  Properties are retrieved for the specified GPU ordinal.
+     */
+    __host__ __device__ __forceinline__
+    cudaError_t Init(int device_ordinal)
+    {
+    #ifndef CUB_RUNTIME_ENABLED
+
+        // CUDA API calls not supported from this device
+        return CubDebug(cudaErrorInvalidConfiguration);
+
+    #else
+
+        cudaError_t error = cudaSuccess;
+        do
+        {
+            // Fill in SM version
+            int major, minor;
+            if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
+            if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
+            sm_version = major * 100 + minor * 10;
+
+            // Fill in static SM properties
+            // Initialize our device properties via callback from static device properties
+            ArchProps<100>::Callback(*this, sm_version);
+
+            // Fill in SM count
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Fill in PTX version
+        #if CUB_PTX_ARCH > 0
+            ptx_version = CUB_PTX_ARCH;
+        #else
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+        #endif
+
+        }
+        while (0);
+
+        return error;
+
+    #endif
+    }
+
+
+    /**
+     * Initializer.  Properties are retrieved for the current GPU ordinal.
+     */
+    __host__ __device__ __forceinline__
+    cudaError_t Init()
+    {
+    #ifndef CUB_RUNTIME_ENABLED
+
+        // CUDA API calls not supported from this device
+        return CubDebug(cudaErrorInvalidConfiguration);
+
+    #else
+
+        cudaError_t error = cudaSuccess;
+        do
+        {
+            int device_ordinal;
+            if ((error = CubDebug(cudaGetDevice(&device_ordinal)))) break;
+            if ((error = Init(device_ordinal))) break;
+        }
+        while (0);
+        return error;
+
+    #endif
+    }
+
+
+    /**
+     * Computes maximum SM occupancy in thread blocks for the given kernel
+     */
+    template <typename KernelPtr>
+    __host__ __device__ __forceinline__
+    cudaError_t MaxSmOccupancy(
+        int                 &max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
+        KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
+        int                 block_threads)              ///< [in] Number of threads per thread block
+    {
+    #ifndef CUB_RUNTIME_ENABLED
+
+        // CUDA API calls not supported from this device
+        return CubDebug(cudaErrorInvalidConfiguration);
+
+    #else
+
+        cudaError_t error = cudaSuccess;
+        do
+        {
+            // Get kernel attributes
+            cudaFuncAttributes kernel_attrs;
+            if (CubDebug(error = cudaFuncGetAttributes(&kernel_attrs, kernel_ptr))) break;
+
+            // Number of warps per threadblock
+            int block_warps = (block_threads +  warp_threads - 1) / warp_threads;
+
+            // Max warp occupancy
+            int max_warp_occupancy = (block_warps > 0) ?
+                max_sm_warps / block_warps :
+                max_sm_blocks;
+
+            // Maximum register occupancy
+            int max_reg_occupancy;
+            if ((block_threads == 0) || (kernel_attrs.numRegs == 0))
+            {
+                // Prevent divide-by-zero
+                max_reg_occupancy = max_sm_blocks;
+            }
+            else if (regs_by_block)
+            {
+                // Allocates registers by threadblock
+                int block_regs = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads * block_warps, reg_alloc_unit);
+                max_reg_occupancy = max_sm_registers / block_regs;
+            }
+            else
+            {
+                // Allocates registers by warp
+                int sm_sides                = warp_alloc_unit;
+                int sm_registers_per_side   = max_sm_registers / sm_sides;
+                int regs_per_warp           = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads, reg_alloc_unit);
+                int warps_per_side          = sm_registers_per_side / regs_per_warp;
+                int warps                   = warps_per_side * sm_sides;
+                max_reg_occupancy           = warps / block_warps;
+            }
+
+            // Shared memory per threadblock
+            int block_allocated_smem = CUB_ROUND_UP_NEAREST(
+                kernel_attrs.sharedSizeBytes,
+                smem_alloc_unit);
+
+            // Max shared memory occupancy
+            int max_smem_occupancy = (block_allocated_smem > 0) ?
+                (smem_bytes / block_allocated_smem) :
+                max_sm_blocks;
+
+            // Max occupancy
+            max_sm_occupancy = CUB_MIN(
+                CUB_MIN(max_sm_blocks, max_warp_occupancy),
+                CUB_MIN(max_smem_occupancy, max_reg_occupancy));
+
+//            printf("max_smem_occupancy(%d), max_warp_occupancy(%d), max_reg_occupancy(%d)", max_smem_occupancy, max_warp_occupancy, max_reg_occupancy);
+
+        } while (0);
+
+        return error;
+
+    #endif
+    }
+
+};
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/util_iterator.cuh b/lib/kokkos/TPL/cub/util_iterator.cuh
new file mode 100644
index 000000000..08b574ca5
--- /dev/null
+++ b/lib/kokkos/TPL/cub/util_iterator.cuh
@@ -0,0 +1,718 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include "thread/thread_load.cuh"
+#include "util_device.cuh"
+#include "util_debug.cuh"
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Texture references
+ *****************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+// Anonymous namespace
+namespace {
+
+/// Templated texture reference type
+template <typename T>
+struct TexIteratorRef
+{
+    // Texture reference type
+    typedef texture<T, cudaTextureType1D, cudaReadModeElementType> TexRef;
+
+    static TexRef ref;
+
+    /**
+     * Bind texture
+     */
+    static cudaError_t BindTexture(void *d_in)
+    {
+        cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<T>();
+        if (d_in)
+            return (CubDebug(cudaBindTexture(NULL, ref, d_in, tex_desc)));
+
+        return cudaSuccess;
+    }
+
+    /**
+     * Unbind textures
+     */
+    static cudaError_t UnbindTexture()
+    {
+        return CubDebug(cudaUnbindTexture(ref));
+    }
+};
+
+// Texture reference definitions
+template <typename Value>
+typename TexIteratorRef<Value>::TexRef TexIteratorRef<Value>::ref = 0;
+
+} // Anonymous namespace
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+
+
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+/******************************************************************************
+ * Iterators
+ *****************************************************************************/
+
+/**
+ * \brief A simple random-access iterator pointing to a range of constant values
+ *
+ * \par Overview
+ * ConstantIteratorRA is a random-access iterator that when dereferenced, always
+ * returns the supplied constant of type \p OutputType.
+ *
+ * \tparam OutputType           The value type of this iterator
+ */
+template <typename OutputType>
+class ConstantIteratorRA
+{
+public:
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    typedef ConstantIteratorRA                  self_type;
+    typedef OutputType                          value_type;
+    typedef OutputType                          reference;
+    typedef OutputType*                         pointer;
+    typedef std::random_access_iterator_tag     iterator_category;
+    typedef int                                 difference_type;
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+private:
+
+    OutputType    val;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ConstantIteratorRA(
+        const OutputType &val)          ///< Constant value for the iterator instance to report
+    :
+        val(val)
+    {}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        self_type i = *this;
+        return i;
+    }
+
+    __host__ __device__ __forceinline__ self_type operator++(int junk)
+    {
+        return *this;
+    }
+
+    __host__ __device__ __forceinline__ reference operator*()
+    {
+        return val;
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ self_type operator+(SizeT n)
+    {
+        return ConstantIteratorRA(val);
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ self_type operator-(SizeT n)
+    {
+        return ConstantIteratorRA(val);
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ reference operator[](SizeT n)
+    {
+        return ConstantIteratorRA(val);
+    }
+
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &val;
+    }
+
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (val == rhs.val);
+    }
+
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (val != rhs.val);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+};
+
+
+
+/**
+ * \brief A simple random-access transform iterator for applying a transformation operator.
+ *
+ * \par Overview
+ * TransformIteratorRA is a random-access iterator that wraps both a native
+ * device pointer of type <tt>InputType*</tt> and a unary conversion functor of
+ * type \p ConversionOp. \p OutputType references are made by pulling \p InputType
+ * values through the \p ConversionOp instance.
+ *
+ * \tparam InputType            The value type of the pointer being wrapped
+ * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p OutputType.  Must have member <tt>OutputType operator()(const InputType &datum)</tt>.
+ * \tparam OutputType           The value type of this iterator
+ */
+template <typename OutputType, typename ConversionOp, typename InputType>
+class TransformIteratorRA
+{
+public:
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    typedef TransformIteratorRA                 self_type;
+    typedef OutputType                          value_type;
+    typedef OutputType                          reference;
+    typedef OutputType*                         pointer;
+    typedef std::random_access_iterator_tag     iterator_category;
+    typedef int                                 difference_type;
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+private:
+
+    ConversionOp    conversion_op;
+    InputType*      ptr;
+
+public:
+
+    /**
+     * \brief Constructor
+     * @param ptr Native pointer to wrap
+     * @param conversion_op Binary transformation functor
+     */
+    __host__ __device__ __forceinline__ TransformIteratorRA(InputType* ptr, ConversionOp conversion_op) :
+        conversion_op(conversion_op),
+        ptr(ptr) {}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        self_type i = *this;
+        ptr++;
+        return i;
+    }
+
+    __host__ __device__ __forceinline__ self_type operator++(int junk)
+    {
+        ptr++;
+        return *this;
+    }
+
+    __host__ __device__ __forceinline__ reference operator*()
+    {
+        return conversion_op(*ptr);
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ self_type operator+(SizeT n)
+    {
+        TransformIteratorRA retval(ptr + n, conversion_op);
+        return retval;
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ self_type operator-(SizeT n)
+    {
+        TransformIteratorRA retval(ptr - n, conversion_op);
+        return retval;
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ reference operator[](SizeT n)
+    {
+        return conversion_op(ptr[n]);
+    }
+
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &conversion_op(*ptr);
+    }
+
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+};
+
+
+
+/**
+ * \brief A simple random-access iterator for loading primitive values through texture cache.
+ *
+ * \par Overview
+ * TexIteratorRA is a random-access iterator that wraps a native
+ * device pointer of type <tt>T*</tt>. References made through TexIteratorRA
+ * causes values to be pulled through texture cache.
+ *
+ * \par Usage Considerations
+ * - Can only be used with primitive types (e.g., \p char, \p int, \p float), with the exception of \p double
+ * - Only one TexIteratorRA or TexIteratorRA of a certain \p InputType can be bound at any given time (per host thread)
+ *
+ * \tparam InputType            The value type of the pointer being wrapped
+ * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p OutputType.  Must have member <tt>OutputType operator()(const InputType &datum)</tt>.
+ * \tparam OutputType           The value type of this iterator
+ */
+template <typename T>
+class TexIteratorRA
+{
+public:
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    typedef TexIteratorRA                       self_type;
+    typedef T                                   value_type;
+    typedef T                                   reference;
+    typedef T*                                  pointer;
+    typedef std::random_access_iterator_tag     iterator_category;
+    typedef int                                 difference_type;
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /// Tag identifying iterator type as being texture-bindable
+    typedef void TexBindingTag;
+
+private:
+
+    T*                  ptr;
+    size_t              tex_align_offset;
+    cudaTextureObject_t tex_obj;
+
+public:
+
+    /**
+     * \brief Constructor
+     */
+    __host__ __device__ __forceinline__ TexIteratorRA()
+    :
+        ptr(NULL),
+        tex_align_offset(0),
+        tex_obj(0)
+    {}
+
+    /// \brief Bind iterator to texture reference
+    cudaError_t BindTexture(
+        T               *ptr,                   ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
+        size_t          bytes,                  ///< Number of items
+        size_t          tex_align_offset = 0)   ///< Offset (in items) from ptr denoting the position of the iterator
+    {
+        this->ptr = ptr;
+        this->tex_align_offset = tex_align_offset;
+
+        int ptx_version;
+        cudaError_t error = cudaSuccess;
+        if (CubDebug(error = PtxVersion(ptx_version))) return error;
+        if (ptx_version >= 300)
+        {
+            // Use texture object
+            cudaChannelFormatDesc   channel_desc = cudaCreateChannelDesc<T>();
+            cudaResourceDesc        res_desc;
+            cudaTextureDesc         tex_desc;
+            memset(&res_desc, 0, sizeof(cudaResourceDesc));
+            memset(&tex_desc, 0, sizeof(cudaTextureDesc));
+            res_desc.resType                = cudaResourceTypeLinear;
+            res_desc.res.linear.devPtr      = ptr;
+            res_desc.res.linear.desc        = channel_desc;
+            res_desc.res.linear.sizeInBytes = bytes;
+            tex_desc.readMode               = cudaReadModeElementType;
+            return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
+        }
+        else
+        {
+            // Use texture reference
+            return TexIteratorRef<T>::BindTexture(ptr);
+        }
+    }
+
+    /// \brief Unbind iterator to texture reference
+    cudaError_t UnbindTexture()
+    {
+        int ptx_version;
+        cudaError_t error = cudaSuccess;
+        if (CubDebug(error = PtxVersion(ptx_version))) return error;
+        if (ptx_version < 300)
+        {
+            // Use texture reference
+            return TexIteratorRef<T>::UnbindTexture();
+        }
+        else
+        {
+            // Use texture object
+            return cudaDestroyTextureObject(tex_obj);
+        }
+    }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        self_type i = *this;
+        ptr++;
+        tex_align_offset++;
+        return i;
+    }
+
+    __host__ __device__ __forceinline__ self_type operator++(int junk)
+    {
+        ptr++;
+        tex_align_offset++;
+        return *this;
+    }
+
+    __host__ __device__ __forceinline__ reference operator*()
+    {
+#if (CUB_PTX_ARCH == 0)
+        // Simply dereference the pointer on the host
+        return *ptr;
+#elif (CUB_PTX_ARCH < 300)
+        // Use the texture reference
+        return tex1Dfetch(TexIteratorRef<T>::ref, tex_align_offset);
+#else
+        // Use the texture object
+        return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
+#endif
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ self_type operator+(SizeT n)
+    {
+        TexIteratorRA retval;
+        retval.ptr = ptr + n;
+        retval.tex_align_offset = tex_align_offset + n;
+        return retval;
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ self_type operator-(SizeT n)
+    {
+        TexIteratorRA retval;
+        retval.ptr = ptr - n;
+        retval.tex_align_offset = tex_align_offset - n;
+        return retval;
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ reference operator[](SizeT n)
+    {
+#if (CUB_PTX_ARCH == 0)
+        // Simply dereference the pointer on the host
+        return ptr[n];
+#elif (CUB_PTX_ARCH < 300)
+        // Use the texture reference
+        return tex1Dfetch(TexIteratorRef<T>::ref, tex_align_offset + n);
+#else
+        // Use the texture object
+        return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset + n));
+#endif
+    }
+
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+#if (CUB_PTX_ARCH == 0)
+        // Simply dereference the pointer on the host
+        return &(*ptr);
+#elif (CUB_PTX_ARCH < 300)
+        // Use the texture reference
+        return &(tex1Dfetch(TexIteratorRef<T>::ref, tex_align_offset));
+#else
+        // Use the texture object
+        return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
+#endif
+    }
+
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+};
+
+
+/**
+ * \brief A simple random-access transform iterator for loading primitive values through texture cache and and subsequently applying a transformation operator.
+ *
+ * \par Overview
+ * TexTransformIteratorRA is a random-access iterator that wraps both a native
+ * device pointer of type <tt>InputType*</tt> and a unary conversion functor of
+ * type \p ConversionOp. \p OutputType references are made by pulling \p InputType
+ * values through the texture cache and then transformed them using the
+ * \p ConversionOp instance.
+ *
+ * \par Usage Considerations
+ * - Can only be used with primitive types (e.g., \p char, \p int, \p float), with the exception of \p double
+ * - Only one TexIteratorRA or TexTransformIteratorRA of a certain \p InputType can be bound at any given time (per host thread)
+ *
+ * \tparam InputType            The value type of the pointer being wrapped
+ * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p OutputType.  Must have member <tt>OutputType operator()(const InputType &datum)</tt>.
+ * \tparam OutputType           The value type of this iterator
+ */
+template <typename OutputType, typename ConversionOp, typename InputType>
+class TexTransformIteratorRA
+{
+public:
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    typedef TexTransformIteratorRA              self_type;
+    typedef OutputType                          value_type;
+    typedef OutputType                          reference;
+    typedef OutputType*                         pointer;
+    typedef std::random_access_iterator_tag     iterator_category;
+    typedef int                                 difference_type;
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+    /// Tag identifying iterator type as being texture-bindable
+    typedef void TexBindingTag;
+
+private:
+
+    ConversionOp        conversion_op;
+    InputType*          ptr;
+    size_t              tex_align_offset;
+    cudaTextureObject_t tex_obj;
+
+public:
+
+    /**
+     * \brief Constructor
+     */
+    TexTransformIteratorRA(
+        ConversionOp    conversion_op)          ///< Binary transformation functor
+    :
+        conversion_op(conversion_op),
+        ptr(NULL),
+        tex_align_offset(0),
+        tex_obj(0)
+    {}
+
+    /// \brief Bind iterator to texture reference
+    cudaError_t BindTexture(
+        InputType*      ptr,                    ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
+        size_t          bytes,                  ///< Number of items
+        size_t          tex_align_offset = 0)   ///< Offset (in items) from ptr denoting the position of the iterator
+    {
+        this->ptr = ptr;
+        this->tex_align_offset = tex_align_offset;
+
+        int ptx_version;
+        cudaError_t error = cudaSuccess;
+        if (CubDebug(error = PtxVersion(ptx_version))) return error;
+        if (ptx_version >= 300)
+        {
+            // Use texture object
+            cudaChannelFormatDesc   channel_desc = cudaCreateChannelDesc<InputType>();
+            cudaResourceDesc        res_desc;
+            cudaTextureDesc         tex_desc;
+            memset(&res_desc, 0, sizeof(cudaResourceDesc));
+            memset(&tex_desc, 0, sizeof(cudaTextureDesc));
+            res_desc.resType                = cudaResourceTypeLinear;
+            res_desc.res.linear.devPtr      = ptr;
+            res_desc.res.linear.desc        = channel_desc;
+            res_desc.res.linear.sizeInBytes = bytes;
+            tex_desc.readMode               = cudaReadModeElementType;
+            return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
+        }
+        else
+        {
+            // Use texture reference
+            return TexIteratorRef<InputType>::BindTexture(ptr);
+        }
+    }
+
+    /// \brief Unbind iterator to texture reference
+    cudaError_t UnbindTexture()
+    {
+        int ptx_version;
+        cudaError_t error = cudaSuccess;
+        if (CubDebug(error = PtxVersion(ptx_version))) return error;
+        if (ptx_version >= 300)
+        {
+            // Use texture object
+            return cudaDestroyTextureObject(tex_obj);
+        }
+        else
+        {
+            // Use texture reference
+            return TexIteratorRef<InputType>::UnbindTexture();
+        }
+    }
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        self_type i = *this;
+        ptr++;
+        tex_align_offset++;
+        return i;
+    }
+
+    __host__ __device__ __forceinline__ self_type operator++(int junk)
+    {
+        ptr++;
+        tex_align_offset++;
+        return *this;
+    }
+
+    __host__ __device__ __forceinline__ reference operator*()
+    {
+#if (CUB_PTX_ARCH == 0)
+        // Simply dereference the pointer on the host
+        return conversion_op(*ptr);
+#elif (CUB_PTX_ARCH < 300)
+        // Use the texture reference
+        return conversion_op(tex1Dfetch(TexIteratorRef<InputType>::ref, tex_align_offset));
+#else
+        // Use the texture object
+        return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
+#endif
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ self_type operator+(SizeT n)
+    {
+        TexTransformIteratorRA retval(conversion_op);
+        retval.ptr = ptr + n;
+        retval.tex_align_offset = tex_align_offset + n;
+        return retval;
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ self_type operator-(SizeT n)
+    {
+        TexTransformIteratorRA retval(conversion_op);
+        retval.ptr = ptr - n;
+        retval.tex_align_offset = tex_align_offset - n;
+        return retval;
+    }
+
+    template <typename SizeT>
+    __host__ __device__ __forceinline__ reference operator[](SizeT n)
+    {
+#if (CUB_PTX_ARCH == 0)
+        // Simply dereference the pointer on the host
+        return conversion_op(ptr[n]);
+#elif (CUB_PTX_ARCH < 300)
+        // Use the texture reference
+        return conversion_op(tex1Dfetch(TexIteratorRef<InputType>::ref, tex_align_offset + n));
+#else
+        // Use the texture object
+        return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset + n));
+#endif
+    }
+
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+#if (CUB_PTX_ARCH == 0)
+        // Simply dereference the pointer on the host
+        return &conversion_op(*ptr);
+#elif (CUB_PTX_ARCH < 300)
+        // Use the texture reference
+        return &conversion_op(tex1Dfetch(TexIteratorRef<InputType>::ref, tex_align_offset));
+#else
+        // Use the texture object
+        return &conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
+#endif
+    }
+
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+};
+
+
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/util_macro.cuh b/lib/kokkos/TPL/cub/util_macro.cuh
new file mode 100644
index 000000000..091fd93c5
--- /dev/null
+++ b/lib/kokkos/TPL/cub/util_macro.cuh
@@ -0,0 +1,107 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Common C/C++ macro utilities
+ ******************************************************************************/
+
+#pragma once
+
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+/**
+ * Align struct
+ */
+#if defined(_WIN32) || defined(_WIN64)
+    #define CUB_ALIGN(bytes) __declspec(align(32))
+#else
+    #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
+#endif
+
+/**
+ * Select maximum(a, b)
+ */
+#define CUB_MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+/**
+ * Select minimum(a, b)
+ */
+#define CUB_MIN(a, b) (((a) < (b)) ? (a) : (b))
+
+/**
+ * Quotient of x/y rounded down to nearest integer
+ */
+#define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
+
+/**
+ * Quotient of x/y rounded up to nearest integer
+ */
+#define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
+
+/**
+ * x rounded up to the nearest multiple of y
+ */
+#define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
+
+/**
+ * x rounded down to the nearest multiple of y
+ */
+#define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
+
+/**
+ * Return character string for given type
+ */
+#define CUB_TYPE_STRING(type) ""#type
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+    #define CUB_CAT_(a, b) a ## b
+    #define CUB_CAT(a, b) CUB_CAT_(a, b)
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * Static assert
+ */
+#define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/util_namespace.cuh b/lib/kokkos/TPL/cub/util_namespace.cuh
new file mode 100644
index 000000000..869ecc613
--- /dev/null
+++ b/lib/kokkos/TPL/cub/util_namespace.cuh
@@ -0,0 +1,41 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Place-holder for prefixing the cub namespace
+ */
+
+#pragma once
+
+// For example:
+//#define CUB_NS_PREFIX namespace thrust{ namespace detail {
+//#define CUB_NS_POSTFIX } }
+
+#define CUB_NS_PREFIX
+#define CUB_NS_POSTFIX
diff --git a/lib/kokkos/TPL/cub/util_ptx.cuh b/lib/kokkos/TPL/cub/util_ptx.cuh
new file mode 100644
index 000000000..ad80b0401
--- /dev/null
+++ b/lib/kokkos/TPL/cub/util_ptx.cuh
@@ -0,0 +1,380 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * PTX intrinsics
+ */
+
+
+#pragma once
+
+#include "util_type.cuh"
+#include "util_arch.cuh"
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+/******************************************************************************
+ * PTX helper macros
+ ******************************************************************************/
+
+/**
+ * Register modifier for pointer-types (for inlining PTX assembly)
+ */
+#if defined(_WIN64) || defined(__LP64__)
+    #define __CUB_LP64__ 1
+    // 64-bit register modifier for inlined asm
+    #define _CUB_ASM_PTR_ "l"
+    #define _CUB_ASM_PTR_SIZE_ "u64"
+#else
+    #define __CUB_LP64__ 0
+    // 32-bit register modifier for inlined asm
+    #define _CUB_ASM_PTR_ "r"
+    #define _CUB_ASM_PTR_SIZE_ "u32"
+#endif
+
+
+/******************************************************************************
+ * Inlined PTX intrinsics
+ ******************************************************************************/
+
+/**
+ * Shift-right then add.  Returns (x >> shift) + addend.
+ */
+__device__ __forceinline__ unsigned int SHR_ADD(
+    unsigned int x,
+    unsigned int shift,
+    unsigned int addend)
+{
+    unsigned int ret;
+#if __CUDA_ARCH__ >= 200
+    asm("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
+        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+#else
+    ret = (x >> shift) + addend;
+#endif
+    return ret;
+}
+
+
+/**
+ * Shift-left then add.  Returns (x << shift) + addend.
+ */
+__device__ __forceinline__ unsigned int SHL_ADD(
+    unsigned int x,
+    unsigned int shift,
+    unsigned int addend)
+{
+    unsigned int ret;
+#if __CUDA_ARCH__ >= 200
+    asm("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
+        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+#else
+    ret = (x << shift) + addend;
+#endif
+    return ret;
+}
+
+
+/**
+ * Bitfield-extract.
+ */
+template <typename UnsignedBits>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits source,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+    unsigned int bits;
+#if __CUDA_ARCH__ >= 200
+    asm("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
+#else
+    const unsigned int MASK = (1 << num_bits) - 1;
+    bits = (source >> bit_start) & MASK;
+#endif
+    return bits;
+}
+
+
+/**
+ * Bitfield-extract for 64-bit types.
+ */
+__device__ __forceinline__ unsigned int BFE(
+    unsigned long long source,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+    const unsigned long long MASK = (1ull << num_bits) - 1;
+    return (source >> bit_start) & MASK;
+}
+
+
+/**
+ * Bitfield insert.  Inserts the first num_bits of y into x starting at bit_start
+ */
+__device__ __forceinline__ void BFI(
+    unsigned int &ret,
+    unsigned int x,
+    unsigned int y,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+#if __CUDA_ARCH__ >= 200
+    asm("bfi.b32 %0, %1, %2, %3, %4;" :
+        "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
+#else
+    // TODO
+#endif
+}
+
+
+/**
+ * Three-operand add
+ */
+__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
+{
+#if __CUDA_ARCH__ >= 200
+    asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
+#else
+    x = x + y + z;
+#endif
+    return x;
+}
+
+
+/**
+ * Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and
+ * reassemble them into a 32-bit destination register
+ */
+__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
+{
+    int ret;
+    asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
+    return ret;
+}
+
+
+/**
+ * Sync-threads barrier.
+ */
+__device__ __forceinline__ void BAR(int count)
+{
+    asm volatile("bar.sync 1, %0;" : : "r"(count));
+}
+
+
+/**
+ * Floating point multiply. (Mantissa LSB rounds towards zero.)
+ */
+__device__ __forceinline__ float FMUL_RZ(float a, float b)
+{
+    float d;
+    asm("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
+    return d;
+}
+
+
+/**
+ * Floating point multiply-add. (Mantissa LSB rounds towards zero.)
+ */
+__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
+{
+    float d;
+    asm("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
+    return d;
+}
+
+
+/**
+ * Terminates the calling thread
+ */
+__device__ __forceinline__ void ThreadExit() {
+    asm("exit;");
+}    
+
+
+/**
+ * Returns the warp lane ID of the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneId()
+{
+    unsigned int ret;
+    asm("mov.u32 %0, %laneid;" : "=r"(ret) );
+    return ret;
+}
+
+
+/**
+ * Returns the warp ID of the calling thread
+ */
+__device__ __forceinline__ unsigned int WarpId()
+{
+    unsigned int ret;
+    asm("mov.u32 %0, %warpid;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * Returns the warp lane mask of all lanes less than the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskLt()
+{
+    unsigned int ret;
+    asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * Returns the warp lane mask of all lanes less than or equal to the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskLe()
+{
+    unsigned int ret;
+    asm("mov.u32 %0, %lanemask_le;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * Returns the warp lane mask of all lanes greater than the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskGt()
+{
+    unsigned int ret;
+    asm("mov.u32 %0, %lanemask_gt;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * Returns the warp lane mask of all lanes greater than or equal to the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskGe()
+{
+    unsigned int ret;
+    asm("mov.u32 %0, %lanemask_ge;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * Portable implementation of __all
+ */
+__device__ __forceinline__ int WarpAll(int cond)
+{
+#if CUB_PTX_ARCH < 120
+
+    __shared__ volatile int warp_signals[PtxArchProps::MAX_SM_THREADS / PtxArchProps::WARP_THREADS];
+
+    if (LaneId() == 0)
+        warp_signals[WarpId()] = 1;
+
+    if (cond == 0)
+        warp_signals[WarpId()] = 0;
+
+    return warp_signals[WarpId()];
+
+#else
+
+    return __all(cond);
+
+#endif
+}
+
+
+/**
+ * Portable implementation of __any
+ */
+__device__ __forceinline__ int WarpAny(int cond)
+{
+#if CUB_PTX_ARCH < 120
+
+    __shared__ volatile int warp_signals[PtxArchProps::MAX_SM_THREADS / PtxArchProps::WARP_THREADS];
+
+    if (LaneId() == 0)
+        warp_signals[WarpId()] = 0;
+
+    if (cond)
+        warp_signals[WarpId()] = 1;
+
+    return warp_signals[WarpId()];
+
+#else
+
+    return __any(cond);
+
+#endif
+}
+
+
+/// Generic shuffle-up
+template <typename T>
+__device__ __forceinline__ T ShuffleUp(
+    T               input,              ///< [in] The value to broadcast
+    int             src_offset)         ///< [in] The up-offset of the peer to read from
+{
+    enum
+    {
+        SHFL_C = 0,
+    };
+
+    typedef typename WordAlignment<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    #pragma unroll
+    for (int WORD = 0; WORD < WORDS; ++WORD)
+    {
+        unsigned int shuffle_word = input_alias[WORD];
+        asm(
+            "  shfl.up.b32 %0, %1, %2, %3;"
+            : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_offset), "r"(SHFL_C));
+        output_alias[WORD] = (ShuffleWord) shuffle_word;
+    }
+
+    return output;
+}
+
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/util_type.cuh b/lib/kokkos/TPL/cub/util_type.cuh
new file mode 100644
index 000000000..836aa0f04
--- /dev/null
+++ b/lib/kokkos/TPL/cub/util_type.cuh
@@ -0,0 +1,685 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Common type manipulation (metaprogramming) utilities
+ */
+
+#pragma once
+
+#include <iostream>
+#include <limits>
+
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+
+/******************************************************************************
+ * Type equality
+ ******************************************************************************/
+
+/**
+ * \brief Type selection (<tt>IF ? ThenType : ElseType</tt>)
+ */
+template <bool IF, typename ThenType, typename ElseType>
+struct If
+{
+    /// Conditional type result
+    typedef ThenType Type;      // true
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename ThenType, typename ElseType>
+struct If<false, ThenType, ElseType>
+{
+    typedef ElseType Type;      // false
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Conditional types
+ ******************************************************************************/
+
+
+/**
+ * \brief Type equality test
+ */
+template <typename A, typename B>
+struct Equals
+{
+    enum {
+        VALUE = 0,
+        NEGATE = 1
+    };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename A>
+struct Equals <A, A>
+{
+    enum {
+        VALUE = 1,
+        NEGATE = 0
+    };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Marker types
+ ******************************************************************************/
+
+/**
+ * \brief A simple "NULL" marker type
+ */
+struct NullType
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+    template <typename T>
+    __host__ __device__ __forceinline__ NullType& operator =(const T& b) { return *this; }
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+
+/**
+ * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)
+ */
+template <int A>
+struct Int2Type
+{
+   enum {VALUE = A};
+};
+
+
+/******************************************************************************
+ * Size and alignment
+ ******************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename T>
+struct WordAlignment
+{
+    struct Pad
+    {
+        T       val;
+        char    byte;
+    };
+
+    enum
+    {
+        /// The alignment of T in bytes
+        ALIGN_BYTES = sizeof(Pad) - sizeof(T)
+    };
+
+    /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<(ALIGN_BYTES % 4 == 0),
+        int,
+        typename If<(ALIGN_BYTES % 2 == 0),
+            short,
+            char>::Type>::Type                  ShuffleWord;
+
+    /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<(ALIGN_BYTES % 8 == 0),
+        long long,
+        ShuffleWord>::Type                      VolatileWord;
+
+    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<(ALIGN_BYTES % 16 == 0),
+        longlong2,
+        typename If<(ALIGN_BYTES % 8 == 0),
+            long long,                                 // needed to get heterogenous PODs to work on all platforms
+            ShuffleWord>::Type>::Type           DeviceWord;
+
+    enum
+    {
+        DEVICE_MULTIPLE = sizeof(DeviceWord) / sizeof(T)
+    };
+
+    struct UninitializedBytes
+    {
+        char buf[sizeof(T)];
+    };
+
+    struct UninitializedShuffleWords
+    {
+        ShuffleWord buf[sizeof(T) / sizeof(ShuffleWord)];
+    };
+
+    struct UninitializedVolatileWords
+    {
+        VolatileWord buf[sizeof(T) / sizeof(VolatileWord)];
+    };
+
+    struct UninitializedDeviceWords
+    {
+        DeviceWord buf[sizeof(T) / sizeof(DeviceWord)];
+    };
+
+
+};
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Wrapper types
+ ******************************************************************************/
+
+/**
+ * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions
+ */
+template <typename T>
+struct Uninitialized
+{
+    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename WordAlignment<T>::DeviceWord DeviceWord;
+
+    enum
+    {
+        WORDS = sizeof(T) / sizeof(DeviceWord)
+    };
+
+    /// Backing storage
+    DeviceWord storage[WORDS];
+
+    /// Alias
+    __host__ __device__ __forceinline__ T& Alias()
+    {
+        return reinterpret_cast<T&>(*this);
+    }
+};
+
+
+/**
+ * \brief A wrapper for passing simple static arrays as kernel parameters
+ */
+template <typename T, int COUNT>
+struct ArrayWrapper
+{
+    /// Static array of type \p T
+    T array[COUNT];
+};
+
+
+/**
+ * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth.
+ *
+ * Many multi-pass computations require a pair of "ping-pong" storage
+ * buffers (e.g., one for reading from and the other for writing to, and then
+ * vice-versa for the subsequent pass).  This structure wraps a set of device
+ * buffers and a "selector" member to track which is "current".
+ */
+template <typename T>
+struct DoubleBuffer
+{
+    /// Pair of device buffer pointers
+    T *d_buffers[2];
+
+    ///  Selector into \p d_buffers (i.e., the active/valid buffer)
+    int selector;
+
+    /// \brief Constructor
+    __host__ __device__ __forceinline__ DoubleBuffer()
+    {
+        selector = 0;
+        d_buffers[0] = NULL;
+        d_buffers[1] = NULL;
+    }
+
+    /// \brief Constructor
+    __host__ __device__ __forceinline__ DoubleBuffer(
+        T *d_current,         ///< The currently valid buffer
+        T *d_alternate)       ///< Alternate storage buffer of the same size as \p d_current
+    {
+        selector = 0;
+        d_buffers[0] = d_current;
+        d_buffers[1] = d_alternate;
+    }
+
+    /// \brief Return pointer to the currently valid buffer
+    __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; }
+};
+
+
+
+/******************************************************************************
+ * Static math
+ ******************************************************************************/
+
+/**
+ * \brief Statically determine log2(N), rounded up.
+ *
+ * For example:
+ *     Log2<8>::VALUE   // 3
+ *     Log2<3>::VALUE   // 2
+ */
+template <int N, int CURRENT_VAL = N, int COUNT = 0>
+struct Log2
+{
+    /// Static logarithm value
+    enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };         // Inductive case
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+template <int N, int COUNT>
+struct Log2<N, 0, COUNT>
+{
+    enum {VALUE = (1 << (COUNT - 1) < N) ?                                  // Base case
+        COUNT :
+        COUNT - 1 };
+};
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Statically determine if N is a power-of-two
+ */
+template <int N>
+struct PowerOfTwo
+{
+    enum { VALUE = ((N & (N - 1)) == 0) };
+};
+
+
+
+/******************************************************************************
+ * Pointer vs. iterator detection
+ ******************************************************************************/
+
+
+/**
+ * \brief Pointer vs. iterator
+ */
+template <typename Tp>
+struct IsPointer
+{
+    enum { VALUE = 0 };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp>
+struct IsPointer<Tp*>
+{
+    enum { VALUE = 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Qualifier detection
+ ******************************************************************************/
+
+/**
+ * \brief Volatile modifier test
+ */
+template <typename Tp>
+struct IsVolatile
+{
+    enum { VALUE = 0 };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp>
+struct IsVolatile<Tp volatile>
+{
+    enum { VALUE = 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Qualifier removal
+ ******************************************************************************/
+
+/**
+ * \brief Removes \p const and \p volatile qualifiers from type \p Tp.
+ *
+ * For example:
+ *     <tt>typename RemoveQualifiers<volatile int>::Type         // int;</tt>
+ */
+template <typename Tp, typename Up = Tp>
+struct RemoveQualifiers
+{
+    /// Type without \p const and \p volatile qualifiers
+    typedef Up Type;
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, volatile Up>
+{
+    typedef Up Type;
+};
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, const Up>
+{
+    typedef Up Type;
+};
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, const volatile Up>
+{
+    typedef Up Type;
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+
+/**
+ * \brief Defines a structure \p detector_name that is templated on type \p T.  The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name
+ */
+#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name)  \
+    template <typename T>                                               \
+    struct detector_name                                                \
+    {                                                                   \
+        template <typename C>                                           \
+        static char& test(typename C::nested_type_name*);               \
+        template <typename>                                             \
+        static int& test(...);                                          \
+        enum                                                            \
+        {                                                               \
+            VALUE = sizeof(test<T>(0)) < sizeof(int)                    \
+        };                                                              \
+    };
+
+
+
+/******************************************************************************
+ * Simple enable-if (similar to Boost)
+ ******************************************************************************/
+
+/**
+ * \brief Simple enable-if (similar to Boost)
+ */
+template <bool Condition, class T = void>
+struct EnableIf
+{
+    /// Enable-if type for SFINAE dummy variables
+    typedef T Type;
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <class T>
+struct EnableIf<false, T> {};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+/**
+ * \brief Determine whether or not BinaryOp's functor is of the form <tt>bool operator()(const T& a, const T&b)</tt> or <tt>bool operator()(const T& a, const T&b, unsigned int idx)</tt>
+ */
+template <typename T, typename BinaryOp>
+struct BinaryOpHasIdxParam
+{
+private:
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx) const>  struct SFINAE1 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx)>        struct SFINAE2 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx) const>                struct SFINAE3 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx)>                      struct SFINAE4 {};
+
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx) const>           struct SFINAE5 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx)>                 struct SFINAE6 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx) const>                         struct SFINAE7 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx)>                               struct SFINAE8 {};
+
+    template <typename BinaryOpT> static char Test(SFINAE1<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE2<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE3<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE4<BinaryOpT, &BinaryOpT::operator()> *);
+
+    template <typename BinaryOpT> static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
+
+    template <typename BinaryOpT> static int Test(...);
+
+public:
+
+    /// Whether the functor BinaryOp has a third <tt>unsigned int</tt> index param
+    static const bool HAS_PARAM = sizeof(Test<BinaryOp>(NULL)) == sizeof(char);
+};
+
+
+
+/******************************************************************************
+ * Simple type traits utilities.
+ *
+ * For example:
+ *     Traits<int>::CATEGORY             // SIGNED_INTEGER
+ *     Traits<NullType>::NULL_TYPE       // true
+ *     Traits<uint4>::CATEGORY           // NOT_A_NUMBER
+ *     Traits<uint4>::PRIMITIVE;         // false
+ *
+ ******************************************************************************/
+
+/**
+ * \brief Basic type traits categories
+ */
+enum Category
+{
+    NOT_A_NUMBER,
+    SIGNED_INTEGER,
+    UNSIGNED_INTEGER,
+    FLOATING_POINT
+};
+
+
+/**
+ * \brief Basic type traits
+ */
+template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits>
+struct BaseTraits
+{
+    /// Category
+    static const Category CATEGORY      = _CATEGORY;
+    enum
+    {
+        PRIMITIVE       = _PRIMITIVE,
+        NULL_TYPE       = _NULL_TYPE,
+    };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Basic type traits (unsigned primitive specialization)
+ */
+template <typename _UnsignedBits>
+struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = UNSIGNED_INTEGER;
+    static const UnsignedBits   MIN_KEY     = UnsignedBits(0);
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1);
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        return key;
+    }
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        return key;
+    }
+};
+
+
+/**
+ * Basic type traits (signed primitive specialization)
+ */
+template <typename _UnsignedBits>
+struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = SIGNED_INTEGER;
+    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+    static const UnsignedBits   MIN_KEY     = HIGH_BIT;
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        return key ^ HIGH_BIT;
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        return key ^ HIGH_BIT;
+    };
+
+};
+
+
+/**
+ * Basic type traits (fp primitive specialization)
+ */
+template <typename _UnsignedBits>
+struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = FLOATING_POINT;
+    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+    static const UnsignedBits   MIN_KEY     = UnsignedBits(-1);
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT;
+        return key ^ mask;
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1);
+        return key ^ mask;
+    };
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Numeric type traits
+ */
+template <typename T> struct NumericTraits :            BaseTraits<NOT_A_NUMBER, false, false, T> {};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <> struct NumericTraits<NullType> :            BaseTraits<NOT_A_NUMBER, false, true, NullType> {};
+
+template <> struct NumericTraits<char> :                BaseTraits<(std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char> {};
+template <> struct NumericTraits<signed char> :         BaseTraits<SIGNED_INTEGER, true, false, unsigned char> {};
+template <> struct NumericTraits<short> :               BaseTraits<SIGNED_INTEGER, true, false, unsigned short> {};
+template <> struct NumericTraits<int> :                 BaseTraits<SIGNED_INTEGER, true, false, unsigned int> {};
+template <> struct NumericTraits<long> :                BaseTraits<SIGNED_INTEGER, true, false, unsigned long> {};
+template <> struct NumericTraits<long long> :           BaseTraits<SIGNED_INTEGER, true, false, unsigned long long> {};
+
+template <> struct NumericTraits<unsigned char> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned char> {};
+template <> struct NumericTraits<unsigned short> :      BaseTraits<UNSIGNED_INTEGER, true, false, unsigned short> {};
+template <> struct NumericTraits<unsigned int> :        BaseTraits<UNSIGNED_INTEGER, true, false, unsigned int> {};
+template <> struct NumericTraits<unsigned long> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long> {};
+template <> struct NumericTraits<unsigned long long> :  BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long> {};
+
+template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, false, unsigned int> {};
+template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, false, unsigned long long> {};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Type traits
+ */
+template <typename T>
+struct Traits : NumericTraits<typename RemoveQualifiers<T>::Type> {};
+
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/util_vector.cuh b/lib/kokkos/TPL/cub/util_vector.cuh
new file mode 100644
index 000000000..9a432dc58
--- /dev/null
+++ b/lib/kokkos/TPL/cub/util_vector.cuh
@@ -0,0 +1,166 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Vector type inference utilities
+ */
+
+#pragma once
+
+#include <iostream>
+
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+/******************************************************************************
+ * Vector type inference utilities.  For example:
+ *
+ * typename VectorHelper<unsigned int, 2>::Type    // Aliases uint2
+ *
+ ******************************************************************************/
+
+/**
+ * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists.  Otherwise \p Type refers to the VectorHelper structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields.
+ */
+template <typename T, int vec_elements> struct VectorHelper;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+enum
+{
+    /// The maximum number of elements in CUDA vector types
+    MAX_VEC_ELEMENTS = 4,
+};
+
+
+/**
+ * Generic vector-1 type
+ */
+template <typename T>
+struct VectorHelper<T, 1>
+{
+    enum { BUILT_IN = false };
+
+    T x;
+
+    typedef VectorHelper<T, 1> Type;
+};
+
+/**
+ * Generic vector-2 type
+ */
+template <typename T>
+struct VectorHelper<T, 2>
+{
+    enum { BUILT_IN = false };
+
+    T x;
+    T y;
+
+    typedef VectorHelper<T, 2> Type;
+};
+
+/**
+ * Generic vector-3 type
+ */
+template <typename T>
+struct VectorHelper<T, 3>
+{
+    enum { BUILT_IN = false };
+
+    T x;
+    T y;
+    T z;
+
+    typedef VectorHelper<T, 3> Type;
+};
+
+/**
+ * Generic vector-4 type
+ */
+template <typename T>
+struct VectorHelper<T, 4>
+{
+    enum { BUILT_IN = false };
+
+    T x;
+    T y;
+    T z;
+    T w;
+
+    typedef VectorHelper<T, 4> Type;
+};
+
+/**
+ * Macro for expanding partially-specialized built-in vector types
+ */
+#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type)                                                            \
+  template<> struct VectorHelper<base_type, 1> { typedef short_type##1 Type; enum { BUILT_IN = true }; };         \
+  template<> struct VectorHelper<base_type, 2> { typedef short_type##2 Type; enum { BUILT_IN = true }; };         \
+  template<> struct VectorHelper<base_type, 3> { typedef short_type##3 Type; enum { BUILT_IN = true }; };         \
+  template<> struct VectorHelper<base_type, 4> { typedef short_type##4 Type; enum { BUILT_IN = true }; };
+
+// Expand CUDA vector types for built-in primitives
+CUB_DEFINE_VECTOR_TYPE(char,               char)
+CUB_DEFINE_VECTOR_TYPE(signed char,        char)
+CUB_DEFINE_VECTOR_TYPE(short,              short)
+CUB_DEFINE_VECTOR_TYPE(int,                int)
+CUB_DEFINE_VECTOR_TYPE(long,               long)
+CUB_DEFINE_VECTOR_TYPE(long long,          longlong)
+CUB_DEFINE_VECTOR_TYPE(unsigned char,      uchar)
+CUB_DEFINE_VECTOR_TYPE(unsigned short,     ushort)
+CUB_DEFINE_VECTOR_TYPE(unsigned int,       uint)
+CUB_DEFINE_VECTOR_TYPE(unsigned long,      ulong)
+CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong)
+CUB_DEFINE_VECTOR_TYPE(float,              float)
+CUB_DEFINE_VECTOR_TYPE(double,             double)
+CUB_DEFINE_VECTOR_TYPE(bool,               uchar)
+
+// Undefine macros
+#undef CUB_DEFINE_VECTOR_TYPE
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/warp/specializations/warp_reduce_shfl.cuh b/lib/kokkos/TPL/cub/warp/specializations/warp_reduce_shfl.cuh
new file mode 100644
index 000000000..317b62990
--- /dev/null
+++ b/lib/kokkos/TPL/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -0,0 +1,358 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction across CUDA warps.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_type.cuh"
+#include "../../util_macro.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction across CUDA warps.
+ */
+template <
+    typename    T,                      ///< Data type being reduced
+    int         LOGICAL_WARPS,          ///< Number of logical warps entrant
+    int         LOGICAL_WARP_THREADS>   ///< Number of threads per logical warp
+struct WarpReduceShfl
+{
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    enum
+    {
+        /// The number of warp reduction steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        // The 5-bit SHFL mask for logically splitting warps into sub-segments
+        SHFL_MASK = (-1 << STEPS) & 31,
+
+        // The 5-bit SFHL clamp
+        SHFL_CLAMP = LOGICAL_WARP_THREADS - 1,
+
+        // The packed C argument (mask starts 8 bits up)
+        SHFL_C = (SHFL_MASK << 8) | SHFL_CLAMP,
+    };
+
+
+    /// Shared memory storage layout type
+    typedef NullType TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    int     warp_id;
+    int     lane_id;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpReduceShfl(
+        TempStorage &temp_storage,
+        int warp_id,
+        int lane_id)
+    :
+        warp_id(warp_id),
+        lane_id(lane_id)
+    {}
+
+
+    /******************************************************************************
+     * Operation
+     ******************************************************************************/
+
+    /// Summation (single-SHFL)
+    template <
+        bool                FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
+        int                 FOLDED_ITEMS_PER_LANE>  ///< Number of items folded into each lane
+    __device__ __forceinline__ T Sum(
+        T                   input,                  ///< [in] Calling thread's input
+        int                 folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
+        Int2Type<true>      single_shfl)            ///< [in] Marker type indicating whether only one SHFL instruction is required
+    {
+        unsigned int output = reinterpret_cast<unsigned int &>(input);
+
+        // Iterate reduction steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            if (FULL_WARPS)
+            {
+                // Use predicate set from SHFL to guard against invalid peers
+                asm(
+                    "{"
+                    "  .reg .u32 r0;"
+                    "  .reg .pred p;"
+                    "  shfl.down.b32 r0|p, %1, %2, %3;"
+                    "  @p add.u32 r0, r0, %4;"
+                    "  mov.u32 %0, r0;"
+                    "}"
+                    : "=r"(output) : "r"(output), "r"(OFFSET), "r"(SHFL_C), "r"(output));
+            }
+            else
+            {
+                // Set range predicate to guard against invalid peers
+                asm(
+                    "{"
+                    "  .reg .u32 r0;"
+                    "  .reg .pred p;"
+                    "  shfl.down.b32 r0, %1, %2, %3;"
+                    "  setp.lt.u32 p, %5, %6;"
+                    "  mov.u32 %0, %1;"
+                    "  @p add.u32 %0, %1, r0;"
+                    "}"
+                    : "=r"(output) : "r"(output), "r"(OFFSET), "r"(SHFL_C), "r"(output), "r"((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE), "r"(folded_items_per_warp));
+            }
+        }
+
+        return output;
+    }
+
+
+    /// Summation (multi-SHFL)
+    template <
+        bool                FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
+        int                 FOLDED_ITEMS_PER_LANE>  ///< Number of items folded into each lane
+    __device__ __forceinline__ T Sum(
+        T                   input,              ///< [in] Calling thread's input
+        int                 folded_items_per_warp,        ///< [in] Total number of valid items folded into each logical warp
+        Int2Type<false>     single_shfl)        ///< [in] Marker type indicating whether only one SHFL instruction is required
+    {
+        // Delegate to generic reduce
+        return Reduce<FULL_WARPS, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, cub::Sum());
+    }
+
+
+    /// Summation (float)
+    template <
+        bool                FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
+        int                 FOLDED_ITEMS_PER_LANE>  ///< Number of items folded into each lane
+    __device__ __forceinline__ float Sum(
+        float               input,              ///< [in] Calling thread's input
+        int                 folded_items_per_warp)        ///< [in] Total number of valid items folded into each logical warp
+    {
+        T output = input;
+
+        // Iterate reduction steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            if (FULL_WARPS)
+            {
+                // Use predicate set from SHFL to guard against invalid peers
+                asm(
+                    "{"
+                    "  .reg .f32 r0;"
+                    "  .reg .pred p;"
+                    "  shfl.down.b32 r0|p, %1, %2, %3;"
+                    "  @p add.f32 r0, r0, %4;"
+                    "  mov.f32 %0, r0;"
+                    "}"
+                    : "=f"(output) : "f"(output), "r"(OFFSET), "r"(SHFL_C), "f"(output));
+            }
+            else
+            {
+                // Set range predicate to guard against invalid peers
+                asm(
+                    "{"
+                    "  .reg .f32 r0;"
+                    "  .reg .pred p;"
+                    "  shfl.down.b32 r0, %1, %2, %3;"
+                    "  setp.lt.u32 p, %5, %6;"
+                    "  mov.f32 %0, %1;"
+                    "  @p add.f32 %0, %0, r0;"
+                    "}"
+                    : "=f"(output) : "f"(output), "r"(OFFSET), "r"(SHFL_C), "f"(output), "r"((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE), "r"(folded_items_per_warp));
+            }
+        }
+
+        return output;
+    }
+
+    /// Summation (generic)
+    template <
+        bool                FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
+        int                 FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
+        typename            _T>
+    __device__ __forceinline__ _T Sum(
+        _T                  input,                  ///< [in] Calling thread's input
+        int                 folded_items_per_warp)  ///< [in] Total number of valid items folded into each logical warp
+    {
+        // Whether sharing can be done with a single SHFL instruction (vs multiple SFHL instructions)
+        Int2Type<(Traits<_T>::PRIMITIVE) && (sizeof(_T) <= sizeof(unsigned int))> single_shfl;
+
+        return Sum<FULL_WARPS, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, single_shfl);
+    }
+
+
+    /// Reduction
+    template <
+        bool            FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
+        int             FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
+        typename        ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               input,                  ///< [in] Calling thread's input
+        int             folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
+        ReductionOp     reduction_op)           ///< [in] Binary reduction operator
+    {
+        typedef typename WordAlignment<T>::ShuffleWord ShuffleWord;
+
+        const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+        T               output          = input;
+        T               temp;
+        ShuffleWord     *temp_alias     = reinterpret_cast<ShuffleWord *>(&temp);
+        ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            // Grab addend from peer
+            const int OFFSET = 1 << STEP;
+
+            #pragma unroll
+            for (int WORD = 0; WORD < WORDS; ++WORD)
+            {
+                unsigned int shuffle_word = output_alias[WORD];
+                asm(
+                    "  shfl.down.b32 %0, %1, %2, %3;"
+                    : "=r"(shuffle_word) : "r"(shuffle_word), "r"(OFFSET), "r"(SHFL_C));
+                temp_alias[WORD] = (ShuffleWord) shuffle_word;
+            }
+
+            // Perform reduction op if from a valid peer
+            if (FULL_WARPS)
+            {
+                if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
+                    output = reduction_op(output, temp);
+            }
+            else
+            {
+                if (((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE) < folded_items_per_warp)
+                    output = reduction_op(output, temp);
+            }
+        }
+
+        return output;
+    }
+
+
+    /// Segmented reduction
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        Flag,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,              ///< [in] Calling thread's input
+        Flag            flag,               ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op)       ///< [in] Binary reduction operator
+    {
+        typedef typename WordAlignment<T>::ShuffleWord ShuffleWord;
+
+        T output = input;
+
+        const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+        T               temp;
+        ShuffleWord     *temp_alias     = reinterpret_cast<ShuffleWord *>(&temp);
+        ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+
+        // Get the start flags for each thread in the warp.
+        int warp_flags = __ballot(flag);
+
+        if (!HEAD_SEGMENTED)
+            warp_flags <<= 1;
+
+        // Keep bits above the current thread.
+        warp_flags &= LaneMaskGt();
+
+        // Accommodate packing of multiple logical warps in a single physical warp
+        if ((LOGICAL_WARPS > 1) && (LOGICAL_WARP_THREADS < 32))
+            warp_flags >>= (warp_id * LOGICAL_WARP_THREADS);
+
+        // Find next flag
+        int next_flag = __clz(__brev(warp_flags));
+
+        // Clip the next segment at the warp boundary if necessary
+        if (LOGICAL_WARP_THREADS != 32)
+            next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Grab addend from peer
+            #pragma unroll
+            for (int WORD = 0; WORD < WORDS; ++WORD)
+            {
+                unsigned int shuffle_word = output_alias[WORD];
+
+                asm(
+                    "  shfl.down.b32 %0, %1, %2, %3;"
+                    : "=r"(shuffle_word) : "r"(shuffle_word), "r"(OFFSET), "r"(SHFL_C));
+                temp_alias[WORD] = (ShuffleWord) shuffle_word;
+
+            }
+
+            // Perform reduction op if valid
+            if (OFFSET < next_flag - lane_id)
+                output = reduction_op(output, temp);
+        }
+
+        return output;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/warp/specializations/warp_reduce_smem.cuh b/lib/kokkos/TPL/cub/warp/specializations/warp_reduce_smem.cuh
new file mode 100644
index 000000000..a32d5fdd7
--- /dev/null
+++ b/lib/kokkos/TPL/cub/warp/specializations/warp_reduce_smem.cuh
@@ -0,0 +1,291 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpReduceSmem provides smem-based variants of parallel reduction across CUDA warps.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../util_type.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpReduceSmem provides smem-based variants of parallel reduction across CUDA warps.
+ */
+template <
+    typename    T,                      ///< Data type being reduced
+    int         LOGICAL_WARPS,          ///< Number of logical warps entrant
+    int         LOGICAL_WARP_THREADS>   ///< Number of threads per logical warp
+struct WarpReduceSmem
+{
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size is a power-of-two
+        POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of threads in half a warp
+        HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+        /// The number of shared memory elements per warp
+        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+    };
+
+    /// Shared memory flag type
+    typedef unsigned char SmemFlag;
+
+    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+    typedef T _TempStorage[LOGICAL_WARPS][WARP_SMEM_ELEMENTS];
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage     &temp_storage;
+    int             warp_id;
+    int             lane_id;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpReduceSmem(
+        TempStorage     &temp_storage,
+        int             warp_id,
+        int             lane_id)
+    :
+        temp_storage(temp_storage.Alias()),
+        warp_id(warp_id),
+        lane_id(lane_id)
+    {}
+
+
+    /******************************************************************************
+     * Operation
+     ******************************************************************************/
+
+    /**
+     * Reduction
+     */
+    template <
+        bool                FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
+        int                 FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,                  ///< [in] Calling thread's input
+        int                 folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
+        ReductionOp         reduction_op)           ///< [in] Reduction operator
+    {
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input through buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], input);
+
+            // Update input if peer_addend is in range
+            if ((FULL_WARPS && POW_OF_TWO) || ((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE < folded_items_per_warp))
+            {
+                T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][lane_id + OFFSET]);
+                input = reduction_op(input, peer_addend);
+            }
+        }
+
+        return input;
+    }
+
+
+    /**
+     * Segmented reduction
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        Flag,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,              ///< [in] Calling thread's input
+        Flag            flag,               ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op)       ///< [in] Reduction operator
+    {
+    #if CUB_PTX_ARCH >= 200
+
+        // Ballot-based segmented reduce
+
+        // Get the start flags for each thread in the warp.
+        int warp_flags = __ballot(flag);
+
+        if (!HEAD_SEGMENTED)
+            warp_flags <<= 1;
+
+        // Keep bits above the current thread.
+        warp_flags &= LaneMaskGt();
+
+        // Accommodate packing of multiple logical warps in a single physical warp
+        if ((LOGICAL_WARPS > 1) && (LOGICAL_WARP_THREADS < 32))
+            warp_flags >>= (warp_id * LOGICAL_WARP_THREADS);
+
+        // Find next flag
+        int next_flag = __clz(__brev(warp_flags));
+
+        // Clip the next segment at the warp boundary if necessary
+        if (LOGICAL_WARP_THREADS != 32)
+            next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
+
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input into buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], input);
+
+            // Update input if peer_addend is in range
+            if (OFFSET < next_flag - lane_id)
+            {
+                T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][lane_id + OFFSET]);
+                input = reduction_op(input, peer_addend);
+            }
+        }
+
+        return input;
+
+    #else
+
+        // Smem-based segmented reduce
+
+        enum
+        {
+            UNSET   = 0x0,  // Is initially unset
+            SET     = 0x1,  // Is initially set
+            SEEN    = 0x2,  // Has seen another head flag from a successor peer
+        };
+
+        // Alias flags onto shared data storage
+        volatile SmemFlag *flag_storage = reinterpret_cast<SmemFlag*>(temp_storage[warp_id]);
+
+        SmemFlag flag_status = (flag) ? SET : UNSET;
+
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input through buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], input);
+
+            // Get peer from buffer
+            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][lane_id + OFFSET]);
+
+            // Share flag through buffer
+            flag_storage[lane_id] = flag_status;
+
+            // Get peer flag from buffer
+            SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
+
+            // Update input if peer was in range
+            if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
+            {
+                if (HEAD_SEGMENTED)
+                {
+                    // Head-segmented
+                    if ((flag_status & SEEN) == 0)
+                    {
+                        // Has not seen a more distant head flag
+                        if (peer_flag_status & SET)
+                        {
+                            // Has now seen a head flag
+                            flag_status |= SEEN;
+                        }
+                        else
+                        {
+                            // Peer is not a head flag: grab its count
+                            input = reduction_op(input, peer_addend);
+                        }
+
+                        // Update seen status to include that of peer
+                        flag_status |= (peer_flag_status & SEEN);
+                    }
+                }
+                else
+                {
+                    // Tail-segmented.  Simply propagate flag status
+                    if (!flag_status)
+                    {
+                        input = reduction_op(input, peer_addend);
+                        flag_status |= peer_flag_status;
+                    }
+
+                }
+            }
+        }
+
+        return input;
+
+    #endif
+    }
+
+
+    /**
+     * Summation
+     */
+    template <
+        bool            FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
+        int             FOLDED_ITEMS_PER_LANE>  ///< Number of items folded into each lane
+    __device__ __forceinline__ T Sum(
+        T               input,                  ///< [in] Calling thread's input
+        int             folded_items_per_warp)  ///< [in] Total number of valid items folded into each logical warp
+    {
+        return Reduce<FULL_WARPS, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, cub::Sum());
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/warp/specializations/warp_scan_shfl.cuh b/lib/kokkos/TPL/cub/warp/specializations/warp_scan_shfl.cuh
new file mode 100644
index 000000000..5585396ce
--- /dev/null
+++ b/lib/kokkos/TPL/cub/warp/specializations/warp_scan_shfl.cuh
@@ -0,0 +1,371 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan across CUDA warps.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../util_type.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan across CUDA warps.
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARPS,          ///< Number of logical warps entrant
+    int         LOGICAL_WARP_THREADS>   ///< Number of threads per logical warp
+struct WarpScanShfl
+{
+
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    enum
+    {
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        // The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+        SHFL_C = ((-1 << STEPS) & 31) << 8,
+    };
+
+    /// Shared memory storage layout type
+    typedef NullType TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    int             warp_id;
+    int             lane_id;
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpScanShfl(
+        TempStorage &temp_storage,
+        int warp_id,
+        int lane_id)
+    :
+        warp_id(warp_id),
+        lane_id(lane_id)
+    {}
+
+
+    /******************************************************************************
+     * Operation
+     ******************************************************************************/
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        int             src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        typedef typename WordAlignment<T>::ShuffleWord ShuffleWord;
+
+        const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+        T               output;
+        ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+        ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+        #pragma unroll
+        for (int WORD = 0; WORD < WORDS; ++WORD)
+        {
+            unsigned int shuffle_word = input_alias[WORD];
+            asm("shfl.idx.b32 %0, %1, %2, %3;"
+                : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_lane), "r"(LOGICAL_WARP_THREADS - 1));
+            output_alias[WORD] = (ShuffleWord) shuffle_word;
+        }
+
+        return output;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive operations
+    //---------------------------------------------------------------------
+
+    /// Inclusive prefix sum with aggregate (single-SHFL)
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate,    ///< [out] Warp-wide aggregate reduction of input items.
+        Int2Type<true>  single_shfl)
+    {
+        unsigned int temp = reinterpret_cast<unsigned int &>(input);
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            // Use predicate set from SHFL to guard against invalid peers
+            asm(
+                "{"
+                "  .reg .u32 r0;"
+                "  .reg .pred p;"
+                "  shfl.up.b32 r0|p, %1, %2, %3;"
+                "  @p add.u32 r0, r0, %4;"
+                "  mov.u32 %0, r0;"
+                "}"
+                : "=r"(temp) : "r"(temp), "r"(1 << STEP), "r"(SHFL_C), "r"(temp));
+        }
+
+        output = temp;
+
+        // Grab aggregate from last warp lane
+        warp_aggregate = Broadcast(output, LOGICAL_WARP_THREADS - 1);
+    }
+
+
+    /// Inclusive prefix sum with aggregate (multi-SHFL)
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate,    ///< [out] Warp-wide aggregate reduction of input items.
+        Int2Type<false> single_shfl)        ///< [in] Marker type indicating whether only one SHFL instruction is required
+    {
+        // Delegate to generic scan
+        InclusiveScan(input, output, Sum(), warp_aggregate);
+    }
+
+
+    /// Inclusive prefix sum with aggregate (specialized for float)
+    __device__ __forceinline__ void InclusiveSum(
+        float           input,              ///< [in] Calling thread's input item.
+        float           &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        float           &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        output = input;
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            // Use predicate set from SHFL to guard against invalid peers
+            asm(
+                "{"
+                "  .reg .f32 r0;"
+                "  .reg .pred p;"
+                "  shfl.up.b32 r0|p, %1, %2, %3;"
+                "  @p add.f32 r0, r0, %4;"
+                "  mov.f32 %0, r0;"
+                "}"
+                : "=f"(output) : "f"(output), "r"(1 << STEP), "r"(SHFL_C), "f"(output));
+        }
+
+        // Grab aggregate from last warp lane
+        warp_aggregate = Broadcast(output, LOGICAL_WARP_THREADS - 1);
+    }
+
+
+    /// Inclusive prefix sum with aggregate (specialized for unsigned long long)
+    __device__ __forceinline__ void InclusiveSum(
+        unsigned long long  input,              ///< [in] Calling thread's input item.
+        unsigned long long  &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        unsigned long long  &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        output = input;
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            // Use predicate set from SHFL to guard against invalid peers
+            asm(
+                "{"
+                "  .reg .u32 r0;"
+                "  .reg .u32 r1;"
+                "  .reg .u32 lo;"
+                "  .reg .u32 hi;"
+                "  .reg .pred p;"
+                "  mov.b64 {lo, hi}, %1;"
+                "  shfl.up.b32 r0|p, lo, %2, %3;"
+                "  shfl.up.b32 r1|p, hi, %2, %3;"
+                "  @p add.cc.u32 r0, r0, lo;"
+                "  @p addc.u32 r1, r1, hi;"
+                "  mov.b64 %0, {r0, r1};"
+                "}"
+                : "=l"(output) : "l"(output), "r"(1 << STEP), "r"(SHFL_C));
+        }
+
+        // Grab aggregate from last warp lane
+        warp_aggregate = Broadcast(output, LOGICAL_WARP_THREADS - 1);
+    }
+
+
+    /// Inclusive prefix sum with aggregate (generic)
+    template <typename _T>
+    __device__ __forceinline__ void InclusiveSum(
+        _T               input,             ///< [in] Calling thread's input item.
+        _T               &output,           ///< [out] Calling thread's output item.  May be aliased with \p input.
+        _T               &warp_aggregate)   ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        // Whether sharing can be done with a single SHFL instruction (vs multiple SFHL instructions)
+        Int2Type<(Traits<_T>::PRIMITIVE) && (sizeof(_T) <= sizeof(unsigned int))> single_shfl;
+
+        InclusiveSum(input, output, warp_aggregate, single_shfl);
+    }
+
+
+    /// Inclusive prefix sum
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output)            ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        T warp_aggregate;
+        InclusiveSum(input, output, warp_aggregate);
+    }
+
+
+    /// Inclusive scan with aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        output = input;
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            // Grab addend from peer
+            const int OFFSET = 1 << STEP;
+            T temp = ShuffleUp(output, OFFSET);
+
+            // Perform scan op if from a valid peer
+            if (lane_id >= OFFSET)
+                output = scan_op(temp, output);
+        }
+
+        // Grab aggregate from last warp lane
+        warp_aggregate = Broadcast(output, LOGICAL_WARP_THREADS - 1);
+    }
+
+
+    /// Inclusive scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T warp_aggregate;
+        InclusiveScan(input, output, scan_op, warp_aggregate);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Exclusive operations
+    //---------------------------------------------------------------------
+
+    /// Exclusive scan with aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               identity,           ///< [in] Identity value
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        // Compute inclusive scan
+        T inclusive;
+        InclusiveScan(input, inclusive, scan_op, warp_aggregate);
+
+        // Grab result from predecessor
+        T exclusive = ShuffleUp(inclusive, 1);
+
+        output = (lane_id == 0) ?
+            identity :
+            exclusive;
+    }
+
+
+    /// Exclusive scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               identity,           ///< [in] Identity value
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T warp_aggregate;
+        ExclusiveScan(input, output, identity, scan_op, warp_aggregate);
+    }
+
+
+    /// Exclusive scan with aggregate, without identity
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        // Compute inclusive scan
+        T inclusive;
+        InclusiveScan(input, inclusive, scan_op, warp_aggregate);
+
+        // Grab result from predecessor
+        output = ShuffleUp(inclusive, 1);
+    }
+
+
+    /// Exclusive scan without identity
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T warp_aggregate;
+        ExclusiveScan(input, output, scan_op, warp_aggregate);
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/warp/specializations/warp_scan_smem.cuh b/lib/kokkos/TPL/cub/warp/specializations/warp_scan_smem.cuh
new file mode 100644
index 000000000..513b35cef
--- /dev/null
+++ b/lib/kokkos/TPL/cub/warp/specializations/warp_scan_smem.cuh
@@ -0,0 +1,327 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpScanSmem provides smem-based variants of parallel prefix scan across CUDA warps.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../util_type.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpScanSmem provides smem-based variants of parallel prefix scan across CUDA warps.
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARPS,          ///< Number of logical warps entrant
+    int         LOGICAL_WARP_THREADS>   ///< Number of threads per logical warp
+struct WarpScanSmem
+{
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    enum
+    {
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of threads in half a warp
+        HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+        /// The number of shared memory elements per warp
+        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+    };
+
+
+    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+    typedef T _TempStorage[LOGICAL_WARPS][WARP_SMEM_ELEMENTS];
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage     &temp_storage;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpScanSmem(
+        TempStorage     &temp_storage,
+        int             warp_id,
+        int             lane_id)
+    :
+        temp_storage(temp_storage.Alias()),
+        warp_id(warp_id),
+        lane_id(lane_id)
+    {}
+
+
+    /******************************************************************************
+     * Operation
+     ******************************************************************************/
+
+    /// Initialize identity padding (specialized for operations that have identity)
+    __device__ __forceinline__ void InitIdentity(Int2Type<true> has_identity)
+    {
+        T identity = T();
+        ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], identity);
+    }
+
+
+    /// Initialize identity padding (specialized for operations without identity)
+    __device__ __forceinline__ void InitIdentity(Int2Type<false> has_identity)
+    {}
+
+
+    /// Basic inclusive scan iteration(template unrolled, base-case specialization)
+    template <
+        bool HAS_IDENTITY,
+        typename ScanOp>
+    __device__ __forceinline__ void ScanStep(
+        T               &partial,
+        ScanOp          scan_op,
+        Int2Type<STEPS>  step)
+    {}
+
+
+    /// Basic inclusive scan iteration (template unrolled, inductive-case specialization)
+    template <
+        bool        HAS_IDENTITY,
+        int         STEP,
+        typename    ScanOp>
+    __device__ __forceinline__ void ScanStep(
+        T               &partial,
+        ScanOp          scan_op,
+        Int2Type<STEP>  step)
+    {
+        const int OFFSET = 1 << STEP;
+
+        // Share partial into buffer
+        ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id], partial);
+
+        // Update partial if addend is in range
+        if (HAS_IDENTITY || (lane_id >= OFFSET))
+        {
+            T addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id - OFFSET]);
+            partial = scan_op(addend, partial);
+        }
+
+        ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<STEP + 1>());
+    }
+
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        if (lane_id == src_lane)
+        {
+            ThreadStore<STORE_VOLATILE>(temp_storage[warp_id], input);
+        }
+
+        return ThreadLoad<LOAD_VOLATILE>(temp_storage[warp_id]);
+    }
+
+
+    /// Basic inclusive scan
+    template <
+        bool        HAS_IDENTITY,
+        bool        SHARE_FINAL,
+        typename    ScanOp>
+    __device__ __forceinline__ T BasicScan(
+        T               partial,            ///< Calling thread's input partial reduction
+        ScanOp          scan_op)            ///< Binary associative scan functor
+    {
+        // Iterate scan steps
+        ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<0>());
+
+        if (SHARE_FINAL)
+        {
+            // Share partial into buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id], partial);
+        }
+
+        return partial;
+    }
+
+
+    /// Inclusive prefix sum
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output)            ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        const bool HAS_IDENTITY = Traits<T>::PRIMITIVE;
+
+        // Initialize identity region
+        InitIdentity(Int2Type<HAS_IDENTITY>());
+
+        // Compute inclusive warp scan (has identity, don't share final)
+        output = BasicScan<HAS_IDENTITY, false>(input, Sum());
+    }
+
+
+    /// Inclusive prefix sum with aggregate
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        const bool HAS_IDENTITY = Traits<T>::PRIMITIVE;
+
+        // Initialize identity region
+        InitIdentity(Int2Type<HAS_IDENTITY>());
+
+        // Compute inclusive warp scan (has identity, share final)
+        output = BasicScan<HAS_IDENTITY, true>(input, Sum());
+
+        // Retrieve aggregate in <em>warp-lane</em><sub>0</sub>
+        warp_aggregate = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][WARP_SMEM_ELEMENTS - 1]);
+    }
+
+
+    /// Inclusive scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        // Compute inclusive warp scan (no identity, don't share final)
+        output = BasicScan<false, false>(input, scan_op);
+    }
+
+
+    /// Inclusive scan with aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        // Compute inclusive warp scan (no identity, share final)
+        output = BasicScan<false, true>(input, scan_op);
+
+        // Retrieve aggregate
+        warp_aggregate = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][WARP_SMEM_ELEMENTS - 1]);
+    }
+
+    /// Exclusive scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               identity,           ///< [in] Identity value
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        // Initialize identity region
+        ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], identity);
+
+        // Compute inclusive warp scan (identity, share final)
+        T inclusive = BasicScan<true, true>(input, scan_op);
+
+        // Retrieve exclusive scan
+        output = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id - 1]);
+    }
+
+
+    /// Exclusive scan with aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               identity,           ///< [in] Identity value
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        // Exclusive warp scan (which does share final)
+        ExclusiveScan(input, output, identity, scan_op);
+
+        // Retrieve aggregate
+        warp_aggregate = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][WARP_SMEM_ELEMENTS - 1]);
+    }
+
+
+    /// Exclusive scan without identity
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        // Compute inclusive warp scan (no identity, share final)
+        T inclusive = BasicScan<false, true>(input, scan_op);
+
+        // Retrieve exclusive scan
+        output = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id - 1]);
+    }
+
+
+    /// Exclusive scan with aggregate, without identity
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        // Exclusive warp scan (which does share final)
+        ExclusiveScan(input, output, scan_op);
+
+        // Retrieve aggregate
+        warp_aggregate = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][WARP_SMEM_ELEMENTS - 1]);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/warp/warp_reduce.cuh b/lib/kokkos/TPL/cub/warp/warp_reduce.cuh
new file mode 100644
index 000000000..548369da1
--- /dev/null
+++ b/lib/kokkos/TPL/cub/warp/warp_reduce.cuh
@@ -0,0 +1,677 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across CUDA warp threads.
+ */
+
+#pragma once
+
+#include "specializations/warp_reduce_shfl.cuh"
+#include "specializations/warp_reduce_smem.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup WarpModule
+ * @{
+ */
+
+/**
+ * \brief The WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across CUDA warp threads. ![](warp_reduce_logo.png)
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a list of input elements.
+ *
+ * \tparam T                        The reduction input/output element type
+ * \tparam LOGICAL_WARPS            <b>[optional]</b> The number of entrant "logical" warps performing concurrent warp reductions.  Default is 1.
+ * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
+ *
+ * \par Simple Examples
+ * \warpcollective{WarpReduce}
+ * \par
+ * The code snippet below illustrates four concurrent warp sum reductions within a block of
+ * 128 threads (one per each of the 32-thread warps).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpReduce for 4 warps on type int
+ *     typedef cub::WarpReduce<int, 4> WarpReduce;
+ *
+ *     // Allocate shared memory for WarpReduce
+ *     __shared__ typename WarpReduce::TempStorage temp_storage;
+ *
+ *     // Obtain one input item per thread
+ *     int thread_data = ...
+ *
+ *     // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
+ *     int aggregate = WarpReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is <tt>0, 1, 2, 3, ..., 127</tt>.
+ * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
+ * \p 2544, and \p 3568, respectively (and is undefined in other threads).
+ *
+ * \par
+ * The code snippet below illustrates a single warp sum reduction within a block of
+ * 128 threads.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpReduce for one warp on type int
+ *     typedef cub::WarpReduce<int, 1> WarpReduce;
+ *
+ *     // Allocate shared memory for WarpReduce
+ *     __shared__ typename WarpReduce::TempStorage temp_storage;
+ *     ...
+ *
+ *     // Only the first warp performs a reduction
+ *     if (threadIdx.x < 32)
+ *     {
+ *         // Obtain one input item per thread
+ *         int thread_data = ...
+ *
+ *         // Return the warp-wide sum to lane0
+ *         int aggregate = WarpReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the warp of threads is <tt>0, 1, 2, 3, ..., 31</tt>.
+ * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads).
+ *
+ * \par Usage and Performance Considerations
+ * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
+ * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
+ * - Warp reductions are concurrent if more than one logical warp is participating
+ * - Uses special instructions when applicable (e.g., warp \p SHFL instructions)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *     - Summation (<b><em>vs.</em></b> generic reduction)
+ *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
+ *
+ */
+template <
+    typename    T,
+    int         LOGICAL_WARPS           = 1,
+    int         LOGICAL_WARP_THREADS    = PtxArchProps::WARP_THREADS>
+class WarpReduce
+{
+private:
+
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    enum
+    {
+        POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
+    };
+
+public:
+
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Internal specialization.  Use SHFL-based reduction if (architecture is >= SM30) and ((only one logical warp) or (LOGICAL_WARP_THREADS is a power-of-two))
+    typedef typename If<(CUB_PTX_ARCH >= 300) && ((LOGICAL_WARPS == 1) || POW_OF_TWO),
+        WarpReduceShfl<T, LOGICAL_WARPS, LOGICAL_WARP_THREADS>,
+        WarpReduceSmem<T, LOGICAL_WARPS, LOGICAL_WARP_THREADS> >::Type InternalWarpReduce;
+
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+private:
+
+    /// Shared memory storage layout type for WarpReduce
+    typedef typename InternalWarpReduce::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Warp ID
+    int warp_id;
+
+    /// Lane ID
+    int lane_id;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /// \smemstorage{WarpReduce}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     *
+     */
+    __device__ __forceinline__ WarpReduce()
+    :
+        temp_storage(PrivateStorage()),
+        warp_id((LOGICAL_WARPS == 1) ?
+            0 :
+            threadIdx.x / LOGICAL_WARP_THREADS),
+        lane_id(((LOGICAL_WARPS == 1) || (LOGICAL_WARP_THREADS == PtxArchProps::WARP_THREADS)) ?
+            LaneId() :
+            threadIdx.x % LOGICAL_WARP_THREADS)
+    {}
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ WarpReduce(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        warp_id((LOGICAL_WARPS == 1) ?
+            0 :
+            threadIdx.x / LOGICAL_WARP_THREADS),
+        lane_id(((LOGICAL_WARPS == 1) || (LOGICAL_WARP_THREADS == PtxArchProps::WARP_THREADS)) ?
+            LaneId() :
+            threadIdx.x % LOGICAL_WARP_THREADS)
+    {}
+
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Threads are identified using the given warp and lane identifiers.
+     */
+    __device__ __forceinline__ WarpReduce(
+        int warp_id,                           ///< [in] A suitable warp membership identifier
+        int lane_id)                           ///< [in] A lane identifier within the warp
+    :
+        temp_storage(PrivateStorage()),
+        warp_id(warp_id),
+        lane_id(lane_id)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Threads are identified using the given warp and lane identifiers.
+     */
+    __device__ __forceinline__ WarpReduce(
+        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
+        int warp_id,                           ///< [in] A suitable warp membership identifier
+        int lane_id)                           ///< [in] A lane identifier within the warp
+    :
+        temp_storage(temp_storage.Alias()),
+        warp_id(warp_id),
+        lane_id(lane_id)
+    {}
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Summation reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a warp-wide sum in each active warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates four concurrent warp sum reductions within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for 4 warps on type int
+     *     typedef cub::WarpReduce<int, 4> WarpReduce;
+     *
+     *     // Allocate shared memory for WarpReduce
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, 1, 2, 3, ..., 127</tt>.
+     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
+     * \p 2544, and \p 3568, respectively (and is undefined in other threads).
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T                   input)              ///< [in] Calling thread's input
+    {
+        return InternalWarpReduce(temp_storage, warp_id, lane_id).Sum<true, 1>(input, LOGICAL_WARP_THREADS);
+    }
+
+    /**
+     * \brief Computes a partially-full warp-wide sum in each active warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * All threads in each logical warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a sum reduction within a single, partially-full
+     * block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items)
+     * {
+     *     // Specialize WarpReduce for a single warp on type int
+     *     typedef cub::WarpReduce<int, 1> WarpReduce;
+     *
+     *     // Allocate shared memory for WarpReduce
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread if in range
+     *     int thread_data;
+     *     if (threadIdx.x < valid_items)
+     *         thread_data = d_data[threadIdx.x];
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Sum(
+     *         thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, ...</tt> and \p valid_items
+     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 6 (and is
+     * undefined in other threads).
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T                   input,              ///< [in] Calling thread's input
+        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
+    {
+        // Determine if we don't need bounds checking
+        if (valid_items >= LOGICAL_WARP_THREADS)
+        {
+            return InternalWarpReduce(temp_storage, warp_id, lane_id).Sum<true, 1>(input, valid_items);
+        }
+        else
+        {
+            return InternalWarpReduce(temp_storage, warp_id, lane_id).Sum<false, 1>(input, valid_items);
+        }
+    }
+
+
+    /**
+     * \brief Computes a segmented sum in each active warp where segments are defined by head-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a head-segmented warp sum
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for a single warp on type int
+     *     typedef cub::WarpReduce<int, 1> WarpReduce;
+     *
+     *     // Allocate shared memory for WarpReduce
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int head_flag = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
+     *         thread_data, head_flag);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
+     * is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     */
+    template <
+        typename            Flag>
+    __device__ __forceinline__ T HeadSegmentedSum(
+        T                   input,              ///< [in] Calling thread's input
+        Flag                head_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+    {
+        return HeadSegmentedReduce(input, head_flag, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes a segmented sum in each active warp where segments are defined by tail-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a tail-segmented warp sum
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for a single warp on type int
+     *     typedef cub::WarpReduce<int, 1> WarpReduce;
+     *
+     *     // Allocate shared memory for WarpReduce
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int tail_flag = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).TailSegmentedSum(
+     *         thread_data, tail_flag);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
+     * is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            Flag>
+    __device__ __forceinline__ T TailSegmentedSum(
+        T                   input,              ///< [in] Calling thread's input
+        Flag                tail_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+    {
+        return TailSegmentedReduce(input, tail_flag, cub::Sum());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Generic reductions
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes a warp-wide reduction in each active warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates four concurrent warp max reductions within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for 4 warps on type int
+     *     typedef cub::WarpReduce<int, 4> WarpReduce;
+     *
+     *     // Allocate shared memory for WarpReduce
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Reduce(
+     *         thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, 1, 2, 3, ..., 127</tt>.
+     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63,
+     * \p 95, and \p 127, respectively  (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        return InternalWarpReduce(temp_storage, warp_id, lane_id).Reduce<true, 1>(input, LOGICAL_WARP_THREADS, reduction_op);
+    }
+
+    /**
+     * \brief Computes a partially-full warp-wide reduction in each active warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * All threads in each logical warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a max reduction within a single, partially-full
+     * block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items)
+     * {
+     *     // Specialize WarpReduce for a single warp on type int
+     *     typedef cub::WarpReduce<int, 1> WarpReduce;
+     *
+     *     // Allocate shared memory for WarpReduce
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread if in range
+     *     int thread_data;
+     *     if (threadIdx.x < valid_items)
+     *         thread_data = d_data[threadIdx.x];
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Reduce(
+     *         thread_data, cub::Max(), valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, ...</tt> and \p valid_items
+     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 3 (and is
+     * undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input
+        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
+        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
+    {
+        // Determine if we don't need bounds checking
+        if (valid_items >= LOGICAL_WARP_THREADS)
+        {
+            return InternalWarpReduce(temp_storage, warp_id, lane_id).Reduce<true, 1>(input, valid_items, reduction_op);
+        }
+        else
+        {
+            return InternalWarpReduce(temp_storage, warp_id, lane_id).Reduce<false, 1>(input, valid_items, reduction_op);
+        }
+    }
+
+
+    /**
+     * \brief Computes a segmented reduction in each active warp where segments are defined by head-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a head-segmented warp max
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for a single warp on type int
+     *     typedef cub::WarpReduce<int, 1> WarpReduce;
+     *
+     *     // Allocate shared memory for WarpReduce
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int head_flag = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(
+     *         thread_data, head_flag, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
+     * is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            ReductionOp,
+        typename            Flag>
+    __device__ __forceinline__ T HeadSegmentedReduce(
+        T                   input,              ///< [in] Calling thread's input
+        Flag                head_flag,          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+        ReductionOp         reduction_op)       ///< [in] Reduction operator
+    {
+        return InternalWarpReduce(temp_storage, warp_id, lane_id).template SegmentedReduce<true>(input, head_flag, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a segmented reduction in each active warp where segments are defined by tail-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a tail-segmented warp max
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for a single warp on type int
+     *     typedef cub::WarpReduce<int, 1> WarpReduce;
+     *
+     *     // Allocate shared memory for WarpReduce
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int tail_flag = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(
+     *         thread_data, tail_flag, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
+     * is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            ReductionOp,
+        typename            Flag>
+    __device__ __forceinline__ T TailSegmentedReduce(
+        T                   input,              ///< [in] Calling thread's input
+        Flag                tail_flag,          ///< [in] Tail flag denoting whether or not \p input is the end of the current segment
+        ReductionOp         reduction_op)       ///< [in] Reduction operator
+    {
+        return InternalWarpReduce(temp_storage, warp_id, lane_id).template SegmentedReduce<false>(input, tail_flag, reduction_op);
+    }
+
+
+
+    //@}  end member group
+};
+
+/** @} */       // end group WarpModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/TPL/cub/warp/warp_scan.cuh b/lib/kokkos/TPL/cub/warp/warp_scan.cuh
new file mode 100644
index 000000000..a588b52bd
--- /dev/null
+++ b/lib/kokkos/TPL/cub/warp/warp_scan.cuh
@@ -0,0 +1,1297 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across CUDA warp threads.
+ */
+
+#pragma once
+
+#include "specializations/warp_scan_shfl.cuh"
+#include "specializations/warp_scan_smem.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup WarpModule
+ * @{
+ */
+
+/**
+ * \brief The WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across CUDA warp threads.  ![](warp_scan_logo.png)
+ *
+ * \par Overview
+ * Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ * produces an output list where each element is computed to be the reduction
+ * of the elements occurring earlier in the input list.  <em>Prefix sum</em>
+ * connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ * the <em>i</em><sup>th</sup> output reduction.
+ *
+ * \tparam T                        The scan input/output element type
+ * \tparam LOGICAL_WARPS            <b>[optional]</b> The number of "logical" warps performing concurrent warp scans. Default is 1.
+ * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20).
+ *
+ * \par Simple Examples
+ * \warpcollective{WarpScan}
+ * \par
+ * The code snippet below illustrates four concurrent warp prefix sums within a block of
+ * 128 threads (one per each of the 32-thread warps).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpScan for 4 warps on type int
+ *     typedef cub::WarpScan<int, 4> WarpScan;
+ *
+ *     // Allocate shared memory for WarpScan
+ *     __shared__ typename WarpScan::TempStorage temp_storage;
+ *
+ *     // Obtain one input item per thread
+ *     int thread_data = ...
+ *
+ *     // Compute warp-wide prefix sums
+ *     WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, 1, 1, ...</tt>.
+ * The corresponding output \p thread_data in each of the four warps of threads will be
+ * <tt>0, 1, 2, 3, ..., 31</tt>.
+ *
+ * \par
+ * The code snippet below illustrates a single warp prefix sum within a block of
+ * 128 threads.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpScan for one warp on type int
+ *     typedef cub::WarpScan<int, 1> WarpScan;
+ *
+ *     // Allocate shared memory for WarpScan
+ *     __shared__ typename WarpScan::TempStorage temp_storage;
+ *     ...
+ *
+ *     // Only the first warp performs a prefix sum
+ *     if (threadIdx.x < 32)
+ *     {
+ *         // Obtain one input item per thread
+ *         int thread_data = ...
+ *
+ *         // Compute warp-wide prefix sums
+ *         WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the warp of threads is <tt>1, 1, 1, 1, ...</tt>.
+ * The corresponding output \p thread_data will be <tt>0, 1, 2, 3, ..., 31</tt>.
+ *
+ * \par Usage and Performance Considerations
+ * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads)
+ * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
+ * - Warp scans are concurrent if more than one warp is participating
+ * - Uses special instructions when applicable (e.g., warp \p SHFL)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Zero bank conflicts for most types.
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *     - Summation (<b><em>vs.</em></b> generic scan)
+ *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
+ *
+ */
+template <
+    typename    T,
+    int         LOGICAL_WARPS           = 1,
+    int         LOGICAL_WARP_THREADS    = PtxArchProps::WARP_THREADS>
+class WarpScan
+{
+private:
+
+    /******************************************************************************
+     * Constants and typedefs
+     ******************************************************************************/
+
+    enum
+    {
+        POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
+    };
+
+    /// Internal specialization.  Use SHFL-based reduction if (architecture is >= SM30) and ((only one logical warp) or (LOGICAL_WARP_THREADS is a power-of-two))
+    typedef typename If<(CUB_PTX_ARCH >= 300) && ((LOGICAL_WARPS == 1) || POW_OF_TWO),
+        WarpScanShfl<T, LOGICAL_WARPS, LOGICAL_WARP_THREADS>,
+        WarpScanSmem<T, LOGICAL_WARPS, LOGICAL_WARP_THREADS> >::Type InternalWarpScan;
+
+    /// Shared memory storage layout type for WarpScan
+    typedef typename InternalWarpScan::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Warp ID
+    int warp_id;
+
+    /// Lane ID
+    int lane_id;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /// \smemstorage{WarpScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ WarpScan()
+    :
+        temp_storage(PrivateStorage()),
+        warp_id((LOGICAL_WARPS == 1) ?
+            0 :
+            threadIdx.x / LOGICAL_WARP_THREADS),
+        lane_id(((LOGICAL_WARPS == 1) || (LOGICAL_WARP_THREADS == PtxArchProps::WARP_THREADS)) ?
+            LaneId() :
+            threadIdx.x % LOGICAL_WARP_THREADS)
+    {}
+
+
+    /**
+     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ WarpScan(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        warp_id((LOGICAL_WARPS == 1) ?
+            0 :
+            threadIdx.x / LOGICAL_WARP_THREADS),
+        lane_id(((LOGICAL_WARPS == 1) || (LOGICAL_WARP_THREADS == PtxArchProps::WARP_THREADS)) ?
+            LaneId() :
+            threadIdx.x % LOGICAL_WARP_THREADS)
+    {}
+
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Threads are identified using the given warp and lane identifiers.
+     */
+    __device__ __forceinline__ WarpScan(
+        int warp_id,                           ///< [in] A suitable warp membership identifier
+        int lane_id)                           ///< [in] A lane identifier within the warp
+    :
+        temp_storage(PrivateStorage()),
+        warp_id(warp_id),
+        lane_id(lane_id)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Threads are identified using the given warp and lane identifiers.
+     */
+    __device__ __forceinline__ WarpScan(
+        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
+        int warp_id,                           ///< [in] A suitable warp membership identifier
+        int lane_id)                           ///< [in] A lane identifier within the warp
+    :
+        temp_storage(temp_storage.Alias()),
+        warp_id(warp_id),
+        lane_id(lane_id)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix sums
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive prefix sum in each logical warp.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for 4 warps on type int
+     *     typedef cub::WarpScan<int, 4> WarpScan;
+     *
+     *     // Allocate shared memory for WarpScan
+     *     __shared__ typename WarpScan::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix sums
+     *     WarpScan(temp_storage).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, 1, 1, ...</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>1, 2, 3, ..., 32</tt>.
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output)            ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        InternalWarpScan(temp_storage, warp_id, lane_id).InclusiveSum(input, output);
+    }
+
+
+    /**
+     * \brief Computes an inclusive prefix sum in each logical warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * The \p warp_aggregate is undefined in threads other than <em>warp-lane</em><sub>0</sub>.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for 4 warps on type int
+     *     typedef cub::WarpScan<int, 4> WarpScan;
+     *
+     *     // Allocate shared memory for WarpScan
+     *     __shared__ typename WarpScan::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix sums
+     *     int warp_aggregate;
+     *     WarpScan(temp_storage).InclusiveSum(thread_data, thread_data, warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, 1, 1, ...</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>1, 2, 3, ..., 32</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan(temp_storage, warp_id, lane_id).InclusiveSum(input, output, warp_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an inclusive prefix sum in each logical warp.  Instead of using 0 as the warp-wide prefix, the call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * The \p warp_aggregate is undefined in threads other than <em>warp-lane</em><sub>0</sub>.
+     *
+     * The \p warp_prefix_op functor must implement a member function <tt>T operator()(T warp_aggregate)</tt>.
+     * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation.
+     * The functor will be invoked by the entire warp of threads, however only the return value from
+     * <em>lane</em><sub>0</sub> is applied as the threadblock-wide prefix.  Can be stateful.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively
+     * computes an inclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 32 integer items that are partitioned across the warp.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct WarpPrefixOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ WarpPrefixOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the entire warp. Lane-0 is responsible
+     *     // for returning a value for seeding the warp-wide scan.
+     *     __device__ int operator()(int warp_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += warp_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize WarpScan for one warp
+     *     typedef cub::WarpScan<int, 1> WarpScan;
+     *
+     *     // Allocate shared memory for WarpScan
+     *     __shared__ typename WarpScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     WarpPrefixOp prefix_op(0);
+     *
+     *     // Have the warp iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 32)
+     *     {
+     *         // Load a segment of consecutive items
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the warp-wide inclusive prefix sum
+     *         int warp_aggregate;
+     *         WarpScan(temp_storage).InclusiveSum(
+     *             thread_data, thread_data, warp_aggregate, prefix_op);
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>1, 2, 3, ..., 32</tt>.
+     * The output for the second segment will be <tt>33, 34, 35, ..., 64</tt>.  Furthermore,
+     * the value \p 32 will be stored in \p warp_aggregate for all threads after each scan.
+     *
+     * \tparam WarpPrefixOp                 <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T warp_aggregate)</tt>
+     */
+    template <typename WarpPrefixOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate,    ///< [out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items, exclusive of the \p warp_prefix_op value
+        WarpPrefixOp    &warp_prefix_op)    ///< [in-out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Call-back functor for specifying a warp-wide prefix to be applied to all inputs.
+    {
+        // Compute inclusive warp scan
+        InclusiveSum(input, output, warp_aggregate);
+
+        // Compute warp-wide prefix from aggregate, then broadcast to other lanes
+        T prefix;
+        prefix = warp_prefix_op(warp_aggregate);
+        prefix = InternalWarpScan(temp_storage, warp_id, lane_id).Broadcast(prefix, 0);
+
+        // Update output
+        output = prefix + output;
+    }
+
+    //@}  end member group
+
+private:
+
+    /// Computes an exclusive prefix sum in each logical warp.
+    __device__ __forceinline__ void ExclusiveSum(T input, T &output, Int2Type<true> is_primitive)
+    {
+        // Compute exclusive warp scan from inclusive warp scan
+        T inclusive;
+        InclusiveSum(input, inclusive);
+        output = inclusive - input;
+    }
+
+    /// Computes an exclusive prefix sum in each logical warp.  Specialized for non-primitive types.
+    __device__ __forceinline__ void ExclusiveSum(T input, T &output, Int2Type<false> is_primitive)
+    {
+        // Delegate to regular scan for non-primitive types (because we won't be able to use subtraction)
+        T identity = T();
+        ExclusiveScan(input, output, identity, Sum());
+    }
+
+    /// Computes an exclusive prefix sum in each logical warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+    __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, Int2Type<true> is_primitive)
+    {
+        // Compute exclusive warp scan from inclusive warp scan
+        T inclusive;
+        InclusiveSum(input, inclusive, warp_aggregate);
+        output = inclusive - input;
+    }
+
+    /// Computes an exclusive prefix sum in each logical warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.  Specialized for non-primitive types.
+    __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, Int2Type<false> is_primitive)
+    {
+        // Delegate to regular scan for non-primitive types (because we won't be able to use subtraction)
+        T identity = T();
+        ExclusiveScan(input, output, identity, Sum(), warp_aggregate);
+    }
+
+    /// Computes an exclusive prefix sum in each logical warp.  Instead of using 0 as the warp-wide prefix, the call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+    template <typename WarpPrefixOp>
+    __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, WarpPrefixOp &warp_prefix_op, Int2Type<true> is_primitive)
+    {
+        // Compute exclusive warp scan from inclusive warp scan
+        T inclusive;
+        InclusiveSum(input, inclusive, warp_aggregate, warp_prefix_op);
+        output = inclusive - input;
+    }
+
+    /// Computes an exclusive prefix sum in each logical warp.  Instead of using 0 as the warp-wide prefix, the call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.  Specialized for non-primitive types.
+    template <typename WarpPrefixOp>
+    __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, WarpPrefixOp &warp_prefix_op, Int2Type<false> is_primitive)
+    {
+        // Delegate to regular scan for non-primitive types (because we won't be able to use subtraction)
+        T identity = T();
+        ExclusiveScan(input, output, identity, Sum(), warp_aggregate, warp_prefix_op);
+    }
+
+public:
+
+
+    /******************************************************************//**
+     * \name Exclusive prefix sums
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive prefix sum in each logical warp.
+     *
+     * This operation assumes the value of obtained by the <tt>T</tt>'s default
+     * constructor (or by zero-initialization if no user-defined default
+     * constructor exists) is suitable as the identity value "zero" for
+     * addition.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for 4 warps on type int
+     *     typedef cub::WarpScan<int, 4> WarpScan;
+     *
+     *     // Allocate shared memory for WarpScan
+     *     __shared__ typename WarpScan::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix sums
+     *     WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, 1, 1, ...</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>0, 1, 2, ..., 31</tt>.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output)            ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        ExclusiveSum(input, output, Int2Type<Traits<T>::PRIMITIVE>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix sum in each logical warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * This operation assumes the value of obtained by the <tt>T</tt>'s default
+     * constructor (or by zero-initialization if no user-defined default
+     * constructor exists) is suitable as the identity value "zero" for
+     * addition.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for 4 warps on type int
+     *     typedef cub::WarpScan<int, 4> WarpScan;
+     *
+     *     // Allocate shared memory for WarpScan
+     *     __shared__ typename WarpScan::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix sums
+     *     int warp_aggregate;
+     *     WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data, warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, 1, 1, ...</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>0, 1, 2, ..., 31</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        ExclusiveSum(input, output, warp_aggregate, Int2Type<Traits<T>::PRIMITIVE>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix sum in each logical warp.  Instead of using 0 as the warp-wide prefix, the call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * This operation assumes the value of obtained by the <tt>T</tt>'s default
+     * constructor (or by zero-initialization if no user-defined default
+     * constructor exists) is suitable as the identity value "zero" for
+     * addition.
+     *
+     * The \p warp_prefix_op functor must implement a member function <tt>T operator()(T warp_aggregate)</tt>.
+     * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation.
+     * The functor will be invoked by the entire warp of threads, however only the return value from
+     * <em>lane</em><sub>0</sub> is applied as the threadblock-wide prefix.  Can be stateful.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively
+     * computes an exclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 32 integer items that are partitioned across the warp.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct WarpPrefixOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ WarpPrefixOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the entire warp. Lane-0 is responsible
+     *     // for returning a value for seeding the warp-wide scan.
+     *     __device__ int operator()(int warp_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += warp_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize WarpScan for one warp
+     *     typedef cub::WarpScan<int, 1> WarpScan;
+     *
+     *     // Allocate shared memory for WarpScan
+     *     __shared__ typename WarpScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     WarpPrefixOp prefix_op(0);
+     *
+     *     // Have the warp iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 32)
+     *     {
+     *         // Load a segment of consecutive items
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the warp-wide exclusive prefix sum
+     *         int warp_aggregate;
+     *         WarpScan(temp_storage).ExclusiveSum(
+     *             thread_data, thread_data, warp_aggregate, prefix_op);
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 1, 2, ..., 31</tt>.
+     * The output for the second segment will be <tt>32, 33, 34, ..., 63</tt>.  Furthermore,
+     * the value \p 32 will be stored in \p warp_aggregate for all threads after each scan.
+     *
+     * \tparam WarpPrefixOp                 <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T warp_aggregate)</tt>
+     */
+    template <typename WarpPrefixOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate,    ///< [out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items (exclusive of the \p warp_prefix_op value).
+        WarpPrefixOp    &warp_prefix_op)    ///< [in-out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Call-back functor for specifying a warp-wide prefix to be applied to all inputs.
+    {
+        ExclusiveSum(input, output, warp_aggregate, warp_prefix_op, Int2Type<Traits<T>::PRIMITIVE>());
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an inclusive prefix sum using the specified binary scan functor in each logical warp.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for 4 warps on type int
+     *     typedef cub::WarpScan<int, 4> WarpScan;
+     *
+     *     // Allocate shared memory for WarpScan
+     *     __shared__ typename WarpScan::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     WarpScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan(temp_storage, warp_id, lane_id).InclusiveScan(input, output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an inclusive prefix sum using the specified binary scan functor in each logical warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for 4 warps on type int
+     *     typedef cub::WarpScan<int, 4> WarpScan;
+     *
+     *     // Allocate shared memory for WarpScan
+     *     __shared__ typename WarpScan::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     WarpScan(temp_storage).InclusiveScan(
+     *         thread_data, thread_data, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan(temp_storage, warp_id, lane_id).InclusiveScan(input, output, scan_op, warp_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an inclusive prefix sum using the specified binary scan functor in each logical warp.  The call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * The \p warp_prefix_op functor must implement a member function <tt>T operator()(T warp_aggregate)</tt>.
+     * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation.
+     * The functor will be invoked by the entire warp of threads, however only the return value from
+     * <em>lane</em><sub>0</sub> is applied as the threadblock-wide prefix.  Can be stateful.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively
+     * computes an inclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 32 integer items that are partitioned across the warp.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct WarpPrefixOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ WarpPrefixOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the entire warp. Lane-0 is responsible
+     *     // for returning a value for seeding the warp-wide scan.
+     *     __device__ int operator()(int warp_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (warp_aggregate > old_prefix) ? warp_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize WarpScan for one warp
+     *     typedef cub::WarpScan<int, 1> WarpScan;
+     *
+     *     // Allocate shared memory for WarpScan
+     *     __shared__ typename WarpScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     WarpPrefixOp prefix_op(0);
+     *
+     *     // Have the warp iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 32)
+     *     {
+     *         // Load a segment of consecutive items
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the warp-wide inclusive prefix max scan
+     *         int warp_aggregate;
+     *         WarpScan(temp_storage).InclusiveScan(
+     *             thread_data, thread_data, cub::Max(), warp_aggregate, prefix_op);
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, ..., 30, 30</tt>.
+     * The output for the second segment will be <tt>32, 32, 34, 34, ..., 62, 62</tt>.  Furthermore,
+     * \p block_aggregate will be assigned \p 30 in all threads after the first scan, assigned \p 62 after the second
+     * scan, etc.
+     *
+     * \tparam ScanOp                       <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam WarpPrefixOp                 <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T warp_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename WarpPrefixOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate,    ///< [out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items (exclusive of the \p warp_prefix_op value).
+        WarpPrefixOp    &warp_prefix_op)    ///< [in-out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Call-back functor for specifying a warp-wide prefix to be applied to all inputs.
+    {
+        // Compute inclusive warp scan
+        InclusiveScan(input, output, scan_op, warp_aggregate);
+
+        // Compute warp-wide prefix from aggregate, then broadcast to other lanes
+        T prefix;
+        prefix = warp_prefix_op(warp_aggregate);
+        prefix = InternalWarpScan(temp_storage, warp_id, lane_id).Broadcast(prefix, 0);
+
+        // Update output
+        output = scan_op(prefix, output);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor in each logical warp.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for 4 warps on type int
+     *     typedef cub::WarpScan<int, 4> WarpScan;
+     *
+     *     // Allocate shared memory for WarpScan
+     *     __shared__ typename WarpScan::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     WarpScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               identity,           ///< [in] Identity value
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan(temp_storage, warp_id, lane_id).ExclusiveScan(input, output, identity, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor in each logical warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for 4 warps on type int
+     *     typedef cub::WarpScan<int, 4> WarpScan;
+     *
+     *     // Allocate shared memory for WarpScan
+     *     __shared__ typename WarpScan::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     WarpScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               identity,           ///< [in] Identity value
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan(temp_storage, warp_id, lane_id).ExclusiveScan(input, output, identity, scan_op, warp_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor in each logical warp.  The call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * The \p warp_prefix_op functor must implement a member function <tt>T operator()(T warp_aggregate)</tt>.
+     * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation.
+     * The functor will be invoked by the entire warp of threads, however only the return value from
+     * <em>lane</em><sub>0</sub> is applied as the threadblock-wide prefix.  Can be stateful.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively
+     * computes an exclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 32 integer items that are partitioned across the warp.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct WarpPrefixOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ WarpPrefixOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the entire warp. Lane-0 is responsible
+     *     // for returning a value for seeding the warp-wide scan.
+     *     __device__ int operator()(int warp_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (warp_aggregate > old_prefix) ? warp_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize WarpScan for one warp
+     *     typedef cub::WarpScan<int, 1> WarpScan;
+     *
+     *     // Allocate shared memory for WarpScan
+     *     __shared__ typename WarpScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     WarpPrefixOp prefix_op(INT_MIN);
+     *
+     *     // Have the warp iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 32)
+     *     {
+     *         // Load a segment of consecutive items
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the warp-wide exclusive prefix max scan
+     *         int warp_aggregate;
+     *         WarpScan(temp_storage).ExclusiveScan(
+     *             thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate, prefix_op);
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>.
+     * The output for the second segment will be <tt>30, 32, 32, 34, ..., 60, 62</tt>.  Furthermore,
+     * \p block_aggregate will be assigned \p 30 in all threads after the first scan, assigned \p 62 after the second
+     * scan, etc.
+     *
+     * \tparam ScanOp                       <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam WarpPrefixOp                 <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T warp_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename WarpPrefixOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               identity,           ///< [in] Identity value
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate,    ///< [out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items (exclusive of the \p warp_prefix_op value).
+        WarpPrefixOp    &warp_prefix_op)    ///< [in-out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Call-back functor for specifying a warp-wide prefix to be applied to all inputs.
+    {
+        // Exclusive warp scan
+        ExclusiveScan(input, output, identity, scan_op, warp_aggregate);
+
+        // Compute warp-wide prefix from aggregate, then broadcast to other lanes
+        T prefix = warp_prefix_op(warp_aggregate);
+        prefix = InternalWarpScan(temp_storage, warp_id, lane_id).Broadcast(prefix, 0);
+
+        // Update output
+        output = (lane_id == 0) ?
+            prefix :
+            scan_op(prefix, output);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Identityless exclusive prefix scans
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor in each logical warp.  Because no identity value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for 4 warps on type int
+     *     typedef cub::WarpScan<int, 4> WarpScan;
+     *
+     *     // Allocate shared memory for WarpScan
+     *     __shared__ typename WarpScan::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     WarpScan(temp_storage).ExclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in each warp lane0 is undefined.)
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan(temp_storage, warp_id, lane_id).ExclusiveScan(input, output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor in each logical warp.  Because no identity value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for 4 warps on type int
+     *     typedef cub::WarpScan<int, 4> WarpScan;
+     *
+     *     // Allocate shared memory for WarpScan
+     *     __shared__ typename WarpScan::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     WarpScan(temp_storage).ExclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in each warp lane0 is undefined.)  Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan(temp_storage, warp_id, lane_id).ExclusiveScan(input, output, scan_op, warp_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor in each logical warp.  The \p warp_prefix_op value from thread-thread-lane<sub>0</sub> is applied to all scan outputs.  Also computes the warp-wide \p warp_aggregate of all inputs for thread-thread-lane<sub>0</sub>.
+     *
+     * The \p warp_prefix_op functor must implement a member function <tt>T operator()(T warp_aggregate)</tt>.
+     * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation.
+     * The functor will be invoked by the entire warp of threads, however only the return value from
+     * <em>lane</em><sub>0</sub> is applied as the threadblock-wide prefix.  Can be stateful.
+     *
+     * Supports non-commutative scan operators.
+     *
+     * \smemreuse
+     *
+     * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively
+     * computes an exclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 32 integer items that are partitioned across the warp.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct WarpPrefixOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ WarpPrefixOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the entire warp. Lane-0 is responsible
+     *     // for returning a value for seeding the warp-wide scan.
+     *     __device__ int operator()(int warp_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (warp_aggregate > old_prefix) ? warp_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize WarpScan for one warp
+     *     typedef cub::WarpScan<int, 1> WarpScan;
+     *
+     *     // Allocate shared memory for WarpScan
+     *     __shared__ typename WarpScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     WarpPrefixOp prefix_op(INT_MIN);
+     *
+     *     // Have the warp iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 32)
+     *     {
+     *         // Load a segment of consecutive items
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the warp-wide exclusive prefix max scan
+     *         int warp_aggregate;
+     *         WarpScan(temp_storage).ExclusiveScan(
+     *             thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate, prefix_op);
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>.
+     * The output for the second segment will be <tt>30, 32, 32, 34, ..., 60, 62</tt>.  Furthermore,
+     * \p block_aggregate will be assigned \p 30 in all threads after the first scan, assigned \p 62 after the second
+     * scan, etc.
+     *
+     * \tparam ScanOp                       <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam WarpPrefixOp                 <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T warp_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename WarpPrefixOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate,    ///< [out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items (exclusive of the \p warp_prefix_op value).
+        WarpPrefixOp    &warp_prefix_op)    ///< [in-out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Call-back functor for specifying a warp-wide prefix to be applied to all inputs.
+    {
+        // Exclusive warp scan
+        ExclusiveScan(input, output, scan_op, warp_aggregate);
+
+        // Compute warp-wide prefix from aggregate, then broadcast to other lanes
+        T prefix = warp_prefix_op(warp_aggregate);
+        prefix = InternalWarpScan(temp_storage, warp_id, lane_id).Broadcast(prefix, 0);
+
+        // Update output with prefix
+        output = (lane_id == 0) ?
+            prefix :
+            scan_op(prefix, output);
+    }
+
+    //@}  end member group
+};
+
+/** @} */       // end group WarpModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/lib/kokkos/containers/src/Kokkos_Bitset.hpp b/lib/kokkos/containers/src/Kokkos_Bitset.hpp
new file mode 100644
index 000000000..b7f83694f
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_Bitset.hpp
@@ -0,0 +1,411 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BITSET_HPP
+#define KOKKOS_BITSET_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Functional.hpp>
+#include <Kokkos_View.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_Pair.hpp>
+
+#include <impl/Kokkos_Bitset_impl.hpp>
+
+#include <stdexcept>
+
+namespace Kokkos {
+
+template <typename Device>
+class Bitset;
+
+template <typename Device>
+class ConstBitset;
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( Bitset<DstDevice> & dst, Bitset<SrcDevice> const& src);
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( ConstBitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+
+
+/// A thread safe bitset
+template <typename Device>
+class Bitset
+{
+public:
+  typedef Device device_type;
+  typedef unsigned size_type;
+
+  enum { BIT_SCAN_REVERSE = 1u };
+  enum { MOVE_HINT_BACKWARD = 2u };
+
+  enum {
+      BIT_SCAN_FORWARD_MOVE_HINT_FORWARD = 0u
+    , BIT_SCAN_REVERSE_MOVE_HINT_FORWARD = BIT_SCAN_REVERSE
+    , BIT_SCAN_FORWARD_MOVE_HINT_BACKWARD = MOVE_HINT_BACKWARD
+    , BIT_SCAN_REVERSE_MOVE_HINT_BACKWARD = BIT_SCAN_REVERSE | MOVE_HINT_BACKWARD
+  };
+
+private:
+  enum { block_size = static_cast<unsigned>(sizeof(unsigned)*CHAR_BIT) };
+  enum { block_mask = block_size-1u };
+  enum { block_shift = static_cast<int>(Impl::power_of_two<block_size>::value) };
+
+public:
+
+
+  Bitset(unsigned arg_size = 0u)
+    : m_size(arg_size)
+    , m_last_block_mask(0u)
+    , m_blocks("Bitset", ((m_size + block_mask) >> block_shift) )
+  {
+    for (int i=0, end = static_cast<int>(m_size & block_mask); i < end; ++i) {
+      m_last_block_mask |= 1u << i;
+    }
+  }
+
+  Bitset<Device> & operator = (Bitset<Device> const & rhs)
+  {
+    this->m_size = rhs.m_size;
+    this->m_last_block_mask = rhs.m_last_block_mask;
+    this->m_blocks = rhs.m_blocks;
+
+    return *this;
+  }
+
+  Bitset( Bitset<Device> const & rhs)
+    : m_size( rhs.m_size )
+    , m_last_block_mask( rhs.m_last_block_mask )
+    , m_blocks( rhs.m_blocks )
+  {}
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned size() const
+  { return m_size; }
+
+  unsigned count() const
+  {
+    Impl::BitsetCount< Bitset<Device> > f(*this);
+    return f.apply();
+  }
+
+  void set()
+  {
+    Kokkos::deep_copy(m_blocks, ~0u );
+
+    if (m_last_block_mask) {
+      //clear the unused bits in the last block
+      typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, Kokkos::HostSpace > raw_deep_copy;
+      raw_deep_copy( m_blocks.ptr_on_device() + (m_blocks.size() -1u), &m_last_block_mask, sizeof(unsigned));
+    }
+  }
+
+  void reset()
+  {
+    Kokkos::deep_copy(m_blocks, 0u );
+  }
+
+  void clear()
+  {
+    Kokkos::deep_copy(m_blocks, 0u );
+  }
+
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool set( unsigned i ) const
+  {
+    if ( i < m_size ) {
+      unsigned * block_ptr = &m_blocks[ i >> block_shift ];
+      const unsigned mask = 1u << static_cast<int>( i & block_mask );
+
+      return !( atomic_fetch_or( block_ptr, mask ) & mask );
+    }
+    return false;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool reset( unsigned i ) const
+  {
+    if ( i < m_size ) {
+      unsigned * block_ptr = &m_blocks[ i >> block_shift ];
+      const unsigned mask = 1u << static_cast<int>( i & block_mask );
+
+      return atomic_fetch_and( block_ptr, ~mask ) & mask;
+    }
+    return false;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool test( unsigned i ) const
+  {
+    if ( i < m_size ) {
+      const unsigned block = volatile_load(&m_blocks[ i >> block_shift ]);
+      const unsigned mask = 1u << static_cast<int>( i & block_mask );
+      return block & mask;
+    }
+    return false;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned max_hint() const
+  {
+    return m_blocks.size();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::pair<bool, unsigned> find_any_set_near( unsigned hint , unsigned scan_direction = BIT_SCAN_FORWARD_MOVE_HINT_FORWARD ) const
+  {
+    const unsigned block_idx = (hint >> block_shift) < m_blocks.size() ? (hint >> block_shift) : 0;
+    const unsigned offset = hint & block_mask;
+    unsigned block = volatile_load(&m_blocks[ block_idx ]);
+    block = !m_last_block_mask || (block_idx < (m_blocks.size()-1)) ? block : block & m_last_block_mask ;
+
+    return find_any_helper(block_idx, offset, block, scan_direction);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::pair<bool, unsigned> find_any_unset_near( unsigned hint , unsigned scan_direction = BIT_SCAN_FORWARD_MOVE_HINT_FORWARD ) const
+  {
+    const unsigned block_idx = hint >> block_shift;
+    const unsigned offset = hint & block_mask;
+    unsigned block = volatile_load(&m_blocks[ block_idx ]);
+    block = !m_last_block_mask || (block_idx < (m_blocks.size()-1) ) ? ~block : ~block & m_last_block_mask ;
+
+    return find_any_helper(block_idx, offset, block, scan_direction);
+  }
+
+private:
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  Kokkos::pair<bool, unsigned> find_any_helper(unsigned block_idx, unsigned offset, unsigned block, unsigned scan_direction) const
+  {
+    Kokkos::pair<bool, unsigned> result( block > 0u, 0);
+
+    if (!result.first) {
+      result.second = update_hint( block_idx, offset, scan_direction );
+    }
+    else {
+      result.second = scan_block(  (block_idx << block_shift)
+                                 , offset
+                                 , block
+                                 , scan_direction
+                                );
+    }
+    return result;
+  }
+
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned scan_block(unsigned block_start, int offset, unsigned block, unsigned scan_direction ) const
+  {
+    offset = !(scan_direction & BIT_SCAN_REVERSE) ? offset : (offset + block_mask) & block_mask;
+    block = Impl::rotate_right(block, offset);
+    return ((( !(scan_direction & BIT_SCAN_REVERSE) ?
+               Impl::bit_scan_forward(block) :
+               Impl::bit_scan_reverse(block)
+             ) + offset
+            ) & block_mask
+           ) + block_start;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned update_hint( long long block_idx, unsigned offset, unsigned scan_direction ) const
+  {
+    block_idx += scan_direction & MOVE_HINT_BACKWARD ? -1 : 1;
+    block_idx = block_idx >= 0 ? block_idx : m_blocks.size() - 1;
+    block_idx = block_idx < static_cast<long long>(m_blocks.size()) ? block_idx : 0;
+
+    return static_cast<unsigned>(block_idx)*block_size + offset;
+  }
+
+private:
+
+  unsigned m_size;
+  unsigned m_last_block_mask;
+  View< unsigned *, device_type, MemoryTraits<RandomAccess> > m_blocks;
+
+private:
+  template <typename DDevice>
+  friend class Bitset;
+
+  template <typename DDevice>
+  friend class ConstBitset;
+
+  template <typename Bitset>
+  friend struct Impl::BitsetCount;
+
+  template <typename DstDevice, typename SrcDevice>
+  friend void deep_copy( Bitset<DstDevice> & dst, Bitset<SrcDevice> const& src);
+
+  template <typename DstDevice, typename SrcDevice>
+  friend void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+};
+
+template <typename Device>
+class ConstBitset
+{
+public:
+  typedef Device device_type;
+  typedef unsigned size_type;
+
+private:
+  enum { block_size = static_cast<unsigned>(sizeof(unsigned)*CHAR_BIT) };
+  enum { block_mask = block_size -1u };
+  enum { block_shift = static_cast<int>(Impl::power_of_two<block_size>::value) };
+
+public:
+  ConstBitset()
+    : m_size (0)
+  {}
+
+  ConstBitset(Bitset<Device> const& rhs)
+    : m_size(rhs.m_size)
+    , m_blocks(rhs.m_blocks)
+  {}
+
+  ConstBitset(ConstBitset<Device> const& rhs)
+    : m_size( rhs.m_size )
+    , m_blocks( rhs.m_blocks )
+  {}
+
+  ConstBitset<Device> & operator = (Bitset<Device> const & rhs)
+  {
+    this->m_size = rhs.m_size;
+    this->m_blocks = rhs.m_blocks;
+
+    return *this;
+  }
+
+  ConstBitset<Device> & operator = (ConstBitset<Device> const & rhs)
+  {
+    this->m_size = rhs.m_size;
+    this->m_blocks = rhs.m_blocks;
+
+    return *this;
+  }
+
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned size() const
+  {
+    return m_size;
+  }
+
+  unsigned count() const
+  {
+    Impl::BitsetCount< ConstBitset<Device> > f(*this);
+    return f.apply();
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool test( unsigned i ) const
+  {
+    if ( i < m_size ) {
+      const unsigned block = m_blocks[ i >> block_shift ];
+      const unsigned mask = 1u << static_cast<int>( i & block_mask );
+      return block & mask;
+    }
+    return false;
+  }
+
+private:
+
+  unsigned m_size;
+  View< const unsigned *, device_type, MemoryTraits<RandomAccess> > m_blocks;
+
+private:
+  template <typename DDevice>
+  friend class ConstBitset;
+
+  template <typename Bitset>
+  friend struct Impl::BitsetCount;
+
+  template <typename DstDevice, typename SrcDevice>
+  friend void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+
+  template <typename DstDevice, typename SrcDevice>
+  friend void deep_copy( ConstBitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+};
+
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( Bitset<DstDevice> & dst, Bitset<SrcDevice> const& src)
+{
+  if (dst.size() != src.size()) {
+    throw std::runtime_error("Error: Cannot deep_copy bitsets of different sizes!");
+  }
+
+  typedef Kokkos::Impl::DeepCopy< typename DstDevice::memory_space, typename SrcDevice::memory_space > raw_deep_copy;
+  raw_deep_copy(dst.m_blocks.ptr_on_device(), src.m_blocks.ptr_on_device(), sizeof(unsigned)*src.m_blocks.size());
+}
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src)
+{
+  if (dst.size() != src.size()) {
+    throw std::runtime_error("Error: Cannot deep_copy bitsets of different sizes!");
+  }
+
+  typedef Kokkos::Impl::DeepCopy< typename DstDevice::memory_space, typename SrcDevice::memory_space > raw_deep_copy;
+  raw_deep_copy(dst.m_blocks.ptr_on_device(), src.m_blocks.ptr_on_device(), sizeof(unsigned)*src.m_blocks.size());
+}
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( ConstBitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src)
+{
+  if (dst.size() != src.size()) {
+    throw std::runtime_error("Error: Cannot deep_copy bitsets of different sizes!");
+  }
+
+  typedef Kokkos::Impl::DeepCopy< typename DstDevice::memory_space, typename SrcDevice::memory_space > raw_deep_copy;
+  raw_deep_copy(dst.m_blocks.ptr_on_device(), src.m_blocks.ptr_on_device(), sizeof(unsigned)*src.m_blocks.size());
+}
+
+} // namespace Kokkos
+
+#endif //KOKKOS_BITSET_HPP
diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp
new file mode 100644
index 000000000..432912bbe
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp
@@ -0,0 +1,636 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_DualView.hpp
+/// \brief Declaration and definition of Kokkos::DualView.
+///
+/// This header file declares and defines Kokkos::DualView and its
+/// related nonmember functions.
+
+#ifndef KOKKOS_DUALVIEW_HPP
+#define KOKKOS_DUALVIEW_HPP
+
+#include <Kokkos_View.hpp>
+
+namespace Kokkos {
+
+/* \class DualView
+ * \brief Container to manage mirroring a Kokkos::View that lives
+ *   in device memory with a Kokkos::View that lives in host memory.
+ *
+ * This class provides capabilities to manage data which exists in two
+ * memory spaces at the same time.  It keeps views of the same layout
+ * on two memory spaces as well as modified flags for both
+ * allocations.  Users are responsible for setting the modified flags
+ * manually if they change the data in either memory space, by calling
+ * the sync() method templated on the device where they modified the
+ * data.  Users may synchronize data by calling the modify() function,
+ * templated on the device towards which they want to synchronize
+ * (i.e., the target of the one-way copy operation).
+ *
+ * The DualView class also provides convenience methods such as
+ * realloc, resize and capacity which call the appropriate methods of
+ * the underlying Kokkos::View objects.
+ *
+ * The four template arguments are the same as those of Kokkos::View.
+ * (Please refer to that class' documentation for a detailed
+ * description.)
+ *
+ *   \tparam DataType The type of the entries stored in the container.
+ *
+ *   \tparam Layout The array's layout in memory.
+ *
+ *   \tparam Device The Kokkos Device type.  If its memory space is
+ *     not the same as the host's memory space, then DualView will
+ *     contain two separate Views: one in device memory, and one in
+ *     host memory.  Otherwise, DualView will only store one View.
+ *
+ *   \tparam MemoryTraits (optional) The user's intended memory access
+ *     behavior.  Please see the documentation of Kokkos::View for
+ *     examples.  The default suffices for most users.
+ */
+template< class T , class L , class D, class M = MemoryManaged>
+class DualView {
+public:
+  //! \name Typedefs for device types and various Kokkos::View specializations.
+  //@{
+
+  //! The Kokkos Device type; same as the \c Device template parameter.
+  typedef D device_type;
+  //! The host mirror Kokkos Device type.
+  typedef typename D::host_mirror_device_type host_mirror_device_type;
+
+  //! The type of a Kokkos::View on the device.
+  typedef Kokkos::View<T,L,D,M> t_dev ;
+
+  /// \typedef t_host
+  /// \brief The type of a Kokkos::View host mirror of \c t_dev.
+#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_USE_UVM)
+  typedef t_dev t_host;
+#else
+  typedef typename t_dev::HostMirror t_host ;
+#endif
+
+  //! The type of a const View on the device.
+  typedef Kokkos::View<typename t_dev::const_data_type,L,D,M> t_dev_const;
+
+  /// \typedef t_host_const
+  /// \brief The type of a const View host mirror of \c t_dev_const.
+#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_USE_UVM)
+  typedef t_dev_const t_host_const;
+#else
+  typedef typename t_dev_const::HostMirror t_host_const;
+#endif
+
+  //! The type of a const, random-access View on the device.
+  typedef Kokkos::View<typename t_dev::const_data_type,L,D,Kokkos::MemoryRandomAccess> t_dev_const_randomread ;
+
+  /// \typedef t_host_const_randomread
+  /// \brief The type of a const, random-access View host mirror of
+  ///   \c t_dev_const_randomread.
+#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_USE_UVM)
+  typedef t_dev_const_randomread t_host_const_randomread;
+#else
+  typedef typename t_dev_const_randomread::HostMirror t_host_const_randomread;
+#endif
+
+  //! The type of an unmanaged View on the device.
+  typedef Kokkos::View<T, L, D, Kokkos::MemoryUnmanaged> t_dev_um;
+  //! The type of an unmanaged View host mirror of \c t_dev_um.
+  typedef Kokkos::View<typename t_host::data_type,
+                       typename t_host::array_layout,
+                       typename t_host::device_type,
+                       Kokkos::MemoryUnmanaged> t_host_um;
+
+  //! The type of a const unmanaged View on the device.
+  typedef Kokkos::View<typename t_dev::const_data_type, L, D,
+                       Kokkos::MemoryUnmanaged> t_dev_const_um;
+  //! The type of a const unmanaged View host mirror of \c t_dev_const_um.
+  typedef Kokkos::View<typename t_host::const_data_type,
+                       typename t_host::array_layout,
+                       typename t_host::device_type,
+                       Kokkos::MemoryUnmanaged> t_host_const_um;
+  //@}
+  //! \name The same typedefs as a View for scalar, data, and value types.
+  //@{
+
+  typedef typename t_dev::value_type value_type;
+  typedef typename t_dev::const_value_type const_value_type;
+  typedef typename t_dev::non_const_value_type non_const_value_type;
+
+  //@}
+  //! \name The two View instances.
+  //@{
+
+  t_dev d_view;
+  t_host h_view;
+
+  //@}
+  //! \name Counters to keep track of changes ("modified" flags)
+  //@{
+
+  View<unsigned int,LayoutLeft,host_mirror_device_type> modified_device;
+  View<unsigned int,LayoutLeft,host_mirror_device_type> modified_host;
+
+  //@}
+  //! \name Constructors
+  //@{
+
+  /// \brief Empty constructor.
+  ///
+  /// Both device and host View objects are constructed using their
+  /// default constructors.  The "modified" flags are both initialized
+  /// to "unmodified."
+  DualView () :
+    modified_device (View<unsigned int,LayoutLeft,host_mirror_device_type> ("DualView::modified_device")),
+    modified_host (View<unsigned int,LayoutLeft,host_mirror_device_type> ("DualView::modified_host"))
+  {}
+
+  /// \brief Constructor that allocates View objects on both host and device.
+  ///
+  /// This constructor works like the analogous constructor of View.
+  /// The first argument is a string label, which is entirely for your
+  /// benefit.  (Different DualView objects may have the same label if
+  /// you like.)  The arguments that follow are the dimensions of the
+  /// View objects.  For example, if the View has three dimensions,
+  /// the first three integer arguments will be nonzero, and you may
+  /// omit the integer arguments that follow.
+  DualView (const std::string& label,
+            const size_t n0 = 0,
+            const size_t n1 = 0,
+            const size_t n2 = 0,
+            const size_t n3 = 0,
+            const size_t n4 = 0,
+            const size_t n5 = 0,
+            const size_t n6 = 0,
+            const size_t n7 = 0)
+    : d_view (label, n0, n1, n2, n3, n4, n5, n6, n7)
+#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_USE_UVM)
+    , h_view (d_view) // with UVM, host View is _always_ a shallow copy
+#else
+    , h_view (create_mirror_view (d_view)) // without UVM, host View mirrors
+#endif
+    , modified_device (View<unsigned int,LayoutLeft,host_mirror_device_type> ("DualView::modified_device"))
+    , modified_host (View<unsigned int,LayoutLeft,host_mirror_device_type> ("DualView::modified_host"))
+  {}
+
+  //! Copy constructor (shallow copy)
+  template<class SS, class LS, class DS, class MS>
+  DualView (const DualView<SS,LS,DS,MS>& src) :
+    d_view (src.d_view),
+    h_view (src.h_view),
+    modified_device (src.modified_device),
+    modified_host (src.modified_host)
+  {}
+
+  /// \brief Create DualView from existing device and host View objects.
+  ///
+  /// This constructor assumes that the device and host View objects
+  /// are synchronized.  You, the caller, are responsible for making
+  /// sure this is the case before calling this constructor.  After
+  /// this constructor returns, you may use DualView's sync() and
+  /// modify() methods to ensure synchronization of the View objects.
+  ///
+  /// \param d_view_ Device View
+  /// \param h_view_ Host View (must have type t_host = t_dev::HostMirror)
+  DualView (const t_dev& d_view_, const t_host& h_view_) :
+    d_view (d_view_),
+    h_view (h_view_),
+    modified_device (View<unsigned int,LayoutLeft,host_mirror_device_type> ("DualView::modified_device")),
+    modified_host (View<unsigned int,LayoutLeft,host_mirror_device_type> ("DualView::modified_host"))
+  {
+    Impl::assert_shapes_are_equal (d_view.shape (), h_view.shape ());
+  }
+
+  //@}
+  //! \name Methods for synchronizing, marking as modified, and getting Views.
+  //@{
+
+  /// \brief Return a View on a specific device \c Device.
+  ///
+  /// Please don't be afraid of the if_c expression in the return
+  /// value's type.  That just tells the method what the return type
+  /// should be: t_dev if the \c Device template parameter matches
+  /// this DualView's device type, else t_host.
+  ///
+  /// For example, suppose you create a DualView on Cuda, like this:
+  /// \code
+  /// typedef Kokkos::DualView<float, Kokkos::LayoutRight, Kokkos::Cuda> dual_view_type;
+  /// dual_view_type DV ("my dual view", 100);
+  /// \endcode
+  /// If you want to get the CUDA device View, do this:
+  /// \code
+  /// typename dual_view_type::t_dev cudaView = DV.view<Kokkos::Cuda> ();
+  /// \endcode
+  /// and if you want to get the host mirror of that View, do this:
+  /// \code
+  /// typedef typename Kokkos::Cuda::host_mirror_device_type host_device_type;
+  /// typename dual_view_type::t_host hostView = DV.view<host_device_type> ();
+  /// \endcode
+  template< class Device >
+  const typename Kokkos::Impl::if_c<
+    Kokkos::Impl::is_same<typename t_dev::memory_space,
+                          typename Device::memory_space>::value,
+    t_dev,
+    t_host>::type view () const
+  {
+    return Kokkos::Impl::if_c<
+      Kokkos::Impl::is_same<
+        typename t_dev::memory_space,
+        typename Device::memory_space>::value,
+      t_dev,
+      t_host >::select (d_view , h_view);
+  }
+
+  /// \brief Update data on device or host only if data in the other
+  ///   space has been marked as modified.
+  ///
+  /// If \c Device is the same as this DualView's device type, then
+  /// copy data from host to device.  Otherwise, copy data from device
+  /// to host.  In either case, only copy if the source of the copy
+  /// has been modified.
+  ///
+  /// This is a one-way synchronization only.  If the target of the
+  /// copy has been modified, this operation will discard those
+  /// modifications.  It will also reset both device and host modified
+  /// flags.
+  ///
+  /// \note This method doesn't know on its own whether you modified
+  ///   the data in either View.  You must manually mark modified data
+  ///   as modified, by calling the modify() method with the
+  ///   appropriate template parameter.
+  template<class Device>
+  void sync () {
+    const unsigned int dev =
+      Kokkos::Impl::if_c<
+        Kokkos::Impl::is_same<
+          typename t_dev::memory_space,
+          typename Device::memory_space>::value ,
+        unsigned int,
+        unsigned int>::select (1, 0);
+
+    if (dev) { // if Device is the same as DualView's device type
+      if ((modified_host () > 0) && (modified_host () >= modified_device ())) {
+        Kokkos::deep_copy (d_view, h_view);
+        modified_host() = modified_device() = 0;
+      }
+    } else { // hopefully Device is the same as DualView's host type
+      if ((modified_device () > 0) && (modified_device () >= modified_host ())) {
+        Kokkos::deep_copy (h_view, d_view);
+        modified_host() = modified_device() = 0;
+      }
+    }
+  }
+
+  /// \brief Mark data as modified on the given device \c Device.
+  ///
+  /// If \c Device is the same as this DualView's device type, then
+  /// mark the device's data as modified.  Otherwise, mark the host's
+  /// data as modified.
+  template<class Device>
+  void modify () {
+    const unsigned int dev =
+      Kokkos::Impl::if_c<
+        Kokkos::Impl::is_same<
+          typename t_dev::memory_space,
+          typename Device::memory_space>::value,
+        unsigned int,
+        unsigned int>::select (1, 0);
+
+    if (dev) { // if Device is the same as DualView's device type
+      // Increment the device's modified count.
+      modified_device () = (modified_device () > modified_host () ?
+                            modified_device () : modified_host ()) + 1;
+    } else { // hopefully Device is the same as DualView's host type
+      // Increment the host's modified count.
+      modified_host () = (modified_device () > modified_host () ?
+                          modified_device () : modified_host ())  + 1;
+    }
+  }
+
+  //@}
+  //! \name Methods for reallocating or resizing the View objects.
+  //@{
+
+  /// \brief Reallocate both View objects.
+  ///
+  /// This discards any existing contents of the objects, and resets
+  /// their modified flags.  It does <i>not</i> copy the old contents
+  /// of either View into the new View objects.
+  void realloc( const size_t n0 = 0 ,
+           const size_t n1 = 0 ,
+           const size_t n2 = 0 ,
+           const size_t n3 = 0 ,
+           const size_t n4 = 0 ,
+           const size_t n5 = 0 ,
+           const size_t n6 = 0 ,
+           const size_t n7 = 0 ) {
+     Kokkos::realloc(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
+#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_USE_UVM)
+     h_view = d_view ;
+#else
+     h_view = create_mirror_view( d_view );
+#endif
+     /* Reset dirty flags */
+     modified_device() = modified_host() = 0;
+  }
+
+  /// \brief Resize both views, copying old contents into new if necessary.
+  ///
+  /// This method only copies the old contents into the new View
+  /// objects for the device which was last marked as modified.
+  void resize( const size_t n0 = 0 ,
+           const size_t n1 = 0 ,
+           const size_t n2 = 0 ,
+           const size_t n3 = 0 ,
+           const size_t n4 = 0 ,
+           const size_t n5 = 0 ,
+           const size_t n6 = 0 ,
+           const size_t n7 = 0 ) {
+   if(modified_device() >= modified_host()) {
+     /* Resize on Device */
+     Kokkos::resize(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
+#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_USE_UVM)
+     h_view = d_view ;
+#else
+     h_view = create_mirror_view( d_view );
+#endif
+
+     /* Mark Device copy as modified */
+     modified_device() = modified_device()+1;
+
+   } else {
+     /* Realloc on Device */
+
+     Kokkos::realloc(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
+#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_USE_UVM)
+     t_host temp_view = d_view ;
+#else
+     t_host temp_view = create_mirror_view( d_view );
+#endif
+
+     /* Remap on Host */
+     Kokkos::Impl::ViewRemap< t_host , t_host >( temp_view , h_view );
+     h_view = temp_view;
+
+     /* Mark Host copy as modified */
+     modified_host() = modified_host()+1;
+   }
+  }
+
+  //@}
+  //! \name Methods for getting capacity, stride, or dimension(s).
+  //@{
+
+  //! The allocation size (same as Kokkos::View::capacity).
+  size_t capacity() const {
+    return d_view.capacity();
+  }
+
+  //! Get stride(s) for each dimension.
+  template< typename iType>
+  void stride(iType* stride_) const {
+    d_view.stride(stride_);
+  }
+
+  /* \brief return size of dimension 0 */
+  size_t dimension_0() const {return d_view.dimension_0();}
+  /* \brief return size of dimension 1 */
+  size_t dimension_1() const {return d_view.dimension_1();}
+  /* \brief return size of dimension 2 */
+  size_t dimension_2() const {return d_view.dimension_2();}
+  /* \brief return size of dimension 3 */
+  size_t dimension_3() const {return d_view.dimension_3();}
+  /* \brief return size of dimension 4 */
+  size_t dimension_4() const {return d_view.dimension_4();}
+  /* \brief return size of dimension 5 */
+  size_t dimension_5() const {return d_view.dimension_5();}
+  /* \brief return size of dimension 6 */
+  size_t dimension_6() const {return d_view.dimension_6();}
+  /* \brief return size of dimension 7 */
+  size_t dimension_7() const {return d_view.dimension_7();}
+
+  //@}
+};
+
+//
+// Partial specializations of Kokkos::subview() for DualView objects.
+//
+
+template< class DstViewType ,
+          class T , class L , class D , class M ,
+          class ArgType0 >
+DstViewType
+subview( const DualView<T,L,D,M> & src ,
+         const ArgType0 & arg0 )
+{
+  DstViewType sub_view;
+  sub_view.d_view = subview<typename DstViewType::t_dev>(src.d_view,arg0);
+  sub_view.h_view = subview<typename DstViewType::t_host>(src.h_view,arg0);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+
+template< class DstViewType ,
+          class T , class L , class D , class M ,
+          class ArgType0 , class ArgType1 >
+DstViewType
+subview( const DualView<T,L,D,M> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 )
+{
+  DstViewType sub_view;
+  sub_view.d_view = subview<typename DstViewType::t_dev>(src.d_view,arg0,arg1);
+  sub_view.h_view = subview<typename DstViewType::t_host>(src.h_view,arg0,arg1);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+template< class DstViewType ,
+          class T , class L , class D , class M ,
+          class ArgType0 , class ArgType1 , class ArgType2 >
+DstViewType
+subview( const DualView<T,L,D,M> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 )
+{
+  DstViewType sub_view;
+  sub_view.d_view = subview<typename DstViewType::t_dev>(src.d_view,arg0,arg1,arg2);
+  sub_view.h_view = subview<typename DstViewType::t_host>(src.h_view,arg0,arg1,arg2);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+template< class DstViewType ,
+          class T , class L , class D , class M ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 >
+DstViewType
+subview( const DualView<T,L,D,M> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 )
+{
+  DstViewType sub_view;
+  sub_view.d_view = subview<typename DstViewType::t_dev>(src.d_view,arg0,arg1,arg2,arg3);
+  sub_view.h_view = subview<typename DstViewType::t_host>(src.h_view,arg0,arg1,arg2,arg3);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+template< class DstViewType ,
+          class T , class L , class D , class M ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 >
+DstViewType
+subview( const DualView<T,L,D,M> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 )
+{
+  DstViewType sub_view;
+  sub_view.d_view = subview<typename DstViewType::t_dev>(src.d_view,arg0,arg1,arg2,arg3,arg4);
+  sub_view.h_view = subview<typename DstViewType::t_host>(src.h_view,arg0,arg1,arg2,arg3,arg4);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+template< class DstViewType ,
+          class T , class L , class D , class M ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 , class ArgType5 >
+DstViewType
+subview( const DualView<T,L,D,M> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 ,
+         const ArgType5 & arg5 )
+{
+  DstViewType sub_view;
+  sub_view.d_view = subview<typename DstViewType::t_dev>(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5);
+  sub_view.h_view = subview<typename DstViewType::t_host>(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+template< class DstViewType ,
+          class T , class L , class D , class M ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 , class ArgType5 , class ArgType6 >
+DstViewType
+subview( const DualView<T,L,D,M> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 ,
+         const ArgType5 & arg5 ,
+         const ArgType6 & arg6 )
+{
+  DstViewType sub_view;
+  sub_view.d_view = subview<typename DstViewType::t_dev>(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6);
+  sub_view.h_view = subview<typename DstViewType::t_host>(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+template< class DstViewType ,
+          class T , class L , class D , class M ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 , class ArgType5 , class ArgType6 , class ArgType7 >
+DstViewType
+subview( const DualView<T,L,D,M> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 ,
+         const ArgType5 & arg5 ,
+         const ArgType6 & arg6 ,
+         const ArgType7 & arg7 )
+{
+  DstViewType sub_view;
+  sub_view.d_view = subview<typename DstViewType::t_dev>(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7);
+  sub_view.h_view = subview<typename DstViewType::t_host>(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+//
+// Partial specialization of Kokkos::deep_copy() for DualView objects.
+//
+
+template< class DT , class DL , class DD , class DM ,
+          class ST , class SL , class SD , class SM >
+void
+deep_copy (DualView<DT,DL,DD,DM> dst, // trust me, this must not be a reference
+           const DualView<ST,SL,SD,SM>& src)
+{
+  if (src.modified_device () >= src.modified_host ()) {
+    Kokkos::deep_copy (dst.d_view, src.d_view);
+    dst.template modify<typename DualView<DT,DL,DD,DM>::device_type> ();
+  } else {
+    Kokkos::deep_copy (dst.h_view, src.h_view);
+    dst.template modify<typename DualView<DT,DL,DD,DM>::host_mirror_device_type> ();
+  }
+}
+
+} // namespace Kokkos
+
+#endif
diff --git a/lib/kokkos/containers/src/Kokkos_Functional.hpp b/lib/kokkos/containers/src/Kokkos_Functional.hpp
new file mode 100644
index 000000000..74c3f7093
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_Functional.hpp
@@ -0,0 +1,132 @@
+#ifndef KOKKOS_FUNCTIONAL_HPP
+#define KOKKOS_FUNCTIONAL_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_Functional_impl.hpp>
+
+namespace Kokkos {
+
+// These should work for most types
+
+template <typename T>
+struct pod_hash
+{
+  typedef T argument_type;
+  typedef T first_argument_type;
+  typedef uint32_t second_argument_type;
+  typedef uint32_t result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  uint32_t operator()(T const & t) const
+  { return Impl::MurmurHash3_x86_32( &t, sizeof(T), 0); }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  uint32_t operator()(T const & t, uint32_t seed) const
+  { return Impl::MurmurHash3_x86_32( &t, sizeof(T), seed); }
+};
+
+template <typename T>
+struct pod_equal_to
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return Impl::bitwise_equal(&a,&b); }
+};
+
+template <typename T>
+struct pod_not_equal_to
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return !Impl::bitwise_equal(&a,&b); }
+};
+
+template <typename T>
+struct equal_to
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a == b; }
+};
+
+template <typename T>
+struct not_equal_to
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a != b; }
+};
+
+
+template <typename T>
+struct greater
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a > b; }
+};
+
+
+template <typename T>
+struct less
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a < b; }
+};
+
+template <typename T>
+struct greater_equal
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a >= b; }
+};
+
+
+template <typename T>
+struct less_equal
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a <= b; }
+};
+
+} // namespace Kokkos
+
+
+#endif //KOKKOS_FUNCTIONAL_HPP
+
+
diff --git a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
new file mode 100644
index 000000000..ff2548379
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
@@ -0,0 +1,227 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STATICCRSGRAPH_HPP
+#define KOKKOS_STATICCRSGRAPH_HPP
+
+#include <string>
+#include <vector>
+
+#include <Kokkos_View.hpp>
+#include <Kokkos_Parallel.hpp> // for parallel_reduce
+
+namespace Kokkos {
+
+/// \class StaticCrsGraph
+/// \brief Compressed row storage array.
+///
+/// \tparam DataType The type of stored entries.  If a StaticCrsGraph is
+///   used as the graph of a sparse matrix, then this is usually an
+///   integer type, the type of the column indices in the sparse
+///   matrix.
+///
+/// \tparam Arg1Type The second template parameter, corresponding
+///   either to the Device type (if there are no more template
+///   parameters) or to the Layout type (if there is at least one more
+///   template parameter).
+///
+/// \tparam Arg2Type The third template parameter, which if provided
+///   corresponds to the Device type.
+///
+/// \tparam SizeType The type of row offsets.  Usually the default
+///   parameter suffices.  However, setting a nondefault value is
+///   necessary in some cases, for example, if you want to have a
+///   sparse matrices with dimensions (and therefore column indices)
+///   that fit in \c int, but want to store more than <tt>INT_MAX</tt>
+///   entries in the sparse matrix.
+///
+/// A row has a range of entries:
+/// <ul>
+/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li>
+/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li>
+/// <li> <tt> entries( entry ,            i2 , i3 , ... ); </tt> </li>
+/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li>
+/// </ul>
+template< class DataType,
+          class Arg1Type,
+          class Arg2Type = void,
+          typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, void >::size_type>
+class StaticCrsGraph {
+private:
+  typedef ViewTraits<DataType*, Arg1Type, Arg2Type, void> traits;
+
+public:
+  typedef DataType                                            data_type;
+  typedef typename traits::array_layout                       array_layout;
+  typedef typename traits::device_type                        device_type;
+  typedef SizeType                                            size_type;
+
+  typedef StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType > staticcrsgraph_type;
+  typedef StaticCrsGraph< DataType , array_layout , typename device_type::host_mirror_device_type , SizeType > HostMirror;
+  //typedef StaticCrsGraph< DataType , array_layout , Kokkos::Threads , SizeType > HostMirror;
+  typedef View< const size_type* , array_layout, device_type >  row_map_type;
+  typedef View<       DataType*  , array_layout, device_type >  entries_type;
+
+  entries_type entries;
+  row_map_type row_map;
+
+  //! Construct an empty view.
+  StaticCrsGraph () : entries(), row_map() {}
+
+  //! Copy constructor (shallow copy).
+  StaticCrsGraph (const StaticCrsGraph& rhs) : entries (rhs.entries), row_map (rhs.row_map)
+  {}
+
+  template<class EntriesType, class RowMapType>
+  StaticCrsGraph (const EntriesType& entries_,const RowMapType& row_map_) : entries (entries_), row_map (row_map_)
+  {}
+
+  /** \brief  Assign to a view of the rhs array.
+   *          If the old view is the last view
+   *          then allocated memory is deallocated.
+   */
+  StaticCrsGraph& operator= (const StaticCrsGraph& rhs) {
+    entries = rhs.entries;
+    row_map = rhs.row_map;
+    return *this;
+  }
+
+  /**  \brief  Destroy this view of the array.
+   *           If the last view then allocated memory is deallocated.
+   */
+  ~StaticCrsGraph() {}
+
+  size_t numRows() const {
+    return row_map.dimension_0()>0?row_map.dimension_0()-1:0;
+  }
+
+};
+
+//----------------------------------------------------------------------------
+
+template< class StaticCrsGraphType , class InputSizeType >
+typename StaticCrsGraphType::staticcrsgraph_type
+create_staticcrsgraph( const std::string & label ,
+                 const std::vector< InputSizeType > & input );
+
+template< class StaticCrsGraphType , class InputSizeType >
+typename StaticCrsGraphType::staticcrsgraph_type
+create_staticcrsgraph( const std::string & label ,
+                 const std::vector< std::vector< InputSizeType > > & input );
+
+//----------------------------------------------------------------------------
+
+template< class DataType ,
+          class Arg1Type ,
+          class Arg2Type ,
+          typename SizeType >
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & input );
+
+template< class DataType ,
+          class Arg1Type ,
+          class Arg2Type ,
+          typename SizeType >
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & input );
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#include <impl/Kokkos_StaticCrsGraph_factory.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class GraphType >
+struct StaticCrsGraphMaximumEntry {
+
+  typedef typename GraphType::device_type device_type ;
+  typedef typename GraphType::data_type value_type ;
+
+  const typename GraphType::entries_type entries ;
+
+  StaticCrsGraphMaximumEntry( const GraphType & graph ) : entries( graph.entries ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned i , value_type & update ) const
+    { if ( update < entries(i) ) update = entries(i); }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & update ) const
+    { update = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & update ,
+             volatile const value_type & input ) const
+    { if ( update < input ) update = input ; }
+};
+
+}
+
+template< class DataType, class Arg1Type, class Arg2Type, typename SizeType >
+DataType maximum_entry( const StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType > & graph )
+{
+  typedef StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType> GraphType ;
+  typedef Impl::StaticCrsGraphMaximumEntry< GraphType > FunctorType ;
+
+  DataType result = 0 ;
+  Kokkos::parallel_reduce( graph.entries.dimension_0(),
+                           FunctorType(graph), result );
+  return result ;
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_CRSARRAY_HPP */
+
diff --git a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
new file mode 100644
index 000000000..e98e7b57b
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
@@ -0,0 +1,862 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_UnorderedMap.hpp
+/// \brief Declaration and definition of Kokkos::UnorderedMap.
+///
+/// This header file declares and defines Kokkos::UnorderedMap and its
+/// related nonmember functions.
+
+#ifndef KOKKOS_UNORDERED_MAP_HPP
+#define KOKKOS_UNORDERED_MAP_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Functional.hpp>
+#include <Kokkos_View.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <Kokkos_HostSpace.hpp>
+
+#include <Kokkos_Bitset.hpp>
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_UnorderedMap_impl.hpp>
+
+
+#include <iostream>
+
+#include <stdint.h>
+#include <stdexcept>
+
+#if (defined( __GNUC__ ) || defined( __GNUG__ )) && not defined( __CUDACC__ )
+
+#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr,0,0)
+#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr,1,0)
+
+#else
+
+#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) ((void)0)
+#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) ((void)0)
+
+#endif
+
+namespace Kokkos {
+
+enum { UnorderedMapInvalidIndex = ~0u };
+
+/// \brief First element of the return value of UnorderedMap::insert().
+///
+/// Inserting an element into an UnorderedMap is not guaranteed to
+/// succeed.  There are three possible conditions:
+/// <ol>
+/// <li> <tt>INSERT_FAILED</tt>: The insert failed.  This usually
+///      means that the UnorderedMap ran out of space. </li>
+/// <li> <tt>INSERT_SUCCESS</tt>: The insert succeeded, and the key
+///      did <i>not</i> exist in the table before. </li>
+/// <li> <tt>INSERT_EXISTING</tt>: The insert succeeded, and the key
+///      <i>did</i> exist in the table before.  The new value was
+///      ignored and the old value was left in place. </li>
+/// </ol>
+
+class UnorderedMapInsertResult
+{
+private:
+  enum Status{
+     SUCCESS = 1u << 31
+   , EXISTING = 1u << 30
+   , FREED_EXISTING = 1u << 29
+   , LIST_LENGTH_MASK = ~(SUCCESS | EXISTING | FREED_EXISTING)
+  };
+
+public:
+  /// Did the map successful insert the key/value pair
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool success() const { return (m_status & SUCCESS); }
+
+  /// Was the key already present in the map
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool existing() const { return (m_status & EXISTING); }
+
+  /// Did the map fail to insert the key due to insufficent capacity
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool failed() const { return m_index == UnorderedMapInvalidIndex; }
+
+  /// Did the map lose a race condition to insert a dupulicate key/value pair
+  /// where an index was claimed that needed to be released
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool freed_existing() const { return (m_status & FREED_EXISTING); }
+
+  /// How many iterations through the insert loop did it take before the
+  /// map returned
+  KOKKOS_FORCEINLINE_FUNCTION
+  uint32_t list_position() const { return (m_status & LIST_LENGTH_MASK); }
+
+  /// Index where the key can be found as long as the insert did not fail
+  KOKKOS_FORCEINLINE_FUNCTION
+  uint32_t index() const { return m_index; }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  UnorderedMapInsertResult()
+    : m_index(UnorderedMapInvalidIndex)
+    , m_status(0)
+  {}
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void increment_list_position()
+  {
+    m_status += (list_position() < LIST_LENGTH_MASK) ? 1u : 0u;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void set_existing(uint32_t i, bool arg_freed_existing)
+  {
+    m_index = i;
+    m_status = EXISTING | (arg_freed_existing ? FREED_EXISTING : 0u) | list_position();
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void set_success(uint32_t i)
+  {
+    m_index = i;
+    m_status = SUCCESS | list_position();
+  }
+
+private:
+  uint32_t m_index;
+  uint32_t m_status;
+};
+
+/// \class UnorderedMap
+/// \brief Thread-safe, performance-portable lookup table.
+///
+/// This class provides a lookup table.  In terms of functionality,
+/// this class compares to std::unordered_map (new in C++11).
+/// "Unordered" means that keys are not stored in any particular
+/// order, unlike (for example) std::map.  "Thread-safe" means that
+/// lookups, insertion, and deletion are safe to call by multiple
+/// threads in parallel.  "Performance-portable" means that parallel
+/// performance of these operations is reasonable, on multiple
+/// hardware platforms.  Platforms on which performance has been
+/// tested include conventional Intel x86 multicore processors, Intel
+/// Xeon Phi ("MIC"), and NVIDIA GPUs.
+///
+/// Parallel performance portability entails design decisions that
+/// might differ from one's expectation for a sequential interface.
+/// This particularly affects insertion of single elements.  In an
+/// interface intended for sequential use, insertion might reallocate
+/// memory if the original allocation did not suffice to hold the new
+/// element.  In this class, insertion does <i>not</i> reallocate
+/// memory.  This means that it might fail.  insert() returns an enum
+/// which indicates whether the insert failed.  There are three
+/// possible conditions:
+/// <ol>
+/// <li> <tt>INSERT_FAILED</tt>: The insert failed.  This usually
+///      means that the UnorderedMap ran out of space. </li>
+/// <li> <tt>INSERT_SUCCESS</tt>: The insert succeeded, and the key
+///      did <i>not</i> exist in the table before. </li>
+/// <li> <tt>INSERT_EXISTING</tt>: The insert succeeded, and the key
+///      <i>did</i> exist in the table before.  The new value was
+///      ignored and the old value was left in place. </li>
+/// </ol>
+///
+/// \tparam Key Type of keys of the lookup table.  If \c const, users
+///   are not allowed to add or remove keys, though they are allowed
+///   to change values.  In that case, the implementation may make
+///   optimizations specific to the <tt>Device</tt>.  For example, if
+///   <tt>Device</tt> is \c Cuda, it may use texture fetches to access
+///   keys.
+///
+/// \tparam Value Type of values stored in the lookup table.  You may use
+///   \c void here, in which case the table will be a set of keys.  If
+///   \c const, users are not allowed to change entries.
+///   In that case, the implementation may make
+///   optimizations specific to the \c Device, such as using texture
+///   fetches to access values.
+///
+/// \tparam Device The Kokkos Device type.
+///
+/// \tparam Hasher Definition of the hash function for instances of
+///   <tt>Key</tt>.  The default will calculate a bitwise hash.
+///
+/// \tparam EqualTo Definition of the equality function for instances of
+///   <tt>Key</tt>.  The default will do a bitwise equality comparison.
+///
+template <   typename Key
+           , typename Value
+           , typename Device
+           , typename Hasher = pod_hash<typename Impl::remove_const<Key>::type>
+           , typename EqualTo = pod_equal_to<typename Impl::remove_const<Key>::type>
+        >
+class UnorderedMap
+{
+public:
+  //! \name Public types and constants
+  //@{
+
+  //key_types
+  typedef Key declared_key_type;
+  typedef typename Impl::remove_const<declared_key_type>::type key_type;
+  typedef typename Impl::add_const<key_type>::type const_key_type;
+
+  //value_types
+  typedef Value declared_value_type;
+  typedef typename Impl::remove_const<declared_value_type>::type value_type;
+  typedef typename Impl::add_const<value_type>::type const_value_type;
+
+  typedef Device device_type;
+  typedef Hasher hasher_type;
+  typedef EqualTo  equal_to_type;
+  typedef uint32_t size_type;
+
+  //map_types
+  typedef UnorderedMap<declared_key_type,declared_value_type,device_type,hasher_type,equal_to_type> declared_map_type;
+  typedef UnorderedMap<key_type,value_type,device_type,hasher_type,equal_to_type>                   insertable_map_type;
+  typedef UnorderedMap<const_key_type,value_type,device_type,hasher_type,equal_to_type>             modifiable_map_type;
+  typedef UnorderedMap<const_key_type,const_value_type,device_type,hasher_type,equal_to_type>       const_map_type;
+
+  static const bool is_set = Impl::is_same<void,value_type>::value;
+  static const bool has_const_key = Impl::is_same<const_key_type,declared_key_type>::value;
+  static const bool has_const_value = is_set || Impl::is_same<const_value_type,declared_value_type>::value;
+
+  static const bool is_insertable_map = !has_const_key && (is_set || !has_const_value);
+  static const bool is_modifiable_map = has_const_key && !has_const_value;
+  static const bool is_const_map = has_const_key && has_const_value;
+
+
+  typedef UnorderedMapInsertResult insert_result;
+
+  typedef typename Device::host_mirror_device_type host_mirror_device_type;
+
+  typedef UnorderedMap<Key,Value,host_mirror_device_type,Hasher,EqualTo> HostMirror;
+
+  typedef Impl::UnorderedMapHistogram<const_map_type> histogram_type;
+
+  //@}
+
+private:
+  enum { invalid_index = ~static_cast<size_type>(0) };
+
+  typedef typename Impl::if_c< is_set, int, declared_value_type>::type impl_value_type;
+
+  typedef typename Impl::if_c<   is_insertable_map
+                               , View< key_type *, device_type>
+                               , View< const key_type *, device_type, MemoryTraits<RandomAccess> >
+                             >::type key_type_view;
+
+  typedef typename Impl::if_c<   is_insertable_map || is_modifiable_map
+                               , View< impl_value_type *, device_type>
+                               , View< const impl_value_type *, device_type, MemoryTraits<RandomAccess> >
+                             >::type value_type_view;
+
+  typedef typename Impl::if_c<   is_insertable_map
+                               , View< size_type *, device_type>
+                               , View< const size_type *, device_type, MemoryTraits<RandomAccess> >
+                             >::type size_type_view;
+
+  typedef typename Impl::if_c<   is_insertable_map
+                               , Bitset< device_type >
+                               , ConstBitset< device_type>
+                             >::type bitset_type;
+
+  enum { modified_idx = 0, erasable_idx = 1, failed_insert_idx = 2 };
+  enum { num_scalars = 3 };
+  typedef View< int[num_scalars], LayoutLeft, device_type> scalars_view;
+
+public:
+  //! \name Public member functions
+  //@{
+
+  UnorderedMap()
+    : m_bounded_insert()
+    , m_hasher()
+    , m_equal_to()
+    , m_size()
+    , m_available_indexes()
+    , m_hash_lists()
+    , m_next_index()
+    , m_keys()
+    , m_values()
+    , m_scalars()
+  {}
+
+  /// \brief Constructor
+  ///
+  /// \param capacity_hint [in] Initial guess of how many unique keys will be inserted into the map
+  /// \param hash [in] Hasher function for \c Key instances.  The
+  ///   default value usually suffices.
+  UnorderedMap(  size_type capacity_hint, hasher_type hasher = hasher_type(), equal_to_type equal_to = equal_to_type() )
+    : m_bounded_insert(true)
+    , m_hasher(hasher)
+    , m_equal_to(equal_to)
+    , m_size()
+    , m_available_indexes(calculate_capacity(capacity_hint))
+    , m_hash_lists(AllocateWithoutInitializing(), "UnorderedMap hash list", Impl::find_hash_size(capacity()))
+    , m_next_index(AllocateWithoutInitializing(), "UnorderedMap next index", capacity()+1) // +1 so that the *_at functions can always return a valid reference
+    , m_keys("UnorderedMap keys",capacity()+1)
+    , m_values("UnorderedMap values",(is_set? 1 : capacity()+1))
+    , m_scalars("UnorderedMap scalars")
+  {
+    if (!is_insertable_map) {
+      throw std::runtime_error("Cannot construct a non-insertable (i.e. const key_type) unordered_map");
+    }
+
+    Kokkos::deep_copy(m_hash_lists, invalid_index);
+    Kokkos::deep_copy(m_next_index, invalid_index);
+  }
+
+  void reset_failed_insert_flag()
+  {
+    reset_flag(failed_insert_idx);
+  }
+
+  histogram_type get_histogram()
+  {
+    return histogram_type(*this);
+  }
+
+  //! Clear all entries in the table.
+  void clear()
+  {
+    m_bounded_insert = true;
+
+    if (capacity() == 0) return;
+
+    m_available_indexes.clear();
+
+    Kokkos::deep_copy(m_hash_lists, invalid_index);
+    Kokkos::deep_copy(m_next_index, invalid_index);
+    {
+      const key_type tmp = key_type();
+      Kokkos::deep_copy(m_keys,tmp);
+    }
+    if (is_set){
+      const impl_value_type tmp = impl_value_type();
+      Kokkos::deep_copy(m_values,tmp);
+    }
+    {
+      Kokkos::deep_copy(m_scalars, 0);
+    }
+  }
+
+  /// \brief Change the capacity of the the map
+  ///
+  /// If there are no failed inserts the current size of the map will
+  /// be used as a lower bound for the input capacity.
+  /// If the map is not empty and does not have failed inserts
+  /// and the capacity changes then the current data is copied
+  /// into the resized / rehashed map.
+  ///
+  /// This is <i>not</i> a device function; it may <i>not</i> be
+  /// called in a parallel kernel.
+  bool rehash(size_type requested_capacity = 0)
+  {
+    const bool bounded_insert = (capacity() == 0) || (size() == 0u);
+    return rehash(requested_capacity, bounded_insert );
+  }
+
+  bool rehash(size_type requested_capacity, bool bounded_insert)
+  {
+    if(!is_insertable_map) return false;
+
+    const size_type curr_size = size();
+    requested_capacity = (requested_capacity < curr_size) ? curr_size : requested_capacity;
+
+    insertable_map_type tmp(requested_capacity, m_hasher, m_equal_to);
+
+    if (curr_size) {
+      tmp.m_bounded_insert = false;
+      Impl::UnorderedMapRehash<insertable_map_type> f(tmp,*this);
+      f.apply();
+    }
+    tmp.m_bounded_insert = bounded_insert;
+
+    *this = tmp;
+
+    return true;
+  }
+
+  /// \brief The number of entries in the table.
+  ///
+  /// This method has undefined behavior when erasable() is true.
+  ///
+  /// Note that this is not a device function; it cannot be called in
+  /// a parallel kernel.  The value is not stored as a variable; it
+  /// must be computed.
+  size_type size() const
+  {
+    if( capacity() == 0u ) return 0u;
+    if (modified()) {
+      m_size = m_available_indexes.count();
+      reset_flag(modified_idx);
+    }
+    return m_size;
+  }
+
+  /// \brief The current number of failed insert() calls.
+  ///
+  /// This is <i>not</i> a device function; it may <i>not</i> be
+  /// called in a parallel kernel.  The value is not stored as a
+  /// variable; it must be computed.
+  bool failed_insert() const
+  {
+    return get_flag(failed_insert_idx);
+  }
+
+  bool erasable() const
+  {
+    return is_insertable_map ? get_flag(erasable_idx) : false;
+  }
+
+  bool begin_erase()
+  {
+    bool result = !erasable();
+    if (is_insertable_map && result) {
+      device_type::fence();
+      set_flag(erasable_idx);
+      device_type::fence();
+    }
+    return result;
+  }
+
+  bool end_erase()
+  {
+    bool result = erasable();
+    if (is_insertable_map && result) {
+      device_type::fence();
+      Impl::UnorderedMapErase<declared_map_type> f(*this);
+      f.apply();
+      device_type::fence();
+      reset_flag(erasable_idx);
+    }
+    return result;
+  }
+
+  /// \brief The maximum number of entries that the table can hold.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type capacity() const
+  { return m_available_indexes.size(); }
+
+  /// \brief The number of hash table "buckets."
+  ///
+  /// This is different than the number of entries that the table can
+  /// hold.  Each key hashes to an index in [0, hash_capacity() - 1].
+  /// That index can hold zero or more entries.  This class decides
+  /// what hash_capacity() should be, given the user's upper bound on
+  /// the number of entries the table must be able to hold.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_INLINE_FUNCTION
+  size_type hash_capacity() const
+  { return m_hash_lists.size(); }
+
+  //---------------------------------------------------------------------------
+  //---------------------------------------------------------------------------
+
+
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.  As discussed in the class documentation, it need not
+  /// succeed.  The return value tells you if it did.
+  ///
+  /// \param k [in] The key to attempt to insert.
+  /// \param v [in] The corresponding value to attempt to insert.  If
+  ///   using this class as a set (with Value = void), then you need not
+  ///   provide this value.
+  KOKKOS_INLINE_FUNCTION
+  insert_result insert(key_type const& k, impl_value_type const&v = impl_value_type()) const
+  {
+    insert_result result;
+
+    if ( !is_insertable_map || capacity() == 0u || m_scalars((int)erasable_idx) ) {
+      return result;
+    }
+
+    if ( !m_scalars((int)modified_idx) ) {
+      m_scalars((int)modified_idx) = true;
+    }
+
+    int volatile & failed_insert_ref = m_scalars((int)failed_insert_idx) ;
+
+    const size_type hash_value = m_hasher(k);
+    const size_type hash_list = hash_value % m_hash_lists.size();
+
+    size_type * curr_ptr   = & m_hash_lists[ hash_list ];
+    size_type new_index    = invalid_index ;
+
+    // Force integer multiply to long
+    size_type index_hint = static_cast<size_type>( (static_cast<double>(hash_list) * capacity()) / m_hash_lists.size());
+
+    size_type find_attempts = 0;
+
+    enum { bounded_find_attempts = 32u };
+    const size_type max_attempts = (m_bounded_insert && (bounded_find_attempts < m_available_indexes.max_hint()) ) ?
+                                    bounded_find_attempts :
+                                    m_available_indexes.max_hint();
+
+    bool not_done = true ;
+
+#if defined( __MIC__ )
+      #pragma noprefetch
+#endif
+    while ( not_done ) {
+
+      // Continue searching the unordered list for this key,
+      // list will only be appended during insert phase.
+      // Need volatile_load as other threads may be appending.
+      size_type curr = volatile_load(curr_ptr);
+
+      KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
+#if defined( __MIC__ )
+      #pragma noprefetch
+#endif
+      while ( curr != invalid_index && ! m_equal_to( volatile_load(&m_keys[curr]), k) ) {
+        result.increment_list_position();
+        index_hint = curr;
+        curr_ptr = &m_next_index[curr];
+        curr = volatile_load(curr_ptr);
+        KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
+      }
+
+      //------------------------------------------------------------
+      // If key already present then return that index.
+      if ( curr != invalid_index ) {
+
+        const bool free_existing = new_index != invalid_index;
+        if ( free_existing ) {
+          // Previously claimed an unused entry that was not inserted.
+          // Release this unused entry immediately.
+          if (!m_available_indexes.reset(new_index) ) {
+            printf("Unable to free existing\n");
+          }
+
+        }
+
+        result.set_existing(curr, free_existing);
+        not_done = false ;
+      }
+      //------------------------------------------------------------
+      // Key is not currently in the map.
+      // If the thread has claimed an entry try to insert now.
+      else {
+
+        //------------------------------------------------------------
+        // If have not already claimed an unused entry then do so now.
+        if (new_index == invalid_index) {
+
+          bool found = false;
+          // use the hash_list as the flag for the search direction
+          Kokkos::tie(found, index_hint) = m_available_indexes.find_any_unset_near( index_hint, hash_list );
+
+          // found and index and this thread set it
+          if ( !found && ++find_attempts >= max_attempts ) {
+            failed_insert_ref = true;
+            not_done = false ;
+          }
+          else if (m_available_indexes.set(index_hint) ) {
+            new_index = index_hint;
+            // Set key and value
+            KOKKOS_NONTEMPORAL_PREFETCH_STORE(&m_keys[new_index]);
+            m_keys[new_index] = k ;
+
+            if (!is_set) {
+              KOKKOS_NONTEMPORAL_PREFETCH_STORE(&m_values[new_index]);
+              m_values[new_index] = v ;
+            }
+
+            // Do not proceed until key and value are updated in global memory
+            memory_fence();
+          }
+        }
+        else if (failed_insert_ref) {
+          not_done = false;
+        }
+
+        // Attempt to append claimed entry into the list.
+        // Another thread may also be trying to append the same list so protect with atomic.
+        if ( new_index != invalid_index &&
+             curr ==  atomic_compare_exchange(curr_ptr, static_cast<size_type>(invalid_index), new_index) ) {
+          // Succeeded in appending
+          result.set_success(new_index);
+          not_done = false ;
+        }
+      }
+    } // while ( not_done )
+
+    return result ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool erase(key_type const& k) const
+  {
+    bool result = false;
+
+    if(is_insertable_map && 0u < capacity() && m_scalars((int)erasable_idx)) {
+
+      if ( ! m_scalars((int)modified_idx) ) {
+        m_scalars((int)modified_idx) = true;
+      }
+
+      size_type index = find(k);
+      if (valid_at(index)) {
+        m_available_indexes.reset(index);
+        result = true;
+      }
+    }
+
+    return result;
+  }
+
+  /// \brief Find the given key \c k, if it exists in the table.
+  ///
+  /// \return If the key exists in the table, the index of the
+  ///   value corresponding to that key; otherwise, an invalid index.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_INLINE_FUNCTION
+  size_type find( const key_type & k) const
+  {
+    size_type curr = 0u < capacity() ? m_hash_lists( m_hasher(k) % m_hash_lists.size() ) : invalid_index ;
+
+    KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
+    while (curr != invalid_index && !m_equal_to( m_keys[curr], k) ) {
+      KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
+      curr = m_next_index[curr];
+    }
+
+    return curr;
+  }
+
+  /// \brief Does the key exist in the map
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_INLINE_FUNCTION
+  bool exists( const key_type & k) const
+  {
+    return valid_at(find(k));
+  }
+
+
+  /// \brief Get the value with \c i as its direct index.
+  ///
+  /// \param i [in] Index directly into the array of entries.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  ///
+  /// 'const value_type' via Cuda texture fetch must return by value.
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::if_c< (is_set || has_const_value), impl_value_type, impl_value_type &>::type
+  value_at(size_type i) const
+  {
+    return m_values[ is_set ? 0 : (i < capacity() ? i : capacity()) ];
+  }
+
+  /// \brief Get the key with \c i as its direct index.
+  ///
+  /// \param i [in] Index directly into the array of entries.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_FORCEINLINE_FUNCTION
+  key_type key_at(size_type i) const
+  {
+    return m_keys[ i < capacity() ? i : capacity() ];
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool valid_at(size_type i) const
+  {
+    return m_available_indexes.test(i);
+  }
+
+  template <typename SKey, typename SValue>
+  UnorderedMap( UnorderedMap<SKey,SValue,Device,Hasher,EqualTo> const& src,
+                typename Impl::enable_if< Impl::UnorderedMapCanAssign<declared_key_type,declared_value_type,SKey,SValue>::value,int>::type = 0
+              )
+    : m_bounded_insert(src.m_bounded_insert)
+    , m_hasher(src.m_hasher)
+    , m_equal_to(src.m_equal_to)
+    , m_size(src.m_size)
+    , m_available_indexes(src.m_available_indexes)
+    , m_hash_lists(src.m_hash_lists)
+    , m_next_index(src.m_next_index)
+    , m_keys(src.m_keys)
+    , m_values(src.m_values)
+    , m_scalars(src.m_scalars)
+  {}
+
+
+  template <typename SKey, typename SValue>
+  typename Impl::enable_if< Impl::UnorderedMapCanAssign<declared_key_type,declared_value_type,SKey,SValue>::value
+                           ,declared_map_type & >::type
+  operator=( UnorderedMap<SKey,SValue,Device,Hasher,EqualTo> const& src)
+  {
+    m_bounded_insert = src.m_bounded_insert;
+    m_hasher = src.m_hasher;
+    m_equal_to = src.m_equal_to;
+    m_size = src.m_size;
+    m_available_indexes = src.m_available_indexes;
+    m_hash_lists = src.m_hash_lists;
+    m_next_index = src.m_next_index;
+    m_keys = src.m_keys;
+    m_values = src.m_values;
+    m_scalars = src.m_scalars;
+    return *this;
+  }
+
+  template <typename SKey, typename SValue, typename SDevice>
+  typename Impl::enable_if< Impl::is_same< typename Impl::remove_const<SKey>::type, key_type>::value &&
+                            Impl::is_same< typename Impl::remove_const<SValue>::type, value_type>::value
+                          >::type
+  create_copy_view( UnorderedMap<SKey, SValue, SDevice, Hasher,EqualTo> const& src)
+  {
+    if (m_hash_lists.ptr_on_device() != src.m_hash_lists.ptr_on_device()) {
+
+      insertable_map_type tmp;
+
+      tmp.m_bounded_insert = src.m_bounded_insert;
+      tmp.m_hasher = src.m_hasher;
+      tmp.m_equal_to = src.m_equal_to;
+      tmp.m_size = src.size();
+      tmp.m_available_indexes = bitset_type( src.capacity() );
+      tmp.m_hash_lists        = size_type_view( AllocateWithoutInitializing(), "UnorderedMap hash list", src.m_hash_lists.size() );
+      tmp.m_next_index        = size_type_view( AllocateWithoutInitializing(), "UnorderedMap next index", src.m_next_index.size() );
+      tmp.m_keys              = key_type_view( AllocateWithoutInitializing(), "UnorderedMap keys", src.m_keys.size() );
+      tmp.m_values            = value_type_view( AllocateWithoutInitializing(), "UnorderedMap values", src.m_values.size() );
+      tmp.m_scalars           = scalars_view("UnorderedMap scalars");
+
+      Kokkos::deep_copy(tmp.m_available_indexes, src.m_available_indexes);
+
+      typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, typename SDevice::memory_space > raw_deep_copy;
+
+      raw_deep_copy(tmp.m_hash_lists.ptr_on_device(), src.m_hash_lists.ptr_on_device(), sizeof(size_type)*src.m_hash_lists.size());
+      raw_deep_copy(tmp.m_next_index.ptr_on_device(), src.m_next_index.ptr_on_device(), sizeof(size_type)*src.m_next_index.size());
+      raw_deep_copy(tmp.m_keys.ptr_on_device(), src.m_keys.ptr_on_device(), sizeof(key_type)*src.m_keys.size());
+      if (!is_set) {
+        raw_deep_copy(tmp.m_values.ptr_on_device(), src.m_values.ptr_on_device(), sizeof(impl_value_type)*src.m_values.size());
+      }
+      raw_deep_copy(tmp.m_scalars.ptr_on_device(), src.m_scalars.ptr_on_device(), sizeof(int)*num_scalars );
+
+      *this = tmp;
+    }
+  }
+
+  //@}
+private: // private member functions
+
+  bool modified() const
+  {
+    return get_flag(modified_idx);
+  }
+
+  void set_flag(int flag) const
+  {
+    typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, Kokkos::HostSpace > raw_deep_copy;
+    const int true_ = true;
+    raw_deep_copy(m_scalars.ptr_on_device() + flag, &true_, sizeof(int));
+  }
+
+  void reset_flag(int flag) const
+  {
+    typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, Kokkos::HostSpace > raw_deep_copy;
+    const int false_ = false;
+    raw_deep_copy(m_scalars.ptr_on_device() + flag, &false_, sizeof(int));
+  }
+
+  bool get_flag(int flag) const
+  {
+    typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename device_type::memory_space > raw_deep_copy;
+    int result = false;
+    raw_deep_copy(&result, m_scalars.ptr_on_device() + flag, sizeof(int));
+    return result;
+  }
+
+  static uint32_t calculate_capacity(uint32_t capacity_hint)
+  {
+    // increase by 16% and round to nears multiple of 128
+    return capacity_hint ? ((static_cast<uint32_t>(7ull*capacity_hint/6u) + 127u)/128u)*128u : 128u;
+  }
+
+private: // private members
+  bool              m_bounded_insert;
+  hasher_type       m_hasher;
+  equal_to_type     m_equal_to;
+  mutable size_type m_size;
+  bitset_type       m_available_indexes;
+  size_type_view    m_hash_lists;
+  size_type_view    m_next_index;
+  key_type_view     m_keys;
+  value_type_view   m_values;
+  scalars_view      m_scalars;
+
+  template <typename KKey, typename VValue, typename DDevice, typename HHash, typename EEqualTo>
+  friend class UnorderedMap;
+
+  template <typename UMap>
+  friend struct Impl::UnorderedMapErase;
+
+  template <typename UMap>
+  friend struct Impl::UnorderedMapHistogram;
+
+  template <typename UMap>
+  friend struct Impl::UnorderedMapPrint;
+};
+
+// Specialization of deep_copy for two UnorderedMap objects.
+template <  typename DKey, typename DT, typename DDevice
+          , typename SKey, typename ST, typename SDevice
+          , typename Hasher, typename EqualTo >
+inline void deep_copy(         UnorderedMap<DKey, DT, DDevice, Hasher, EqualTo> & dst
+                       , const UnorderedMap<SKey, ST, SDevice, Hasher, EqualTo> & src )
+{
+  dst.create_copy_view(src);
+}
+
+
+} // namespace Kokkos
+
+#endif //KOKKOS_UNORDERED_MAP_HPP
diff --git a/lib/kokkos/containers/src/Kokkos_Vector.hpp b/lib/kokkos/containers/src/Kokkos_Vector.hpp
new file mode 100644
index 000000000..d946b8a9c
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_Vector.hpp
@@ -0,0 +1,282 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VECTOR_HPP
+#define KOKKOS_VECTOR_HPP
+
+#include <KokkosCore_config.h>
+#include <Kokkos_DualView.hpp>
+
+/* Drop in replacement for std::vector based on Kokkos::DualView
+ * Most functions only work on the host (it will not compile if called from device kernel)
+ *
+ */
+  namespace Kokkos {
+
+template <typename Scalar, class Device=Impl::DefaultDeviceType>
+class vector : public DualView<Scalar*,LayoutLeft,Device> {
+public:
+  typedef Device device_type;
+  typedef Scalar value_type;
+  typedef Scalar* pointer;
+  typedef const Scalar* const_pointer;
+  typedef Scalar* reference;
+  typedef const Scalar* const_reference;
+  typedef Scalar* iterator;
+  typedef const Scalar* const_iterator;
+
+private:
+  size_t _size;
+  typedef size_t size_type;
+  float _extra_storage;
+  typedef DualView<Scalar*,LayoutLeft,Device> DV;
+
+
+public:
+  inline Scalar& operator() (int i) const {return DV::h_view(i);};
+  inline Scalar& operator[] (int i) const {return DV::h_view(i);};
+
+
+  /* Member functions which behave like std::vector functions */
+
+  vector():DV() {
+    _size = 0;
+    _extra_storage = 1.1;
+    DV::modified_host = 1;
+  };
+
+
+  vector(int n, Scalar val=Scalar()):DualView<Scalar*,LayoutLeft,Device>("Vector",size_t(n*(1.1))) {
+    _size = n;
+    _extra_storage = 1.1;
+    DV::modified_host = 1;
+
+    assign(n,val);
+  }
+
+
+  void resize(size_t n) {
+    if(n>=capacity())
+      DV::resize(size_t (n*_extra_storage));
+    _size = n;
+  }
+
+  void resize(size_t n, const Scalar& val) {
+    assign(n,val);
+  }
+
+  void assign (size_t n, const Scalar& val) {
+
+    /* Resize if necessary (behavour of std:vector) */
+
+    if(n>capacity())
+      DV::resize(size_t (n*_extra_storage));
+    _size = n;
+
+          /* Assign value either on host or on device */
+
+    if( DV::modified_host >= DV::modified_device ) {
+      set_functor_host f(DV::h_view,val);
+      parallel_for(n,f);
+      DV::t_host::device_type::fence();
+      DV::modified_host++;
+    } else {
+      set_functor f(DV::d_view,val);
+      parallel_for(n,f);
+      DV::t_dev::device_type::fence();
+      DV::modified_device++;
+    }
+  }
+
+  void reserve(size_t n) {
+    DV::resize(size_t (n*_extra_storage));
+  }
+
+  void push_back(Scalar val) {
+    DV::modified_host++;
+    if(_size == capacity()) {
+      size_t new_size = _size*_extra_storage;
+      if(new_size == _size) new_size++;
+      DV::resize(new_size);
+    }
+
+    DV::h_view(_size) = val;
+    _size++;
+
+  };
+
+  void pop_back() {
+    _size--;
+  };
+
+  void clear() {
+    _size = 0;
+  }
+
+  size_type size() const {return _size;};
+  size_type max_size() const {return 2000000000;}
+  size_type capacity() const {return DV::capacity();};
+  bool empty() const {return _size==0;};
+
+  iterator begin() const {return &DV::h_view(0);};
+
+  iterator end() const {return &DV::h_view(_size);};
+
+
+  /* std::algorithms wich work originally with iterators, here they are implemented as member functions */
+
+  size_t
+  lower_bound (const size_t& start,
+               const size_t& theEnd,
+               const Scalar& comp_val) const
+  {
+    int lower = start; // FIXME (mfh 24 Apr 2014) narrowing conversion
+    int upper = _size > theEnd? theEnd : _size-1; // FIXME (mfh 24 Apr 2014) narrowing conversion
+    if (upper <= lower) {
+      return theEnd;
+    }
+
+    Scalar lower_val = DV::h_view(lower);
+    Scalar upper_val = DV::h_view(upper);
+    size_t idx = (upper+lower)/2;
+    Scalar val = DV::h_view(idx);
+    if(val>upper_val) return upper;
+    if(val<lower_val) return start;
+
+    while(upper>lower) {
+      if(comp_val>val) {
+        lower = ++idx;
+      } else {
+        upper = idx;
+      }
+      idx = (upper+lower)/2;
+      val = DV::h_view(idx);
+    }
+    return idx;
+  }
+
+  bool is_sorted() {
+    for(int i=0;i<_size-1;i++) {
+      if(DV::h_view(i)>DV::h_view(i+1)) return false;
+    }
+    return true;
+  }
+
+  iterator find(Scalar val) const {
+    if(_size == 0) return end();
+
+    int upper,lower,current;
+    current = _size/2;
+    upper = _size-1;
+    lower = 0;
+
+    if((val<DV::h_view(0)) || (val>DV::h_view(_size-1)) ) return end();
+
+    while(upper>lower)
+    {
+      if(val>DV::h_view(current)) lower = current+1;
+      else upper = current;
+      current = (upper+lower)/2;
+    }
+
+    if(val==DV::h_view(current)) return &DV::h_view(current);
+    else return end();
+  }
+
+  /* Additional functions for data management */
+
+  void device_to_host(){
+    deep_copy(DV::h_view,DV::d_view);
+  }
+  void host_to_device() const {
+    deep_copy(DV::d_view,DV::h_view);
+  }
+
+  void on_host() {
+    DV::modified_host = DV::modified_device + 1;
+  }
+  void on_device() {
+    DV::modified_device = DV::modified_host + 1;
+  }
+
+  void set_overallocation(float extra) {
+    _extra_storage = 1.0 + extra;
+  }
+
+
+public:
+  struct set_functor {
+    typedef typename DV::t_dev::device_type device_type;
+    typename DV::t_dev _data;
+    Scalar _val;
+
+    set_functor(typename DV::t_dev data, Scalar val) :
+      _data(data),_val(val) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int &i) const {
+      _data(i) = _val;
+    }
+  };
+
+  struct set_functor_host {
+    typedef typename DV::t_host::device_type device_type;
+    typename DV::t_host _data;
+    Scalar _val;
+
+    set_functor_host(typename DV::t_host data, Scalar val) :
+      _data(data),_val(val) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int &i) const {
+      _data(i) = _val;
+    }
+  };
+
+};
+
+
+}
+#endif
diff --git a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
new file mode 100644
index 000000000..17ad3599f
--- /dev/null
+++ b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
@@ -0,0 +1,173 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BITSET_IMPL_HPP
+#define KOKKOS_BITSET_IMPL_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <stdint.h>
+
+#include <cstdio>
+#include <climits>
+#include <iostream>
+#include <iomanip>
+
+namespace Kokkos { namespace Impl {
+
+KOKKOS_FORCEINLINE_FUNCTION
+unsigned rotate_right(unsigned i, int r)
+{
+  enum { size = static_cast<int>(sizeof(unsigned)*CHAR_BIT) };
+  return r ? ((i >> r) | (i << (size-r))) : i ;
+}
+
+KOKKOS_FORCEINLINE_FUNCTION
+int bit_scan_forward(unsigned i)
+{
+#if defined( __CUDA_ARCH__ )
+  return __ffs(i) - 1;
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+  return __builtin_ffs(i) - 1;
+#elif defined( __INTEL_COMPILER )
+  return _bit_scan_forward(i);
+#else
+
+  unsigned t = 1u;
+  int r = 0;
+  while (i && (i & t == 0))
+  {
+    t = t << 1;
+    ++r;
+  }
+  return r;
+#endif
+}
+
+
+KOKKOS_FORCEINLINE_FUNCTION
+int bit_scan_reverse(unsigned i)
+{
+  enum { shift = static_cast<int>(sizeof(unsigned)*CHAR_BIT - 1) };
+#if defined( __CUDA_ARCH__ )
+  return shift - __clz(i);
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+  return shift - __builtin_clz(i);
+#elif defined( __INTEL_COMPILER )
+  return _bit_scan_reverse(i);
+#else
+  unsigned t = 1u << shift;
+  int r = 0;
+  while (i && (i & t == 0))
+  {
+    t = t >> 1;
+    ++r;
+  }
+  return r;
+#endif
+}
+
+
+// count the bits set
+KOKKOS_FORCEINLINE_FUNCTION
+int popcount(unsigned i)
+{
+#if defined( __CUDA_ARCH__ )
+  return __popc(i);
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+  return __builtin_popcount(i);
+#elif defined ( __INTEL_COMPILER )
+  return _popcnt32(i);
+#else
+  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
+  i = i - ((i >> 1) & ~0u/3u);                                         // temp
+  i = (i & ~0u/15u*3u) + ((i >> 2) & ~0u/15u*3u);                      // temp
+  i = (i + (i >> 4)) & ~0u/255u*15u;                                   // temp
+  return (int)((i * (~0u/255u)) >> (sizeof(unsigned) - 1) * CHAR_BIT); // count
+#endif
+}
+
+
+template <typename Bitset>
+struct BitsetCount
+{
+  typedef Bitset bitset_type;
+  typedef typename bitset_type::device_type device_type;
+  typedef typename bitset_type::size_type size_type;
+  typedef size_type value_type;
+
+  bitset_type m_bitset;
+
+  BitsetCount( bitset_type const& bitset)
+    : m_bitset(bitset)
+  {}
+
+  size_type apply() const
+  {
+    size_type count = 0u;
+    parallel_reduce(m_bitset.m_blocks.size(), *this, count);
+    return count;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & count)
+  {
+    count = 0u;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & count, const volatile size_type & incr )
+  {
+    count += incr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i, value_type & count) const
+  {
+    count += popcount(m_bitset.m_blocks[i]);
+  }
+};
+
+}} //Kokkos::Impl
+
+#endif // KOKKOS_BITSET_IMPL_HPP
+
diff --git a/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
new file mode 100644
index 000000000..647024f48
--- /dev/null
+++ b/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
@@ -0,0 +1,154 @@
+#ifndef KOKKOS_FUNCTIONAL_IMPL_HPP
+#define KOKKOS_FUNCTIONAL_IMPL_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <stdint.h>
+
+namespace Kokkos { namespace Impl {
+
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+KOKKOS_FORCEINLINE_FUNCTION
+uint32_t getblock32 ( const uint8_t * p, int i )
+{
+// used to avoid aliasing error which could cause errors with
+// forced inlining
+  return    ((uint32_t)p[i*4+0])
+          | ((uint32_t)p[i*4+1] << 8)
+          | ((uint32_t)p[i*4+2] << 16)
+          | ((uint32_t)p[i*4+3] << 24);
+}
+
+KOKKOS_FORCEINLINE_FUNCTION
+uint32_t rotl32 ( uint32_t x, int8_t r )
+{ return (x << r) | (x >> (32 - r)); }
+
+KOKKOS_FORCEINLINE_FUNCTION
+uint32_t fmix32 ( uint32_t h )
+{
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+
+  return h;
+}
+
+KOKKOS_INLINE_FUNCTION
+uint32_t MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 4;
+
+  uint32_t h1 = seed;
+
+  const uint32_t c1 = 0xcc9e2d51;
+  const uint32_t c2 = 0x1b873593;
+
+  //----------
+  // body
+
+  for(int i=0; i<nblocks; ++i)
+  {
+    uint32_t k1 = getblock32(data,i);
+
+    k1 *= c1;
+    k1 = rotl32(k1,15);
+    k1 *= c2;
+
+    h1 ^= k1;
+    h1 = rotl32(h1,13);
+    h1 = h1*5+0xe6546b64;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
+
+  uint32_t k1 = 0;
+
+  switch(len & 3)
+  {
+  case 3: k1 ^= tail[2] << 16;
+  case 2: k1 ^= tail[1] << 8;
+  case 1: k1 ^= tail[0];
+          k1 *= c1; k1 = rotl32(k1,15); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+
+  h1 = fmix32(h1);
+
+  return h1;
+}
+
+
+#if defined( __GNUC__ ) /* GNU C   */ || \
+    defined( __GNUG__ ) /* GNU C++ */ || \
+    defined( __clang__ )
+
+#define KOKKOS_MAY_ALIAS __attribute__((__may_alias__))
+
+#else
+
+#define KOKKOS_MAY_ALIAS
+
+#endif
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+bool bitwise_equal(T const * const a_ptr, T const * const b_ptr)
+{
+  typedef uint64_t KOKKOS_MAY_ALIAS T64;
+  typedef uint32_t KOKKOS_MAY_ALIAS T32;
+  typedef uint16_t KOKKOS_MAY_ALIAS T16;
+  typedef uint8_t  KOKKOS_MAY_ALIAS T8;
+
+  enum {
+    NUM_8  = sizeof(T),
+    NUM_16 = NUM_8 / 2,
+    NUM_32 = NUM_8 / 4,
+    NUM_64 = NUM_8 / 8
+  };
+
+  union {
+    T   const * const ptr;
+    T64 const * const ptr64;
+    T32 const * const ptr32;
+    T16 const * const ptr16;
+    T8  const * const ptr8;
+  } a = {a_ptr}, b = {b_ptr};
+
+  bool result = true;
+
+  for (int i=0; i < NUM_64; ++i) {
+    result = result && a.ptr64[i] == b.ptr64[i];
+  }
+
+  if ( NUM_64*2 < NUM_32 ) {
+    result = result && a.ptr32[NUM_64*2] == b.ptr32[NUM_64*2];
+  }
+
+  if ( NUM_32*2 < NUM_16 ) {
+    result = result && a.ptr16[NUM_32*2] == b.ptr16[NUM_32*2];
+  }
+
+  if ( NUM_16*2 < NUM_8 ) {
+    result = result && a.ptr8[NUM_16*2] == b.ptr8[NUM_16*2];
+  }
+
+  return result;
+}
+
+
+
+#undef KOKKOS_MAY_ALIAS
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_FUNCTIONAL_IMPL_HPP
diff --git a/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp b/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp
new file mode 100644
index 000000000..ddd091a45
--- /dev/null
+++ b/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp
@@ -0,0 +1,223 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_STATICCRSGRAPH_FACTORY_HPP
+#define KOKKOS_IMPL_STATICCRSGRAPH_FACTORY_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
+inline
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view ,
+                    typename Impl::enable_if< ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
+{
+  return view ;
+}
+
+template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
+inline
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view )
+{
+  // Force copy:
+  //typedef Impl::ViewAssignment< Impl::ViewDefault > alloc ; // unused
+  typedef StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >  staticcrsgraph_type ;
+
+  typename staticcrsgraph_type::HostMirror               tmp ;
+  typename staticcrsgraph_type::row_map_type::HostMirror tmp_row_map = create_mirror( view.row_map);
+
+  // Allocation to match:
+  tmp.row_map = tmp_row_map ; // Assignment of 'const' from 'non-const'
+  tmp.entries = create_mirror( view.entries );
+
+
+  // Deep copy:
+  deep_copy( tmp_row_map , view.row_map );
+  deep_copy( tmp.entries , view.entries );
+
+  return tmp ;
+}
+
+template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
+inline
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view ,
+                    typename Impl::enable_if< ! ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
+{
+  return create_mirror( view );
+}
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class StaticCrsGraphType , class InputSizeType >
+inline
+typename StaticCrsGraphType::staticcrsgraph_type
+create_staticcrsgraph( const std::string & label ,
+                 const std::vector< InputSizeType > & input )
+{
+  typedef StaticCrsGraphType                  output_type ;
+  //typedef std::vector< InputSizeType >  input_type ; // unused
+
+  typedef typename output_type::entries_type   entries_type ;
+
+  typedef View< typename output_type::size_type [] ,
+                typename output_type::array_layout ,
+                typename output_type::device_type > work_type ;
+
+  output_type output ;
+
+  // Create the row map:
+
+  const size_t length = input.size();
+
+  {
+    work_type row_work( "tmp" , length + 1 );
+
+    typename work_type::HostMirror row_work_host =
+      create_mirror_view( row_work );
+
+    size_t sum = 0 ;
+    row_work_host[0] = 0 ;
+    for ( size_t i = 0 ; i < length ; ++i ) {
+      row_work_host[i+1] = sum += input[i];
+    }
+
+    deep_copy( row_work , row_work_host );
+
+    output.entries   = entries_type( label , sum );
+    output.row_map   = row_work ;
+  }
+
+  return output ;
+}
+
+//----------------------------------------------------------------------------
+
+template< class StaticCrsGraphType , class InputSizeType >
+inline
+typename StaticCrsGraphType::staticcrsgraph_type
+create_staticcrsgraph( const std::string & label ,
+                 const std::vector< std::vector< InputSizeType > > & input )
+{
+  typedef StaticCrsGraphType                                output_type ;
+  //typedef std::vector< std::vector< InputSizeType > > input_type ; // unused
+  typedef typename output_type::entries_type          entries_type ;
+  //typedef typename output_type::size_type             size_type ; // unused
+
+  // mfh 14 Feb 2014: This function doesn't actually create instances
+  // of ok_rank, but it needs to declare the typedef in order to do
+  // the static "assert" (a compile-time check that the given shape
+  // has rank 1).  In order to avoid a "declared but unused typedef"
+  // warning, we declare an empty instance of this type, with the
+  // usual "(void)" marker to avoid a compiler warning for the unused
+  // variable.
+
+  typedef typename
+    Impl::assert_shape_is_rank_one< typename entries_type::shape_type >::type
+      ok_rank ;
+  {
+    ok_rank thing;
+    (void) thing;
+  }
+
+  typedef View< typename output_type::size_type [] ,
+                typename output_type::array_layout ,
+                typename output_type::device_type > work_type ;
+
+  output_type output ;
+
+    // Create the row map:
+
+  const size_t length = input.size();
+
+  {
+    work_type row_work( "tmp" , length + 1 );
+
+    typename work_type::HostMirror row_work_host =
+      create_mirror_view( row_work );
+
+    size_t sum = 0 ;
+    row_work_host[0] = 0 ;
+    for ( size_t i = 0 ; i < length ; ++i ) {
+      row_work_host[i+1] = sum += input[i].size();
+    }
+
+    deep_copy( row_work , row_work_host );
+
+    output.entries   = entries_type( label , sum );
+    output.row_map   = row_work ;
+  }
+
+  // Fill in the entries:
+  {
+    typename entries_type::HostMirror host_entries =
+      create_mirror_view( output.entries );
+
+    size_t sum = 0 ;
+    for ( size_t i = 0 ; i < length ; ++i ) {
+      for ( size_t j = 0 ; j < input[i].size() ; ++j , ++sum ) {
+        host_entries( sum ) = input[i][j] ;
+      }
+    }
+
+    deep_copy( output.entries , host_entries );
+  }
+
+  return output ;
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_IMPL_CRSARRAY_FACTORY_HPP */
+
diff --git a/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
new file mode 100644
index 000000000..150d3d893
--- /dev/null
+++ b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
@@ -0,0 +1,101 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_UnorderedMap.hpp>
+
+namespace Kokkos { namespace Impl {
+
+uint32_t find_hash_size(uint32_t size)
+{
+  if (size == 0u) return 0u;
+
+  // these primes try to preserve randomness of hash
+  static const uint32_t primes [] = {
+        3, 7, 13, 23, 53, 97, 193, 389, 769, 1543
+      , 2237, 2423, 2617, 2797, 2999, 3167, 3359, 3539
+      , 3727, 3911, 4441 , 4787 , 5119 , 5471 , 5801 , 6143 , 6521 , 6827
+      , 7177 , 7517 , 7853 , 8887 , 9587 , 10243 , 10937 , 11617 , 12289
+      , 12967 , 13649 , 14341 , 15013 , 15727
+      , 17749 , 19121 , 20479 , 21859 , 23209 , 24593 , 25939 , 27329
+      , 28669 , 30047 , 31469 , 35507 , 38231 , 40961 , 43711 , 46439
+      , 49157 , 51893 , 54617 , 57347 , 60077 , 62801 , 70583 , 75619
+      , 80669 , 85703 , 90749 , 95783 , 100823 , 105871 , 110909 , 115963
+      , 120997 , 126031 , 141157 , 151237 , 161323 , 171401 , 181499 , 191579
+      , 201653 , 211741 , 221813 , 231893 , 241979 , 252079
+      , 282311 , 302483 , 322649 , 342803 , 362969 , 383143 , 403301 , 423457
+      , 443629 , 463787 , 483953 , 504121 , 564617 , 604949 , 645313 , 685609
+      , 725939 , 766273 , 806609 , 846931 , 887261 , 927587 , 967919 , 1008239
+      , 1123477 , 1198397 , 1273289 , 1348177 , 1423067 , 1497983 , 1572869
+      , 1647761 , 1722667 , 1797581 , 1872461 , 1947359 , 2022253
+      , 2246953 , 2396759 , 2546543 , 2696363 , 2846161 , 2995973 , 3145739
+      , 3295541 , 3445357 , 3595117 , 3744941 , 3894707 , 4044503
+      , 4493921 , 4793501 , 5093089 , 5392679 , 5692279 , 5991883 , 6291469
+      , 6591059 , 6890641 , 7190243 , 7489829 , 7789447 , 8089033
+      , 8987807 , 9586981 , 10186177 , 10785371 , 11384539 , 11983729
+      , 12582917 , 13182109 , 13781291 , 14380469 , 14979667 , 15578861
+      , 16178053 , 17895707 , 19014187 , 20132683 , 21251141 , 22369661
+      , 23488103 , 24606583 , 25725083 , 26843549 , 27962027 , 29080529
+      , 30198989 , 31317469 , 32435981 , 35791397 , 38028379 , 40265327
+      , 42502283 , 44739259 , 46976221 , 49213237 , 51450131 , 53687099
+      , 55924061 , 58161041 , 60397993 , 62634959 , 64871921
+      , 71582857 , 76056727 , 80530643 , 85004567 , 89478503 , 93952427
+      , 98426347 , 102900263 , 107374217 , 111848111 , 116322053 , 120795971
+      , 125269877 , 129743807 , 143165587 , 152113427 , 161061283 , 170009141
+      , 178956983 , 187904819 , 196852693 , 205800547 , 214748383 , 223696237
+      , 232644089 , 241591943 , 250539763 , 259487603 , 268435399
+  };
+
+  const uint32_t num_primes = sizeof(primes)/sizeof(uint32_t);
+
+  uint32_t hsize = primes[num_primes-1] ;
+  for (uint32_t i = 0; i < num_primes; ++i) {
+    if (size <= primes[i]) {
+      hsize = primes[i];
+      break;
+    }
+  }
+  return hsize;
+}
+
+}} // namespace Kokkos::Impl
+
diff --git a/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
new file mode 100644
index 000000000..c71b364a0
--- /dev/null
+++ b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
@@ -0,0 +1,297 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_UNORDERED_MAP_IMPL_HPP
+#define KOKKOS_UNORDERED_MAP_IMPL_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <stdint.h>
+
+#include <cstdio>
+#include <climits>
+#include <iostream>
+#include <iomanip>
+
+namespace Kokkos { namespace Impl {
+
+uint32_t find_hash_size( uint32_t size );
+
+template <typename Map>
+struct UnorderedMapRehash
+{
+  typedef Map map_type;
+  typedef typename map_type::const_map_type const_map_type;
+  typedef typename map_type::device_type device_type;
+  typedef typename map_type::size_type size_type;
+
+  map_type       m_dst;
+  const_map_type m_src;
+
+  UnorderedMapRehash( map_type const& dst, const_map_type const& src)
+    : m_dst(dst), m_src(src)
+  {}
+
+  void apply() const
+  {
+    parallel_for(m_src.capacity(), *this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(size_type i) const
+  {
+    if ( m_src.valid_at(i) )
+      m_dst.insert(m_src.key_at(i), m_src.value_at(i));
+  }
+
+};
+
+template <typename UMap>
+struct UnorderedMapErase
+{
+  typedef UMap map_type;
+  typedef typename map_type::device_type device_type;
+  typedef typename map_type::size_type size_type;
+  typedef typename map_type::key_type key_type;
+  typedef typename map_type::impl_value_type value_type;
+
+  map_type m_map;
+
+  UnorderedMapErase( map_type const& map)
+    : m_map(map)
+  {}
+
+  void apply() const
+  {
+    parallel_for(m_map.m_hash_lists.size(), *this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i ) const
+  {
+    const size_type invalid_index = map_type::invalid_index;
+
+    size_type curr = m_map.m_hash_lists(i);
+    size_type next = invalid_index;
+
+    // remove erased head of the linked-list
+    while (curr != invalid_index && !m_map.valid_at(curr)) {
+      next = m_map.m_next_index[curr];
+      m_map.m_next_index[curr] = invalid_index;
+      m_map.m_keys[curr] = key_type();
+      if (m_map.is_set) m_map.m_values[curr] = value_type();
+      curr = next;
+      m_map.m_hash_lists(i) = next;
+    }
+
+    // if the list is non-empty and the head is valid
+    if (curr != invalid_index && m_map.valid_at(curr) ) {
+      size_type prev = curr;
+      curr = m_map.m_next_index[prev];
+
+      while (curr != invalid_index) {
+        next = m_map.m_next_index[curr];
+        if (m_map.valid_at(curr)) {
+          prev = curr;
+        }
+        else {
+          // remove curr from list
+          m_map.m_next_index[prev] = next;
+          m_map.m_next_index[curr] = invalid_index;
+          m_map.m_keys[curr] = key_type();
+          if (map_type::is_set) m_map.m_values[curr] = value_type();
+        }
+        curr = next;
+      }
+    }
+  }
+};
+
+template <typename UMap>
+struct UnorderedMapHistogram
+{
+  typedef UMap map_type;
+  typedef typename map_type::device_type device_type;
+  typedef typename map_type::size_type size_type;
+
+  typedef View<int[100], device_type> histogram_view;
+  typedef typename histogram_view::HostMirror host_histogram_view;
+
+  map_type m_map;
+  histogram_view m_length;
+  histogram_view m_distance;
+  histogram_view m_block_distance;
+
+  UnorderedMapHistogram( map_type const& map)
+    : m_map(map)
+    , m_length("UnorderedMap Histogram")
+    , m_distance("UnorderedMap Histogram")
+    , m_block_distance("UnorderedMap Histogram")
+  {}
+
+  void calculate()
+  {
+    parallel_for(m_map.m_hash_lists.size(), *this);
+  }
+
+  void clear()
+  {
+    Kokkos::deep_copy(m_length, 0);
+    Kokkos::deep_copy(m_distance, 0);
+    Kokkos::deep_copy(m_block_distance, 0);
+  }
+
+  void print_length(std::ostream &out)
+  {
+    host_histogram_view host_copy = create_mirror_view(m_length);
+    Kokkos::deep_copy(host_copy, m_length);
+
+    for (int i=0, size = host_copy.size(); i<size; ++i)
+    {
+      out << host_copy[i] << " , ";
+    }
+    out << "\b\b\b   " << std::endl;
+  }
+
+  void print_distance(std::ostream &out)
+  {
+    host_histogram_view host_copy = create_mirror_view(m_distance);
+    Kokkos::deep_copy(host_copy, m_distance);
+
+    for (int i=0, size = host_copy.size(); i<size; ++i)
+    {
+      out << host_copy[i] << " , ";
+    }
+    out << "\b\b\b   " << std::endl;
+  }
+
+  void print_block_distance(std::ostream &out)
+  {
+    host_histogram_view host_copy = create_mirror_view(m_block_distance);
+    Kokkos::deep_copy(host_copy, m_block_distance);
+
+    for (int i=0, size = host_copy.size(); i<size; ++i)
+    {
+      out << host_copy[i] << " , ";
+    }
+    out << "\b\b\b   " << std::endl;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i ) const
+  {
+    const size_type invalid_index = map_type::invalid_index;
+
+    uint32_t length = 0;
+    size_type min_index = ~0u, max_index = 0;
+    for (size_type curr = m_map.m_hash_lists(i); curr != invalid_index; curr = m_map.m_next_index[curr]) {
+      ++length;
+      min_index = (curr < min_index) ? curr : min_index;
+      max_index = (max_index < curr) ? curr : max_index;
+    }
+
+    size_type distance = (0u < length) ? max_index - min_index : 0u;
+    size_type blocks = (0u < length) ? max_index/32u - min_index/32u : 0u;
+
+    // normalize data
+    length   = length   < 100u ? length   : 99u;
+    distance = distance < 100u ? distance : 99u;
+    blocks   = blocks   < 100u ? blocks   : 99u;
+
+    if (0u < length)
+    {
+      atomic_fetch_add( &m_length(length), 1);
+      atomic_fetch_add( &m_distance(distance), 1);
+      atomic_fetch_add( &m_block_distance(blocks), 1);
+    }
+  }
+};
+
+template <typename UMap>
+struct UnorderedMapPrint
+{
+  typedef UMap map_type;
+  typedef typename map_type::device_type device_type;
+  typedef typename map_type::size_type size_type;
+
+  map_type m_map;
+
+  UnorderedMapPrint( map_type const& map)
+    : m_map(map)
+  {}
+
+  void apply()
+  {
+    parallel_for(m_map.m_hash_lists.size(), *this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i ) const
+  {
+    const size_type invalid_index = map_type::invalid_index;
+
+    uint32_t list = m_map.m_hash_lists(i);
+    for (size_type curr = list, ii=0; curr != invalid_index; curr = m_map.m_next_index[curr], ++ii) {
+      printf("%d[%d]: %d->%d\n", list, ii, m_map.key_at(curr), m_map.value_at(curr));
+    }
+  }
+};
+
+template <typename DKey, typename DValue, typename SKey, typename SValue>
+struct UnorderedMapCanAssign : public false_ {};
+
+template <typename Key, typename Value>
+struct UnorderedMapCanAssign<Key,Value,Key,Value> : public true_ {};
+
+template <typename Key, typename Value>
+struct UnorderedMapCanAssign<const Key,Value,Key,Value> : public true_ {};
+
+template <typename Key, typename Value>
+struct UnorderedMapCanAssign<const Key,const Value,Key,Value> : public true_ {};
+
+template <typename Key, typename Value>
+struct UnorderedMapCanAssign<const Key,const Value,const Key,Value> : public true_ {};
+
+
+}} //Kokkos::Impl
+
+#endif // KOKKOS_UNORDERED_MAP_IMPL_HPP
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
new file mode 100644
index 000000000..7112cc068
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@@ -0,0 +1,283 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDAEXEC_HPP
+#define KOKKOS_CUDAEXEC_HPP
+
+#include <string>
+#include <Kokkos_Parallel.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Cuda/Kokkos_Cuda_abort.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#if defined( __CUDACC__ )
+
+namespace Kokkos {
+namespace Impl {
+
+class CudaExec {
+public:
+
+  __device__ inline
+  CudaExec( const int shmem_begin , const int shmem_end )
+    : m_shmem_end(   shmem_end )
+    , m_shmem_iter(  shmem_begin )
+    {}
+
+  __device__ inline
+  void * get_shmem( const int size )
+  {
+    extern __shared__ int sh[];
+
+    // m_shmem_iter is in bytes, convert to integer offsets
+    const int offset = m_shmem_iter >> power_of_two<sizeof(int)>::value ;
+
+    m_shmem_iter += size ;
+
+    if ( m_shmem_end < m_shmem_iter ) {
+      cuda_abort("Cuda::get_shmem out of memory");
+    }
+
+    return sh + offset ;
+  }
+
+private:
+
+  const int m_shmem_end ;
+        int m_shmem_iter ;
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#if defined( __CUDA_ARCH__ )
+
+namespace Kokkos {
+
+inline __device__ 
+void * Cuda::get_shmem( const int size ) { return m_exec.get_shmem( size ); }
+
+} // namespace Kokkos
+
+#endif /* defined( __CUDA_ARCH__ ) */
+#endif /* defined( __CUDACC__ ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+struct CudaTraits {
+  enum { WarpSize       = 32      /* 0x0020 */ };
+  enum { WarpIndexMask  = 0x001f  /* Mask for warpindex */ };
+  enum { WarpIndexShift = 5       /* WarpSize == 1 << WarpShift */ };
+
+  enum { SharedMemoryBanks    = 32      /* Compute device 2.0 */ };
+  enum { SharedMemoryCapacity = 0x0C000 /* 48k shared / 16k L1 Cache */ };
+  enum { SharedMemoryUsage    = 0x04000 /* 16k shared / 48k L1 Cache */ };
+
+  enum { UpperBoundGridCount    = 65535 /* Hard upper bound */ };
+  enum { ConstantMemoryCapacity = 0x010000 /* 64k bytes */ };
+  enum { ConstantMemoryUsage    = 0x008000 /* 32k bytes */ };
+  enum { ConstantMemoryCache    = 0x002000 /*  8k bytes */ };
+
+  typedef unsigned long
+    ConstantGlobalBufferType[ ConstantMemoryUsage / sizeof(unsigned long) ];
+
+  enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ };
+
+  KOKKOS_INLINE_FUNCTION static
+  CudaSpace::size_type warp_count( CudaSpace::size_type i )
+    { return ( i + WarpIndexMask ) >> WarpIndexShift ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  CudaSpace::size_type warp_align( CudaSpace::size_type i )
+    {
+      enum { Mask = ~CudaSpace::size_type( WarpIndexMask ) };
+      return ( i + WarpIndexMask ) & Mask ;
+    }
+};
+
+//----------------------------------------------------------------------------
+
+CudaSpace::size_type cuda_internal_maximum_warp_count();
+CudaSpace::size_type cuda_internal_maximum_grid_count();
+CudaSpace::size_type cuda_internal_maximum_shared_words();
+
+CudaSpace::size_type * cuda_internal_scratch_flags( const CudaSpace::size_type size );
+CudaSpace::size_type * cuda_internal_scratch_space( const CudaSpace::size_type size );
+CudaSpace::size_type * cuda_internal_scratch_unified( const CudaSpace::size_type size );
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __CUDACC__ )
+
+/** \brief  Access to constant memory on the device */
+__device__ __constant__
+Kokkos::Impl::CudaTraits::ConstantGlobalBufferType
+kokkos_impl_cuda_constant_memory_buffer ;
+
+template< typename T >
+inline
+__device__
+T * kokkos_impl_cuda_shared_memory()
+{ extern __shared__ Kokkos::CudaSpace::size_type sh[]; return (T*) sh ; }
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+// See section B.17 of Cuda C Programming Guide Version 3.2
+// for discussion of
+//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
+// function qualifier which could be used to improve performance.
+//----------------------------------------------------------------------------
+// Maximize L1 cache and minimize shared memory:
+//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
+// For 2.0 capability: 48 KB L1 and 16 KB shared
+//----------------------------------------------------------------------------
+
+template< class DriverType >
+__global__
+static void cuda_parallel_launch_constant_memory()
+{
+  const DriverType & driver =
+    *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
+
+  driver();
+}
+
+template< class DriverType >
+__global__
+static void cuda_parallel_launch_local_memory( const DriverType driver )
+{
+  driver();
+}
+
+template < class DriverType ,
+           bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
+struct CudaParallelLaunch ;
+
+template < class DriverType >
+struct CudaParallelLaunch< DriverType , true > {
+
+  inline
+  CudaParallelLaunch( const DriverType & driver ,
+                      const dim3       & grid ,
+                      const dim3       & block ,
+                      const int          shmem )
+  {
+    if ( grid.x && block.x ) {
+
+      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
+           sizeof( DriverType ) ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
+      }
+
+      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      else if ( shmem ) {
+        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared );
+      } else {
+        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 );
+      }
+
+      // Copy functor to constant memory on the device
+      cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
+
+      // Invoke the driver function on the device
+      cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem >>>();
+
+#if defined( KOKKOS_EXPRESSION_CHECK )
+      Kokkos::Cuda::fence();
+#endif
+    }
+  }
+};
+
+template < class DriverType >
+struct CudaParallelLaunch< DriverType , false > {
+
+  inline
+  CudaParallelLaunch( const DriverType & driver ,
+                      const dim3       & grid ,
+                      const dim3       & block ,
+                      const int          shmem )
+  {
+    if ( grid.x && block.x ) {
+
+      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      else if ( shmem ) {
+        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared );
+      } else {
+        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 );
+      }
+
+      cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem >>>( driver );
+
+#if defined( KOKKOS_EXPRESSION_CHECK )
+      Kokkos::Cuda::fence();
+#endif
+    }
+  }
+};
+
+//----------------------------------------------------------------------------
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* defined( __CUDACC__ ) */
+
+#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cu b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cu
new file mode 100644
index 000000000..129395465
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cu
@@ -0,0 +1,329 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+#include <Kokkos_Cuda.hpp>
+#include <Kokkos_CudaSpace.hpp>
+
+#include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <impl/Kokkos_MemoryTracking.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+DeepCopy<HostSpace,CudaSpace>
+  ::DeepCopy( void * dst , const void * src , size_t n )
+{
+  CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) );
+}
+
+DeepCopy<CudaSpace,HostSpace>
+  ::DeepCopy( void * dst , const void * src , size_t n )
+{
+  CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) );
+}
+
+DeepCopy<CudaSpace,CudaSpace>
+  ::DeepCopy( void * dst , const void * src , size_t n )
+{
+  CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) );
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace {
+
+class CudaMemoryTrackingEntry : public Impl::MemoryTrackingEntry
+{
+public:
+
+  void * const                    ptr_alloc ;
+  const unsigned                  size ;
+  const unsigned                  count ;
+  Impl::cuda_texture_object_type  tex_obj ;
+
+  CudaMemoryTrackingEntry( const std::string & arg_label ,
+                           const std::type_info & arg_info ,
+                           void * const           arg_ptr ,
+                           const unsigned         arg_size ,
+                           const unsigned         arg_count )
+    : Impl::MemoryTrackingEntry( arg_label , arg_info , arg_ptr , arg_size * arg_count )
+    , ptr_alloc( arg_ptr )
+    , size( arg_size )
+    , count( arg_count )
+    , tex_obj( 0 )
+    {}
+
+  ~CudaMemoryTrackingEntry();
+};
+
+CudaMemoryTrackingEntry::~CudaMemoryTrackingEntry()
+{
+  std::ostringstream oss;
+  bool error = false;
+  try {
+    Kokkos::Impl::cuda_device_synchronize();
+  }
+  catch(std::runtime_error & err) {
+    error = true;
+    oss << err.what() << std::endl;
+  }
+
+  if ( tex_obj ) {
+
+  }
+
+  try {
+    CUDA_SAFE_CALL( cudaFree( ptr_alloc ) );
+  }
+  catch(std::runtime_error & err) {
+    error = true;
+    oss << err.what() << std::endl;
+  }
+
+  if ( error ) {
+    std::cerr << "cudaFree( " << ptr_alloc << " ) FAILED for " ;
+    Impl::MemoryTrackingEntry::print( std::cerr );
+    std::cerr << oss.str() << std::endl;
+  }
+}
+
+Impl::MemoryTracking & cuda_space_singleton()
+{
+  static Impl::MemoryTracking self("Kokkos::CudaSpace");
+  return self ;
+}
+
+bool cuda_space_verify_modifiable( const char * const label )
+{
+  static const char error_in_parallel[] = "Called with HostSpace::in_parallel()" ;
+  static const char error_not_exists[]  = "Called after return from main()" ;
+
+  const char * const error_msg =
+    HostSpace::in_parallel() ? error_in_parallel : (
+    ! cuda_space_singleton().exists() ? error_not_exists : (const char *) 0 );
+
+  if ( error_msg ) {
+    std::cerr << "Kokkos::CudaSpace::" << label << " ERROR : " << error_msg << std::endl ;
+  }
+
+  return error_msg == 0  ;
+}
+
+}
+
+/*--------------------------------------------------------------------------*/
+
+/*--------------------------------------------------------------------------*/
+
+void * CudaSpace::allocate(
+  const std::string    & label ,
+  const std::type_info & scalar_type ,
+  const size_t           scalar_size ,
+  const size_t           scalar_count )
+{
+  void * ptr = 0 ;
+
+  const size_t size = scalar_size * scalar_count ;
+
+  if ( cuda_space_verify_modifiable("allocate") && size ) {
+
+    try {
+      Kokkos::Impl::cuda_device_synchronize();
+
+#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && defined(KOKKOS_USE_UVM)
+      CUDA_SAFE_CALL( cudaMallocManaged( (void**) &ptr, size, cudaMemAttachGlobal) );
+#else
+      CUDA_SAFE_CALL( cudaMalloc( (void**) &ptr, size) );
+#endif
+
+      Kokkos::Impl::cuda_device_synchronize();
+    }
+    catch( std::runtime_error & err ) {
+      std::ostringstream msg ;
+      msg << "Kokkos::Impl::CudaSpace::allocate( "
+          << label
+          << " , " << scalar_type.name()
+          << " , " << scalar_size
+          << " , " << scalar_count
+          << " ) FAILED memory allocation\n" 
+          << err.what();
+      Kokkos::Impl::throw_runtime_exception( msg.str() );
+    } 
+
+    cuda_space_singleton().insert(
+      new CudaMemoryTrackingEntry( label , scalar_type , ptr , scalar_size , scalar_count ) );
+  }
+
+  return ptr ;
+}
+
+void CudaSpace::increment( const void * ptr )
+{
+  if ( cuda_space_verify_modifiable("increment") ) {
+    cuda_space_singleton().increment( ptr );
+  }
+}
+
+void CudaSpace::decrement( const void * ptr )
+{
+  if ( cuda_space_verify_modifiable("decrement") ) {
+    cuda_space_singleton().decrement( ptr );
+  }
+}
+
+void CudaSpace::print_memory_view( std::ostream & o )
+{
+  cuda_space_singleton().print( o , std::string("  ") );
+}
+
+//----------------------------------------------------------------------------
+
+std::string CudaSpace::query_label( const void * p )
+{
+  const Impl::MemoryTrackingEntry * entry =
+    cuda_space_singleton().query( p );
+
+  return entry ? entry->label : std::string("ERROR NOT FOUND");
+}
+
+void CudaSpace::access_error()
+{
+  const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
+
+  Kokkos::Impl::throw_runtime_exception( msg );
+}
+
+void CudaSpace::access_error( const void * const ptr )
+{
+  std::ostringstream msg ;
+  msg << "Kokkos::CudaSpace::access_error:" ;
+  msg << " attempt to access Cuda-data labeled(" ;
+  msg << query_label( ptr ) ;
+  msg << ") from non-Cuda execution" ;
+  Kokkos::Impl::throw_runtime_exception( msg.str() );
+}
+
+/*--------------------------------------------------------------------------*/
+
+} // namespace Kokkos
+
+#if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION )
+
+namespace Kokkos {
+namespace Impl {
+
+::cudaTextureObject_t
+cuda_texture_object_attach(
+  const cudaChannelFormatDesc & desc ,
+  const void * const            ptr )
+{
+  if ( 0 == ptr || ! cuda_space_verify_modifiable("texture_object_attach") ) return 0 ;
+
+  const unsigned max_count = 1 << 28 ;
+
+  CudaMemoryTrackingEntry * entry =
+    dynamic_cast<CudaMemoryTrackingEntry *>( cuda_space_singleton().query( ptr ) );
+
+  const bool ok_found  = 0 != entry ;
+  const bool ok_ptr    = ok_found && ptr == entry->ptr_alloc ;
+  const bool ok_count  = ok_found && entry->count < max_count ;
+
+  if ( ok_found && ok_ptr && ok_count ) {
+
+    // Can only create texture object on device architure 3.0 or better
+
+    if ( 0 == entry->tex_obj && 300 <= Cuda::device_arch() ) {
+
+      struct cudaResourceDesc resDesc ;
+      struct cudaTextureDesc  texDesc ;
+
+      memset( & resDesc , 0 , sizeof(resDesc) );
+      memset( & texDesc , 0 , sizeof(texDesc) );
+
+      resDesc.resType                = cudaResourceTypeLinear ;
+      resDesc.res.linear.desc        = desc ;
+      resDesc.res.linear.sizeInBytes = entry->size * entry->count ;
+      resDesc.res.linear.devPtr      = entry->ptr_alloc ;
+
+      cudaCreateTextureObject( & entry->tex_obj, & resDesc, & texDesc, NULL);
+    }
+  }
+  else {
+    std::ostringstream msg ;
+    msg << "CudaSpace::texture_object_attach( " << ptr << " ) FAILED: " ;
+
+    if ( ! ok_found ) {
+      msg << "Not View allocated" ;
+    }
+    else if ( ! ok_ptr ) {
+      msg << "Not the originally allocated View \"" << entry->label << "\"" ;
+    }
+    else if ( ! ok_count ) {
+      msg << "Cuda texture object limit exceeded "
+          << max_count << " <= " << entry->count ;
+    }
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  return entry->tex_obj ;
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif
+
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cu b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cu
new file mode 100644
index 000000000..49ad14091
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cu
@@ -0,0 +1,609 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/*--------------------------------------------------------------------------*/
+/* Kokkos interfaces */
+
+#include <Kokkos_Cuda.hpp>
+#include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+/*--------------------------------------------------------------------------*/
+/* Standard 'C' libraries */
+#include <stdlib.h>
+
+/* Standard 'C++' libraries */
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+namespace {
+
+bool cuda_launch_blocking()
+{
+  const char * env = getenv("CUDA_LAUNCH_BLOCKING");
+  if (env == 0) return false;
+
+  return atoi(env);
+}
+
+
+}
+
+void cuda_device_synchronize()
+{
+  static const bool launch_blocking = cuda_launch_blocking();
+
+  if (!launch_blocking) {
+    CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+  }
+}
+
+void cuda_internal_error_throw( cudaError e , const char * name, const char * file, const int line )
+{
+  std::ostringstream out ;
+  out << name << " error: " << cudaGetErrorString(e);
+  if (file) {
+    out << " " << file << ":" << line;
+  }
+  throw_runtime_exception( out.str() );
+}
+
+//----------------------------------------------------------------------------
+// Some significant cuda device properties:
+//
+// cudaDeviceProp::name                : Text label for device
+// cudaDeviceProp::major               : Device major number
+// cudaDeviceProp::minor               : Device minor number
+// cudaDeviceProp::warpSize            : number of threads per warp
+// cudaDeviceProp::multiProcessorCount : number of multiprocessors
+// cudaDeviceProp::sharedMemPerBlock   : capacity of shared memory per block
+// cudaDeviceProp::totalConstMem       : capacity of constant memory
+// cudaDeviceProp::totalGlobalMem      : capacity of global memory
+// cudaDeviceProp::maxGridSize[3]      : maximum grid size
+
+//
+//  Section 4.4.2.4 of the CUDA Toolkit Reference Manual
+//
+// struct cudaDeviceProp {
+//   char name[256];
+//   size_t totalGlobalMem;
+//   size_t sharedMemPerBlock;
+//   int regsPerBlock;
+//   int warpSize;
+//   size_t memPitch;
+//   int maxThreadsPerBlock;
+//   int maxThreadsDim[3];
+//   int maxGridSize[3];
+//   size_t totalConstMem;
+//   int major;
+//   int minor;
+//   int clockRate;
+//   size_t textureAlignment;
+//   int deviceOverlap;
+//   int multiProcessorCount;
+//   int kernelExecTimeoutEnabled;
+//   int integrated;
+//   int canMapHostMemory;
+//   int computeMode;
+//   int concurrentKernels;
+//   int ECCEnabled;
+//   int pciBusID;
+//   int pciDeviceID;
+//   int tccDriver;
+//   int asyncEngineCount;
+//   int unifiedAddressing;
+//   int memoryClockRate;
+//   int memoryBusWidth;
+//   int l2CacheSize;
+//   int maxThreadsPerMultiProcessor;
+// };
+
+
+namespace {
+
+
+
+class CudaInternalDevices {
+public:
+  enum { MAXIMUM_DEVICE_COUNT = 8 };
+  struct cudaDeviceProp  m_cudaProp[ MAXIMUM_DEVICE_COUNT ] ;
+  int                    m_cudaDevCount ;
+
+  CudaInternalDevices();
+
+  static const CudaInternalDevices & singleton();
+};
+
+CudaInternalDevices::CudaInternalDevices()
+{
+  // See 'cudaSetDeviceFlags' for host-device thread interaction
+  // Section 4.4.2.6 of the CUDA Toolkit Reference Manual
+
+  CUDA_SAFE_CALL (cudaGetDeviceCount( & m_cudaDevCount ) );
+
+  for ( int i = 0 ; i < m_cudaDevCount ; ++i ) {
+    CUDA_SAFE_CALL( cudaGetDeviceProperties( m_cudaProp + i , i ) );
+  }
+}
+
+const CudaInternalDevices & CudaInternalDevices::singleton()
+{
+  static CudaInternalDevices self ; return self ;
+}
+
+}
+
+//----------------------------------------------------------------------------
+
+class CudaInternal {
+private:
+
+  CudaInternal( const CudaInternal & );
+  CudaInternal & operator = ( const CudaInternal & );
+
+public:
+
+  typedef Cuda::size_type size_type ;
+
+  int         m_cudaDev ;
+  unsigned    m_maxWarpCount ;
+  unsigned    m_maxBlock ;
+  unsigned    m_maxSharedWords ;
+  size_type   m_scratchSpaceCount ;
+  size_type   m_scratchFlagsCount ;
+  size_type   m_scratchUnifiedCount ;
+  size_type   m_scratchUnifiedSupported ;
+  size_type * m_scratchSpace ;
+  size_type * m_scratchFlags ;
+  size_type * m_scratchUnified ;
+
+  static CudaInternal & singleton();
+
+  int verify_is_initialized( const char * const label ) const ;
+
+  int is_initialized() const
+    { return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
+
+  void initialize( int cuda_device_id );
+  void finalize();
+
+  void print_configuration( std::ostream & ) const ;
+
+  ~CudaInternal();
+
+  CudaInternal()
+    : m_cudaDev( -1 )
+    , m_maxWarpCount( 0 )
+    , m_maxBlock( 0 ) 
+    , m_maxSharedWords( 0 )
+    , m_scratchSpaceCount( 0 )
+    , m_scratchFlagsCount( 0 )
+    , m_scratchUnifiedCount( 0 )
+    , m_scratchUnifiedSupported( 0 )
+    , m_scratchSpace( 0 )
+    , m_scratchFlags( 0 )
+    , m_scratchUnified( 0 )
+    {}
+
+  size_type * scratch_space( const size_type size );
+  size_type * scratch_flags( const size_type size );
+  size_type * scratch_unified( const size_type size );
+};
+
+//----------------------------------------------------------------------------
+
+
+void CudaInternal::print_configuration( std::ostream & s ) const
+{
+  const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
+
+#if defined( KOKKOS_HAVE_CUDA )
+    s << "macro  KOKKOS_HAVE_CUDA      : defined" << std::endl ;
+#endif
+#if defined( CUDA_VERSION )
+    s << "macro  CUDA_VERSION          = " << CUDA_VERSION
+      << " = version " << CUDA_VERSION / 1000
+      << "." << ( CUDA_VERSION % 1000 ) / 10
+      << std::endl ;
+#endif
+
+  for ( int i = 0 ; i < dev_info.m_cudaDevCount ; ++i ) {
+    s << "Kokkos::Cuda[ " << i << " ] "
+      << dev_info.m_cudaProp[i].name
+      << " capability " << dev_info.m_cudaProp[i].major << "." << dev_info.m_cudaProp[i].minor
+      << ", Total Global Memory: " << human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem) 
+      << ", Shared Memory per Block: " << human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock);
+    if ( m_cudaDev == i ) s << " : Selected" ;
+    s << std::endl ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+CudaInternal::~CudaInternal()
+{
+  if ( m_scratchSpace ||
+       m_scratchFlags ||
+       m_scratchUnified ) {
+    std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
+              << std::endl ;
+    std::cerr.flush();
+  }
+
+  m_cudaDev             = -1 ;
+  m_maxWarpCount        = 0 ;
+  m_maxBlock            = 0 ;
+  m_maxSharedWords      = 0 ;
+  m_scratchSpaceCount   = 0 ;
+  m_scratchFlagsCount   = 0 ;
+  m_scratchUnifiedCount = 0 ;
+  m_scratchUnifiedSupported = 0 ;
+  m_scratchSpace   = 0 ;
+  m_scratchFlags   = 0 ;
+  m_scratchUnified = 0 ;
+}
+
+int CudaInternal::verify_is_initialized( const char * const label ) const
+{
+  if ( m_cudaDev < 0 ) {
+    std::cerr << "Kokkos::Cuda::" << label << " : ERROR device not initialized" << std::endl ;
+  }
+  return 0 <= m_cudaDev ;
+}
+
+CudaInternal & CudaInternal::singleton()
+{
+  static CudaInternal self ;
+  return self ;
+}
+
+void CudaInternal::initialize( int cuda_device_id )
+{
+  enum { WordSize = sizeof(size_type) };
+
+  if ( ! Cuda::host_mirror_device_type::is_initialized() ) {
+    const std::string msg("Cuda::initialize ERROR : Cuda::host_mirror_device_type is not initialized");
+    throw_runtime_exception( msg );
+  }
+
+  const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
+
+  const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags ;
+
+  const bool ok_id   = 0 <= cuda_device_id &&
+                            cuda_device_id < dev_info.m_cudaDevCount ;
+
+  // Need device capability 2.0 or better
+
+  const bool ok_dev = ok_id &&
+    ( 2 <= dev_info.m_cudaProp[ cuda_device_id ].major &&
+      0 <= dev_info.m_cudaProp[ cuda_device_id ].minor );
+
+  if ( ok_init && ok_dev ) {
+
+    const struct cudaDeviceProp & cudaProp =
+      dev_info.m_cudaProp[ cuda_device_id ];
+
+    m_cudaDev = cuda_device_id ;
+
+    CUDA_SAFE_CALL( cudaSetDevice( m_cudaDev ) );
+    CUDA_SAFE_CALL( cudaDeviceReset() );
+    Kokkos::Impl::cuda_device_synchronize();
+
+    //----------------------------------
+    // Maximum number of warps,
+    // at most one warp per thread in a warp for reduction.
+
+    // HCE 2012-February :
+    // Found bug in CUDA 4.1 that sometimes a kernel launch would fail
+    // if the thread count == 1024 and a functor is passed to the kernel.
+    // Copying the kernel to constant memory and then launching with
+    // thread count == 1024 would work fine.
+    //
+    // HCE 2012-October :
+    // All compute capabilities support at least 16 warps (512 threads).
+    // However, we have found that 8 warps typically gives better performance.
+
+    m_maxWarpCount = 8 ;
+
+    // m_maxWarpCount = cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize ;
+
+    if ( Impl::CudaTraits::WarpSize < m_maxWarpCount ) {
+      m_maxWarpCount = Impl::CudaTraits::WarpSize ;
+    }
+
+    m_maxSharedWords = cudaProp.sharedMemPerBlock / WordSize ;
+
+    //----------------------------------
+
+    m_maxBlock = cudaProp.maxGridSize[0] ;
+
+    //----------------------------------
+
+    m_scratchUnifiedSupported = cudaProp.unifiedAddressing ;
+
+    if ( ! m_scratchUnifiedSupported ) {
+      std::cout << "Kokkos::Cuda device "
+                << cudaProp.name << " capability "
+                << cudaProp.major << "." << cudaProp.minor
+                << " does not support unified virtual address space"
+                << std::endl ;
+    }
+
+    //----------------------------------
+    // Multiblock reduction uses scratch flags for counters
+    // and scratch space for partial reduction values.
+    // Allocate some initial space.  This will grow as needed.
+
+    {
+      const unsigned reduce_block_count = m_maxWarpCount * Impl::CudaTraits::WarpSize ;
+
+      (void) scratch_unified( 16 * sizeof(size_type) );
+      (void) scratch_flags( reduce_block_count * 2  * sizeof(size_type) );
+      (void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
+    }
+  }
+  else {
+
+    std::ostringstream msg ;
+    msg << "Kokkos::Cuda::initialize(" << cuda_device_id << ") FAILED" ;
+
+    if ( ! ok_init ) {
+      msg << " : Already initialized" ;
+    }
+    if ( ! ok_id ) {
+      msg << " : Device identifier out of range "
+          << "[0.." << dev_info.m_cudaDevCount << "]" ;
+    }
+    else if ( ! ok_dev ) {
+      msg << " : Device " ;
+      msg << dev_info.m_cudaProp[ cuda_device_id ].major ;
+      msg << "." ;
+      msg << dev_info.m_cudaProp[ cuda_device_id ].minor ;
+      msg << " has insufficient capability, required 2.0 or better" ;
+    }
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  } 
+}
+
+//----------------------------------------------------------------------------
+
+typedef Cuda::size_type ScratchGrain[ Impl::CudaTraits::WarpSize ] ;
+enum { sizeScratchGrain = sizeof(ScratchGrain) };
+
+
+Cuda::size_type *
+CudaInternal::scratch_flags( const Cuda::size_type size )
+{
+  if ( verify_is_initialized("scratch_flags") && m_scratchFlagsCount * sizeScratchGrain < size ) {
+
+    Cuda::memory_space::decrement( m_scratchFlags );
+  
+    m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+    m_scratchFlags = (size_type *)
+      Cuda::memory_space::allocate(
+        std::string("InternalScratchFlags") ,
+        typeid( ScratchGrain ),
+        sizeof( ScratchGrain ),
+        m_scratchFlagsCount );
+
+    CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) );
+  }
+
+  return m_scratchFlags ;
+}
+
+Cuda::size_type *
+CudaInternal::scratch_space( const Cuda::size_type size )
+{
+  if ( verify_is_initialized("scratch_space") && m_scratchSpaceCount * sizeScratchGrain < size ) {
+
+    Cuda::memory_space::decrement( m_scratchSpace );
+  
+    m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+    m_scratchSpace = (size_type *)
+      Cuda::memory_space::allocate(
+        std::string("InternalScratchSpace") ,
+        typeid( ScratchGrain ),
+        sizeof( ScratchGrain ),
+        m_scratchSpaceCount );
+  }
+
+  return m_scratchSpace ;
+}
+
+Cuda::size_type *
+CudaInternal::scratch_unified( const Cuda::size_type size )
+{
+
+  if ( verify_is_initialized("scratch_unified") && m_scratchUnifiedSupported ) {
+
+    const bool allocate   = m_scratchUnifiedCount * sizeScratchGrain < size ;
+    const bool deallocate = m_scratchUnified && ( 0 == size || allocate );
+
+    if ( allocate || deallocate ) {
+      Kokkos::Impl::cuda_device_synchronize();
+    }
+
+    if ( deallocate ) {
+
+      CUDA_SAFE_CALL( cudaFreeHost( m_scratchUnified ) );
+
+      m_scratchUnified = 0 ;
+      m_scratchUnifiedCount = 0 ;
+    }
+
+    if ( allocate ) {
+
+      m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+      CUDA_SAFE_CALL( cudaHostAlloc( (void **)( & m_scratchUnified ) ,
+                      m_scratchUnifiedCount * sizeScratchGrain ,
+                      cudaHostAllocDefault ) );
+    }
+  }
+
+  return m_scratchUnified ;
+}
+
+//----------------------------------------------------------------------------
+
+void CudaInternal::finalize()
+{
+  if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
+
+    Cuda::memory_space::decrement( m_scratchSpace );
+    Cuda::memory_space::decrement( m_scratchFlags );
+    (void) scratch_unified( 0 );
+
+    m_cudaDev            = -1 ;
+    m_maxWarpCount       = 0 ;
+    m_maxBlock           = 0 ; 
+    m_maxSharedWords     = 0 ;
+    m_scratchSpaceCount  = 0 ;
+    m_scratchFlagsCount  = 0 ;
+    m_scratchSpace       = 0 ;
+    m_scratchFlags       = 0 ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+Cuda::size_type cuda_internal_maximum_warp_count()
+{ return CudaInternal::singleton().m_maxWarpCount ; }
+
+Cuda::size_type cuda_internal_maximum_grid_count()
+{ return CudaInternal::singleton().m_maxBlock ; }
+
+Cuda::size_type cuda_internal_maximum_shared_words()
+{ return CudaInternal::singleton().m_maxSharedWords ; }
+
+Cuda::size_type * cuda_internal_scratch_space( const Cuda::size_type size )
+{ return CudaInternal::singleton().scratch_space( size ); }
+
+Cuda::size_type * cuda_internal_scratch_flags( const Cuda::size_type size )
+{ return CudaInternal::singleton().scratch_flags( size ); }
+
+Cuda::size_type * cuda_internal_scratch_unified( const Cuda::size_type size )
+{ return CudaInternal::singleton().scratch_unified( size ); }
+
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+Cuda::size_type Cuda::detect_device_count()
+{ return Impl::CudaInternalDevices::singleton().m_cudaDevCount ; }
+
+int Cuda::is_initialized()
+{ return Impl::CudaInternal::singleton().is_initialized(); }
+
+void Cuda::initialize( const Cuda::SelectDevice config )
+{ Impl::CudaInternal::singleton().initialize( config.cuda_device_id ); }
+
+std::vector<unsigned>
+Cuda::detect_device_arch()
+{
+  const Impl::CudaInternalDevices & s = Impl::CudaInternalDevices::singleton();
+
+  std::vector<unsigned> output( s.m_cudaDevCount );
+
+  for ( int i = 0 ; i < s.m_cudaDevCount ; ++i ) {
+    output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor ;
+  }
+
+  return output ;
+}
+
+Cuda::size_type Cuda::device_arch()
+{
+  const int dev_id = Impl::CudaInternal::singleton().m_cudaDev ;
+
+  int dev_arch = 0 ;
+
+  if ( 0 <= dev_id ) {
+    const struct cudaDeviceProp & cudaProp =
+      Impl::CudaInternalDevices::singleton().m_cudaProp[ dev_id ] ;
+
+    dev_arch = cudaProp.major * 100 + cudaProp.minor ;
+  }
+
+  return dev_arch ;
+}
+
+void Cuda::finalize()
+{ Impl::CudaInternal::singleton().finalize(); }
+
+void Cuda::print_configuration( std::ostream & s , const bool )
+{ Impl::CudaInternal::singleton().print_configuration( s ); }
+
+bool Cuda::sleep() { return false ; }
+
+bool Cuda::wake() { return true ; }
+
+void Cuda::fence()
+{ 
+  Kokkos::Impl::cuda_device_synchronize();
+}
+
+unsigned Cuda::team_max()
+{
+  return Impl::CudaInternal::singleton().m_maxWarpCount << Impl::CudaTraits::WarpIndexShift ;
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
new file mode 100644
index 000000000..0604eb8eb
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
@@ -0,0 +1,69 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_INTERNAL_HPP
+#define KOKKOS_CUDA_INTERNAL_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+void cuda_internal_error_throw( cudaError e , const char * name, const char * file = NULL, const int line = 0 );
+
+void cuda_device_synchronize();
+
+inline
+void cuda_internal_safe_call( cudaError e , const char * name, const char * file = NULL, const int line = 0)
+{
+  if ( cudaSuccess != e ) { cuda_internal_error_throw( e , name, file, line ); }
+}
+
+
+
+}
+}
+
+#define CUDA_SAFE_CALL( call )  \
+	Kokkos::Impl::cuda_internal_safe_call( call , #call, __FILE__, __LINE__ )
+
+#endif /* #ifndef KOKKOS_CUDA_INTERNAL_HPP */
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
new file mode 100644
index 000000000..cf4cfb11c
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@@ -0,0 +1,829 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_PARALLEL_HPP
+#define KOKKOS_CUDA_PARALLEL_HPP
+
+#include <iostream>
+#include <stdio.h>
+
+#if defined( __CUDACC__ )
+
+#include <utility>
+#include <Kokkos_Parallel.hpp>
+
+#include <Cuda/Kokkos_CudaExec.hpp>
+#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class WorkSpec >
+class ParallelFor< FunctorType , WorkSpec /* size_t */ , Cuda > {
+private:
+
+  const FunctorType     m_functor ;
+  const Cuda::size_type m_work ;  
+
+  ParallelFor();
+  ParallelFor & operator = ( const ParallelFor & );
+
+public:
+
+  inline
+  __device__
+  void operator()(void) const
+  {
+    const Cuda::size_type work_stride = blockDim.x * gridDim.x ;
+
+    for ( Cuda::size_type
+            iwork = threadIdx.x + blockDim.x * blockIdx.x ;
+            iwork < m_work ;
+            iwork += work_stride ) {
+      m_functor( iwork );
+    }
+  }
+
+  ParallelFor( const FunctorType  & functor ,
+               const size_t         work )
+    : m_functor( functor )
+    , m_work(    work )
+    {
+      const dim3 block( CudaTraits::WarpSize * cuda_internal_maximum_warp_count(), 1, 1);
+      const dim3 grid( std::min( ( m_work + block.x - 1 ) / block.x , cuda_internal_maximum_grid_count() ) , 1 , 1 );
+
+      CudaParallelLaunch< ParallelFor >( *this , grid , block , 0 );
+    }
+};
+
+template< class FunctorType >
+class ParallelFor< FunctorType , ParallelWorkRequest , Cuda > {
+private:
+
+  const FunctorType          m_functor ;
+  const ParallelWorkRequest  m_work ;
+  const int                  m_shmem ;
+
+  ParallelFor();
+  ParallelFor & operator = ( const ParallelFor & );
+
+public:
+
+  inline
+  __device__
+  void operator()(void) const
+  {
+    CudaExec exec( 0 , m_shmem );
+    m_functor( Cuda( exec ) );
+  }
+
+  ParallelFor( const FunctorType         & functor ,
+               const ParallelWorkRequest &  work )
+    : m_functor( functor )
+    , m_work( std::min( work.league_size , size_t(cuda_internal_maximum_grid_count()) ) ,
+              std::min( work.team_size ,   size_t(CudaTraits::WarpSize * cuda_internal_maximum_warp_count()) ) )
+    , m_shmem( FunctorShmemSize< FunctorType >::value( functor ) )
+    {
+      const dim3 grid(  m_work.league_size , 1 , 1 );
+      const dim3 block( m_work.team_size , 1, 1 );
+
+      CudaParallelLaunch< ParallelFor >( *this , grid , block , m_shmem );
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType >
+class ParallelFor< FunctorType , CudaWorkConfig , Cuda > {
+public:
+
+  const FunctorType m_work_functor ;
+
+  inline
+  __device__
+  void operator()(void) const
+  {
+    Cuda::size_type iwork = threadIdx.x + blockDim.x * (
+                            threadIdx.y + blockDim.y * (
+                            threadIdx.z + blockDim.z * (
+                            blockIdx.x + gridDim.x * (
+                            blockIdx.y + gridDim.y * (
+                            blockIdx.z )))));
+
+    m_work_functor( iwork );
+  }
+
+  ParallelFor( const FunctorType    & functor ,
+               const CudaWorkConfig & work_config )
+  : m_work_functor( functor )
+  {
+    const dim3 grid( work_config.grid[0] ,
+                     work_config.grid[1] ,
+                     work_config.grid[2] );
+
+    const dim3 block( work_config.block[0] ,
+                      work_config.block[1] ,
+                      work_config.block[2] );
+
+    CudaParallelLaunch< ParallelFor >( *this , grid , block , work_config.shared );
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class WorkSpec >
+class ParallelReduce< FunctorType , WorkSpec , Cuda >
+{
+public:
+  typedef ReduceAdapter< FunctorType >        Reduce ;
+  typedef typename Reduce::pointer_type       pointer_type ;
+  typedef typename Reduce::reference_type     reference_type ;
+  typedef Cuda::size_type                     size_type ;
+
+  // Algorithmic constraints:
+  //  (a) blockSize is a power of two
+  //  (b) blockDim.x == BlockSize == 1 << BlockSizeShift
+  //  (c) blockDim.y == blockDim.z == 1
+
+  enum { WarpCount      = 8 };
+  enum { BlockSize      = CudaTraits::WarpSize << power_of_two< WarpCount >::value };
+  enum { BlockSizeShift = power_of_two< BlockSize >::value };
+  enum { BlockSizeMask  = BlockSize - 1 };
+
+  enum { GridMaxComputeCapability_2x = 0x0ffff };
+  enum { GridMax = BlockSize };
+
+  const FunctorType m_functor ;
+  size_type *       m_scratch_space ;
+  size_type *       m_scratch_flags ;
+  size_type *       m_unified_space ;
+  pointer_type      m_host_pointer ;
+  size_type         m_work ;
+  size_type         m_work_per_block ;
+  size_type         m_local_block_count ;
+  size_type         m_global_block_begin ;
+  size_type         m_global_block_count ;
+
+
+  __device__ inline
+  void operator()(void) const
+  {
+    extern __shared__ size_type shared_data[];
+
+    const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
+      word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );
+
+    {
+      reference_type value = Reduce::reference( shared_data + threadIdx.x * word_count.value );
+
+      m_functor.init( value );
+
+      // Number of blocks is bounded so that the reduction can be limited to two passes.
+      // Each thread block is given an approximately equal amount of work to perform.
+      // Accumulate the values for this block.
+      // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+      const size_type iwork_beg = blockIdx.x * m_work_per_block ;
+      const size_type iwork_end = iwork_beg + m_work_per_block < m_work
+                                ? iwork_beg + m_work_per_block : m_work ;
+
+      for ( size_type iwork = threadIdx.x + iwork_beg ; iwork < iwork_end ; iwork += BlockSize ) {
+        m_functor( iwork , value );
+      }
+    }
+
+    // Reduce with final value at BlockSize - 1 location.
+    if ( cuda_single_inter_block_reduce_scan<false,BlockSize>(
+           m_functor , m_global_block_begin + blockIdx.x , m_global_block_count ,
+           shared_data , m_scratch_space , m_scratch_flags ) ) {
+
+      // This is the final block with the final result at the final threads' location
+
+      size_type * const shared = shared_data + BlockSizeMask * word_count.value ;
+      size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
+
+      if ( threadIdx.x == 0 ) { Reduce::final( m_functor , shared ); }
+
+      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
+
+      for ( unsigned i = threadIdx.x ; i < word_count.value ; i += BlockSize ) { global[i] = shared[i]; }
+    }
+  }
+
+  ParallelReduce( const FunctorType  & functor ,
+                  const size_t         nwork ,
+                  const pointer_type   result = 0 ,
+                  const bool execute_immediately = true )
+  : m_functor( functor )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  , m_host_pointer( result )
+  , m_work( nwork )
+  , m_work_per_block( 0 )
+  , m_local_block_count( 0 )
+  , m_global_block_begin( 0 )
+  , m_global_block_count( 0 )
+  {
+    // At most 'max_grid' blocks:
+    const int max_grid = std::min( int(GridMax) , int(( nwork + BlockSizeMask ) / BlockSize ));
+
+    // How much work per block:
+    m_work_per_block = ( nwork + max_grid - 1 ) / max_grid ;
+
+    // How many block are really needed for this much work:
+    m_local_block_count  = ( nwork + m_work_per_block - 1 ) / m_work_per_block ;
+    m_global_block_count = m_local_block_count ;
+
+    m_scratch_space = cuda_internal_scratch_space( Reduce::value_size( functor ) * m_local_block_count );
+    m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
+    m_unified_space = cuda_internal_scratch_unified( Reduce::value_size( functor ) );
+
+    if ( execute_immediately ) { execute(); }
+  }
+
+  inline
+  void execute() const
+  {
+    const dim3 grid( m_local_block_count , 1 , 1 );
+    const dim3 block( BlockSize , 1 , 1 );
+    const int shmem = cuda_single_inter_block_reduce_scan_shmem<false,BlockSize>( m_functor );
+
+    CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute
+  }
+
+  void wait() const
+  {
+    Cuda::fence();
+
+    if ( m_host_pointer ) {
+      if ( m_unified_space ) {
+        const int count = Reduce::value_count( m_functor );
+        for ( int i = 0 ; i < count ; ++i ) { m_host_pointer[i] = pointer_type(m_unified_space)[i] ; }
+      }
+      else {
+        const int size = Reduce::value_size( m_functor );
+        DeepCopy<HostSpace,CudaSpace>( m_host_pointer , m_scratch_space , size );
+      }
+    }
+  }
+};
+
+
+template< class FunctorType >
+class ParallelReduce< FunctorType , ParallelWorkRequest , Cuda >
+{
+public:
+  typedef ReduceAdapter< FunctorType >        Reduce ;
+  typedef typename Reduce::pointer_type       pointer_type ;
+  typedef typename Reduce::reference_type     reference_type ;
+  typedef Cuda::size_type                     size_type ;
+
+  // Algorithmic constraints:
+  //  (a) blockSize is a power of two
+  //  (b) blockDim.x == BlockSize == 1 << BlockSizeShift
+  //  (b) blockDim.y == blockDim.z == 1
+
+  enum { WarpCount      = 8 };
+  enum { BlockSize      = CudaTraits::WarpSize << power_of_two< WarpCount >::value };
+  enum { BlockSizeShift = power_of_two< BlockSize >::value };
+  enum { BlockSizeMask  = BlockSize - 1 };
+
+  enum { GridMaxComputeCapability_2x = 0x0ffff };
+  enum { GridMax = BlockSize };
+
+  const FunctorType m_functor ;
+  size_type *       m_scratch_space ;
+  size_type *       m_scratch_flags ;
+  size_type *       m_unified_space ;
+  pointer_type      m_host_pointer ;
+  size_type         m_shmem_begin ;
+  size_type         m_shmem_end ;
+  size_type         m_local_block_count ;
+  size_type         m_global_block_begin ;
+  size_type         m_global_block_count ;
+
+  __device__ inline
+  void operator()(void) const
+  {
+    extern __shared__ size_type shared_data[];
+
+    const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
+      word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );
+
+    {
+      reference_type value = Reduce::reference( shared_data + threadIdx.x * word_count.value );
+
+      m_functor.init( value );
+
+      CudaExec exec( m_shmem_begin , m_shmem_end );
+
+      m_functor( Cuda( exec ) , value );
+    }
+
+    // Reduce with final value at BlockSize - 1 location.
+    if ( cuda_single_inter_block_reduce_scan<false,BlockSize>(
+           m_functor , m_global_block_begin + blockIdx.x , m_global_block_count ,
+           shared_data , m_scratch_space , m_scratch_flags ) ) {
+
+      // This is the final block with the final result at the final threads' location
+
+      size_type * const shared = shared_data + BlockSizeMask * word_count.value ;
+      size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
+
+      if ( threadIdx.x == 0 ) { Reduce::final( m_functor , shared ); }
+
+      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
+
+      for ( unsigned i = threadIdx.x ; i < word_count.value ; i += BlockSize ) { global[i] = shared[i]; }
+    }
+  }
+
+
+  ParallelReduce( const FunctorType         & functor ,
+                  const ParallelWorkRequest & work ,
+                  const pointer_type          result = 0 ,
+                  const bool execute_immediately = true )
+  : m_functor( functor )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  , m_host_pointer( result )
+  , m_shmem_begin( cuda_single_inter_block_reduce_scan_shmem<false,BlockSize>( functor ) )
+  , m_shmem_end(   cuda_single_inter_block_reduce_scan_shmem<false,BlockSize>( functor )
+                   + FunctorShmemSize< FunctorType >::value( functor ) )
+  , m_local_block_count( 0 )
+  , m_global_block_begin( 0 )
+  , m_global_block_count( 0 )
+  {
+    m_local_block_count  = std::min( int(GridMax) , int(work.league_size) );
+    m_global_block_count = std::min( int(GridMax) , int(work.league_size) );
+    m_scratch_space = cuda_internal_scratch_space( Reduce::value_size( functor ) * m_local_block_count );
+    m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
+    m_unified_space = cuda_internal_scratch_unified( Reduce::value_size( functor ) );
+
+    if ( execute_immediately ) { execute(); }
+  }
+
+  inline
+  void execute() const
+  {
+    const dim3 grid( m_local_block_count , 1 , 1 );
+    const dim3 block( BlockSize , 1 , 1 );
+
+    CudaParallelLaunch< ParallelReduce >( *this, grid, block, m_shmem_end ); // copy to device and execute
+  }
+
+  void wait() const
+  {
+    Cuda::fence();
+
+    if ( m_host_pointer ) {
+      if ( m_unified_space ) {
+        const int count = Reduce::value_count( m_functor );
+        for ( int i = 0 ; i < count ; ++i ) { m_host_pointer[i] = pointer_type(m_unified_space)[i] ; }
+      }
+      else {
+        const int size = Reduce::value_size( m_functor );
+        DeepCopy<HostSpace,CudaSpace>( m_host_pointer , m_scratch_space , size );
+      }
+    }
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class Functor >
+class MultiFunctorParallelReduceMember ;
+
+template<>
+class MultiFunctorParallelReduceMember<void>
+{
+private:
+
+  MultiFunctorParallelReduceMember( const MultiFunctorParallelReduceMember & );
+  MultiFunctorParallelReduceMember & operator = ( const MultiFunctorParallelReduceMember & );
+
+protected:
+
+  MultiFunctorParallelReduceMember() {}
+
+public:
+
+  virtual unsigned block_count() const = 0 ;
+
+  virtual ~MultiFunctorParallelReduceMember() {}
+
+  virtual void execute( void * const host_pointer ,
+                        const unsigned global_block_begin ,
+                        const unsigned global_block_count ) = 0 ;
+
+  virtual void wait() const = 0 ;
+};
+
+template< class Functor >
+class MultiFunctorParallelReduceMember : public MultiFunctorParallelReduceMember<void> {
+public:
+  ParallelReduce< Functor , size_t , Cuda >  m_functor ;
+
+  MultiFunctorParallelReduceMember( const Functor & f , size_t nwork )
+    : MultiFunctorParallelReduceMember<void>()
+    , m_functor( f , nwork , 0 , false )
+    {}
+
+  virtual unsigned block_count() const { return m_functor.m_local_block_count ; }
+
+  virtual void execute( void * const host_pointer ,
+                        const unsigned global_block_begin ,
+                        const unsigned global_block_count )
+  {
+    m_functor.m_host_pointer = typename ReduceAdapter< Functor >::pointer_type(host_pointer);
+    m_functor.m_global_block_begin = global_block_begin ;
+    m_functor.m_global_block_count = global_block_count ;
+    m_functor.execute();
+  }
+
+  virtual void wait() const { m_functor.wait(); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+namespace Kokkos {
+
+template<>
+class MultiFunctorParallelReduce< Cuda >
+{
+private:
+
+  typedef std::vector< Impl::MultiFunctorParallelReduceMember<void> * > MemberVector ;
+
+  MemberVector m_functors ;
+
+public:
+
+  MultiFunctorParallelReduce()
+    : m_functors()
+    {}
+
+  ~MultiFunctorParallelReduce()
+  {
+    while ( ! m_functors.empty() ) {
+      delete m_functors.back();
+      m_functors.pop_back();
+    }
+  }
+
+  template< class FunctorType >
+  void push_back( const size_t work_count , const FunctorType & f )
+  {
+    m_functors.push_back( new Impl::MultiFunctorParallelReduceMember<FunctorType>( f , work_count ) );
+  }
+
+  void execute( void * host_pointer )
+  {
+    typename MemberVector::iterator m ;
+
+    Cuda::size_type block_count = 0 ;
+
+    for ( m = m_functors.begin() ; m != m_functors.end() ; ++m ) {
+      block_count += (*m)->block_count();
+    }
+
+    Cuda::size_type block_offset = 0 ;
+
+    for ( m = m_functors.begin() ; m != m_functors.end() ; ++m ) {
+      (*m)->execute( host_pointer , block_offset , block_count );
+      block_offset += (*m)->block_count();
+    }
+  }
+
+  void wait() const
+  {
+    if ( ! m_functors.empty() ) { (m_functors.back())->wait(); }
+  }
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class WorkSpec >
+class ParallelScan< FunctorType , WorkSpec , Cuda >
+{
+public:
+  typedef ReduceAdapter< FunctorType >        Reduce ;
+  typedef typename Reduce::pointer_type       pointer_type ;
+  typedef typename Reduce::reference_type     reference_type ;
+  typedef Cuda::size_type                     size_type ;
+
+  // Algorithmic constraints:
+  //  (a) blockSize is a power of two
+  //  (b) blockDim.x == BlockSize == 1 << BlockSizeShift
+  //  (b) blockDim.y == blockDim.z == 1
+  //  (c) gridDim.x  <= blockDim.x * blockDim.x
+  //  (d) gridDim.y  == gridDim.z == 1
+
+  // blockDim.x must be power of two = 128 (4 warps) or 256 (8 warps) or 512 (16 warps)
+  // gridDim.x <= blockDim.x * blockDim.x
+  //
+  // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit testing
+
+  enum { WarpCount      = 4 };
+  enum { BlockSize      = CudaTraits::WarpSize << power_of_two< WarpCount >::value };
+  enum { BlockSizeShift = power_of_two< BlockSize >::value };
+  enum { BlockSizeMask  = BlockSize - 1 };
+
+  enum { GridMaxComputeCapability_2x = 0x0ffff };
+  enum { GridMax = ( BlockSize * BlockSize ) < GridMaxComputeCapability_2x
+                 ? ( BlockSize * BlockSize ) : GridMaxComputeCapability_2x };
+
+  const FunctorType m_functor ;
+  size_type *       m_scratch_space ;
+  size_type *       m_scratch_flags ;
+  const size_type   m_work ;
+        size_type   m_work_per_block ;
+        size_type   m_final ;
+  
+  //----------------------------------------
+
+  __device__ inline
+  void initial(void) const
+  {
+    extern __shared__ size_type shared_data[];
+
+    const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
+      word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );
+
+    size_type * const shared_value = shared_data + word_count.value * threadIdx.x ;
+
+    m_functor.init( Reduce::reference( shared_value ) );
+
+    // Number of blocks is bounded so that the reduction can be limited to two passes.
+    // Each thread block is given an approximately equal amount of work to perform.
+    // Accumulate the values for this block.
+    // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+    const size_type iwork_beg = blockIdx.x * m_work_per_block ;
+    const size_type iwork_end = iwork_beg + m_work_per_block < m_work 
+                              ? iwork_beg + m_work_per_block : m_work ;
+
+    for ( size_type iwork = threadIdx.x + iwork_beg ; iwork < iwork_end ; iwork += BlockSize ) {
+      m_functor( iwork , Reduce::reference( shared_value ) , false );
+    }
+
+    // Reduce and scan, writing out scan of blocks' totals and block-groups' totals.
+    // Blocks' scan values are written to 'blockIdx.x' location.
+    // Block-groups' scan values are at: i = ( j * BlockSize - 1 ) for i < gridDim.x
+    cuda_single_inter_block_reduce_scan<true,BlockSize>( m_functor , blockIdx.x , gridDim.x , shared_data , m_scratch_space , m_scratch_flags );
+  }
+
+  //----------------------------------------
+
+  __device__ inline
+  void final(void) const
+  {
+    extern __shared__ size_type shared_data[];
+
+    const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
+      word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );
+
+    // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , value[2] , ... }
+    size_type * const shared_prefix = shared_data + word_count.value * threadIdx.x ;
+    size_type * const shared_accum  = shared_data + word_count.value * ( BlockSize + 1 );
+
+    // Starting value for this thread block is the previous block's total.
+    if ( blockIdx.x ) {
+      size_type * const block_total = m_scratch_space + word_count.value * ( blockIdx.x - 1 );
+      for ( unsigned i = threadIdx.x ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i] ; }
+    }
+    else if ( 0 == threadIdx.x ) {
+      m_functor.init( Reduce::reference( shared_accum ) );
+    }
+
+          unsigned iwork_beg = blockIdx.x * m_work_per_block ;
+    const unsigned iwork_end = iwork_beg + m_work_per_block ;
+
+    for ( ; iwork_beg < iwork_end ; iwork_beg += BlockSize ) {
+
+      const unsigned iwork = threadIdx.x + iwork_beg ;
+
+      __syncthreads(); // Don't overwrite previous iteration values until they are used
+
+      m_functor.init( Reduce::reference( shared_prefix + word_count.value ) );
+
+      // Copy previous block's accumulation total into thread[0] prefix and inclusive scan value of this block
+      for ( unsigned i = threadIdx.x ; i < word_count.value ; ++i ) {
+        shared_data[i + word_count.value] = shared_data[i] = shared_accum[i] ;
+      }
+
+      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.
+
+      // Call functor to accumulate inclusive scan value for this work item
+      if ( iwork < m_work ) { m_functor( iwork , Reduce::reference( shared_prefix + word_count.value ) , false ); }
+
+      // Scan block values into locations shared_data[1..BlockSize]
+      cuda_intra_block_reduce_scan<true>( m_functor , Reduce::pointer_type(shared_data+word_count.value) );
+
+      {
+        size_type * const block_total = shared_data + word_count.value * blockDim.x ;
+        for ( unsigned i = threadIdx.x ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i]; }
+      }
+
+      // Call functor with exclusive scan value
+      if ( iwork < m_work ) { m_functor( iwork , Reduce::reference( shared_prefix ) , true ); }
+    }
+  }
+
+  //----------------------------------------
+
+  __device__ inline
+  void operator()(void) const
+  {
+    if ( ! m_final ) {
+      initial();
+    }
+    else {
+      final();
+    }
+  }
+
+  ParallelScan( const FunctorType  & functor ,
+                const size_t         nwork )
+  : m_functor( functor )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_work( nwork )
+  , m_work_per_block( 0 )
+  , m_final( false )
+  {
+    // At most 'max_grid' blocks:
+    const int max_grid = std::min( int(GridMax) , int(( nwork + BlockSizeMask ) / BlockSize ));
+
+    // How much work per block:
+    m_work_per_block = ( nwork + max_grid - 1 ) / max_grid ;
+
+    // How many block are really needed for this much work:
+    const dim3 grid( ( nwork + m_work_per_block - 1 ) / m_work_per_block , 1 , 1 );
+    const dim3 block( BlockSize , 1 , 1 );
+    const int shmem = Reduce::value_size( functor ) * ( BlockSize + 2 );
+
+    m_scratch_space = cuda_internal_scratch_space( Reduce::value_size( functor ) * grid.x );
+    m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) * 1 );
+
+    m_final = false ;
+    CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
+
+    m_final = true ;
+    CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
+  }
+
+  void wait() const { Cuda::fence(); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __CUDA_ARCH__ )
+
+namespace Kokkos {
+namespace Impl {
+
+template< typename Type >
+struct CudaJoinFunctor {
+  typedef Type value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    volatile const value_type & input )
+    { update += input ; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+namespace Kokkos {
+
+template< typename TypeLocal , typename TypeGlobal >
+__device__ inline TypeGlobal Cuda::team_scan( const TypeLocal & value , TypeGlobal * const global_accum )
+{
+  enum { BlockSizeMax = 512 };
+
+  __shared__ TypeGlobal base_data[ BlockSizeMax + 1 ];
+
+  __syncthreads(); // Don't write in to shared data until all threads have entered this function
+
+  if ( 0 == threadIdx.x ) { base_data[0] = 0 ; }
+
+  base_data[ threadIdx.x + 1 ] = value ;
+
+  Impl::cuda_intra_block_reduce_scan<true>( Impl::CudaJoinFunctor<TypeGlobal>() , base_data + 1 );
+
+  if ( global_accum ) {
+    if ( blockDim.x == threadIdx.x + 1 ) {
+      base_data[ blockDim.x ] = atomic_fetch_add( global_accum , base_data[ blockDim.x ] );
+    }
+    __syncthreads(); // Wait for atomic
+    base_data[ threadIdx.x ] += base_data[ blockDim.x ] ;
+  }
+
+  return base_data[ threadIdx.x ];
+}
+
+template< typename Type >
+__device__ inline Type Cuda::team_scan( const Type & value )
+{ return team_scan( value , (Type*) 0 ); }
+
+} // namespace Kokkos
+
+#else /* ! defined( __CUDA_ARCH__ ) */
+
+namespace Kokkos {
+
+template< typename Type > inline Type Cuda::team_scan( const Type & ) { return 0 ; }
+
+template< typename TypeLocal , typename TypeGlobal >
+inline TypeGlobal Cuda::team_scan( const TypeLocal & , TypeGlobal * const ) { return 0 ; }
+
+} // namespace Kokkos
+
+#endif /* ! defined( __CUDA_ARCH__ ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* defined( __CUDACC__ ) */
+
+#endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
new file mode 100644
index 000000000..d9f2d8f16
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -0,0 +1,267 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_REDUCESCAN_HPP
+#define KOKKOS_CUDA_REDUCESCAN_HPP
+
+#if defined( __CUDACC__ )
+
+#include <utility>
+
+#include <Kokkos_Parallel.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+// See section B.17 of Cuda C Programming Guide Version 3.2
+// for discussion of
+//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
+// function qualifier which could be used to improve performance.
+//----------------------------------------------------------------------------
+// Maximize shared memory and minimize L1 cache:
+//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
+// For 2.0 capability: 48 KB shared and 16 KB L1
+//----------------------------------------------------------------------------
+// Must have consistent '__shared__' statement across all device kernels.
+// Since there may be more than one kernel in a file then have to make this
+// a simple array of words.
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/*
+ *  Algorithmic constraints:
+ *   (a) blockDim.x is a power of two
+ *   (b) blockDim.x <= 512
+ *   (c) blockDim.y == blockDim.z == 1
+ */
+template< bool DoScan , class FunctorType >
+__device__
+void cuda_intra_block_reduce_scan( const FunctorType & functor ,
+                                   const typename ReduceAdapter< FunctorType >::pointer_type base_data )
+{
+  typedef ReduceAdapter< FunctorType >   Reduce ;
+  typedef typename Reduce::pointer_type  pointer_type ;
+
+  const unsigned value_count   = Reduce::value_count( functor );
+  const unsigned BlockSizeMask = blockDim.x - 1 ;
+
+  // Must have power of two thread count
+
+  if ( BlockSizeMask & blockDim.x ) { cuda_abort("Cuda::cuda_intra_block_scan requires power-of-two blockDim"); }
+
+#define BLOCK_REDUCE_STEP( R , TD , S )  \
+  if ( ! ( R & ((1<<(S+1))-1) ) ) \
+    { functor.join( Reduce::reference(TD) , Reduce::reference(TD - (value_count<<S))); }
+
+#define BLOCK_SCAN_STEP( TD , N , S )  \
+  if ( N == (1<<S) ) \
+    { functor.join( Reduce::reference(TD) , Reduce::reference(TD - (value_count<<S))); }
+
+  const unsigned     rtid_intra = threadIdx.x ^ BlockSizeMask ;
+  const pointer_type tdata_intra = base_data + value_count * threadIdx.x ;
+
+  { // Intra-warp reduction:
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,0)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,1)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,2)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,3)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,4)
+  }
+
+  __syncthreads(); // Wait for all warps to reduce
+
+  { // Inter-warp reduce-scan by a single warp to avoid extra synchronizations
+    const unsigned rtid_inter = ( threadIdx.x ^ BlockSizeMask ) << CudaTraits::WarpIndexShift ;
+
+    if ( rtid_inter < blockDim.x ) {
+
+      const pointer_type tdata_inter = base_data + value_count * ( rtid_inter ^ BlockSizeMask );
+
+      if ( (1<<5) < BlockSizeMask ) {                        BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
+      if ( (1<<6) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
+      if ( (1<<7) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
+      if ( (1<<8) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
+
+      if ( DoScan ) {
+
+        int n = ( rtid_inter &  32 ) ?  32 : (
+                ( rtid_inter &  64 ) ?  64 : (
+                ( rtid_inter & 128 ) ? 128 : (
+                ( rtid_inter & 256 ) ? 256 : 0 )));
+
+        if ( ! ( rtid_inter + n < blockDim.x ) ) n = 0 ;
+
+        BLOCK_SCAN_STEP(tdata_inter,n,8)
+        BLOCK_SCAN_STEP(tdata_inter,n,7)
+        BLOCK_SCAN_STEP(tdata_inter,n,6)
+        BLOCK_SCAN_STEP(tdata_inter,n,5)
+      }
+    }
+  }
+
+  __syncthreads(); // Wait for inter-warp reduce-scan to complete
+
+  if ( DoScan ) {
+    int n = ( rtid_intra &  1 ) ?  1 : (
+            ( rtid_intra &  2 ) ?  2 : (
+            ( rtid_intra &  4 ) ?  4 : (
+            ( rtid_intra &  8 ) ?  8 : (
+            ( rtid_intra & 16 ) ? 16 : 0 ))));
+
+    if ( ! ( rtid_intra + n < blockDim.x ) ) n = 0 ;
+
+    BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,0)
+  }
+
+#undef BLOCK_SCAN_STEP
+#undef BLOCK_REDUCE_STEP
+}
+
+//----------------------------------------------------------------------------
+/**\brief  Input value-per-thread starting at 'shared_data'.
+ *         Reduction value at last thread's location.
+ *
+ *  If 'DoScan' then write blocks' scan values and block-groups' scan values.
+ *
+ *  Global reduce result is in the last threads' 'shared_data' location.
+ */
+template< bool DoScan , unsigned ArgBlockSize , class FunctorType >
+__device__
+bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
+                                          const Cuda::size_type   block_id ,
+                                          const Cuda::size_type   block_count ,
+                                          Cuda::size_type * const shared_data ,
+                                          Cuda::size_type * const global_data ,
+                                          Cuda::size_type * const global_flags )
+{
+  typedef Cuda::size_type                  size_type ;
+  typedef ReduceAdapter< FunctorType >     Reduce ;
+  typedef typename Reduce::pointer_type    pointer_type ;
+  typedef typename Reduce::reference_type  reference_type ;
+
+  enum { BlockSize      = ArgBlockSize };
+  enum { BlockSizeMask  = BlockSize - 1 };
+  enum { BlockSizeShift = power_of_two< BlockSize >::value };
+
+  const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
+    word_count( Reduce::value_size( functor ) / sizeof(size_type) );
+
+  // Must have power of two thread count
+  if ( BlockSize != blockDim.x ) { cuda_abort("Cuda::cuda_inter_block_scan wrong blockDim.x"); }
+
+  // Reduce the accumulation for the entire block.
+  cuda_intra_block_reduce_scan<false>( functor , pointer_type(shared_data) );
+
+  {
+    // Write accumulation total to global scratch space.
+    // Accumulation total is the last thread's data.
+    size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
+    size_type * const global = global_data + word_count.value * block_id ;
+
+    for ( size_type i = threadIdx.x ; i < word_count.value ; i += BlockSize ) { global[i] = shared[i] ; }
+  }
+
+  // Contributing blocks note that their contribution has been completed via an atomic-increment flag
+  // If this block is not the last block to contribute to this group then the block is done.
+  const bool is_last_block =
+    ! __syncthreads_or( threadIdx.x ? 0 : ( 1 + atomicInc( global_flags , block_count - 1 ) < block_count ) );
+
+  if ( is_last_block ) {
+
+    const size_type b = ( long(block_count) * long(threadIdx.x) ) >> BlockSizeShift ;
+    const size_type e = ( long(block_count) * long( threadIdx.x + 1 ) ) >> BlockSizeShift ;
+
+    {
+      reference_type shared_value = Reduce::reference( shared_data + word_count.value * threadIdx.x );
+
+      functor.init( shared_value );
+
+      for ( size_type i = b ; i < e ; ++i ) {
+        functor.join( shared_value , Reduce::reference( global_data + word_count.value * i ) );
+      }
+    }
+
+    cuda_intra_block_reduce_scan<DoScan>( functor , pointer_type(shared_data) );
+
+    if ( DoScan ) {
+
+      size_type * const shared_value = shared_data + word_count.value * ( threadIdx.x ? threadIdx.x - 1 : BlockSize );
+
+      if ( ! threadIdx.x ) { functor.init( Reduce::reference( shared_value ) ); }
+
+      // Join previous inclusive scan value to each member
+      for ( size_type i = b ; i < e ; ++i ) {
+        size_type * const global_value = global_data + word_count.value * i ;
+        functor.join( Reduce::reference( shared_value ) , Reduce::reference( global_value ) );
+        Reduce::copy( functor , global_value , shared_value );
+      }
+    }
+  }
+
+  return is_last_block ;
+}
+
+template< bool DoScan , unsigned ArgBlockSize , class FunctorType >
+inline
+unsigned cuda_single_inter_block_reduce_scan_shmem( const FunctorType & functor )
+{
+  return ( ArgBlockSize + 2 ) * ReduceAdapter< FunctorType >::value_size( functor );
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( __CUDACC__ ) */
+#endif /* KOKKOS_CUDA_REDUCESCAN_HPP */
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
new file mode 100644
index 000000000..69120fdda
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
@@ -0,0 +1,323 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Cuda.hpp>
+
+namespace Kokkos {
+
+
+// Shuffle only makes sense on >= Kepler GPUs; it doesn't work on CPUs
+// or other GPUs.  We provide a generic definition (which is trivial
+// and doesn't do what it claims to do) because we don't actually use
+// this function unless we are on a suitable GPU, with a suitable
+// Scalar type.  (For example, in the mat-vec, the "ThreadsPerRow"
+// internal parameter depends both on the Device and the Scalar type,
+// and it controls whether shfl_down() gets called.)
+template<typename Scalar>
+KOKKOS_INLINE_FUNCTION
+Scalar shfl_down(const Scalar &val, const int& delta, const int& width){
+  return val;
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+unsigned int shfl_down<unsigned int>(const unsigned int &val, const int& delta, const int& width){
+#ifdef __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 300)
+    unsigned int tmp1 = val;
+    int tmp = *reinterpret_cast<int*>(&tmp1);
+    tmp = __shfl_down(tmp,delta,width);
+    return *reinterpret_cast<unsigned int*>(&tmp);
+  #else
+    return val;
+  #endif
+#else
+  return val;
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+int shfl_down<int>(const int &val, const int& delta, const int& width){
+#ifdef __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 300)
+    return __shfl_down(val,delta,width);
+  #else
+    return val;
+  #endif
+#else
+  return val;
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+float shfl_down<float>(const float &val, const int& delta, const int& width){
+#ifdef __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 300)
+    return __shfl_down(val,delta,width);
+  #else
+    return val;
+  #endif
+#else
+  return val;
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+double shfl_down<double>(const double &val, const int& delta, const int& width){
+#ifdef __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 300)
+    int lo = __double2loint(val);
+    int hi = __double2hiint(val);
+    lo = __shfl_down(lo,delta,width);
+    hi = __shfl_down(hi,delta,width);
+    return __hiloint2double(hi,lo);
+  #else
+    return val;
+  #endif
+#else
+  return val;
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+long int shfl_down<long int>(const long int &val, const int& delta, const int& width){
+#ifdef __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 300)
+    int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+    int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+    lo = __shfl_down(lo,delta,width);
+    hi = __shfl_down(hi,delta,width);
+    const double tmp = __hiloint2double(hi,lo);
+    return *(reinterpret_cast<const long int*>(&tmp));
+  #else
+    return val;
+  #endif
+#else
+  return val;
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+unsigned long shfl_down<unsigned long>(const unsigned long &val, const int& delta, const int& width){
+#ifdef __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 300)
+    int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+    int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+    lo = __shfl_down(lo,delta,width);
+    hi = __shfl_down(hi,delta,width);
+    const double tmp = __hiloint2double(hi,lo);
+    return *(reinterpret_cast<const unsigned long*>(&tmp));
+  #else
+    return val;
+  #endif
+#else
+  return val;
+#endif
+}
+
+template<int N>
+struct Vectorization<Cuda,N> {
+  enum {increment = N};
+
+#ifdef __CUDA_ARCH__
+  KOKKOS_FORCEINLINE_FUNCTION
+  static int begin() { return threadIdx.x%N;}
+#else
+  KOKKOS_FORCEINLINE_FUNCTION
+  static int begin() { return 0;}
+#endif
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  static int thread_rank(const Cuda &dev) {
+    return dev.team_rank()/increment;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  static int global_thread_rank(const Cuda &dev) {
+    return (dev.league_rank()*dev.team_size()+dev.team_rank())/increment;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  static bool is_lane_0(const Cuda &dev) {
+    return (dev.team_rank()%increment)==0;
+  }
+
+  template<class Scalar>
+  KOKKOS_INLINE_FUNCTION
+  static Scalar reduce(const Scalar& val) {
+    #ifdef __CUDA_ARCH__
+    __shared__ Scalar result[256];
+    Scalar myresult;
+    for(int k=0;k<blockDim.x;k+=256) {
+      const int tid = threadIdx.x - k;
+      if(tid > 0 && tid<256) {
+        result[tid] = val;
+        if ( (N > 1) && (tid%2==0) )
+          result[tid] += result[tid+1];
+        if ( (N > 2) && (tid%4==0) )
+          result[tid] += result[tid+2];
+        if ( (N > 4) && (tid%8==0) )
+          result[tid] += result[tid+4];
+        if ( (N > 8) && (tid%16==0) )
+          result[tid] += result[tid+8];
+        if ( (N > 16) && (tid%32==0) )
+          result[tid] += result[tid+16];
+        myresult = result[tid];
+      }
+      if(blockDim.x>256)
+        __syncthreads();
+    }
+    return myresult;
+    #else
+    return val;
+    #endif
+  }
+
+#ifdef __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 300)
+  KOKKOS_INLINE_FUNCTION
+  static int reduce(const int& val) {
+    int result = val;
+    if (N > 1)
+      result += shfl_down(result, 1,N);
+    if (N > 2)
+      result += shfl_down(result, 2,N);
+    if (N > 4)
+      result += shfl_down(result, 4,N);
+    if (N > 8)
+      result += shfl_down(result, 8,N);
+    if (N > 16)
+      result += shfl_down(result, 16,N);
+    return result;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static unsigned int reduce(const unsigned int& val) {
+    unsigned int result = val;
+    if (N > 1)
+      result += shfl_down(result, 1,N);
+    if (N > 2)
+      result += shfl_down(result, 2,N);
+    if (N > 4)
+      result += shfl_down(result, 4,N);
+    if (N > 8)
+      result += shfl_down(result, 8,N);
+    if (N > 16)
+      result += shfl_down(result, 16,N);
+    return result;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static long int reduce(const long int& val) {
+    long int result = val;
+    if (N > 1)
+      result += shfl_down(result, 1,N);
+    if (N > 2)
+      result += shfl_down(result, 2,N);
+    if (N > 4)
+      result += shfl_down(result, 4,N);
+    if (N > 8)
+      result += shfl_down(result, 8,N);
+    if (N > 16)
+      result += shfl_down(result, 16,N);
+    return result;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static unsigned long int reduce(const unsigned long int& val) {
+    unsigned long int result = val;
+    if (N > 1)
+      result += shfl_down(result, 1,N);
+    if (N > 2)
+      result += shfl_down(result, 2,N);
+    if (N > 4)
+      result += shfl_down(result, 4,N);
+    if (N > 8)
+      result += shfl_down(result, 8,N);
+    if (N > 16)
+      result += shfl_down(result, 16,N);
+    return result;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static float reduce(const float& val) {
+    float result = val;
+    if (N > 1)
+      result += shfl_down(result, 1,N);
+    if (N > 2)
+      result += shfl_down(result, 2,N);
+    if (N > 4)
+      result += shfl_down(result, 4,N);
+    if (N > 8)
+      result += shfl_down(result, 8,N);
+    if (N > 16)
+      result += shfl_down(result, 16,N);
+    return result;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static double reduce(const double& val) {
+    double result = val;
+    if (N > 1)
+      result += shfl_down(result, 1,N);
+    if (N > 2)
+      result += shfl_down(result, 2,N);
+    if (N > 4)
+      result += shfl_down(result, 4,N);
+    if (N > 8)
+      result += shfl_down(result, 8,N);
+    if (N > 16)
+      result += shfl_down(result, 16,N);
+    return result;
+  }
+  #endif
+#endif
+
+};
+}
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
new file mode 100644
index 000000000..326f97587
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@@ -0,0 +1,594 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_VIEW_HPP
+#define KOKKOS_CUDA_VIEW_HPP
+
+#include <cstring>
+
+#if defined( __CUDACC__ )
+#include <cuda_runtime.h>
+#endif
+
+#include <Kokkos_View.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_CudaSpace.hpp>
+#include <Kokkos_CudaTypes.hpp>
+#include <Cuda/Kokkos_Cuda_abort.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct AssertShapeBoundsAbort< CudaSpace >
+{
+  KOKKOS_INLINE_FUNCTION
+  static void apply( const size_t /* rank */ ,
+                     const size_t /* n0 */ , const size_t /* n1 */ ,
+                     const size_t /* n2 */ , const size_t /* n3 */ ,
+                     const size_t /* n4 */ , const size_t /* n5 */ ,
+                     const size_t /* n6 */ , const size_t /* n7 */ ,
+
+                     const size_t /* arg_rank */ ,
+                     const size_t /* i0 */ , const size_t /* i1 */ ,
+                     const size_t /* i2 */ , const size_t /* i3 */ ,
+                     const size_t /* i4 */ , const size_t /* i5 */ ,
+                     const size_t /* i6 */ , const size_t /* i7 */ )
+    {
+      Kokkos::cuda_abort("Kokkos::View array bounds violation");
+    }
+};
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+// Cuda 5.0 <texture_types.h> defines 'cudaTextureObject_t'
+// to be an 'unsigned long long'.  This chould change with
+// future version of Cuda and this typedef would have to
+// change accordingly.
+
+#if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION )
+
+typedef enable_if<
+  sizeof(::cudaTextureObject_t) == sizeof(const void *) ,
+  ::cudaTextureObject_t >::type cuda_texture_object_type ;
+
+cuda_texture_object_type
+cuda_texture_object_attach(
+  const cudaChannelFormatDesc & ,
+  const void * const );
+
+template< typename TextureType >
+inline
+cuda_texture_object_type
+cuda_texture_object_attach( const void * const base_view_ptr )
+{
+  return cuda_texture_object_attach( cudaCreateChannelDesc<TextureType>() , base_view_ptr );
+}
+
+#else
+
+typedef const void * cuda_texture_object_type ;
+
+template< typename TextureType >
+inline
+cuda_texture_object_type
+cuda_texture_object_attach( const void * const )
+{ return 0 ; }
+
+#endif
+
+//----------------------------------------------------------------------------
+
+// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
+// Via reinterpret_case this can be used to support all scalar types of those sizes.
+// Any other scalar type falls back to either normal reads out of global memory,
+// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
+
+template< typename T, size_t size = sizeof(T) >
+struct alias_type {
+  typedef void type;
+};
+
+template< typename T >
+struct alias_type<T,4> {
+  typedef int type;
+};
+
+template< typename T >
+struct alias_type<T,8> {
+  typedef int2 type;
+};
+
+template< typename T >
+struct alias_type<T,16> {
+  typedef int4 type;
+};
+
+template< typename ValueType, typename AliasType = typename alias_type<ValueType>::type >
+struct CudaTextureFetch {
+  private:
+
+    cuda_texture_object_type  obj ;
+
+  public:
+
+    const ValueType * ptr ;
+
+    KOKKOS_INLINE_FUNCTION
+    CudaTextureFetch() : obj( 0 ) , ptr( 0 ) {}
+
+    KOKKOS_INLINE_FUNCTION
+    ~CudaTextureFetch() {}
+
+    KOKKOS_INLINE_FUNCTION
+    CudaTextureFetch( const CudaTextureFetch & rhs )
+      : obj( rhs.obj ) , ptr( rhs.ptr ) {}
+
+    KOKKOS_INLINE_FUNCTION
+    CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
+      { obj = rhs.obj ; ptr = rhs.ptr ; return *this ; }
+
+    explicit
+    CudaTextureFetch( const ValueType * const base_view_ptr )
+      : obj( cuda_texture_object_attach<AliasType>( base_view_ptr ) )
+      , ptr( base_view_ptr ) {}
+
+    template< typename iType >
+    KOKKOS_INLINE_FUNCTION
+    ValueType operator[]( const iType & i ) const
+    {
+  #if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
+  // Enable the usage of the _ldg intrinsic even in cases where texture fetches work
+  // Currently texture fetches are faster, but that might change in the future
+  #ifdef KOKKOS_USE_LDG_INTRINSIC
+      return _ldg(&ptr[i]);
+  #else
+      AliasType v = tex1Dfetch<AliasType>( obj , i );
+
+      return  *(reinterpret_cast<ValueType*> (&v));
+  #endif
+  #else
+      return ptr[ i ];
+  #endif
+    }
+};
+
+template< typename ValueType >
+struct CudaTextureFetch< const ValueType, void > {
+private:
+
+  cuda_texture_object_type  obj ;
+
+public:
+
+  const ValueType * ptr ;
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch() : obj( 0 ) , ptr( 0 ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~CudaTextureFetch() {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( const CudaTextureFetch & rhs )
+    : obj( rhs.obj ) , ptr( rhs.ptr ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
+    { obj = rhs.obj ; ptr = rhs.ptr ; return *this ; }
+
+  explicit
+  CudaTextureFetch( ValueType * const base_view_ptr )
+    : obj( cuda_texture_object_attach<ValueType>( base_view_ptr ) )
+    , ptr( base_view_ptr ) {}
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator[]( const iType & i ) const
+  {
+  #if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
+    return _ldg(&ptr[i]);
+  #else
+    return ptr[ i ];
+  #endif
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+struct ViewCudaTexture {};
+
+#if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION )
+
+/** \brief  Replace ViewDefault specialization with Cuda texture fetch specialization
+ *          if 'const' value type and random access.
+ */
+template< class ValueType , class MemoryTraits >
+struct ViewSpecialize< const ValueType , void , LayoutLeft , CudaSpace , MemoryTraits >
+{
+  typedef typename if_c< MemoryTraits::RandomAccess , ViewCudaTexture , ViewDefault >::type type ;
+};
+
+template< class ValueType , class MemoryTraits >
+struct ViewSpecialize< const ValueType , void , LayoutRight , CudaSpace , MemoryTraits >
+{
+  typedef typename if_c< MemoryTraits::RandomAccess , ViewCudaTexture , ViewDefault >::type type ;
+};
+
+#endif
+
+//----------------------------------------------------------------------------
+
+template<>
+struct ViewAssignment< ViewCudaTexture , ViewCudaTexture , void >
+{
+  /** \brief Assign compatible views */
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,ViewCudaTexture> & dst ,
+                  const View<ST,SL,SD,SM,ViewCudaTexture> & src ,
+                  const typename enable_if<(
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::value
+                  ) >::type * = 0 )
+  {
+    dst.m_tracking.decrement( dst.m_texture.ptr );
+
+    dst.m_texture  = src.m_texture ;
+    dst.m_offset_map.assign( src.m_offset_map );
+    dst.m_tracking = src.m_tracking ;
+
+    dst.m_tracking.increment( dst.m_texture.ptr );
+  }
+};
+
+
+template<>
+struct ViewAssignment< ViewCudaTexture , ViewDefault , void >
+{
+  /** \brief Assign compatible views */
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  inline
+  ViewAssignment(       View<DT,DL,DD,DM,ViewCudaTexture> & dst ,
+                  const View<ST,SL,SD,SM,ViewDefault> & src ,
+                  const typename enable_if<(
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                    ViewTraits<ST,SL,SD,SM> >::value
+                  )>::type * = 0 )
+  {
+    dst.m_tracking.decrement( dst.m_texture.ptr );
+
+    dst.m_texture = CudaTextureFetch< typename ViewTraits<DT,DL,DD,DM>::value_type >( src.m_ptr_on_device );
+
+    dst.m_offset_map.assign( src.m_offset_map );
+
+    dst.m_tracking  = src.m_tracking ;
+
+    dst.m_tracking.increment( dst.m_texture.ptr );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+template< class T , class L, class D , class M >
+class View< T , L , D , M , Impl::ViewCudaTexture >
+  : public ViewTraits< T , L , D , M >
+{
+public:
+
+  typedef ViewTraits< T , L , D , M > traits ;
+
+private:
+
+  template< class , class , class > friend struct Impl::ViewAssignment ;
+
+  typedef Impl::ViewOffset< typename traits::shape_type
+                          , typename traits::array_layout
+                          > offset_map_type ;
+
+  Impl::CudaTextureFetch<typename traits::value_type > m_texture ;
+  offset_map_type                                      m_offset_map ;
+  Impl::ViewTracking< traits >                         m_tracking ;
+
+public:
+
+  typedef Impl::ViewCudaTexture specialize ;
+
+  typedef View< typename traits::const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits > const_type ;
+
+  typedef View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type::host_mirror_device_type ,
+                void > HostMirror ;
+
+  enum { Rank = traits::rank };
+
+  KOKKOS_INLINE_FUNCTION typename traits::shape_type shape() const { return m_offset_map ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const { return m_offset_map.N0 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_offset_map.N1 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return m_offset_map.N2 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return m_offset_map.N3 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return m_offset_map.N4 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return m_offset_map.N5 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return m_offset_map.N6 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return m_offset_map.N7 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type size() const
+  {
+    return   m_offset_map.N0
+           * m_offset_map.N1
+           * m_offset_map.N2
+           * m_offset_map.N3
+           * m_offset_map.N4
+           * m_offset_map.N5
+           * m_offset_map.N6
+           * m_offset_map.N7
+           ;
+  }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  typename traits::size_type dimension( const iType & i ) const
+    { return Impl::dimension( m_offset_map , i ); }
+
+  //------------------------------------
+
+  View() : m_texture()
+   { m_offset_map.assign(0,0,0,0,0,0,0,0); }
+
+  KOKKOS_INLINE_FUNCTION
+  ~View() { m_tracking.decrement( m_texture.ptr ); }
+
+  View( const View & rhs )
+    : m_texture( rhs.m_texture )
+    {
+      m_offset_map.assign( rhs.m_offset_map );
+      m_tracking = rhs.m_tracking ;
+      m_tracking.increment( m_texture.ptr );
+    }
+
+  View & operator = ( const View & rhs )
+    {
+      (void)Impl::ViewAssignment< Impl::ViewCudaTexture , Impl::ViewCudaTexture >( *this , rhs );
+      return *this ;
+    }
+
+  template< class RT , class RL, class RD , class RM , class RS >
+  View( const View<RT,RL,RD,RM,RS> & rhs )
+    : m_texture(0)
+    {
+      Impl::ViewAssignment< Impl::ViewCudaTexture , RS >( *this , rhs );
+    }
+
+  template< class RT , class RL, class RD, class RM , class RS >
+  View & operator = ( const View<RT,RL,RD,RM,RS> & rhs )
+    {
+      Impl::ViewAssignment< Impl::ViewCudaTexture , RS >( *this , rhs );
+      return *this ;
+    }
+
+  template< typename TT >
+  explicit inline
+  View( TT * ptr ,
+        const size_t n0 = 0 ,
+        const size_t n1 = 0 ,
+        const size_t n2 = 0 ,
+        const size_t n3 = 0 ,
+        const size_t n4 = 0 ,
+        const size_t n5 = 0 ,
+        const size_t n6 = 0 ,
+        typename Impl::enable_if<(
+          Impl::is_same<TT,typename traits::value_type>::value
+        ), const size_t >::type n7 = 0 )
+    : m_texture( Impl::CudaTextureFetch< typename traits::value_type >(ptr))
+    {
+      m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 );
+      m_tracking = false ;
+    }
+
+  //------------------------------------
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool is_null() const { return 0 == m_texture.ptr ; }
+
+  //------------------------------------
+  // Rank = 1 access operators:
+
+  template < typename iType0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type , traits, typename traits::array_layout, 1 , iType0 >::type
+    operator[] ( const iType0 & i0 ) const
+    {
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
+      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
+      return m_texture[ i0 ];
+    }
+
+  template < typename iType0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type , traits , typename traits::array_layout, 1 , iType0 >::type
+    operator() ( const iType0 & i0 ) const
+    {
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
+      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
+      return m_texture[ i0 ];
+    }
+
+  template< typename iType0 , typename iType1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type , traits, typename traits::array_layout, 2, iType0, iType1 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0,i1 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
+
+      return m_texture[ m_offset_map(i0,i1) ];
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type ,
+                                      traits, typename traits::array_layout, 3, iType0, iType1, iType2 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_3( m_offset_map, i0,i1,i2 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
+
+      return m_texture[ m_offset_map(i0,i1,i2) ];
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type ,
+                                      traits, typename traits::array_layout, 4, iType0, iType1, iType2, iType3 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_4( m_offset_map, i0,i1,i2,i3 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
+
+      return m_texture[ m_offset_map(i0,i1,i2,i3) ];
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type ,
+                                      traits, typename traits::array_layout, 5, iType0, iType1, iType2, iType3, iType4 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_5( m_offset_map, i0,i1,i2,i3,i4 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
+
+      return m_texture[ m_offset_map(i0,i1,i2,i3,i4) ];
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type ,
+                                      traits, typename traits::array_layout, 6, iType0, iType1, iType2, iType3, iType4, iType5 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 , const iType5 & i5 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_6( m_offset_map, i0,i1,i2,i3,i4,i5 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
+
+      return m_texture[ m_offset_map(i0,i1,i2,i3,i4,i5) ];
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 , typename iType6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type ,
+                                      traits, typename traits::array_layout, 7, iType0, iType1, iType2, iType3, iType4, iType5, iType6 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_7( m_offset_map, i0,i1,i2,i3,i4,i5,i6 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
+
+      return m_texture[ m_offset_map(i0,i1,i2,i3,i4,i5,i6) ];
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 , typename iType6 , typename iType7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type ,
+                                      traits, typename traits::array_layout, 8, iType0, iType1, iType2, iType3, iType4, iType5, iType6, iType7 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_8( m_offset_map, i0,i1,i2,i3,i4,i5,i6,i7 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
+
+      return m_texture[ m_offset_map(i0,i1,i2,i3,i4,i5,i6,i7) ];
+    }
+
+  //------------------------------------
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename traits::value_type * ptr_on_device() const { return m_texture.ptr ; }
+
+  // Stride of physical storage, dimensioned to at least Rank
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const { m_offset_map.stride(s); }
+};
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp
new file mode 100644
index 000000000..14d63b0e4
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp
@@ -0,0 +1,101 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_ABORT_HPP
+#define KOKKOS_CUDA_ABORT_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ )
+
+#if ! defined( CUDA_VERSION ) || ( CUDA_VERSION < 4010 )
+#errof "Cuda version 4.1 or greater required"
+#endif
+
+#if ( __CUDA_ARCH__ < 200 )
+#error "Cuda device capability 2.0 or greater required"
+#endif
+
+extern "C" {
+/*  Cuda runtime function, declared in <crt/device_runtime.h>
+ *  Requires capability 2.x or better.
+ */
+extern __device__ void __assertfail(
+  const void  *message,
+  const void  *file,
+  unsigned int line,
+  const void  *function,
+  size_t       charsize);
+}
+
+namespace Kokkos {
+
+__device__ inline
+void cuda_abort( const char * const message )
+{
+  const char empty[] = "" ;
+
+  __assertfail( (const void *) message ,
+                (const void *) empty ,
+                (unsigned int) 0 ,
+                (const void *) empty ,
+                sizeof(char) );
+}
+
+} // namespace Kokkos
+
+#else
+
+namespace Kokkos {
+KOKKOS_INLINE_FUNCTION
+void cuda_abort( const char * const ) {}
+}
+
+#endif /* #if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */
+
diff --git a/lib/kokkos/core/src/KokkosCore_config.h b/lib/kokkos/core/src/KokkosCore_config.h
new file mode 100644
index 000000000..e69de29bb
diff --git a/lib/kokkos/core/src/Kokkos_Atomic.hpp b/lib/kokkos/core/src/Kokkos_Atomic.hpp
new file mode 100644
index 000000000..77813ada9
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp
@@ -0,0 +1,195 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Atomic.hpp
+/// \brief Atomic functions
+///
+/// This header file defines prototypes for the following atomic functions:
+///   - exchange
+///   - compare and exchange
+///   - add
+///
+/// Supported types include:
+///   - signed and unsigned 4 and 8 byte integers
+///   - float
+///   - double
+///
+/// They are implemented through GCC compatible intrinsics, OpenMP
+/// directives and native CUDA intrinsics.
+///
+/// Including this header file requires one of the following
+/// compilers:
+///   - NVCC (for CUDA device code only)
+///   - GCC (for host code only)
+///   - Intel (for host code only)
+///   - A compiler that supports OpenMP 3.1 (for host code only)
+
+#ifndef KOKKOS_ATOMIC_HPP
+#define KOKKOS_ATOMIC_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_Traits.hpp>
+
+//----------------------------------------------------------------------------
+
+#if defined( __CUDA_ARCH__ )
+
+// Compiling NVIDIA device code, must use Cuda atomics:
+
+#define KOKKOS_ATOMICS_USE_CUDA
+
+#elif ! defined( KOKKOS_ATOMICS_USE_GCC ) && \
+      ! defined( KOKKOS_ATOMICS_USE_INTEL ) && \
+      ! defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+// Compiling for non-Cuda atomic implementation has not been pre-selected.
+// Choose the best implementation for the detected compiler.
+// Preference: GCC, INTEL, OMP31
+
+#if defined( __GNUC__ ) || defined( __GNUG__ ) || defined( __clang__ )
+
+#define KOKKOS_ATOMICS_USE_GCC
+
+#elif defined( __INTEL_COMPILER ) || defined( _CRAYC)
+
+#define KOKKOS_ATOMICS_USE_INTEL
+
+#elif defined( _OPENMP ) && ( 201107 <= _OPENMP )
+
+#define KOKKOS_ATOMICS_USE_OMP31
+
+#else
+
+#error "KOKKOS_ATOMICS_USE : Unsupported compiler"
+
+#endif
+
+#endif /* Not pre-selected atomic implementation */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+
+inline
+const char * atomic_query_version()
+{
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+  return "KOKKOS_ATOMICS_USE_CUDA" ;
+#elif defined( KOKKOS_ATOMICS_USE_GCC )
+  return "KOKKOS_ATOMICS_USE_GCC" ;
+#elif defined( KOKKOS_ATOMICS_USE_INTEL )
+  return "KOKKOS_ATOMICS_USE_INTEL" ;
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+  return "KOKKOS_ATOMICS_USE_OMP31" ;
+#endif
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+// Atomic exchange
+//
+// template< typename T >
+// T atomic_exchange( volatile T* const dest , const T val )
+// { T tmp = *dest ; *dest = val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Exchange.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic compare-and-exchange
+//
+// template<class T>
+// bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
+// { bool equal = compare == *dest ; if ( equal ) { *dest = val ; } return equal ; }
+
+#include "impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and add
+//
+// template<class T>
+// T atomic_fetch_add(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest += val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_Add.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and or
+//
+// template<class T>
+// T atomic_fetch_or(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest = tmp | val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_Or.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and and
+//
+// template<class T>
+// T atomic_fetch_or(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest = tmp & val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_And.hpp"
+
+//----------------------------------------------------------------------------
+// Memory fence
+//
+// All loads and stores from this thread will be globally consistent before continuing
+//
+// void memory_fence() {...};
+#include "impl/Kokkos_Memory_Fence.hpp"
+
+//----------------------------------------------------------------------------
+// Provide volatile_load and safe_load
+//
+// T volatile_load(T const volatile * const ptr);
+//
+// T const& safe_load(T const * const ptr);
+// XEON PHI
+// T safe_load(T const * const ptr
+
+#include "impl/Kokkos_Volatile_Load.hpp"
+
+#endif /* KOKKOS_ATOMIC_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_CrsArray.hpp b/lib/kokkos/core/src/Kokkos_CrsArray.hpp
new file mode 100644
index 000000000..8f1b83812
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_CrsArray.hpp
@@ -0,0 +1,170 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CRSARRAY_HPP
+#define KOKKOS_CRSARRAY_HPP
+
+#include <string>
+#include <vector>
+
+#include <Kokkos_View.hpp>
+
+namespace Kokkos {
+
+/// \class CrsArray
+/// \brief Compressed row storage array.
+///
+/// \tparam DataType The type of stored entries.  If a CrsArray is
+///   used as the graph of a sparse matrix, then this is usually an
+///   integer type, the type of the column indices in the sparse
+///   matrix.
+///
+/// \tparam Arg1Type The second template parameter, corresponding
+///   either to the Device type (if there are no more template
+///   parameters) or to the Layout type (if there is at least one more
+///   template parameter).
+///
+/// \tparam Arg2Type The third template parameter, which if provided
+///   corresponds to the Device type.
+///
+/// \tparam SizeType The type of row offsets.  Usually the default
+///   parameter suffices.  However, setting a nondefault value is
+///   necessary in some cases, for example, if you want to have a
+///   sparse matrices with dimensions (and therefore column indices)
+///   that fit in \c int, but want to store more than <tt>INT_MAX</tt>
+///   entries in the sparse matrix.
+///
+/// A row has a range of entries:
+/// <ul>
+/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li>
+/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li>
+/// <li> <tt> entries( entry ,            i2 , i3 , ... ); </tt> </li>
+/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li>
+/// </ul>
+template< class DataType,
+          class Arg1Type,
+          class Arg2Type = void,
+          typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, void >::size_type>
+class CrsArray {
+private:
+  typedef ViewTraits<DataType*, Arg1Type, Arg2Type, void> traits;
+
+public:
+  typedef DataType                                            data_type;
+  typedef typename traits::array_layout                       array_layout;
+  typedef typename traits::device_type                        device_type;
+  typedef SizeType                                            size_type;
+
+  typedef CrsArray< DataType , Arg1Type , Arg2Type , SizeType > crsarray_type;
+  typedef CrsArray< DataType , array_layout , typename device_type::host_mirror_device_type , SizeType > HostMirror;
+  typedef View< const size_type* , array_layout, device_type >  row_map_type;
+  typedef View<       DataType*  , array_layout, device_type >  entries_type;
+
+  entries_type entries;
+  row_map_type row_map;
+
+  //! Construct an empty view.
+  CrsArray () : entries(), row_map() {}
+
+  //! Copy constructor (shallow copy).
+  CrsArray (const CrsArray& rhs) : entries (rhs.entries), row_map (rhs.row_map)
+  {}
+
+  /** \brief  Assign to a view of the rhs array.
+   *          If the old view is the last view
+   *          then allocated memory is deallocated.
+   */
+  CrsArray& operator= (const CrsArray& rhs) {
+    entries = rhs.entries;
+    row_map = rhs.row_map;
+    return *this;
+  }
+
+  /**  \brief  Destroy this view of the array.
+   *           If the last view then allocated memory is deallocated.
+   */
+  ~CrsArray() {}
+};
+
+//----------------------------------------------------------------------------
+
+template< class CrsArrayType , class InputSizeType >
+typename CrsArrayType::crsarray_type
+create_crsarray( const std::string & label ,
+                 const std::vector< InputSizeType > & input );
+
+template< class CrsArrayType , class InputSizeType >
+typename CrsArrayType::crsarray_type
+create_crsarray( const std::string & label ,
+                 const std::vector< std::vector< InputSizeType > > & input );
+
+//----------------------------------------------------------------------------
+
+template< class DataType ,
+          class Arg1Type ,
+          class Arg2Type ,
+          typename SizeType >
+typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror_view( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & input );
+
+template< class DataType ,
+          class Arg1Type ,
+          class Arg2Type ,
+          typename SizeType >
+typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & input );
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#include <impl/Kokkos_CrsArray_factory.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_CRSARRAY_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Kokkos_Cuda.hpp
new file mode 100644
index 000000000..d42601104
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp
@@ -0,0 +1,323 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_HPP
+#define KOKKOS_CUDA_HPP
+
+#include <iosfwd>
+#include <vector>
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_HAVE_OPENMP
+#include <Kokkos_OpenMP.hpp>
+#else
+#ifdef KOKKOS_HAVE_PTHREAD
+#include <Kokkos_Threads.hpp>
+#else
+#include <Kokkos_Serial.hpp>
+#endif
+#endif
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_CudaSpace.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class CudaExec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/// \class Cuda
+/// \brief Kokkos device that uses CUDA to run on GPUs.
+///
+/// A "device" represents a parallel execution model.  It tells Kokkos
+/// how to parallelize the execution of kernels in a parallel_for or
+/// parallel_reduce.  For example, the Threads device uses Pthreads or
+/// C++11 threads on a CPU, the OpenMP device uses the OpenMP language
+/// extensions, and the Serial device executes "parallel" kernels
+/// sequentially.  The Cuda device uses NVIDIA's CUDA programming
+/// model to execute kernels in parallel on GPUs.
+class Cuda {
+public:
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! The device type (same as this class).
+  typedef Cuda                  device_type ;
+  //! This device's preferred memory space.
+  typedef CudaSpace             memory_space ;
+  //! The size_type typedef best suited for this device.
+  typedef CudaSpace::size_type  size_type ;
+  //! This device's preferred array layout.
+  typedef LayoutLeft            array_layout ;
+  //! This device's host mirror type.
+#ifdef KOKKOS_HAVE_OPENMP
+  typedef Kokkos::OpenMP       host_mirror_device_type ;
+#else
+#ifdef KOKKOS_HAVE_PTHREAD
+  typedef Kokkos::Threads       host_mirror_device_type ;
+#else
+  typedef Kokkos::Serial       host_mirror_device_type ;
+#endif
+#endif
+  //@}
+  //! \name Functions that all Kokkos devices must implement.
+  //@{
+
+  /// \brief True if and only if this method is being called in a
+  ///   thread-parallel function.
+  KOKKOS_INLINE_FUNCTION static int in_parallel() {
+#if defined( __CUDA_ARCH__ )
+    return true;
+#else
+    return false;
+#endif
+  }
+
+  /** \brief  Set the device in a "sleep" state.
+   *
+   * This function sets the device in a "sleep" state in which it is
+   * not ready for work.  This may consume less resources than if the
+   * device were in an "awake" state, but it may also take time to
+   * bring the device from a sleep state to be ready for work.
+   *
+   * \return True if the device is in the "sleep" state, else false if
+   *   the device is actively working and could not enter the "sleep"
+   *   state.
+   */
+  static bool sleep();
+
+  /// \brief Wake the device from the 'sleep' state so it is ready for work.
+  ///
+  /// \return True if the device is in the "ready" state, else "false"
+  ///  if the device is actively working (which also means that it's
+  ///  awake).
+  static bool wake();
+
+  /// \brief Wait until all dispatched functors complete.
+  ///
+  /// The parallel_for or parallel_reduce dispatch of a functor may
+  /// return asynchronously, before the functor completes.  This
+  /// method does not return until all dispatched functors on this
+  /// device have completed.
+  static void fence();
+
+  //! Free any resources being consumed by the device.
+  static void finalize();
+
+  //! Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  //@}
+  //--------------------------------------------------------------------------
+  //! \name Device-specific functions
+  //@{
+
+  struct SelectDevice {
+    int cuda_device_id ;
+    SelectDevice() : cuda_device_id(0) {}
+    explicit SelectDevice( int id ) : cuda_device_id( id ) {}
+  };
+
+  //! Initialize, telling the CUDA run-time library which device to use.
+  static void initialize( const SelectDevice = SelectDevice() );
+
+  static int is_initialized();
+
+  /// \brief Cuda device architecture of the selected device.
+  ///
+  /// This matches the __CUDA_ARCH__ specification.
+  static size_type device_arch();
+
+  //! Query device count.
+  static size_type detect_device_count();
+
+  /** \brief  Detect the available devices and their architecture
+   *          as defined by the __CUDA_ARCH__ specification.
+   */
+  static std::vector<unsigned> detect_device_arch();
+
+  static unsigned team_max();
+
+  //@}
+  //--------------------------------------------------------------------------
+#if defined( __CUDA_ARCH__ )
+  //! \name Functions for the functor device interface
+  //@{
+
+  __device__ inline int league_size() const { return gridDim.x ; }
+  __device__ inline int league_rank() const { return blockIdx.x ; }
+
+  __device__ inline int team_size() const { return blockDim.x ; }
+  __device__ inline int team_rank() const { return threadIdx.x ; }
+
+  __device__ inline void team_barrier() const { __syncthreads(); }
+  __device__ inline unsigned int team_barrier_count(bool value) const
+             { return __syncthreads_count(value); }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  __device__ inline Type team_scan( const Type & value );
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename TypeLocal , typename TypeGlobal >
+  __device__ inline TypeGlobal team_scan( const TypeLocal & value , TypeGlobal * const global_accum );
+
+
+  //! Get a pointer to shared memory for this team.
+  __device__ inline void * get_shmem( const int size );
+
+  __device__ inline Cuda( Impl::CudaExec & exec ) : m_exec(exec) {}
+  __device__ inline Cuda( const Cuda & rhs ) : m_exec(rhs.m_exec) {}
+
+  //@}
+  //--------------------------------------------------------------------------
+
+private:
+
+  Impl::CudaExec & m_exec ;
+
+  //--------------------------------------------------------------------------
+#else
+
+  int league_size() const ;
+  int league_rank() const ;
+
+  int team_size() const ;
+  int team_rank() const ;
+
+  void team_barrier() const ;
+  unsigned int team_barrier_count(bool) const ;
+
+  template< typename T >
+    inline T team_scan(const T& value);
+
+  template< typename TypeLocal , typename TypeGlobal >
+    inline TypeGlobal team_scan( const TypeLocal & value , TypeGlobal * const global_accum );
+
+  void * get_shmem( const int size );
+
+  Cuda( Impl::CudaExec & );
+
+#endif
+
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief Cuda-specific parallel work configuration */
+
+struct CudaWorkConfig {
+  Cuda::size_type  grid[3] ;   //< Grid dimensions
+  Cuda::size_type  block[3] ;  //< Block dimensions
+  Cuda::size_type  shared ;    //< Shared memory size
+
+  CudaWorkConfig()
+  {
+    enum { WarpSize = 32 };
+    grid[0] = grid[1] = grid[2] = 1 ;
+    block[1] = block[2] = 1 ;
+    block[0] = 8 * WarpSize ;
+    shared = 0 ;
+  }
+};
+
+template< class FunctorType >
+inline
+void parallel_for( const CudaWorkConfig & work_config ,
+                   const FunctorType    & functor )
+{
+  Impl::ParallelFor< FunctorType , CudaWorkConfig , Cuda >
+    ( work_config , functor );
+}
+
+template< class FunctorType , class FinalizeType >
+inline
+void parallel_reduce( const CudaWorkConfig & work_config ,
+                      const FunctorType    & functor ,
+                      const FinalizeType   & finalize );
+
+template< class FunctorType >
+inline
+typename FunctorType::value_type
+parallel_reduce( const CudaWorkConfig & work_config ,
+                 const FunctorType    & functor );
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+#include <Cuda/Kokkos_CudaExec.hpp>
+#include <Cuda/Kokkos_Cuda_View.hpp>
+#include <Cuda/Kokkos_Cuda_Parallel.hpp>
+
+#endif /* #ifndef KOKKOS_CUDA_HPP */
+
+//----------------------------------------------------------------------------
+
+
diff --git a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
new file mode 100644
index 000000000..ff6238551
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
@@ -0,0 +1,184 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDASPACE_HPP
+#define KOKKOS_CUDASPACE_HPP
+
+#if defined( __CUDACC__ )
+#include <cuda_runtime.h>
+#endif
+
+#include <iosfwd>
+#include <typeinfo>
+#include <string>
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Cuda/Kokkos_Cuda_abort.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Cuda memory management */
+
+class CudaSpace {
+public:
+
+  typedef CudaSpace     memory_space ;
+  typedef unsigned int  size_type ;
+
+  /** \brief  Allocate a contiguous block of memory on the Cuda device
+   *          with size = scalar_size * scalar_count.
+   *
+   *  The input label is associated with the block of memory.
+   *  The block of memory is tracked via reference counting where
+   *  allocation gives it a reference count of one.
+   *
+   *  Allocation may only occur on the master thread of the process.
+   */
+  static void * allocate( const std::string    & label ,
+                          const std::type_info & scalar_type ,
+                          const size_t           scalar_size ,
+                          const size_t           scalar_count );
+
+  /** \brief  Increment the reference count of the block of memory
+   *          in which the input pointer resides.
+   *
+   *          Reference counting only occurs on the master thread.
+   */
+  static void increment( const void * );
+
+  /** \brief  Decrement the reference count of the block of memory
+   *          in which the input pointer resides.  If the reference
+   *          count falls to zero the memory is deallocated.
+   *
+   *          Reference counting only occurs on the master thread.
+   */
+  static void decrement( const void * );
+
+  /** \brief  Print all tracked memory to the output stream. */
+  static void print_memory_view( std::ostream & );
+
+  /** \brief  Retrieve label associated with the input pointer */
+  static std::string query_label( const void * );
+
+  /*--------------------------------*/
+
+  static void access_error();
+  static void access_error( const void * const );
+
+  /*--------------------------------*/
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct DeepCopy<HostSpace,CudaSpace> {
+  DeepCopy( void * dst , const void * src , size_t );
+};
+
+template<>
+struct DeepCopy<CudaSpace,HostSpace> {
+  DeepCopy( void * dst , const void * src , size_t );
+};
+
+template<>
+struct DeepCopy<CudaSpace,CudaSpace> {
+  DeepCopy( void * dst , const void * src , size_t );
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief  Cuda code accessing Cuda data is good. */
+template<>
+struct VerifyExecutionSpaceCanAccessDataSpace< CudaSpace , CudaSpace >
+{
+  KOKKOS_INLINE_FUNCTION static void verify( void ) {}
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) {}
+};
+
+/** \brief  Cuda code accessing non-Cuda data is bad. */
+template<>
+struct VerifyExecutionSpaceCanAccessDataSpace< CudaSpace , HostSpace >
+{
+  KOKKOS_INLINE_FUNCTION static void verify(void)
+  { Kokkos::cuda_abort("Cuda code called function restricted to HostSpace"); }
+
+  KOKKOS_INLINE_FUNCTION static void verify( const void * )
+  { Kokkos::cuda_abort("Cuda code attempted to access HostSpace memory"); }
+};
+
+/** \brief  Produce error message when trying to access Cuda 
+ *          memory on the host.
+ */
+template<>
+struct VerifyExecutionSpaceCanAccessDataSpace< HostSpace , CudaSpace >
+{
+#ifdef KOKKOS_USE_UVM
+  inline static void verify( void ) { }
+  inline static void verify( const void * p ) { }
+#else
+  inline static void verify( void ) { CudaSpace::access_error(); }
+  inline static void verify( const void * p ) { CudaSpace::access_error(p); }
+#endif
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_CUDASPACE_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_CudaTypes.hpp b/lib/kokkos/core/src/Kokkos_CudaTypes.hpp
new file mode 100644
index 000000000..899e7e1fa
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_CudaTypes.hpp
@@ -0,0 +1,139 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDATYPES_HPP
+#define KOKKOS_CUDATYPES_HPP
+
+#include <Kokkos_Macros.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __CUDACC__ )
+
+namespace Kokkos {
+
+typedef ::int2 int2 ;
+typedef ::int3 int3 ;
+typedef ::int4 int4 ;
+
+typedef ::float2 float2 ;
+typedef ::float3 float3 ;
+typedef ::float4 float4 ;
+
+typedef ::double2 double2 ;
+typedef ::double3 double3 ;
+typedef ::double4 double4 ;
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#else /* NOT #if defined( __CUDACC__ ) */
+
+namespace Kokkos {
+
+struct int2 {
+        int x;
+        int y;
+};
+
+struct int3 {
+        int x;
+        int y;
+        int z;
+};
+
+struct int4 {
+        int x;
+        int y;
+        int z;
+        int w;
+};
+
+struct float2 {
+        float x;
+        float y;
+};
+
+struct float3 {
+        float x;
+        float y;
+        float z;
+};
+
+struct float4 {
+        float x;
+        float y;
+        float z;
+        float w;
+};
+
+struct double2 {
+        double x;
+        double y;
+};
+
+struct double3 {
+        double x;
+        double y;
+        double z;
+};
+
+struct double4 {
+        double x;
+        double y;
+        double z;
+        double w;
+};
+
+} // namespace Kokkos
+
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_CUDATYPES_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
new file mode 100644
index 000000000..c42ad0a4e
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
@@ -0,0 +1,144 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HOSTSPACE_HPP
+#define KOKKOS_HOSTSPACE_HPP
+
+#include <iosfwd>
+#include <typeinfo>
+#include <string>
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_MemoryTracking.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Memory management on the host for devices */
+
+class HostSpace {
+public:
+
+  typedef HostSpace  memory_space ;
+  typedef size_t     size_type ;
+
+  /** \brief  Allocate a contiguous block of memory on the Cuda device
+   *          with size = scalar_size * scalar_count.
+   *
+   *  The input label is associated with the block of memory.
+   *  The block of memory is tracked via reference counting where
+   *  allocation gives it a reference count of one.
+   *
+   *  Allocation may only occur on the master thread of the process.
+   */
+  static void * allocate( const std::string    & label ,
+                          const std::type_info & scalar_type ,
+                          const size_t           scalar_size ,
+                          const size_t           scalar_count );
+
+  /** \brief  Increment the reference count of the block of memory
+   *          in which the input pointer resides.
+   *
+   *          Reference counting only occurs on the master thread.
+   */
+  static void increment( const void * );
+
+  /** \brief  Decrement the reference count of the block of memory
+   *          in which the input pointer resides.  If the reference
+   *          count falls to zero the memory is deallocated.
+   *
+   *          Reference counting only occurs on the master thread.
+   */
+  static void decrement( const void * );
+
+  /*--------------------------------*/
+
+  /** \brief  Print all tracked memory to the output stream. */
+  static void print_memory_view( std::ostream & );
+
+  /** \brief  Retrieve label associated with the input pointer */
+  static std::string query_label( const void * );
+
+  /*--------------------------------*/
+  /* Functions unique to the HostSpace */
+
+  static int in_parallel();
+
+  static void register_in_parallel( int (*)() );
+};
+
+//----------------------------------------------------------------------------
+
+template< class ExecutionSpace , class DataSpace >
+struct VerifyExecutionSpaceCanAccessDataSpace ;
+
+template<>
+struct VerifyExecutionSpaceCanAccessDataSpace< HostSpace , HostSpace >
+{
+  inline static void verify(void) {}
+  inline static void verify(const void *) {}
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class , class > struct DeepCopy ;
+
+template<>
+struct DeepCopy<HostSpace,HostSpace> {
+  DeepCopy( void * dst , const void * src , size_t n );
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_HOSTSPACE_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_Layout.hpp b/lib/kokkos/core/src/Kokkos_Layout.hpp
new file mode 100644
index 000000000..99f8056ea
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Layout.hpp
@@ -0,0 +1,164 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Layout.hpp
+/// \brief Declaration of various \c MemoryLayout options.
+
+#ifndef KOKKOS_LAYOUT_HPP
+#define KOKKOS_LAYOUT_HPP
+
+#include <stddef.h>
+#include <impl/Kokkos_Traits.hpp>
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+/// \struct LayoutLeft
+/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
+///   striding of multi-indices.
+///
+/// This is an example of a \c MemoryLayout template parameter of
+/// View.  The memory layout describes how View maps from a
+/// multi-index (i0, i1, ..., ik) to a memory location.  
+///
+/// "Layout left" indicates a mapping where the leftmost index i0
+/// refers to contiguous access, and strides increase for dimensions
+/// going right from there (i1, i2, ...).  This layout imitates how
+/// Fortran stores multi-dimensional arrays.  For the special case of
+/// a two-dimensional array, "layout left" is also called "column
+/// major."
+struct LayoutLeft { typedef LayoutLeft array_layout ; };
+
+//----------------------------------------------------------------------------
+/// \struct LayoutRight
+/// \brief Memory layout tag indicating right-to-left (C or
+///   lexigraphical scheme) striding of multi-indices.
+///
+/// This is an example of a \c MemoryLayout template parameter of
+/// View.  The memory layout describes how View maps from a
+/// multi-index (i0, i1, ..., ik) to a memory location.  
+///
+/// "Right layout" indicates a mapping where the rightmost index ik
+/// refers to contiguous access, and strides increase for dimensions
+/// going left from there.  This layout imitates how C stores
+/// multi-dimensional arrays.  For the special case of a
+/// two-dimensional array, "layout right" is also called "row major."
+struct LayoutRight { typedef LayoutRight array_layout ; };
+
+//----------------------------------------------------------------------------
+/// \struct LayoutStride
+/// \brief  Memory layout tag indicated arbitrarily strided
+///         multi-index mapping into contiguous memory.
+struct LayoutStride {
+  typedef LayoutStride array_layout ;
+
+  enum { MAX_RANK = 8 };
+
+  size_t dimension[ MAX_RANK ] ;
+  size_t stride[ MAX_RANK ] ; 
+
+  /** \brief  Compute strides from ordered dimensions.
+   *
+   *  Values of order uniquely form the set [0..rank)
+   *  and specify ordering of the dimensions.
+   *  Order = {0,1,2,...} is LayoutLeft
+   *  Order = {...,2,1,0} is LayoutRight
+   */
+  template< typename iTypeOrder , typename iTypeDimen >
+  KOKKOS_INLINE_FUNCTION static
+  LayoutStride order_dimensions( int const rank
+                               , iTypeOrder const * const order
+                               , iTypeDimen const * const dimen )
+    {
+      LayoutStride tmp ;
+      // Verify valid rank order:
+      int check_input = MAX_RANK < rank ? 0 : int( 1 << rank ) - 1 ;
+      for ( int r = 0 ; r < MAX_RANK ; ++r ) {
+        tmp.dimension[r] = 0 ;
+        tmp.stride[r]    = 0 ;
+        check_input &= ~int( 1 << order[r] );
+      }
+      if ( 0 == check_input ) {
+        size_t n = 1 ;
+        for ( int r = 0 ; r < rank ; ++r ) {
+          tmp.stride[ order[r] ] = n ;
+          n *= ( tmp.dimension[r] = dimen[r] );
+        }
+      }
+      return tmp ;
+    }
+};
+
+//----------------------------------------------------------------------------
+/// \struct LayoutTileLeft
+/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
+///   striding of multi-indices by tiles.
+///
+/// This is an example of a \c MemoryLayout template parameter of
+/// View.  The memory layout describes how View maps from a
+/// multi-index (i0, i1, ..., ik) to a memory location.  
+///
+/// "Tiled layout" indicates a mapping to contiguously stored
+/// <tt>ArgN0</tt> by <tt>ArgN1</tt> tiles for the rightmost two
+/// dimensions.  Indices are LayoutLeft within each tile, and the
+/// tiles themselves are arranged using LayoutLeft.  Note that the
+/// dimensions <tt>ArgN0</tt> and <tt>ArgN1</tt> of the tiles must be
+/// compile-time constants.  This speeds up index calculations.  If
+/// both tile dimensions are powers of two, Kokkos can optimize
+/// further.
+template < unsigned ArgN0 , unsigned ArgN1 ,
+           bool IsPowerOfTwo = ( Impl::is_power_of_two<ArgN0>::value &&
+                                 Impl::is_power_of_two<ArgN1>::value )
+         >
+struct LayoutTileLeft {
+  typedef LayoutTileLeft<ArgN0,ArgN1,IsPowerOfTwo> array_layout ;
+  enum { N0 = ArgN0 };
+  enum { N1 = ArgN1 };
+};
+
+} // namespace Kokkos
+
+#endif // #ifndef KOKKOS_LAYOUT_HPP
+
diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp
new file mode 100644
index 000000000..11faf98ce
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@@ -0,0 +1,227 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MACROS_HPP
+#define KOKKOS_MACROS_HPP
+
+#include <KokkosCore_config.h>
+#include <impl/Kokkos_Compiler_Macros.hpp>
+
+namespace Kokkos {
+class HostSpace ;
+class CudaSpace ;
+} // namespace Kokkos
+
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __CUDACC__ )
+
+// Compiling with CUDA compiler.
+
+#if ! defined( KOKKOS_HAVE_CUDA )
+#error "Compiling Kokkos with Cuda compiler but KOKKOS_HAVE_CUDA is undefined"
+#endif
+
+#include <cuda.h>
+
+/*  Compiling with a CUDA compiler for device code.
+ *
+ *  Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
+ *    CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
+ *
+ *  When generating device code the __CUDA_ARCH__ macro is defined as:
+ *    __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )
+ */
+#if ! defined( CUDA_VERSION )
+#error "#include <cuda.h> did not define CUDA_VERSION"
+#endif
+
+#if ( CUDA_VERSION < 4010 )
+#error "Cuda version 4.1 or greater required"
+#endif
+
+#endif /* #if defined( __CUDACC__ ) */
+
+//----------------------------------------------------------------------------
+
+#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ )
+
+/*  Compiling with CUDA compiler for device code. */
+
+#if ( __CUDA_ARCH__ < 200 )
+#error "Cuda device capability >= 2.0 is required"
+#endif
+
+#define KOKKOS_FORCEINLINE_FUNCTION  __device__  __host__  __forceinline__
+#define KOKKOS_INLINE_FUNCTION  __device__  __host__  inline
+#define KOKKOS_FUNCTION         __device__  __host__
+
+#endif /* #if defined( __CUDACC__ ) && #if defined( __CUDA_ARCH__ ) */
+
+//----------------------------------------------------------------------------
+
+#if defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
+
+/*  Compiling with CUDA compiler for host code. */
+
+#define KOKKOS_FORCEINLINE_FUNCTION  __forceinline__
+
+#endif /* #if defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __INTEL_COMPILER )
+
+#if (__INTEL_COMPILER < 1200)
+#define KOKKOS_DISABLE_ASM true;
+#endif
+
+/*  Compiling with Intel compiler */
+/*  TBD: Version testing */
+
+#ifndef KOKKOS_FORCEINLINE_FUNCTION
+#define KOKKOS_FORCEINLINE_FUNCTION  __forceinline
+#endif
+
+#if defined( __MIC__ )
+
+/*  Compiling with Intel compiler for execution on an Intel MIC device.
+ *  These devices are used in no-offload mode so the HostSpace is the MIC space.
+ */
+
+#else
+
+#ifndef KOKKOS_USE_PRAGMA_SIMD
+#define KOKKOS_USE_PRAGMA_SIMD
+#endif
+
+/*
+  #pragma simd vectorlength(N)
+  #pragma ivdep
+*/
+
+#endif /* #if defined( __MIC__ ) */
+
+#endif /* #if defined( __INTEL_COMPILER ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __GNUC__ ) /* GNU C   */ || \
+    defined( __GNUG__ ) /* GNU C++ */
+
+/* Compiling with GNU compiler */
+
+#ifndef KOKKOS_FORCEINLINE_FUNCTION
+#define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
+#endif
+
+/*  Compiling with GNU compatible compiler.  */
+
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( _OPENMP )
+
+#if ! defined( KOKKOS_HAVE_OPENMP )
+#error "Compiling Kokkos for OpenMP but KOKKOS_HAVE_OPENMP is undefined"
+#endif
+
+/*  Compiling with OpenMP.
+ *  The value of _OPENMP is an integer value YYYYMM
+ *  where YYYY and MM are the year and month designation
+ *  of the supported OpenMP API version.
+ */
+
+#endif /* END: #if defined( _OPENMP ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#ifndef KOKKOS_FUNCTION
+#define KOKKOS_FUNCTION /* */
+#endif
+
+#ifndef KOKKOS_INLINE_FUNCTION
+#define KOKKOS_INLINE_FUNCTION inline
+#endif
+
+#ifndef KOKKOS_FORCEINLINE_FUNCTION
+#define KOKKOS_FORCEINLINE_FUNCTION  inline
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ )
+
+namespace Kokkos { typedef CudaSpace ExecutionSpace ; }
+
+#else
+
+namespace Kokkos { typedef HostSpace ExecutionSpace ; }
+
+#endif
+
+#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE , DATA_PTR ) \
+  Kokkos::VerifyExecutionSpaceCanAccessDataSpace< \
+    Kokkos::ExecutionSpace , DATA_SPACE >::verify( DATA_PTR )
+
+#define KOKKOS_RESTRICT_EXECUTION_TO( DATA_SPACE ) \
+  Kokkos::VerifyExecutionSpaceCanAccessDataSpace< \
+    Kokkos::ExecutionSpace , DATA_SPACE >::verify()
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+
+#endif /* #ifndef KOKKOS_MACROS_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
new file mode 100644
index 000000000..4da3884d4
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
@@ -0,0 +1,111 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MEMORYTRAITS_HPP
+#define KOKKOS_MEMORYTRAITS_HPP
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief  Memory access traits for views, an extension point.
+ *
+ *  These traits should be orthogonal.  If there are dependencies then
+ *  the MemoryTraits template must detect and enforce dependencies.
+ *
+ *  A zero value is the default for a View, indicating that none of
+ *  these traits are present.
+ */
+enum MemoryTraitsFlags
+  { Unmanaged  = 0x01
+  , RandomAccess = 0x02
+  };
+
+template < unsigned T >
+struct MemoryTraits {
+  enum { Unmanaged  = T & unsigned(Kokkos::Unmanaged) };
+  enum { RandomAccess = T & unsigned(Kokkos::RandomAccess) };
+
+  typedef MemoryTraits memory_traits ;
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+typedef Kokkos::MemoryTraits<0> MemoryManaged ;
+typedef Kokkos::MemoryTraits< Kokkos::Unmanaged > MemoryUnmanaged ;
+typedef Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::RandomAccess > MemoryRandomAccess ;
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief Memory alignment settings
+ *
+ *  Sets global value for memory alignment.
+ *  Enable compatibility of views from different devices with static stride.
+ *  Use compiler flag to enable overwrites.
+ */
+enum { MEMORY_ALIGNMENT =
+#if defined( KOKKOS_MEMORY_ALIGNMENT )
+  KOKKOS_MEMORY_ALIGNMENT
+#else
+  128
+#endif
+  };
+
+enum { MEMORY_ALIGNMENT_THRESHOLD = 4 };
+
+} //namespace Impl
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_MEMORYTRAITS_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
new file mode 100644
index 000000000..d9326e712
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
@@ -0,0 +1,189 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMP_HPP
+#define KOKKOS_OPENMP_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#if defined(KOKKOS_HAVE_OPENMP)
+
+#include <omp.h>
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_Layout.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class OpenMPexec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/// \class OpenMP
+/// \brief Kokkos device for multicore processors in the host memory space.
+class OpenMP {
+public:
+  //------------------------------------
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  typedef OpenMP                device_type ;
+  typedef HostSpace::size_type  size_type ;
+  typedef HostSpace             memory_space ;
+  typedef LayoutRight           array_layout ;
+  typedef OpenMP                host_mirror_device_type ;
+
+  //@}
+  //------------------------------------
+  //! \name Functions that all Kokkos devices must implement.
+  //@{
+
+  inline static bool in_parallel() { return omp_in_parallel(); }
+
+  /** \brief  Set the device in a "sleep" state. A noop for OpenMP.  */
+  static bool sleep();
+
+  /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */
+  static bool wake();
+
+  /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
+  static void fence() {}
+
+  /// \brief Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  /// \brief Free any resources being consumed by the device.
+  static void finalize();
+
+  /** \brief  Initialize the device.
+   *
+   *  1) If the hardware locality library is enabled and OpenMP has not
+   *     already bound threads then bind OpenMP threads to maximize
+   *     core utilization and group for memory hierarchy locality.
+   *
+   *  2) Allocate a HostThread for each OpenMP thread to hold its
+   *     topology and fan in/out data.
+   */
+#if 0
+  static void initialize( const unsigned team_count         = 1 ,
+                          const unsigned threads_per_team   = 1 ,
+                          const unsigned use_numa_count     = 0 ,
+                          const unsigned use_cores_per_numa = 0 );
+#endif
+
+  static void initialize( unsigned thread_count = 0 ,
+                          unsigned use_numa_count = 0 ,
+                          unsigned use_cores_per_numa = 0 );
+
+  static int is_initialized();
+
+  KOKKOS_FUNCTION static unsigned league_max();
+  KOKKOS_FUNCTION static unsigned team_max();
+  //@}
+  //------------------------------------
+  //! \name Function for the functor device interface */
+  //@{
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const ;
+  KOKKOS_INLINE_FUNCTION int league_size() const ;
+  KOKKOS_INLINE_FUNCTION int team_rank() const ;
+  KOKKOS_INLINE_FUNCTION int team_size() const ;
+
+  KOKKOS_INLINE_FUNCTION void team_barrier();
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value );
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename TypeLocal , typename TypeGlobal >
+  KOKKOS_INLINE_FUNCTION TypeGlobal team_scan( const TypeLocal & value , TypeGlobal * const global_accum );
+
+
+  KOKKOS_INLINE_FUNCTION void * get_shmem( const int size );
+
+  explicit inline OpenMP( Impl::OpenMPexec & );
+
+  //------------------------------------
+
+private:
+
+  Impl::OpenMPexec & m_exec ;
+
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+#include <OpenMP/Kokkos_OpenMPexec.hpp>
+#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#endif /* #if defined(KOKKOS_HAVE_OPENMP) */
+#endif /* #ifndef KOKKOS_OPENMP_HPP */
+
+
diff --git a/lib/kokkos/core/src/Kokkos_Pair.hpp b/lib/kokkos/core/src/Kokkos_Pair.hpp
new file mode 100644
index 000000000..c69273cb8
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Pair.hpp
@@ -0,0 +1,457 @@
+/// \file Kokkos_Pair.hpp
+/// \brief Declaration and definition of Kokkos::pair.
+///
+/// This header file declares and defines Kokkos::pair and its related
+/// nonmember functions.
+
+#ifndef KOKKOS_PAIR_HPP
+#define KOKKOS_PAIR_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <utility>
+
+namespace Kokkos {
+/// \struct pair
+/// \brief Replacement for std::pair that works on CUDA devices.
+///
+/// The instance methods of std::pair, including its constructors, are
+/// not marked as <tt>__device__</tt> functions.  Thus, they cannot be
+/// called on a CUDA device, such as an NVIDIA GPU.  This struct
+/// implements the same interface as std::pair, but can be used on a
+/// CUDA device as well as on the host.
+template <class T1, class T2>
+struct pair
+{
+  //! The first template parameter of this class.
+  typedef T1 first_type;
+  //! The second template parameter of this class.
+  typedef T2 second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Default constructor.
+  ///
+  /// This calls the default constructors of T1 and T2.  It won't
+  /// compile if those default constructors are not defined and
+  /// public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair()
+    : first(), second()
+  {}
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type const& f, second_type const& s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<T1, T2> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+template <class T1, class T2>
+struct pair<T1&, T2&>
+{
+  //! The first template parameter of this class.
+  typedef T1& first_type;
+  //! The second template parameter of this class.
+  typedef T2& second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type f, second_type s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<first_type, second_type> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+template <class T1, class T2>
+struct pair<T1, T2&>
+{
+  //! The first template parameter of this class.
+  typedef T1  first_type;
+  //! The second template parameter of this class.
+  typedef T2& second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type const& f, second_type s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<first_type, second_type> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+template <class T1, class T2>
+struct pair<T1&, T2>
+{
+  //! The first template parameter of this class.
+  typedef T1&  first_type;
+  //! The second template parameter of this class.
+  typedef T2 second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type f, second_type const& s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<first_type, second_type> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+//! Equality operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator== (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return lhs.first==rhs.first && lhs.second==rhs.second; }
+
+//! Inequality operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator!= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return !(lhs==rhs); }
+
+//! Less-than operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return lhs.first<rhs.first || (!(rhs.first<lhs.first) && lhs.second<rhs.second); }
+
+//! Less-than-or-equal-to operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return !(rhs<lhs); }
+
+//! Greater-than operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return rhs<lhs; }
+
+//! Greater-than-or-equal-to operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return !(lhs<rhs); }
+
+/// \brief Return a new pair.
+///
+/// This is a "nonmember constructor" for Kokkos::pair.  It works just
+/// like std::make_pair.
+template <class T1,class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+pair<T1,T2> make_pair (T1 x, T2 y)
+{ return ( pair<T1,T2>(x,y) ); }
+
+/// \brief Return a pair of references to the input arguments.
+///
+/// This compares to std::tie (new in C++11).  You can use it to
+/// assign to two variables at once, from the result of a function
+/// that returns a pair.  For example (<tt>__device__</tt> and
+/// <tt>__host__</tt> attributes omitted for brevity):
+/// \code
+/// // Declaration of the function to call.
+/// // First return value: operation count.
+/// // Second return value: whether all operations succeeded.
+/// Kokkos::pair<int, bool> someFunction ();
+///
+/// // Code that uses Kokkos::tie.
+/// int myFunction () {
+///   int count = 0;
+///   bool success = false;
+///
+///   // This assigns to both count and success.
+///   Kokkos::tie (count, success) = someFunction ();
+///
+///   if (! success) {
+///     // ... Some operation failed;
+///     //     take corrective action ...
+///   }
+///   return count;
+/// }
+/// \endcode
+///
+/// The line that uses tie() could have been written like this:
+/// \code
+///   Kokkos::pair<int, bool> result = someFunction ();
+///   count = result.first;
+///   success = result.second;
+/// \endcode
+///
+/// Using tie() saves two lines of code and avoids a copy of each
+/// element of the pair.  The latter could be significant if one or
+/// both elements of the pair are more substantial objects than \c int
+/// or \c bool.
+template <class T1,class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+pair<T1 &,T2 &> tie (T1 & x, T2 & y)
+{ return ( pair<T1 &,T2 &>(x,y) ); }
+
+//
+// Specialization of Kokkos::pair for a \c void second argument.  This
+// is not actually a "pair"; it only contains one element, the first.
+//
+template <class T1>
+struct pair<T1,void>
+{
+  typedef T1 first_type;
+  typedef void second_type;
+
+  first_type  first;
+  enum { second = 0 };
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair()
+    : first()
+  {}
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(const first_type & f)
+    : first(f)
+  {}
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(const first_type & f, int)
+    : first(f)
+  {}
+
+  template <class U>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,void> &p)
+    : first(p.first)
+  {}
+
+  template <class U>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<T1, void> & operator=(const pair<U,void> &p)
+  {
+    first = p.first;
+    return *this;
+  }
+};
+
+//
+// Specialization of relational operators for Kokkos::pair<T1,void>.
+//
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator== (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return lhs.first==rhs.first; }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator!= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return !(lhs==rhs); }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return lhs.first<rhs.first; }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return !(rhs<lhs); }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return rhs<lhs; }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return !(lhs<rhs); }
+
+} // namespace Kokkos
+
+
+#endif //KOKKOS_PAIR_HPP
diff --git a/lib/kokkos/core/src/Kokkos_Parallel.hpp b/lib/kokkos/core/src/Kokkos_Parallel.hpp
new file mode 100644
index 000000000..95a1a87f6
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp
@@ -0,0 +1,765 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Parallel.hpp
+/// \brief Declaration of parallel operators
+
+#ifndef KOKKOS_PARALLEL_HPP
+#define KOKKOS_PARALLEL_HPP
+
+#include <cstddef>
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_View.hpp>
+#include <impl/Kokkos_Traits.hpp>
+
+namespace Kokkos {
+#if   defined ( KOKKOS_HAVE_CUDA )
+class Cuda ;
+#endif
+#if   defined ( KOKKOS_HAVE_OPENMP )
+class OpenMP ;
+#endif
+#if   defined ( KOKKOS_HAVE_PTHREAD )
+class Threads ;
+#endif
+#if   defined ( KOKKOS_HAVE_SERIAL )
+class Serial ;
+#endif
+} // namespace Kokkos
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+  #if   defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA ) && \
+       !defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) && \
+       !defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) && \
+       !defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
+    typedef Cuda DefaultDeviceType;
+  #elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) && \
+       !defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA ) && \
+       !defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) && \
+       !defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
+    typedef OpenMP DefaultDeviceType;
+  #elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) && \
+       !defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) && \
+       !defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA ) && \
+       !defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
+    typedef Threads DefaultDeviceType;
+  #elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL ) && \
+       !defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) && \
+       !defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) && \
+       !defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA )
+    typedef Serial DefaultDeviceType;
+  #else
+    #if   defined ( KOKKOS_HAVE_CUDA )
+      typedef Kokkos::Cuda DefaultDeviceType;
+      #define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA
+    #elif defined ( KOKKOS_HAVE_OPENMP )
+      typedef OpenMP DefaultDeviceType;
+      #define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP
+    #elif defined ( KOKKOS_HAVE_PTHREAD )
+      typedef Threads DefaultDeviceType;
+      #define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS
+    #else
+      typedef Serial DefaultDeviceType;
+      #define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL
+    #endif
+  #endif
+}
+}
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Enable = void >
+struct FunctorHasDeviceType : public false_type {};
+
+template< class FunctorType >
+struct FunctorHasDeviceType< FunctorType , typename
+   enable_if< ! is_same<typename FunctorType::device_type,int>::value >::type >
+  : public true_type {};
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/// \class ParallelFor
+/// \brief Implementation of the ParallelFor operator that has a
+///   partial specialization for the device.
+///
+/// This is an implementation detail of parallel_for.  Users should
+/// skip this and go directly to the nonmember function parallel_for.
+template< class FunctorType ,
+          class WorkSpec ,
+          class DeviceType = typename FunctorType::device_type >
+class ParallelFor ;
+
+} // namespace Impl
+} // namespace Kokkos
+
+namespace Kokkos {
+
+/// \class VectorParallel
+/// \brief Request for parallel_for to attempt thread+vector parallelism.
+struct VectorParallel
+{
+  const size_t nwork ;
+  VectorParallel( const size_t n ) : nwork(n) {}
+  operator size_t () const { return nwork ; }
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief Execute \c functor \c work_count times in parallel.
+ *
+ * A "functor" is a class containing the function to execute in
+ * parallel, any data needed for that execution, and a \c device_type
+ * typedef.  Here is an example functor for parallel_for:
+ *
+ * \code
+ *  class FunctorType {
+ *  public:
+ *    typedef  ...  device_type ;
+ *    void operator() (IntType iwork) const ;
+ *  };
+ * \endcode
+ *
+ * In the above example, \c IntType is any integer type for which a
+ * valid conversion from \c size_t to \c IntType exists.  Its
+ * <tt>operator()</tt> method defines the operation to parallelize,
+ * over the range of integer indices <tt>iwork=[0,work_count-1]</tt>.
+ * This compares to a single iteration \c iwork of a \c for loop.
+ */
+template< class FunctorType >
+inline
+void parallel_for( const size_t        work_count ,
+                   const FunctorType & functor ,
+     typename Impl::enable_if<Impl::FunctorHasDeviceType<FunctorType>::value,int>::type = 0 )
+{
+  Impl::ParallelFor< FunctorType , size_t > tmp( functor , work_count );
+}
+
+template< class FunctorType >
+inline
+void parallel_for( const size_t        work_count ,
+                   const FunctorType & functor ,
+                   typename Impl::enable_if<!Impl::FunctorHasDeviceType<FunctorType>::value,int>::type = 0 )
+{
+  Impl::ParallelFor< FunctorType , size_t, Impl::DefaultDeviceType >
+    tmp( functor , work_count );
+}
+
+/** \brief Execute \c functor \c work_count times in parallel, with vectorization.
+ *
+ * This is like parallel_for, except that it <i>mandates</i>
+ * vectorization as well as parallelization of the given functor.  We
+ * emphasize "mandates": this means that the user asserts that
+ * vectorization is correct, and insists that the compiler vectorize.
+ * Mandating vectorization is not always desirable, for example if the
+ * body of the functor is complicated.  In some cases, users might
+ * want to parallelize over threads, and use vectorization inside the
+ * parallel operation.  Furthermore, the compiler might still be able
+ * to vectorize through a parallel_for.  Thus, users should take care
+ * not to use this execution option arbitrarily.
+ */
+template< class FunctorType >
+inline
+void vector_parallel_for( const size_t        work_count ,
+                          const FunctorType & functor )
+{
+  Impl::ParallelFor< FunctorType , VectorParallel > tmp( functor , work_count );
+}
+
+template< class DeviceType >
+class MultiFunctorParallelFor ;
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/// \class ParallelReduce
+/// \brief Implementation detail of parallel_reduce.
+///
+/// This is an implementation detail of parallel_reduce.  Users should
+/// skip this and go directly to the nonmember function parallel_reduce.
+template< class FunctorType ,
+          class WorkSpec ,
+          class DeviceType = typename FunctorType::device_type >
+class ParallelReduce ;
+
+/// \class ReduceAdapter
+/// \brief Implementation detail of parallel_reduce.
+///
+/// This is an implementation detail of parallel_reduce.  Users should
+/// skip this and go directly to the nonmember function parallel_reduce.
+template< class FunctorType ,
+          class ValueType = typename FunctorType::value_type >
+struct ReduceAdapter ;
+
+} // namespace Impl
+} // namespace Kokkos
+
+
+namespace Kokkos {
+
+/** \brief  Parallel reduction
+ *
+ * Example of a parallel_reduce functor for a POD (plain old data) value type:
+ * \code
+ *  class FunctorType { // For POD value type
+ *  public:
+ *    typedef    ...     device_type ;
+ *    typedef <podType>  value_type ;
+ *    void operator()( <intType> iwork , <podType> & update ) const ;
+ *    void init( <podType> & update ) const ;
+ *    void join( volatile       <podType> & update ,
+ *               volatile const <podType> & input ) const ;
+ *
+ *    typedef true_type has_final ;
+ *    void final( <podType> & update ) const ;
+ *  };
+ * \endcode
+ *
+ * Example of a parallel_reduce functor for an array of POD (plain old data) values:
+ * \code
+ *  class FunctorType { // For array of POD value
+ *  public:
+ *    typedef    ...     device_type ;
+ *    typedef <podType>  value_type[] ;
+ *    void operator()( <intType> , <podType> update[] ) const ;
+ *    void init( <podType> update[] ) const ;
+ *    void join( volatile       <podType> update[] ,
+ *               volatile const <podType> input[] ) const ;
+ *
+ *    typedef true_type has_final ;
+ *    void final( <podType> update[] ) const ;
+ *  };
+ * \endcode
+ */
+template< class FunctorType >
+inline
+void parallel_reduce( const size_t        work_count ,
+                      const FunctorType & functor )
+{
+  Impl::ParallelReduce< FunctorType , size_t > reduce( functor , work_count );
+}
+
+/** \brief  Parallel reduction and output to host.
+ *
+ *  If FunctorType::value_type is
+ *    - \c PodType,  then \c reference_type is <tt>PodType & </tt>.
+ *    - <tt>PodType[]</tt>, then \c reference_type is <tt>PodType * </tt>.
+ */
+template< class FunctorType >
+inline
+void parallel_reduce( const size_t work_count ,
+                      const FunctorType & functor ,
+                      typename Kokkos::Impl::ReduceAdapter< FunctorType >::reference_type result )
+{
+  Impl::ParallelReduce< FunctorType, size_t >
+    reduce( functor , work_count , Kokkos::Impl::ReduceAdapter< FunctorType >::pointer( result ) );
+
+  reduce.wait();
+}
+
+template< class FunctorType >
+inline
+void parallel_reduce( const VectorParallel & work_count ,
+                      const FunctorType & functor ,
+                      typename Kokkos::Impl::ReduceAdapter< FunctorType >::reference_type result )
+{
+  Impl::ParallelReduce< FunctorType, VectorParallel >
+    reduce( functor , work_count , Kokkos::Impl::ReduceAdapter< FunctorType >::pointer( result ) );
+
+  reduce.wait();
+}
+
+template< class DeviceType >
+class MultiFunctorParallelReduce ;
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/// \class ParallelScan
+/// \brief Implementation detail of parallel_scan.
+///
+/// This is an implementation detail of parallel_scan.  Users should
+/// skip this and go directly to the documentation of the nonmember
+/// template function Kokkos::parallel_scan.
+template< class FunctorType ,
+          class WorkSpec ,
+          class DeviceType = typename FunctorType::device_type >
+class ParallelScan ;
+
+} // namespace Impl
+} // namespace Kokkos
+
+namespace Kokkos {
+
+/// \fn parallel_scan
+/// \tparam FunctorType Type of the scan functor.
+///
+/// \param work_count [in] Number of work items.
+/// \param functor [in] The scan functor.
+///
+/// This function implements a parallel scan operation.  The scan can
+/// be either inclusive or exclusive, depending on how you implement
+/// the scan functor.
+///
+/// A scan functor looks almost exactly like a reduce functor, except
+/// that its operator() takes a third \c bool argument, \c final_pass,
+/// which indicates whether this is the last pass of the scan
+/// operation.  We will show below how to use the \c final_pass
+/// argument to control whether the scan is inclusive or exclusive.
+///
+/// Here is the minimum required interface of a scan functor for a POD
+/// (plain old data) value type \c PodType.  That is, the result is a
+/// View of zero or more PodType.  It is also possible for the result
+/// to be an array of (same-sized) arrays of PodType, but we do not
+/// show the required interface for that here.
+/// \code
+/// class ScanFunctor {
+/// public:
+///   // The Kokkos device type
+///   typedef ... device_type;
+///   // Type of an entry of the array containing the result;
+///   // also the type of each of the entries combined using
+///   // operator() or join().
+///   typedef PodType value_type;
+///   typedef typename DeviceType::size_type size_type;
+///
+///   void operator () (const size_type i, value_type& update, const bool final_pass) const;
+///   void init (value_type& update) const;
+///   void join (volatile value_type& update, volatile const value_type& input) const
+/// };
+/// \endcode
+///
+/// Here is an example of a functor which computes an inclusive plus-scan
+/// of an array of \c int, in place.  If given an array [1, 2, 3, 4], this
+/// scan will overwrite that array with [1, 3, 6, 10].
+///
+/// \code
+/// template<class DeviceType>
+/// class InclScanFunctor {
+/// public:
+///   typedef DeviceType device_type;
+///   typedef int value_type;
+///   typedef typename DeviceType::size_type size_type;
+///
+///   InclScanFunctor (Kokkos::View<value_type*, device_type> x) : x_ (x) {}
+///
+///   void operator () (const size_type i, value_type& update, const bool final_pass) const {
+///     update += x_(i);
+///     if (final_pass) {
+///       x_(i) = update;
+///     }
+///   }
+///   void init (value_type& update) const {
+///     update = 0;
+///   }
+///   void join (volatile value_type& update, volatile const value_type& input) const {
+///     update += input;
+///   }
+///
+/// private:
+///   Kokkos::View<value_type*, device_type> x_;
+/// };
+/// \endcode
+///
+/// Here is an example of a functor which computes an <i>exclusive</i>
+/// scan of an array of \c int, in place.  In operator(), note both
+/// that the final_pass test and the update have switched places, and
+/// the use of a temporary.  If given an array [1, 2, 3, 4], this scan
+/// will overwrite that array with [0, 1, 3, 6].
+///
+/// \code
+/// template<class DeviceType>
+/// class ExclScanFunctor {
+/// public:
+///   typedef DeviceType device_type;
+///   typedef int value_type;
+///   typedef typename DeviceType::size_type size_type;
+///
+///   ExclScanFunctor (Kokkos::View<value_type*, device_type> x) : x_ (x) {}
+///
+///   void operator () (const size_type i, value_type& update, const bool final_pass) const {
+///     const value_type x_i = x_(i);
+///     if (final_pass) {
+///       x_(i) = update;
+///     }
+///     update += x_i;
+///   }
+///   void init (value_type& update) const {
+///     update = 0;
+///   }
+///   void join (volatile value_type& update, volatile const value_type& input) const {
+///     update += input;
+///   }
+///
+/// private:
+///   Kokkos::View<value_type*, device_type> x_;
+/// };
+/// \endcode
+///
+/// Here is an example of a functor which builds on the above
+/// exclusive scan example, to compute an offsets array from a
+/// population count array, in place.  We assume that the pop count
+/// array has an extra entry at the end to store the final count.  If
+/// given an array [1, 2, 3, 4, 0], this scan will overwrite that
+/// array with [0, 1, 3, 6, 10].
+///
+/// \code
+/// template<class DeviceType>
+/// class OffsetScanFunctor {
+/// public:
+///   typedef DeviceType device_type;
+///   typedef int value_type;
+///   typedef typename DeviceType::size_type size_type;
+///
+///   // lastIndex_ is the last valid index (zero-based) of x.
+///   // If x has length zero, then lastIndex_ won't be used anyway.
+///   ExclScanFunctor (Kokkos::View<value_type*, device_type> x) :
+///     x_ (x), last_index_ (x.dimension_0 () == 0 ? 0 : x.dimension_0 () - 1)
+///   {}
+///
+///   void operator () (const size_type i, int& update, const bool final_pass) const {
+///     const value_type x_i = x_(i);
+///     if (final_pass) {
+///       x_(i) = update;
+///     }
+///     update += x_i;
+///     // The last entry of x_ gets the final sum.
+///     if (final_pass && i == last_index_) {
+///       x_(i) = update;
+///     }
+///   }
+///   void init (value_type& update) const {
+///     update = 0;
+///   }
+///   void join (volatile value_type& update, volatile const value_type& input) const {
+///     update += input;
+///   }
+///
+/// private:
+///   Kokkos::View<value_type*, device_type> x_;
+///   const size_type last_index_;
+/// };
+/// \endcode
+///
+template< class FunctorType >
+inline
+void parallel_scan( const size_t        work_count ,
+                    const FunctorType & functor )
+{
+  Impl::ParallelScan< FunctorType , size_t > scan( functor , work_count );
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief  Parallel work request for shared memory, league size, and team size.
+ *
+ *  If the shared size is too large then slow (global) memory will be used.
+ *  If the league or team size are too large then they will be reduced.
+ */
+struct ParallelWorkRequest {
+  size_t  league_size ; ///<  Size of league (number of teams in a league)
+  size_t  team_size ;   ///<  Size of team (number of threads in a team)
+
+  KOKKOS_INLINE_FUNCTION
+  ParallelWorkRequest() : league_size(0), team_size(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ParallelWorkRequest( size_t s0 , size_t s1 ) : league_size(s0), team_size(s1) {}
+};
+
+/** \brief  Execute functor in parallel with work request,
+ *          the actual league_size and team_size may be smaller.
+ *
+ *  class FunctorType {
+ *  public:
+ *    typedef  ...  device_type ;
+ *    void operator()( device_type ) const ;
+ *  };
+ */
+template< class FunctorType >
+inline
+void parallel_for( const ParallelWorkRequest & request ,
+                   const FunctorType         & functor )
+{
+  Kokkos::Impl::ParallelFor< FunctorType , ParallelWorkRequest >( functor , request );
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+/** \brief  Parallel reduction.
+ *
+ *  class FunctorType {
+ *  public:
+ *    typedef    ...     device_type ;
+ *    typedef <podType>  value_type ; // POD type
+ *    void operator()( device_type , <podType> & ) const ;
+ *    void init( <podType> & ) const ;
+ *    void join( volatile       <podType> & update ,
+ *               volatile const <podType> & input ) const ;
+ *
+ *    typedef true_type has_final ;
+ *    void final( <podType> & update ) const ;
+ *  };
+ *
+ *  class FunctorType { // For array of POD value
+ *  public:
+ *    typedef    ...     device_type ;
+ *    typedef <podType>  value_type[] ;
+ *    void operator()( device_type , <podType> update[] ) const ;
+ *    void init( <podType> update[] ) const ;
+ *    void join( volatile       <podType> update[] ,
+ *               volatile const <podType> input[] ) const ;
+ *
+ *    typedef true_type has_final ;
+ *    void final( <podType> update[] ) const ;
+ *  };
+ */
+template< class FunctorType >
+inline
+void parallel_reduce( const Kokkos::ParallelWorkRequest  & request ,
+                      const FunctorType          & functor )
+{
+  Impl::ParallelReduce< FunctorType , Kokkos::ParallelWorkRequest > reduce( functor , request );
+}
+
+template< class FunctorType >
+inline
+void parallel_reduce( const Kokkos::ParallelWorkRequest  & request ,
+                      const FunctorType          & functor ,
+                      typename Kokkos::Impl::ReduceAdapter< FunctorType >::reference_type result )
+{
+  Impl::ParallelReduce< FunctorType , Kokkos::ParallelWorkRequest >
+    reduce( functor , request , Kokkos::Impl::ReduceAdapter< FunctorType >::pointer( result ) );
+
+  reduce.wait(); // Wait for reduce to complete and output result
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Enable = void >
+struct FunctorHasJoin : public false_type {};
+
+template< class FunctorType >
+struct FunctorHasJoin< FunctorType , typename enable_if< 0 < sizeof( & FunctorType::join ) >::type >
+  : public true_type {};
+
+template< class FunctorType , class Enable = void >
+struct FunctorHasFinal : public false_type {};
+
+template< class FunctorType >
+struct FunctorHasFinal< FunctorType , typename enable_if< 0 < sizeof( & FunctorType::final ) >::type >
+  : public true_type {};
+
+template< class FunctorType , class Enable = void >
+struct FunctorShmemSize
+{
+  static inline size_t value( const FunctorType & ) { return 0 ; }
+};
+
+template< class FunctorType >
+struct FunctorShmemSize< FunctorType , typename enable_if< 0 < sizeof( & FunctorType::shmem_size ) >::type >
+{
+  static inline size_t value( const FunctorType & f ) { return f.shmem_size() ; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ScalarType >
+struct ReduceAdapter
+{
+  enum { StaticValueSize = sizeof(ScalarType) };
+
+  typedef ScalarType & reference_type  ;
+  typedef ScalarType * pointer_type  ;
+  typedef ScalarType   scalar_type  ;
+
+  KOKKOS_INLINE_FUNCTION static
+  reference_type reference( void * p ) { return *((ScalarType*) p); }
+
+  KOKKOS_INLINE_FUNCTION static
+  reference_type reference( void * p , unsigned i ) { return ((ScalarType*) p)[i]; }
+
+  KOKKOS_INLINE_FUNCTION static
+  pointer_type pointer( reference_type p ) { return & p ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  unsigned value_count( const FunctorType & ) { return 1 ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  unsigned value_size( const FunctorType & ) { return sizeof(ScalarType); }
+
+  KOKKOS_INLINE_FUNCTION static
+  void copy( const FunctorType & , void * const dst , const void * const src )
+    { *((scalar_type*)dst) = *((const scalar_type*)src); }
+
+  KOKKOS_INLINE_FUNCTION static
+  void join( const FunctorType & f , volatile void * update , volatile const void * input )
+    { f.join( *((volatile ScalarType*)update) , *((volatile const ScalarType*)input) ); }
+
+  template< class F >
+  KOKKOS_INLINE_FUNCTION static
+  void final( const F & f ,
+              typename enable_if< ( is_same<F,FunctorType>::value &&
+                                    FunctorHasFinal<F>::value )
+                                >::type * p )
+    { f.final( *((ScalarType *) p ) ); }
+
+  template< class F >
+  KOKKOS_INLINE_FUNCTION static
+  void final( const F & ,
+              typename enable_if< ( is_same<F,FunctorType>::value &&
+                                    ! FunctorHasFinal<F>::value )
+                                >::type * )
+    {}
+};
+
+template< class FunctorType , class ScalarType >
+struct ReduceAdapter< FunctorType , ScalarType[] >
+{
+  enum { StaticValueSize = 0 };
+
+  typedef ScalarType * reference_type  ;
+  typedef ScalarType * pointer_type  ;
+  typedef ScalarType   scalar_type  ;
+
+  KOKKOS_INLINE_FUNCTION static
+  ScalarType * reference( void * p ) { return (ScalarType*) p ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  reference_type reference( void * p , unsigned i ) { return ((ScalarType*) p)+i; }
+
+  KOKKOS_INLINE_FUNCTION static
+  pointer_type pointer( reference_type p ) { return p ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  unsigned value_count( const FunctorType & f ) { return f.value_count ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  unsigned value_size( const FunctorType & f ) { return f.value_count * sizeof(ScalarType); }
+
+  KOKKOS_INLINE_FUNCTION static
+  void copy( const FunctorType & f , void * const dst , const void * const src )
+    {
+      for ( int i = 0 ; i < int(f.value_count) ; ++i ) {
+        ((scalar_type*)dst)[i] = ((const scalar_type*)src)[i];
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION static
+  void join( const FunctorType & f , volatile void * update , volatile const void * input )
+    { f.join( ((volatile ScalarType*)update) , ((volatile const ScalarType*)input) ); }
+
+  template< class F >
+  KOKKOS_INLINE_FUNCTION static
+  void final( const F & f ,
+              typename enable_if< ( is_same<F,FunctorType>::value &&
+                                    FunctorHasFinal<F>::value )
+                                >::type * p )
+    { f.final( ((ScalarType *) p ) ); }
+
+  template< class F >
+  KOKKOS_INLINE_FUNCTION static
+  void final( const F & ,
+              typename enable_if< ( is_same<F,FunctorType>::value &&
+                                    ! FunctorHasFinal<F>::value )
+                                >::type * )
+    {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_PARALLEL_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_ParallelReduce.hpp b/lib/kokkos/core/src/Kokkos_ParallelReduce.hpp
new file mode 100644
index 000000000..c6d929eb2
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_ParallelReduce.hpp
@@ -0,0 +1,75 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_PARALLELREDUCE_HPP
+#define KOKKOS_PARALLELREDUCE_HPP
+
+#include <cstddef>
+#include <sstream>
+#include <Kokkos_Parallel.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType >
+void vector_parallel_reduce( const size_t work_count ,
+                             const FunctorType & functor ,
+                             typename Impl::ReduceAdapter< FunctorType >::reference_type result )
+
+{
+  Impl::ParallelReduce< FunctorType, VectorParallel >
+    reduce( functor , work_count , Kokkos::Impl::ReduceAdapter< FunctorType >::pointer( result ) );
+
+  reduce.wait();
+}
+
+//----------------------------------------------------------------------------
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_PARALLELREDUCE_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_Serial.hpp b/lib/kokkos/core/src/Kokkos_Serial.hpp
new file mode 100644
index 000000000..e346045ee
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@@ -0,0 +1,240 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Serial.hpp
+/// \brief Declaration and definition of Kokkos::Serial device.
+
+#ifndef KOKKOS_SERIAL_HPP
+#define KOKKOS_SERIAL_HPP
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/// \class Serial
+/// \brief Kokkos device for non-parallel execution
+///
+/// A "device" represents a parallel execution model.  It tells Kokkos
+/// how to parallelize the execution of kernels in a parallel_for or
+/// parallel_reduce.  For example, the Threads device uses Pthreads or
+/// C++11 threads on a CPU, the OpenMP device uses the OpenMP language
+/// extensions, and the Cuda device uses NVIDIA's CUDA programming
+/// model.  The Serial device executes "parallel" kernels
+/// sequentially.  This is useful if you really do not want to use
+/// threads, or if you want to explore different combinations of MPI
+/// and shared-memory parallel programming models.
+class Serial {
+public:
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! The device type (same as this class).
+  typedef Serial                device_type ;
+  //! The size_type typedef best suited for this device.
+  typedef HostSpace::size_type  size_type ;
+  //! This device's preferred memory space.
+  typedef HostSpace             memory_space ;
+  //! This device's preferred array layout.
+  typedef LayoutRight           array_layout ;
+  /// \brief This device's host mirror type.
+  ///
+  /// Serial is a host device, so the host mirror type is the same as
+  /// the device type itself.
+  typedef Serial                host_mirror_device_type ;
+
+  //@}
+
+  /// \brief True if and only if this method is being called in a
+  ///   thread-parallel function.
+  ///
+  /// For the Serial device, this method <i>always</i> returns false,
+  /// because parallel_for or parallel_reduce with the Serial device
+  /// always execute sequentially.
+  inline static int in_parallel() { return false ; }
+
+  /** \brief  Set the device in a "sleep" state.
+   *
+   * This function sets the device in a "sleep" state in which it is
+   * not ready for work.  This may consume less resources than if the
+   * device were in an "awake" state, but it may also take time to
+   * bring the device from a sleep state to be ready for work.
+   *
+   * \return True if the device is in the "sleep" state, else false if
+   *   the device is actively working and could not enter the "sleep"
+   *   state.
+   */
+  static bool sleep();
+
+  /// \brief Wake the device from the 'sleep' state so it is ready for work.
+  ///
+  /// \return True if the device is in the "ready" state, else "false"
+  ///  if the device is actively working (which also means that it's
+  ///  awake).
+  static bool wake();
+
+  /// \brief Wait until all dispatched functors complete.
+  ///
+  /// The parallel_for or parallel_reduce dispatch of a functor may
+  /// return asynchronously, before the functor completes.  This
+  /// method does not return until all dispatched functors on this
+  /// device have completed.
+  static void fence() {}
+
+  static void initialize( unsigned threads_count = 1 ,
+                          unsigned use_numa_count = 0 ,
+                          unsigned use_cores_per_numa = 0 ,
+                          bool allow_asynchronous_threadpool = false) {}
+
+  static int is_initialized() { return 1 ; }
+
+  //! Free any resources being consumed by the device.
+  static void finalize() {}
+
+  //! Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  inline int league_rank() const { return 0 ; }
+  inline int league_size() const { return 1 ; }
+  inline int team_rank() const { return 0 ; }
+  inline int team_size() const { return 1 ; }
+
+  inline void team_barrier() {}
+
+  inline std::pair<size_t,size_t> work_range( size_t n ) const
+    { return std::pair<size_t,size_t>(0,n); }
+
+  template< typename T >
+  inline T * get_shmem( const int count );
+
+  static void * resize_reduce_scratch( const unsigned );
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+//TODO: Needs constructor for Kokkos::ParallelWorkRequest CRT
+
+template< class FunctorType , class WorkSpec >
+class ParallelFor< FunctorType , WorkSpec , Serial > {
+public:
+
+  ParallelFor( const FunctorType & functor , const size_t work_count )
+    {
+      for ( size_t iwork = 0 ; iwork < work_count ; ++iwork ) {
+        functor( iwork );
+      }
+    }
+};
+
+template< class FunctorType , class WorkSpec >
+class ParallelReduce< FunctorType , WorkSpec , Serial > {
+public:
+
+  typedef ReduceAdapter< FunctorType >  Reduce ;
+  typedef typename Reduce::pointer_type pointer_type ;
+
+  ParallelReduce( const FunctorType  & functor ,
+                  const size_t         work_count ,
+                  pointer_type         result = 0 )
+    {
+      if ( 0 == result ) {
+        result = (pointer_type ) Serial::resize_reduce_scratch( Reduce::value_size( functor ) );
+      }
+
+      functor.init( Reduce::reference( result ) );
+
+      for ( size_t iwork = 0 ; iwork < work_count ; ++iwork ) {
+        functor( iwork , Reduce::reference( result ) );
+      }
+
+      Reduce::final( functor , result );
+    }
+
+  void wait() {}
+};
+
+template< class FunctorType , class WorkSpec >
+class ParallelScan< FunctorType , WorkSpec , Kokkos::Serial >
+{
+public:
+  typedef ReduceAdapter< FunctorType >   Reduce ;
+  typedef typename Reduce::pointer_type  pointer_type ;
+
+  inline
+  ParallelScan( const FunctorType & functor , const size_t work_count )
+  {
+    pointer_type result = (pointer_type ) Serial::resize_reduce_scratch( Reduce::value_size( functor ) );
+
+    functor.init( Reduce::reference( result ) );
+
+    for ( size_t iwork = 0 ; iwork < work_count ; ++iwork ) {
+      functor( iwork , Reduce::reference( result ) , true );
+    }
+  }
+
+  void wait() {}
+};
+
+//----------------------------------------------------------------------------
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_SERIAL_HPP */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/core/src/Kokkos_Threads.hpp b/lib/kokkos/core/src/Kokkos_Threads.hpp
new file mode 100644
index 000000000..240ae28be
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Threads.hpp
@@ -0,0 +1,218 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADS_HPP
+#define KOKKOS_THREADS_HPP
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <Kokkos_HostSpace.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class ThreadsExec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Device for a pool of Pthreads or C11 threads on a CPU. */
+class Threads {
+public:
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  typedef Threads                  device_type ;
+  typedef Kokkos::HostSpace        memory_space ;
+  typedef memory_space::size_type  size_type ;
+  typedef Kokkos::LayoutRight      array_layout ;
+  typedef Kokkos::Threads          host_mirror_device_type ;
+
+  //@}
+  /*------------------------------------------------------------------------*/
+  //! \name Static functions that all Kokkos devices must implement.
+  //@{
+
+  /// \brief True if and only if this method is being called in a
+  ///   thread-parallel function.
+  static int in_parallel();
+
+  /** \brief  Set the device in a "sleep" state.
+   *
+   * This function sets the device in a "sleep" state in which it is
+   * not ready for work.  This may consume less resources than if the
+   * device were in an "awake" state, but it may also take time to
+   * bring the device from a sleep state to be ready for work.
+   *
+   * \return True if the device is in the "sleep" state, else false if
+   *   the device is actively working and could not enter the "sleep"
+   *   state.
+   */
+  static bool sleep();
+
+  /// \brief Wake the device from the 'sleep' state so it is ready for work.
+  ///
+  /// \return True if the device is in the "ready" state, else "false"
+  ///  if the device is actively working (which also means that it's
+  ///  awake).
+  static bool wake();
+
+  /// \brief Wait until all dispatched functors complete.
+  ///
+  /// The parallel_for or parallel_reduce dispatch of a functor may
+  /// return asynchronously, before the functor completes.  This
+  /// method does not return until all dispatched functors on this
+  /// device have completed.
+  static void fence();
+
+  /// \brief Free any resources being consumed by the device.
+  ///
+  /// For the Threads device, this terminates spawned worker threads.
+  static void finalize();
+
+  /// \brief Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  //@}
+  //! \name Function for the functor device interface */
+  //@{
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const ;
+  KOKKOS_INLINE_FUNCTION int league_size() const ;
+  KOKKOS_INLINE_FUNCTION int team_rank() const ;
+  KOKKOS_INLINE_FUNCTION int team_size() const ;
+
+  KOKKOS_INLINE_FUNCTION void team_barrier();
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value );
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename TypeLocal , typename TypeGlobal >
+  KOKKOS_INLINE_FUNCTION TypeGlobal team_scan( const TypeLocal & value , TypeGlobal * const global_accum );
+
+  KOKKOS_INLINE_FUNCTION void * get_shmem( const int size );
+
+  explicit inline Threads( Impl::ThreadsExec & );
+
+  /**@} */
+  /*------------------------------------------------------------------------*/
+  //! \name Device-specific functions
+  //@{
+
+  /** \brief Initialize the device in the "ready to work" state.
+   *
+   *  The device is initialized in a "ready to work" or "awake" state.
+   *  This state reduces latency and thus improves performance when
+   *  dispatching work.  However, the "awake" state consumes resources
+   *  even when no work is being done.  You may call sleep() to put
+   *  the device in a "sleeping" state that does not consume as many
+   *  resources, but it will take time (latency) to awaken the device
+   *  again (via the wake()) method so that it is ready for work.
+   *
+   *  Teams of threads are distributed as evenly as possible across
+   *  the requested number of numa regions and cores per numa region.
+   *  A team will not be split across a numa region.
+   *
+   *  If the 'use_' arguments are not supplied the hwloc is queried
+   *  to use all available cores.
+   */
+  static void initialize( unsigned threads_count = 1 ,
+                          unsigned use_numa_count = 0 ,
+                          unsigned use_cores_per_numa = 0 ,
+                          bool allow_asynchronous_threadpool = false );
+
+  static int is_initialized();
+
+  /** \brief  Maximum size of a single thread team.
+   *
+   *  If a parallel_{for,reduce,scan} operation requests a team_size that
+   *  does not satisfy the condition: 0 == team_max() % team_size
+   *  then some threads will idle.
+   */
+  KOKKOS_INLINE_FUNCTION static unsigned team_max();
+
+  KOKKOS_INLINE_FUNCTION static unsigned league_max();
+
+  //@}
+  /*------------------------------------------------------------------------*/
+
+private:
+
+  friend class Impl::ThreadsExec ;
+
+  Impl::ThreadsExec & m_exec ;
+};
+
+/*--------------------------------------------------------------------------*/
+
+} // namespace Kokkos
+
+#include <Kokkos_Parallel.hpp>
+#include <Threads/Kokkos_ThreadsExec.hpp>
+#include <Threads/Kokkos_Threads_Parallel.hpp>
+
+#endif /* #define KOKKOS_THREADS_HPP */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/core/src/Kokkos_Vectorization.hpp b/lib/kokkos/core/src/Kokkos_Vectorization.hpp
new file mode 100644
index 000000000..6c8dba590
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Vectorization.hpp
@@ -0,0 +1,86 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Vectorization.hpp
+/// \brief Declaration and definition of Kokkos::Vectorization interface.
+#ifndef KOKKOS_VECTORIZATION_HPP
+#define KOKKOS_VECTORIZATION_HPP
+
+#include <Kokkos_Macros.hpp>
+
+namespace Kokkos {
+template<class Device, int N>
+struct Vectorization {
+  enum {increment = 1};
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  static int begin() { return 0;}
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  static int thread_rank(const Device &dev) {
+    return dev.team_rank();
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  static int global_thread_rank(const Device &dev) {
+    return (dev.league_rank()*dev.team_size()+dev.team_rank());
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  static bool is_lane_0(const Device &dev) {
+    return true;
+  }
+
+  template<class Scalar>
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar reduce(const Scalar& val) {
+    return val;
+  }
+};
+}
+
+#if defined( KOKKOS_HAVE_CUDA )
+#include <Cuda/Kokkos_Cuda_Vectorization.hpp>
+#endif
+
+#endif
diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp
new file mode 100644
index 000000000..5e2753853
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
@@ -0,0 +1,1496 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VIEW_HPP
+#define KOKKOS_VIEW_HPP
+
+#include <string>
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+
+#include <impl/Kokkos_StaticAssert.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Shape.hpp>
+#include <impl/Kokkos_AnalyzeShape.hpp>
+#include <impl/Kokkos_ViewSupport.hpp>
+#include <impl/Kokkos_ViewOffset.hpp>
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  View specialization mapping of view traits to a specialization tag */
+template< class ValueType ,
+          class ArraySpecialize ,
+          class ArrayLayout ,
+          class MemorySpace ,
+          class MemoryTraits >
+struct ViewSpecialize ;
+
+template< class DstViewSpecialize ,
+          class SrcViewSpecialize = void ,
+          class Enable = void >
+struct ViewAssignment ;
+
+template< class DstMemorySpace , class SrcMemorySpace >
+struct DeepCopy ;
+
+} /* namespace Impl */
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \class ViewTraits
+ *  \brief Traits class for accessing attributes of a View.
+ *
+ * This is an implementation detail of View.  It is only of interest
+ * to developers implementing a new specialization of View.
+ *
+ * Template argument permutations:
+ *   - View< DataType , Device , void         , void >
+ *   - View< DataType , Device , MemoryTraits , void >
+ *   - View< DataType , Device , void         , MemoryTraits >
+ *   - View< DataType , ArrayLayout , Device  , void >
+ *   - View< DataType , ArrayLayout , Device  , MemoryTraits >
+ */
+template< class DataType ,
+          class Arg1 ,
+          class Arg2 ,
+          class Arg3 >
+class ViewTraits {
+private:
+
+  // Arg1 is either Device or Layout, both of which must have 'typedef ... array_layout'.
+  // If Arg1 is not Layout then Arg1 must be Device
+  enum { Arg1IsDevice = ! Impl::is_same< Arg1 , typename Arg1::array_layout >::value };
+  enum { Arg2IsDevice = ! Arg1IsDevice };
+
+  // If Arg1 is device and Arg2 is not void then Arg2 is MemoryTraits.
+  // If Arg1 is device and Arg2 is void and Arg3 is not void then Arg3 is MemoryTraits.
+  // If Arg2 is device and Arg3 is not void then Arg3 is MemoryTraits.
+  enum { Arg2IsVoid = Impl::is_same< Arg2 , void >::value };
+  enum { Arg3IsVoid = Impl::is_same< Arg3 , void >::value };
+  enum { Arg2IsMemory = ! Arg2IsVoid && Arg1IsDevice && Arg3IsVoid };
+  enum { Arg3IsMemory = ! Arg3IsVoid && ( ( Arg1IsDevice && Arg2IsVoid ) || Arg2IsDevice ) };
+
+
+  typedef typename Arg1::array_layout  ArrayLayout ;
+  typedef typename Impl::if_c< Arg1IsDevice , Arg1 , Arg2 >::type::device_type  DeviceType ;
+
+  typedef typename Impl::if_c< Arg2IsMemory , Arg2 ,
+          typename Impl::if_c< Arg3IsMemory , Arg3 , MemoryManaged
+          >::type >::type::memory_traits  MemoryTraits ;
+
+  typedef Impl::AnalyzeShape<DataType> analysis ;
+
+public:
+
+  //------------------------------------
+  // Data type traits:
+
+  typedef DataType                            data_type ;
+  typedef typename analysis::const_type       const_data_type ;
+  typedef typename analysis::non_const_type   non_const_data_type ;
+
+  //------------------------------------
+  // Array of intrinsic scalar type traits:
+
+  typedef typename analysis::array_type            array_type ;
+  typedef typename analysis::const_array_type      const_array_type ;
+  typedef typename analysis::non_const_array_type  non_const_array_type ;
+
+  //------------------------------------
+  // Value type traits:
+
+  typedef typename analysis::value_type            value_type ;
+  typedef typename analysis::const_value_type      const_value_type ;
+  typedef typename analysis::non_const_value_type  non_const_value_type ;
+
+  //------------------------------------
+  // Layout and shape traits:
+
+  typedef typename Impl::StaticAssertSame< ArrayLayout , typename ArrayLayout ::array_layout >::type  array_layout ;
+
+  typedef typename analysis::shape   shape_type ;
+
+  enum { rank         = shape_type::rank };
+  enum { rank_dynamic = shape_type::rank_dynamic };
+
+  //------------------------------------
+  // Device and memory space traits:
+
+  typedef typename Impl::StaticAssertSame< DeviceType   , typename DeviceType  ::device_type   >::type  device_type ;
+  typedef typename Impl::StaticAssertSame< MemoryTraits , typename MemoryTraits::memory_traits >::type  memory_traits ;
+
+  typedef typename device_type::memory_space  memory_space ;
+  typedef typename device_type::size_type     size_type ;
+
+  enum { is_hostspace = Impl::is_same< memory_space , HostSpace >::value };
+  enum { is_managed   = memory_traits::Unmanaged == 0 };
+  enum { is_random_access   = memory_traits::RandomAccess == 1 };
+
+  //------------------------------------
+  // Specialization tag:
+
+  typedef typename
+    Impl::ViewSpecialize< value_type
+                        , typename analysis::specialize
+                        , array_layout
+                        , memory_space
+                        , memory_traits
+                        >::type specialize ;
+};
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+class ViewDefault {};
+
+/** \brief  Default view specialization has LayoutLeft, LayoutRight, or LayoutStride.
+ */
+template< class ValueType , class MemorySpace , class MemoryTraits >
+struct ViewSpecialize< ValueType , void , LayoutLeft , MemorySpace , MemoryTraits >
+{ typedef ViewDefault type ; };
+
+template< class ValueType , class MemorySpace , class MemoryTraits >
+struct ViewSpecialize< ValueType , void , LayoutRight , MemorySpace , MemoryTraits >
+{ typedef ViewDefault type ; };
+
+template< class ValueType , class MemorySpace , class MemoryTraits >
+struct ViewSpecialize< ValueType , void , LayoutStride , MemorySpace , MemoryTraits >
+{ typedef ViewDefault type ; };
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief Types for compile-time detection of View usage errors */
+namespace ViewError {
+
+struct allocation_constructor_requires_managed {};
+struct allocation_constructor_requires_nonconst {};
+struct user_pointer_constructor_requires_unmanaged {};
+struct device_shmem_constructor_requires_unmanaged {};
+
+struct scalar_operator_called_from_non_scalar_view {};
+
+} /* namespace ViewError */
+
+//----------------------------------------------------------------------------
+
+template< class Type >
+struct IsViewLabel : public Kokkos::Impl::false_type {};
+
+template<>
+struct IsViewLabel<std::string> : public Kokkos::Impl::true_type {};
+
+template< unsigned N >
+struct IsViewLabel<char[N]> : public Kokkos::Impl::true_type {};
+
+//----------------------------------------------------------------------------
+/** \brief  Enable view parentheses operator for
+ *          match of layout and integral arguments.
+ *          If correct rank define type from traits,
+ *          otherwise define type as an error message.
+ */
+template< class ReturnType , class Traits , class Layout , unsigned Rank ,
+          typename iType0 = int , typename iType1 = int ,
+          typename iType2 = int , typename iType3 = int ,
+          typename iType4 = int , typename iType5 = int ,
+          typename iType6 = int , typename iType7 = int ,
+          class Enable = void >
+struct ViewEnableArrayOper ;
+
+template< class ReturnType , class Traits , class Layout , unsigned Rank ,
+          typename iType0 , typename iType1 ,
+          typename iType2 , typename iType3 ,
+          typename iType4 , typename iType5 ,
+          typename iType6 , typename iType7 >
+struct ViewEnableArrayOper<
+   ReturnType , Traits , Layout , Rank ,
+   iType0 , iType1 , iType2 , iType3 ,
+   iType4 , iType5 , iType6 , iType7 ,
+   typename enable_if<
+     iType0(0) == 0 && iType1(0) == 0 && iType2(0) == 0 && iType3(0) == 0 &&
+     iType4(0) == 0 && iType5(0) == 0 && iType6(0) == 0 && iType7(0) == 0 &&
+     is_same< typename Traits::array_layout , Layout >::value &&
+     ( unsigned(Traits::rank) == Rank )
+   >::type >
+{
+  typedef ReturnType type ;
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+struct AllocateWithoutInitializing {};
+struct ViewWithoutManaging {};
+
+namespace {
+const AllocateWithoutInitializing allocate_without_initializing = AllocateWithoutInitializing();
+const ViewWithoutManaging view_without_managing = ViewWithoutManaging();
+}
+
+/** \class View
+ *  \brief View to an array of data.
+ *
+ * A View represents an array of one or more dimensions.
+ * For details, please refer to Kokkos' tutorial materials.
+ *
+ * \section Kokkos_View_TemplateParameters Template parameters
+ *
+ * This class has both required and optional template parameters.  The
+ * \c DataType parameter must always be provided, and must always be
+ * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are
+ * placeholders for different template parameters.  The default value
+ * of the fifth template parameter \c Specialize suffices for most use
+ * cases.  When explaining the template parameters, we won't refer to
+ * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer
+ * to the valid categories of template parameters, in whatever order
+ * they may occur.
+ *
+ * Valid ways in which template arguments may be specified:
+ *   - View< DataType , Device >
+ *   - View< DataType , Device ,        MemoryTraits >
+ *   - View< DataType , Device , void , MemoryTraits >
+ *   - View< DataType , Layout , Device >
+ *   - View< DataType , Layout , Device , MemoryTraits >
+ *
+ * \tparam DataType (required) This indicates both the type of each
+ *   entry of the array, and the combination of compile-time and
+ *   run-time array dimension(s).  For example, <tt>double*</tt>
+ *   indicates a one-dimensional array of \c double with run-time
+ *   dimension, and <tt>int*[3]</tt> a two-dimensional array of \c int
+ *   with run-time first dimension and compile-time second dimension
+ *   (of 3).  In general, the run-time dimensions (if any) must go
+ *   first, followed by zero or more compile-time dimensions.  For
+ *   more examples, please refer to the tutorial materials.
+ *
+ * \tparam Device (required) The execution model for parallel
+ *   operations.  Examples include Threads, OpenMP, Cuda, and Serial.
+ *
+ * \tparam Layout (optional) The array's layout in memory.  For
+ *   example, LayoutLeft indicates a column-major (Fortran style)
+ *   layout, and LayoutRight a row-major (C style) layout.  If not
+ *   specified, this defaults to the preferred layout for the
+ *   <tt>Device</tt>.
+ *
+ * \tparam MemoryTraits (optional) Assertion of the user's intended
+ *   access behavior.  For example, RandomAccess indicates read-only
+ *   access with limited spatial locality, and Unmanaged lets users
+ *   wrap externally allocated memory in a View without automatic
+ *   deallocation.
+ *
+ * \section Kokkos_View_MT MemoryTraits discussion
+ *
+ * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on Device
+ *
+ * Some \c MemoryTraits options may have different interpretations for
+ * different \c Device types.  For example, with the Cuda device,
+ * \c RandomAccess tells Kokkos to fetch the data through the texture
+ * cache, whereas the non-GPU devices have no such hardware construct.
+ *
+ * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits
+ *
+ * Users should defer applying the optional \c MemoryTraits parameter
+ * until the point at which they actually plan to rely on it in a
+ * computational kernel.  This minimizes the number of template
+ * parameters exposed in their code, which reduces the cost of
+ * compilation.  Users may always assign a View without specified
+ * \c MemoryTraits to a compatible View with that specification.
+ * For example:
+ * \code
+ * // Pass in the simplest types of View possible.
+ * void
+ * doSomething (View<double*, Cuda> out,
+ *              View<const double*, Cuda> in)
+ * {
+ *   // Assign the "generic" View in to a RandomAccess View in_rr.
+ *   // Note that RandomAccess View objects must have const data.
+ *   View<const double*, Cuda, RandomAccess> in_rr = in;
+ *   // ... do something with in_rr and out ...
+ * }
+ * \endcode
+ */
+template< class DataType ,
+          class Arg1Type ,        /* ArrayLayout or DeviceType */
+          class Arg2Type = void , /* DeviceType or MemoryTraits */
+          class Arg3Type = void , /* MemoryTraits */
+          class Specialize =
+            typename ViewTraits<DataType,Arg1Type,Arg2Type,Arg3Type>::specialize >
+class View ;
+
+//----------------------------------------------------------------------------
+
+template< class V >
+struct is_view : public Impl::false_type {};
+
+template< class DataType ,
+          class Arg1 ,
+          class Arg2 ,
+          class Arg3 ,
+          class Spec >
+struct is_view< View< DataType , Arg1 , Arg2 , Arg3 , Spec > >
+  : public Impl::true_type {};
+
+//----------------------------------------------------------------------------
+
+template< class DataType ,
+          class Arg1Type ,
+          class Arg2Type ,
+          class Arg3Type >
+class View< DataType , Arg1Type , Arg2Type , Arg3Type , Impl::ViewDefault >
+  : public ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type >
+{
+public:
+
+  typedef ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ;
+
+private:
+
+  // Assignment of compatible views requirement:
+  template< class , class , class , class , class > friend class View ;
+
+  // Assignment of compatible subview requirement:
+  template< class , class , class > friend struct Impl::ViewAssignment ;
+
+  // Dimensions, cardinality, capacity, and offset computation for
+  // multidimensional array view of contiguous memory.
+  // Inherits from Impl::Shape
+  typedef Impl::ViewOffset< typename traits::shape_type
+                          , typename traits::array_layout
+                          > offset_map_type ;
+
+  typename traits::value_type * m_ptr_on_device ;
+  offset_map_type               m_offset_map ;
+  Impl::ViewTracking< traits >  m_tracking ;
+
+public:
+
+  typedef View< typename traits::array_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits > array_type ;
+
+  typedef View< typename traits::const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits > const_type ;
+
+  typedef View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits > non_const_type ;
+
+  typedef View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type::host_mirror_device_type ,
+                void > HostMirror ;
+
+  //------------------------------------
+  // Shape
+
+  enum { Rank = traits::rank };
+
+  KOKKOS_INLINE_FUNCTION typename traits::shape_type shape() const { return m_offset_map ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const { return m_offset_map.N0 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_offset_map.N1 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return m_offset_map.N2 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return m_offset_map.N3 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return m_offset_map.N4 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return m_offset_map.N5 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return m_offset_map.N6 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return m_offset_map.N7 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type size() const { return m_offset_map.cardinality(); }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  typename traits::size_type dimension( const iType & i ) const
+    { return Impl::dimension( m_offset_map , i ); }
+
+  //------------------------------------
+  // Destructor, constructors, assignment operators:
+
+  KOKKOS_INLINE_FUNCTION
+  ~View() { m_tracking.decrement( m_ptr_on_device ); }
+
+  KOKKOS_INLINE_FUNCTION
+  View() : m_ptr_on_device(0)
+    { m_offset_map.assign(0, 0,0,0,0,0,0,0,0); }
+
+  KOKKOS_INLINE_FUNCTION
+  View( const View & rhs ) : m_ptr_on_device(0)
+    {
+      (void) Impl::ViewAssignment<
+         typename traits::specialize ,
+         typename traits::specialize >( *this , rhs );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( const View & rhs )
+    {
+      (void) Impl::ViewAssignment<
+         typename traits::specialize ,
+         typename traits::specialize >( *this , rhs );
+      return *this ;
+    }
+
+  //------------------------------------
+  // Construct or assign compatible view:
+
+  template< class RT , class RL , class RD , class RM , class RS >
+  KOKKOS_INLINE_FUNCTION
+  View( const View<RT,RL,RD,RM,RS> & rhs )
+    : m_ptr_on_device(0)
+    {
+      (void) Impl::ViewAssignment<
+         typename traits::specialize , RS >( *this , rhs );
+    }
+
+  template< class RT , class RL , class RD , class RM , class RS >
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( const View<RT,RL,RD,RM,RS> & rhs )
+    {
+      (void) Impl::ViewAssignment<
+         typename traits::specialize , RS >( *this , rhs );
+      return *this ;
+    }
+
+  //------------------------------------
+  // Allocation of a managed view with possible alignment padding.
+  // Allocation constructor enabled for managed and non-const values
+
+  template< class LabelType >
+  explicit inline
+  View( const LabelType & label ,
+        const size_t n0 = 0 ,
+        const size_t n1 = 0 ,
+        const size_t n2 = 0 ,
+        const size_t n3 = 0 ,
+        const size_t n4 = 0 ,
+        const size_t n5 = 0 ,
+        const size_t n6 = 0 ,
+        const size_t n7 = 0 ,
+        typename Impl::enable_if<(
+          Impl::IsViewLabel< LabelType >::value &&
+          ( ! Impl::is_const< typename traits::value_type >::value ) &&
+          traits::is_managed ),
+        const size_t >::type n8 = 0 )
+    : m_ptr_on_device(0)
+    {
+      typedef typename traits::memory_space  memory_space_ ;
+      typedef typename traits::value_type    value_type_ ;
+
+      m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n8 );
+      m_offset_map.set_padding();
+     
+      m_ptr_on_device = (value_type_ *)
+        memory_space_::allocate( label ,
+                                typeid(value_type_) ,
+                                sizeof(value_type_) ,
+                                m_offset_map.capacity() );
+
+      (void) Impl::ViewFill< View >( *this , typename traits::value_type() );
+    }
+
+  template< class LabelType >
+  explicit inline
+  View( const AllocateWithoutInitializing & ,
+        const LabelType & label ,
+        const size_t n0 = 0 ,
+        const size_t n1 = 0 ,
+        const size_t n2 = 0 ,
+        const size_t n3 = 0 ,
+        const size_t n4 = 0 ,
+        const size_t n5 = 0 ,
+        const size_t n6 = 0 ,
+        const size_t n7 = 0 ,
+        typename Impl::enable_if<(
+          Impl::IsViewLabel< LabelType >::value &&
+          ( ! Impl::is_const< typename traits::value_type >::value ) &&
+          traits::is_managed ),
+        const size_t >::type n8 = 0 )
+    : m_ptr_on_device(0)
+    {
+      typedef typename traits::memory_space  memory_space_ ;
+      typedef typename traits::value_type    value_type_ ;
+
+      m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n8 );
+      m_offset_map.set_padding();
+
+      m_ptr_on_device = (value_type_ *)
+        memory_space_::allocate( label ,
+                                typeid(value_type_) ,
+                                sizeof(value_type_) ,
+                                m_offset_map.capacity() );
+    }
+
+  template< class LabelType >
+  explicit inline
+  View( const AllocateWithoutInitializing & ,
+        const LabelType & label ,
+        typename Impl::enable_if<(
+          Impl::IsViewLabel< LabelType >::value &&
+          ( ! Impl::is_const< typename traits::value_type >::value ) &&
+          traits::is_managed ),
+        const typename traits::array_layout >::type layout )
+    : m_ptr_on_device(0)
+    {
+      typedef typename traits::memory_space  memory_space_ ;
+      typedef typename traits::value_type    value_type_ ;
+
+      m_offset_map.assign( layout );
+      m_offset_map.set_padding();
+
+      m_ptr_on_device = (value_type_ *)
+        memory_space_::allocate( label ,
+                                typeid(value_type_) ,
+                                sizeof(value_type_) ,
+                                m_offset_map.capacity() );
+    }
+
+  template< class LabelType >
+  explicit inline
+  View( const LabelType & label ,
+        typename Impl::enable_if<(
+          Impl::IsViewLabel< LabelType >::value &&
+          ( ! Impl::is_const< typename traits::value_type >::value ) &&
+          traits::is_managed
+        ), typename traits::array_layout const & >::type layout )
+    : m_ptr_on_device(0)
+    {
+      typedef typename traits::memory_space  memory_space_ ;
+      typedef typename traits::value_type    value_type_ ;
+
+      m_offset_map.assign( layout );
+      m_offset_map.set_padding();
+     
+      m_ptr_on_device = (value_type_ *)
+        memory_space_::allocate( label ,
+                                typeid(value_type_) ,
+                                sizeof(value_type_) ,
+                                m_offset_map.capacity() );
+
+      (void) Impl::ViewFill< View >( *this , typename traits::value_type() );
+    }
+
+  //------------------------------------
+  // Assign an unmanaged View from pointer, can be called in functors.
+  // No alignment padding is performed.
+
+  template< typename T >
+  explicit inline
+  View( T * ptr ,
+        const size_t n0 = 0 ,
+        const size_t n1 = 0 ,
+        const size_t n2 = 0 ,
+        const size_t n3 = 0 ,
+        const size_t n4 = 0 ,
+        const size_t n5 = 0 ,
+        const size_t n6 = 0 ,
+        const size_t n7 = 0 ,
+        typename Impl::enable_if<(
+          ( Impl::is_same<T,typename traits::value_type>::value ||
+            Impl::is_same<T,typename traits::const_value_type>::value )
+          &&
+          ( ! traits::is_managed )
+        ), const size_t >::type n8 = 0 )
+    : m_ptr_on_device(ptr)
+    {
+      m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n8 );
+      m_tracking = false ;
+    }
+
+  template< typename T >
+  explicit inline
+  View( T * ptr ,
+        typename Impl::enable_if<(
+          ( Impl::is_same<T,typename traits::value_type>::value ||
+            Impl::is_same<T,typename traits::const_value_type>::value )
+          &&
+          ( ! traits::is_managed )
+        ), typename traits::array_layout const & >::type layout )
+    : m_ptr_on_device(ptr)
+    {
+      m_offset_map.assign( layout );
+      m_tracking = false ;
+    }
+
+  explicit inline
+  View( const ViewWithoutManaging & ,
+        typename traits::value_type * ptr ,
+        const size_t n0 = 0 ,
+        const size_t n1 = 0 ,
+        const size_t n2 = 0 ,
+        const size_t n3 = 0 ,
+        const size_t n4 = 0 ,
+        const size_t n5 = 0 ,
+        const size_t n6 = 0 ,
+        const size_t n7 = 0 ,
+        const size_t n8 = 0 )
+    : m_ptr_on_device(ptr)
+    {
+      m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n8 );
+      m_tracking = false ;
+    }
+
+  explicit inline
+  View( const ViewWithoutManaging & 
+      , typename traits::value_type * ptr 
+      , typename traits::array_layout const & layout )
+    : m_ptr_on_device(ptr)
+    {
+      m_offset_map.assign( layout );
+      m_tracking = false ;
+    }
+
+  //------------------------------------
+  // Assign unmanaged View to portion of Device shared memory
+
+  typedef Impl::if_c< ! traits::is_managed ,
+                      typename traits::device_type ,
+                      Impl::ViewError::device_shmem_constructor_requires_unmanaged >
+      if_device_shmem_constructor ;
+
+  explicit KOKKOS_INLINE_FUNCTION
+  View( typename if_device_shmem_constructor::type & dev ,
+        const unsigned n0 = 0 ,
+        const unsigned n1 = 0 ,
+        const unsigned n2 = 0 ,
+        const unsigned n3 = 0 ,
+        const unsigned n4 = 0 ,
+        const unsigned n5 = 0 ,
+        const unsigned n6 = 0 ,
+        const unsigned n7 = 0 )
+    : m_ptr_on_device(0)
+    {
+      typedef typename traits::value_type  value_type_ ;
+
+      enum { align = 8 };
+      enum { mask  = align - 1 };
+
+      m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 );
+
+      typedef Impl::if_c< ! traits::is_managed ,
+                          value_type_ * ,
+                          Impl::ViewError::device_shmem_constructor_requires_unmanaged >
+        if_device_shmem_pointer ;
+
+      // Select the first argument:
+      m_ptr_on_device = if_device_shmem_pointer::select(
+       (value_type_*) dev.get_shmem( unsigned( sizeof(value_type_) * m_offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ) );
+    }
+
+  static inline
+  unsigned shmem_size( const unsigned n0 = 0 ,
+                       const unsigned n1 = 0 ,
+                       const unsigned n2 = 0 ,
+                       const unsigned n3 = 0 ,
+                       const unsigned n4 = 0 ,
+                       const unsigned n5 = 0 ,
+                       const unsigned n6 = 0 ,
+                       const unsigned n7 = 0 )
+  {
+    enum { align = 8 };
+    enum { mask  = align - 1 };
+
+    typedef typename traits::value_type  value_type_ ;
+
+    offset_map_type offset_map ;
+
+    offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 );
+
+    return unsigned( sizeof(value_type_) * offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ;
+  }
+
+  //------------------------------------
+  // Is not allocated
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool is_null() const { return 0 == m_ptr_on_device ; }
+
+  //------------------------------------
+  // Operators for scalar (rank zero) views.
+
+  typedef Impl::if_c< traits::rank == 0 ,
+                      typename traits::value_type ,
+                      Impl::ViewError::scalar_operator_called_from_non_scalar_view >
+    if_scalar_operator ;
+
+  KOKKOS_INLINE_FUNCTION
+  const View & operator = ( const typename if_scalar_operator::type & rhs ) const
+    {
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+      *m_ptr_on_device = if_scalar_operator::select( rhs );
+      return *this ;
+    }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  operator typename if_scalar_operator::type & () const
+    {
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+      return if_scalar_operator::select( *m_ptr_on_device );
+    }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename if_scalar_operator::type & operator()() const
+    {
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+      return if_scalar_operator::select( *m_ptr_on_device );
+    }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename if_scalar_operator::type & operator*() const
+    {
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+      return if_scalar_operator::select( *m_ptr_on_device );
+    }
+
+  //------------------------------------
+  // Array member access operators enabled if
+  // (1) a zero value of all argument types are compile-time comparable to zero
+  // (2) the rank matches the number of arguments
+  // (3) the memory space is valid for the access
+  //------------------------------------
+  // rank 1:
+
+  template< typename iType0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type & , traits, typename traits::array_layout, 1, iType0 >::type
+    operator[] ( const iType0 & i0 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return m_ptr_on_device[ i0 ];
+    }
+
+  template< typename iType0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type & , traits, typename traits::array_layout, 1, iType0 >::type
+    operator() ( const iType0 & i0 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return m_ptr_on_device[ i0 ];
+    }
+
+  template< typename iType0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type & , traits, typename traits::array_layout, 1, iType0 >::type
+    at( const iType0 & i0 , const int , const int , const int ,
+        const int , const int , const int , const int ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return m_ptr_on_device[ i0 ];
+    }
+
+  // rank 2:
+
+  template< typename iType0 , typename iType1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type & ,
+                                      traits, typename traits::array_layout, 2, iType0, iType1 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0,i1 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1) ];
+    }
+
+  template< typename iType0 , typename iType1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type & ,
+                                      traits, typename traits::array_layout, 2, iType0, iType1 >::type
+    at( const iType0 & i0 , const iType1 & i1 , const int , const int ,
+        const int , const int , const int , const int ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0,i1 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1) ];
+    }
+
+  // rank 3:
+
+  template< typename iType0 , typename iType1 , typename iType2 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type & ,
+                                      traits, typename traits::array_layout, 3, iType0, iType1, iType2 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_3( m_offset_map, i0,i1,i2 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2) ];
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type & ,
+                                      traits, typename traits::array_layout, 3, iType0, iType1, iType2 >::type
+    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const int ,
+        const int , const int , const int , const int ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_3( m_offset_map, i0,i1,i2 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2) ];
+    }
+
+  // rank 4:
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type & ,
+                                      traits, typename traits::array_layout, 4, iType0, iType1, iType2, iType3 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_4( m_offset_map, i0,i1,i2,i3 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3) ];
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type & ,
+                                      traits, typename traits::array_layout, 4, iType0, iType1, iType2, iType3 >::type
+    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+        const int , const int , const int , const int ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_4( m_offset_map, i0,i1,i2,i3 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3) ];
+    }
+
+  // rank 5:
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type & ,
+                                      traits, typename traits::array_layout, 5, iType0, iType1, iType2, iType3 , iType4 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_5( m_offset_map, i0,i1,i2,i3,i4 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4) ];
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type & ,
+                                      traits, typename traits::array_layout, 5, iType0, iType1, iType2, iType3 , iType4 >::type
+    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+        const iType4 & i4 , const int , const int , const int ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_5( m_offset_map, i0,i1,i2,i3,i4 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4) ];
+    }
+
+  // rank 6:
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type & ,
+                                      traits, typename traits::array_layout, 6,
+                                      iType0, iType1, iType2, iType3 , iType4, iType5 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 , const iType5 & i5 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_6( m_offset_map, i0,i1,i2,i3,i4,i5 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5) ];
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type & ,
+                                      traits, typename traits::array_layout, 6,
+                                      iType0, iType1, iType2, iType3 , iType4, iType5 >::type
+    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+        const iType4 & i4 , const iType5 & i5 , const int , const int ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_6( m_offset_map, i0,i1,i2,i3,i4,i5 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5) ];
+    }
+
+  // rank 7:
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 , typename iType6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type & ,
+                                      traits, typename traits::array_layout, 7,
+                                      iType0, iType1, iType2, iType3 , iType4, iType5, iType6 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_7( m_offset_map, i0,i1,i2,i3,i4,i5,i6 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6) ];
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 , typename iType6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type & ,
+                                      traits, typename traits::array_layout, 7,
+                                      iType0, iType1, iType2, iType3 , iType4, iType5, iType6 >::type
+    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+        const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const int ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_7( m_offset_map, i0,i1,i2,i3,i4,i5,i6 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6) ];
+    }
+
+  // rank 8:
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 , typename iType6 , typename iType7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type & ,
+                                      traits, typename traits::array_layout, 8,
+                                      iType0, iType1, iType2, iType3 , iType4, iType5, iType6, iType7 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_8( m_offset_map, i0,i1,i2,i3,i4,i5,i6,i7 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6,i7) ];
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 , typename iType6 , typename iType7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< typename traits::value_type & ,
+                                      traits, typename traits::array_layout, 8,
+                                      iType0, iType1, iType2, iType3 , iType4, iType5, iType6, iType7 >::type
+    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+        const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_8( m_offset_map, i0,i1,i2,i3,i4,i5,i6,i7 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6,i7) ];
+    }
+
+  //------------------------------------
+  // Access to the underlying contiguous storage of this view specialization.
+  // These methods are specific to specialization of a view.
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename traits::value_type * ptr_on_device() const { return m_ptr_on_device ; }
+
+  // Stride of physical storage, dimensioned to at least Rank
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+  { m_offset_map.stride(s); }
+
+  // Count of contiguously allocated data members including padding.
+  KOKKOS_INLINE_FUNCTION
+  typename traits::size_type capacity() const
+  { return m_offset_map.capacity(); }
+};
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class LT , class LL , class LD , class LM , class LS ,
+          class RT , class RL , class RD , class RM , class RS >
+KOKKOS_INLINE_FUNCTION
+typename Impl::enable_if<( Impl::is_same< LS , RS >::value ), bool >::type
+operator == ( const View<LT,LL,LD,LM,LS> & lhs ,
+              const View<RT,RL,RD,RM,RS> & rhs )
+{
+  // Same data, layout, dimensions
+  typedef ViewTraits<LT,LL,LD,LM> lhs_traits ;
+  typedef ViewTraits<RT,RL,RD,RM> rhs_traits ;
+
+  return
+    Impl::is_same< typename lhs_traits::const_data_type ,
+                   typename rhs_traits::const_data_type >::value &&
+    Impl::is_same< typename lhs_traits::array_layout ,
+                   typename rhs_traits::array_layout >::value &&
+    Impl::is_same< typename lhs_traits::memory_space ,
+                   typename rhs_traits::memory_space >::value &&
+    Impl::is_same< typename lhs_traits::specialize ,
+                   typename rhs_traits::specialize >::value &&
+    lhs.ptr_on_device() == rhs.ptr_on_device() &&
+    lhs.shape()         == rhs.shape() ;
+}
+
+template< class LT , class LL , class LD , class LM , class LS ,
+          class RT , class RL , class RD , class RM , class RS >
+KOKKOS_INLINE_FUNCTION
+bool operator != ( const View<LT,LL,LD,LM,LS> & lhs ,
+                   const View<RT,RL,RD,RM,RS> & rhs )
+{
+  return ! operator==( lhs , rhs );
+}
+
+//----------------------------------------------------------------------------
+
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+/** \brief  Deep copy a value into a view.
+ */
+template< class DT , class DL , class DD , class DM , class DS >
+inline
+void deep_copy( const View<DT,DL,DD,DM,DS> & dst ,
+                typename Impl::enable_if<(
+                  Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::non_const_value_type ,
+                                 typename ViewTraits<DT,DL,DD,DM>::value_type >::value
+                ), typename ViewTraits<DT,DL,DD,DM>::const_value_type >::type & value )
+{
+  Impl::ViewFill< View<DT,DL,DD,DM,DS> >( dst , value );
+}
+
+template< class ST , class SL , class SD , class SM , class SS >
+inline
+typename Impl::enable_if<( ViewTraits<ST,SL,SD,SM>::rank == 0 )>::type
+deep_copy( ST & dst , const View<ST,SL,SD,SM,SS> & src )
+{
+  typedef  ViewTraits<ST,SL,SD,SM>  src_traits ;
+  typedef typename src_traits::memory_space  src_memory_space ;
+  Impl::DeepCopy< HostSpace , src_memory_space >( & dst , src.ptr_on_device() , sizeof(ST) );
+}
+
+//----------------------------------------------------------------------------
+/** \brief  A deep copy between views of the same specialization, compatible type,
+ *          same rank, same layout are handled by that specialization.
+ */
+template< class DT , class DL , class DD , class DM ,
+          class ST , class SL , class SD , class SM >
+inline
+void deep_copy( const View<DT,DL,DD,DM,Impl::ViewDefault> & dst ,
+                const View<ST,SL,SD,SM,Impl::ViewDefault> & src ,
+                typename Impl::enable_if<(
+                  Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::value_type ,
+                                 typename View<ST,SL,SD,SM,Impl::ViewDefault>::non_const_value_type >::value
+                  &&
+                  Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::array_layout ,
+                                 typename View<ST,SL,SD,SM,Impl::ViewDefault>::array_layout >::value
+                  &&
+                  ( unsigned(View<DT,DL,DD,DM,Impl::ViewDefault>::rank) == unsigned(View<ST,SL,SD,SM,Impl::ViewDefault>::rank) )
+                )>::type * = 0 )
+{
+  typedef  View<DT,DL,DD,DM,Impl::ViewDefault>  dst_type ;
+  typedef  View<ST,SL,SD,SM,Impl::ViewDefault>  src_type ;
+
+  typedef typename dst_type::memory_space  dst_memory_space ;
+  typedef typename src_type::memory_space  src_memory_space ;
+
+  if ( dst.ptr_on_device() != src.ptr_on_device() ) {
+
+    Impl::assert_shapes_are_equal( dst.shape() , src.shape() );
+
+    const size_t nbytes = sizeof(typename dst_type::value_type) * dst.capacity();
+
+    Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.ptr_on_device() , src.ptr_on_device() , nbytes );
+  }
+}
+
+
+/** \brief Deep copy equal dimension arrays in the host space which
+ *         have different layouts or specializations.
+ */
+template< class DT , class DL , class DD , class DM , class DS ,
+          class ST , class SL ,            class SM , class SS >
+inline
+void deep_copy( const View< DT, DL, DD, DM, DS > & dst ,
+                const View< ST, SL, DD, SM, SS > & src ,
+                const typename Impl::enable_if<(
+                  // Destination is not constant:
+                  Impl::is_same< typename View<DT,DL,DD,DM,DS>::value_type ,
+                                 typename View<DT,DL,DD,DM,DS>::non_const_value_type >::value
+                  &&
+                  // Same rank
+                  ( unsigned( View<DT,DL,DD,DM,DS>::rank ) ==
+                    unsigned( View<ST,SL,DD,SM,SS>::rank ) )
+                  &&
+                  // Different layout or different specialization:
+                  ( ( ! Impl::is_same< typename View<DT,DL,DD,DM,DS>::array_layout ,
+                                       typename View<ST,SL,DD,SM,SS>::array_layout >::value )
+                    ||
+                    ( ! Impl::is_same< DS , SS >::value )
+                  )
+                )>::type * = 0 )
+{
+  typedef View< DT, DL, DD, DM, DS > dst_type ;
+  typedef View< ST, SL, DD, SM, SS > src_type ;
+
+  assert_shapes_equal_dimension( dst.shape() , src.shape() );
+
+  Impl::ViewRemap< dst_type , src_type >( dst , src );
+}
+
+//----------------------------------------------------------------------------
+
+template< class T , class L , class D , class M , class S >
+typename Impl::enable_if<(
+    View<T,L,D,M,S>::is_managed
+  ), typename View<T,L,D,M,S>::HostMirror >::type
+inline
+create_mirror( const View<T,L,D,M,S> & src )
+{
+  typedef View<T,L,D,M,S>                  view_type ;
+  typedef typename view_type::HostMirror    host_view_type ;
+  typedef typename view_type::memory_space  memory_space ;
+
+  // 'view' is managed therefore we can allocate a
+  // compatible host_view through the ordinary constructor.
+
+  std::string label = memory_space::query_label( src.ptr_on_device() );
+  label.append("_mirror");
+
+  return host_view_type( label ,
+                         src.dimension_0() ,
+                         src.dimension_1() ,
+                         src.dimension_2() ,
+                         src.dimension_3() ,
+                         src.dimension_4() ,
+                         src.dimension_5() ,
+                         src.dimension_6() ,
+                         src.dimension_7() );
+}
+
+template< class T , class L , class D , class M , class S >
+typename Impl::enable_if<(
+    View<T,L,D,M,S>::is_managed &&
+    Impl::ViewAssignable< typename View<T,L,D,M,S>::HostMirror , View<T,L,D,M,S> >::value
+  ), typename View<T,L,D,M,S>::HostMirror >::type
+inline
+create_mirror_view( const View<T,L,D,M,S> & src )
+{
+  return src ;
+}
+
+template< class T , class L , class D , class M , class S >
+typename Impl::enable_if<(
+    View<T,L,D,M,S>::is_managed &&
+    ! Impl::ViewAssignable< typename View<T,L,D,M,S>::HostMirror , View<T,L,D,M,S> >::value
+  ), typename View<T,L,D,M,S>::HostMirror >::type
+inline
+create_mirror_view( const View<T,L,D,M,S> & src )
+{
+  return create_mirror( src );
+}
+
+//----------------------------------------------------------------------------
+
+/** \brief  Resize a view with copying old data to new data at the corresponding indices. */
+template< class T , class L , class D , class M , class S >
+inline
+void resize( View<T,L,D,M,S> & v ,
+             const typename Impl::enable_if< ViewTraits<T,L,D,M>::is_managed , size_t >::type n0 ,
+             const size_t n1 = 0 ,
+             const size_t n2 = 0 ,
+             const size_t n3 = 0 ,
+             const size_t n4 = 0 ,
+             const size_t n5 = 0 ,
+             const size_t n6 = 0 ,
+             const size_t n7 = 0 )
+{
+  typedef View<T,L,D,M,S> view_type ;
+  typedef typename view_type::memory_space memory_space ;
+
+  const std::string label = memory_space::query_label( v.ptr_on_device() );
+
+  view_type v_resized( label, n0, n1, n2, n3, n4, n5, n6, n7 );
+
+  Impl::ViewRemap< view_type , view_type >( v_resized , v );
+
+  v = v_resized ;
+}
+
+/** \brief  Reallocate a view without copying old data to new data */
+template< class T , class L , class D , class M , class S >
+inline
+void realloc( View<T,L,D,M,S> & v ,
+              const typename Impl::enable_if< ViewTraits<T,L,D,M>::is_managed , size_t >::type n0 ,
+              const size_t n1 = 0 ,
+              const size_t n2 = 0 ,
+              const size_t n3 = 0 ,
+              const size_t n4 = 0 ,
+              const size_t n5 = 0 ,
+              const size_t n6 = 0 ,
+              const size_t n7 = 0 )
+{
+  typedef View<T,L,D,M,S> view_type ;
+  typedef typename view_type::memory_space memory_space ;
+
+  // Query the current label and reuse it.
+  const std::string label = memory_space::query_label( v.ptr_on_device() );
+
+  v = view_type(); // deallocate first, if the only view to memory.
+  v = view_type( label, n0, n1, n2, n3, n4, n5, n6, n7 );
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+struct ALL { KOKKOS_INLINE_FUNCTION ALL(){} };
+
+template< class DstViewType ,
+          class T , class L , class D , class M , class S ,
+          class ArgType0 >
+KOKKOS_INLINE_FUNCTION
+DstViewType
+subview( const View<T,L,D,M,S> & src ,
+         const ArgType0 & arg0 )
+{
+  DstViewType dst ;
+
+  Impl::ViewAssignment<typename DstViewType::specialize,S>( dst , src , arg0 );
+
+  return dst ;
+}
+
+template< class DstViewType ,
+          class T , class L , class D , class M , class S ,
+          class ArgType0 , class ArgType1 >
+KOKKOS_INLINE_FUNCTION
+DstViewType
+subview( const View<T,L,D,M,S> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 )
+{
+  DstViewType dst ;
+
+  Impl::ViewAssignment<typename DstViewType::specialize,S>( dst, src, arg0, arg1 );
+
+  return dst ;
+}
+
+template< class DstViewType ,
+          class T , class L , class D , class M , class S ,
+          class ArgType0 , class ArgType1 , class ArgType2 >
+KOKKOS_INLINE_FUNCTION
+DstViewType
+subview( const View<T,L,D,M,S> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 )
+{
+  DstViewType dst ;
+
+  Impl::ViewAssignment<typename DstViewType::specialize,S>( dst, src, arg0, arg1, arg2 );
+
+  return dst ;
+}
+
+template< class DstViewType ,
+          class T , class L , class D , class M , class S ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 >
+KOKKOS_INLINE_FUNCTION
+DstViewType
+subview( const View<T,L,D,M,S> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 )
+{
+  DstViewType dst ;
+
+  Impl::ViewAssignment<typename DstViewType::specialize,S>( dst, src, arg0, arg1, arg2, arg3 );
+
+  return dst ;
+}
+
+template< class DstViewType ,
+          class T , class L , class D , class M , class S ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 >
+KOKKOS_INLINE_FUNCTION
+DstViewType
+subview( const View<T,L,D,M,S> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 )
+{
+  DstViewType dst ;
+
+  Impl::ViewAssignment<typename DstViewType::specialize,S>( dst, src, arg0, arg1, arg2, arg3, arg4 );
+
+  return dst ;
+}
+
+template< class DstViewType ,
+          class T , class L , class D , class M , class S ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 , class ArgType5 >
+KOKKOS_INLINE_FUNCTION
+DstViewType
+subview( const View<T,L,D,M,S> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 ,
+         const ArgType5 & arg5 )
+{
+  DstViewType dst ;
+
+  Impl::ViewAssignment<typename DstViewType::specialize,S>( dst, src, arg0, arg1, arg2, arg3, arg4, arg5 );
+
+  return dst ;
+}
+
+template< class DstViewType ,
+          class T , class L , class D , class M , class S ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 , class ArgType5 , class ArgType6 >
+KOKKOS_INLINE_FUNCTION
+DstViewType
+subview( const View<T,L,D,M,S> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 ,
+         const ArgType5 & arg5 ,
+         const ArgType6 & arg6 )
+{
+  DstViewType dst ;
+
+  Impl::ViewAssignment<typename DstViewType::specialize,S>( dst, src, arg0, arg1, arg2, arg3, arg4, arg5, arg6 );
+
+  return dst ;
+}
+
+template< class DstViewType ,
+          class T , class L , class D , class M , class S ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 , class ArgType5 , class ArgType6 , class ArgType7 >
+KOKKOS_INLINE_FUNCTION
+DstViewType
+subview( const View<T,L,D,M,S> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 ,
+         const ArgType5 & arg5 ,
+         const ArgType6 & arg6 ,
+         const ArgType7 & arg7 )
+{
+  DstViewType dst ;
+
+  Impl::ViewAssignment<typename DstViewType::specialize,S>( dst, src, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 );
+
+  return dst ;
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#include <impl/Kokkos_ViewDefault.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif
+
diff --git a/lib/kokkos/core/src/Kokkos_hwloc.hpp b/lib/kokkos/core/src/Kokkos_hwloc.hpp
new file mode 100644
index 000000000..6b8aea148
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_hwloc.hpp
@@ -0,0 +1,140 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HWLOC_HPP
+#define KOKKOS_HWLOC_HPP
+
+#include <utility>
+
+namespace Kokkos {
+
+/** \brief  Minimal subset of logical 'hwloc' functionality available
+ *          from http://www.open-mpi.org/projects/hwloc/.
+ *
+ *  The calls are NOT thread safe in order to avoid mutexes,
+ *  memory allocations, or other actions which could give the
+ *  runtime system an opportunity to migrate the threads or
+ *  touch allocated memory during the function calls.
+ *
+ *  All calls to these functions should be performed by a thread
+ *  when it has guaranteed exclusive access; e.g., for OpenMP
+ *  within a 'critical' region.
+ */
+namespace hwloc {
+
+/** \brief  Query if hwloc is available */
+bool available();
+
+/** \brief  Query number of available NUMA regions.
+ *          This will be less than the hardware capacity
+ *          if the MPI process is pinned to a NUMA region.
+ */
+unsigned get_available_numa_count();
+
+/** \brief  Query number of available cores per NUMA regions.
+ *          This will be less than the hardware capacity
+ *          if the MPI process is pinned to a set of cores.
+ */
+unsigned get_available_cores_per_numa();
+
+/** \brief  Query number of available "hard" threads per core; i.e., hyperthreads */
+unsigned get_available_threads_per_core();
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Internal functions for binding persistent spawned threads.
+
+namespace Kokkos {
+namespace hwloc {
+
+/** \brief  Recommend mapping of threads onto cores.
+ *
+ * If thread_count == 0 then choose and set a value.
+ * If use_numa_count == 0 then choose and set a value.
+ * If use_cores_per_numa == 0 then choose and set a value.
+ *
+ * Return 0 if asynchronous,
+ * Return 1 if synchronous and threads_coord[0] is process core
+ */
+unsigned thread_mapping( const char * const label ,
+                         const bool allow_async ,
+                         unsigned & thread_count ,
+                         unsigned & use_numa_count ,
+                         unsigned & use_cores_per_numa ,
+                         std::pair<unsigned,unsigned> threads_coord[] );
+
+/** \brief  Query core-coordinate of the current thread
+ *          with respect to the core_topology.
+ *
+ *  As long as the thread is running within the 
+ *  process binding the following condition holds.
+ *
+ *  core_coordinate.first  < core_topology.first
+ *  core_coordinate.second < core_topology.second
+ */
+std::pair<unsigned,unsigned> get_this_thread_coordinate();
+
+/** \brief  Bind the current thread to a core. */
+bool bind_this_thread( const std::pair<unsigned,unsigned> );
+
+/** \brief  Bind the current thread to one of the cores in the list.
+ *          Set that entry to (~0,~0) and return the index.
+ *          If binding fails return ~0.
+ */
+unsigned bind_this_thread( const unsigned               coordinate_count ,
+                           std::pair<unsigned,unsigned> coordinate[] );
+
+/** \brief  Unbind the current thread back to the original process binding */
+bool unbind_this_thread();
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_HWLOC_HPP */
+
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
new file mode 100644
index 000000000..d3a0092d7
--- /dev/null
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@@ -0,0 +1,403 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMP_PARALLEL_HPP
+#define KOKKOS_OPENMP_PARALLEL_HPP
+
+#include <omp.h>
+
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_ParallelReduce.hpp>
+#include <OpenMP/Kokkos_OpenMPexec.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class WorkSpec >
+class ParallelFor< FunctorType , WorkSpec , ::Kokkos::OpenMP >
+{
+public:
+
+  inline
+  ParallelFor( const FunctorType & functor , const size_t work_count )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
+    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      const std::pair< size_t , size_t > range = exec.work_range( work_count );
+
+      for ( size_t iwork = range.first , work_end = range.second ; iwork < work_end ; ++iwork ) {
+        functor( iwork );
+      }
+    }
+/* END #pragma omp parallel */
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class WorkSpec >
+class ParallelReduce< FunctorType , WorkSpec , Kokkos::OpenMP >
+{
+public:
+  typedef ReduceAdapter< FunctorType >   Reduce ;
+  typedef typename Reduce::pointer_type  pointer_type ;
+
+  inline
+  ParallelReduce( const FunctorType & functor ,
+                  const size_t        work_count ,
+                  pointer_type        result = 0 )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+
+    OpenMPexec::resize_reduce_scratch( Reduce::value_size( functor ) );
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      const std::pair<size_t,size_t> range = exec.work_range( work_count );
+
+      typename Reduce::reference_type update = Reduce::reference( exec.reduce_base() );
+
+      functor.init( update );
+
+      for ( size_t iw = range.first, work_end = range.second ; iw < work_end ; ++iw ) {
+        functor( iw , update );
+      }
+    }
+/* END #pragma omp parallel */
+
+    {
+      const pointer_type ptr = pointer_type( OpenMPexec::get_thread_rank_rev(0)->reduce_base() );
+
+      for ( int i = 1 ; i < omp_get_max_threads() ; ++i ) {
+        functor.join( Reduce::reference( ptr ) , Reduce::reference( OpenMPexec::get_thread_rank_rev(i)->reduce_base() ) );
+      }
+
+      Reduce::final( functor , ptr );
+
+      if ( result ) {
+        const int n = Reduce::value_count( functor );
+
+        for ( int j = 0 ; j < n ; ++j ) { result[j] = ptr[j] ; }
+      }
+    }
+  }
+
+  void wait() {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class WorkSpec >
+class ParallelScan< FunctorType , WorkSpec , Kokkos::OpenMP >
+{
+public:
+  typedef ReduceAdapter< FunctorType >   Reduce ;
+  typedef typename Reduce::pointer_type  pointer_type ;
+
+  inline
+  ParallelScan( const FunctorType & functor , const size_t work_count )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
+    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
+
+    OpenMPexec::resize_reduce_scratch( 2 * Reduce::value_size( functor ) );
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      const std::pair<size_t,size_t> range = exec.work_range( work_count );
+
+      typename Reduce::reference_type update =
+        Reduce::reference( pointer_type( exec.reduce_base() ) + Reduce::value_count( functor ) );
+
+      functor.init( update );
+
+      for ( size_t iw = range.first , work_end = range.second ; iw < work_end ; ++iw ) {
+        functor( iw , update , false );
+      }
+    }
+/* END #pragma omp parallel */
+
+    {
+      const unsigned thread_count = omp_get_max_threads();
+      const unsigned value_count  = Reduce::value_count( functor );
+
+      pointer_type ptr_prev = 0 ;
+
+      for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
+
+        pointer_type ptr = pointer_type( OpenMPexec::get_thread_rank_rev(rank_rev)->reduce_base() );
+
+        if ( ptr_prev ) {
+          for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
+          functor.join( Reduce::reference( ptr + value_count ) , Reduce::reference( ptr ) );
+        }
+        else {
+          functor.init( Reduce::reference( ptr ) );
+        }
+
+        ptr_prev = ptr ;
+      }
+    }
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      const std::pair<size_t,size_t> range = exec.work_range( work_count );
+
+      typename Reduce::reference_type update =
+        Reduce::reference( pointer_type( exec.reduce_base() ) );
+
+      for ( size_t iw = range.first , work_end = range.second ; iw < work_end ; ++iw ) {
+        functor( iw , update , true );
+      }
+    }
+/* END #pragma omp parallel */
+
+  }
+
+  void wait() {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_USE_PRAGMA_SIMD )
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType >
+class ParallelReduce< FunctorType , VectorParallel , ::Kokkos::OpenMP >
+{
+public:
+  typedef ReduceAdapter< FunctorType >   Reduce ;
+  typedef typename Reduce::pointer_type  pointer_type ;
+
+  inline
+  ParallelReduce( const FunctorType & functor ,
+                  const size_t        work_count ,
+                  pointer_type        result = 0 )
+  {
+    typedef integral_constant< size_t , OpenMPexec::VECTOR_LENGTH >     vector_length ;
+    typedef integral_constant< size_t , OpenMPexec::VECTOR_LENGTH - 1 > vector_mask ;
+
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+
+    OpenMPexec::resize_reduce_scratch( Reduce::value_size( functor ) * vector_length::value );
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+      const pointer_type ptr = pointer_type( exec.reduce_base() );
+
+      const std::pair<size_t,size_t> range = exec.work_range( work_count );
+
+#pragma simd
+#pragma ivdep
+      for ( size_t iv = 0 ; iv < vector_length::value ; ++iv ) {
+        functor.init( Reduce::reference( ptr + iv * Reduce::value_count( functor ) ) );
+      }
+
+#pragma simd vectorlength( vector_length::value )
+#pragma ivdep
+      for ( size_t iw = range.first , work_end = range.second ; iw < work_end ; ++iw ) {
+        functor( iw , Reduce::reference( ptr + ( iw & vector_mask::value ) * Reduce::value_count( functor ) ) );
+      }
+
+      for ( size_t iv = 1 ; iv < vector_length::value ; ++iv ) {
+        functor.join( Reduce::reference( ptr ) ,
+                      Reduce::reference( ptr + iv * Reduce::value_count( functor ) ) );
+      }
+    }
+/* END #pragma omp parallel */
+
+    {
+      const pointer_type ptr = pointer_type( OpenMPexec::get_thread_rank_rev(0)->reduce_base() );
+
+      for ( int i = 1 ; i < omp_get_max_threads() ; ++i ) {
+        functor.join( Reduce::reference( ptr ) , Reduce::reference( OpenMPexec::get_thread_rank_rev(i)->reduce_base() ) );
+      }
+
+      Reduce::final( functor , ptr );
+
+      if ( result ) {
+        const int n = Reduce::value_count( functor );
+
+        for ( int j = 0 ; j < n ; ++j ) { result[j] = ptr[j] ; }
+      }
+    }
+  }
+
+  void wait() {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #if defined( KOKKOS_USE_PRAGMA_SIMD ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType >
+class ParallelFor< FunctorType , ParallelWorkRequest , ::Kokkos::OpenMP >
+{
+public:
+
+  inline
+  ParallelFor( const FunctorType         & functor ,
+               const ParallelWorkRequest & work )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
+    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+
+    OpenMPexec::resize_shared_scratch( FunctorShmemSize< FunctorType >::value( functor ) );
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      for ( exec.team_work_init( work.league_size , work.team_size ) ; exec.team_work_avail() ; exec.team_work_next() ) {
+        functor( OpenMP( exec ) );
+      }
+    }
+/* END #pragma omp parallel */
+  }
+
+  void wait() {}
+};
+
+template< class FunctorType >
+class ParallelReduce< FunctorType , ParallelWorkRequest , ::Kokkos::OpenMP >
+{
+public:
+  typedef ReduceAdapter< FunctorType >   Reduce ;
+  typedef typename Reduce::pointer_type  pointer_type ;
+
+  inline
+  ParallelReduce( const FunctorType         & functor ,
+                  const ParallelWorkRequest & work ,
+                  pointer_type                result = 0 )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+
+    OpenMPexec::resize_shared_scratch( FunctorShmemSize< FunctorType >::value( functor ) );
+    OpenMPexec::resize_reduce_scratch( Reduce::value_size( functor ) );
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      typename Reduce::reference_type update = Reduce::reference( exec.reduce_base() );
+
+      functor.init( update );
+
+      for ( exec.team_work_init( work.league_size , work.team_size ) ; exec.team_work_avail() ; exec.team_work_next() ) {
+        functor( OpenMP( exec ) , update );
+      }
+    }
+/* END #pragma omp parallel */
+
+    {
+      const pointer_type ptr = pointer_type( OpenMPexec::get_thread_rank_rev(0)->reduce_base() );
+
+      for ( int i = 1 ; i < omp_get_max_threads() ; ++i ) {
+        functor.join( Reduce::reference( ptr ) , Reduce::reference( OpenMPexec::get_thread_rank_rev(i)->reduce_base() ) );
+      }
+
+      Reduce::final( functor , ptr );
+
+      if ( result ) {
+        const int n = Reduce::value_count( functor );
+
+        for ( int j = 0 ; j < n ; ++j ) { result[j] = ptr[j] ; }
+      }
+    }
+  }
+
+  void wait() {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_OPENMP_PARALLEL_HPP */
+
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
new file mode 100644
index 000000000..2bda3855d
--- /dev/null
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
@@ -0,0 +1,425 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdio.h>
+#include <limits>
+#include <iostream>
+#include <Kokkos_OpenMP.hpp>
+#include <Kokkos_hwloc.hpp>
+#include <iostream>
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+KOKKOS_INLINE_FUNCTION
+int kokkos_omp_in_parallel();
+
+int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 );
+
+KOKKOS_INLINE_FUNCTION
+int kokkos_omp_in_parallel()
+{
+#ifndef __CUDA_ARCH__
+  return omp_in_parallel() && ! kokkos_omp_in_critical_region ;
+#else
+  return 0;
+#endif
+}
+
+unsigned s_threads_per_core = 0 ;
+unsigned s_threads_per_numa = 0 ;
+bool s_using_hwloc = false;
+
+KOKKOS_INLINE_FUNCTION
+unsigned fan_size( const unsigned rank , const unsigned size )
+{
+  const unsigned rank_rev = size - ( rank + 1 );
+  unsigned count = 0 ;
+  for ( unsigned n = 1 ; ( rank_rev + n < size ) && ! ( rank_rev & n ) ; n <<= 1 ) { ++count ; }
+  return count ;
+}
+
+} // namespace
+} // namespace Impl
+} // namespace Kokkos
+
+
+namespace Kokkos {
+namespace Impl {
+
+OpenMPexec * OpenMPexec::m_thread[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 }; // Indexed by omp_get_thread_num()
+OpenMPexec * OpenMPexec::m_pool[   OpenMPexec::MAX_THREAD_COUNT ] = { 0 }; // Indexed by OpenMPexec::m_pool_rank
+
+OpenMPexec::OpenMPexec( const unsigned pool_rank )
+  : m_team_base(0)
+  , m_alloc_reduce(0)
+  , m_alloc_shared(0)
+  , m_team_shared(0)
+  , m_alloc_shared_size(0)
+  , m_pool_rank( pool_rank )
+  , m_team_shared_end(0)
+  , m_team_shared_iter(0)
+  , m_team_rank(0)
+  , m_team_size(0)
+  , m_team_fan_size(0)
+  , m_league_rank(0)
+  , m_league_end(0)
+  , m_league_size(0)
+  , m_barrier_state( OpenMPexec::Active )
+  , m_scan_state( OpenMPexec::Active )
+{}
+
+OpenMPexec::~OpenMPexec() {}
+
+void OpenMPexec::team_work_init( size_t league_size , size_t team_size )
+{
+  m_team_base        = 0 ;
+  m_team_shared      = 0 ;
+  m_team_shared_end  = 0 ;
+  m_team_size        = 0 ;
+  m_team_rank        = 0 ;
+  m_team_fan_size    = 0 ;
+  m_league_size      = 0 ;
+  m_league_rank      = 0 ;
+  m_league_end       = 0 ;
+
+  if ( league_size ) {
+
+    if ( s_threads_per_numa < team_size ) { team_size = s_threads_per_numa ; }
+
+    // Execution is using device-team interface:
+
+    const unsigned pool_size     = omp_get_num_threads();
+    const unsigned team_alloc    = s_threads_per_core * ( ( team_size + s_threads_per_core - 1 ) / s_threads_per_core );
+    const unsigned pool_rank_rev = pool_size - ( m_pool_rank + 1 );
+    const unsigned team_rank_rev = pool_rank_rev % team_alloc ;
+
+    // May be using fewer threads per team than a multiple of threads per core,
+    // some threads will idle.
+
+    if ( team_rank_rev < team_size ) {
+      const size_t pool_league_size     = pool_size     / team_alloc ;
+      const size_t pool_league_rank_rev = pool_rank_rev / team_alloc ;
+      const size_t pool_league_rank     = pool_league_size - ( pool_league_rank_rev + 1 );
+
+      m_team_base        = m_pool + team_alloc * pool_league_rank_rev ;
+      m_team_shared      = (*m_team_base)->m_alloc_shared ;
+      m_team_shared_end  = (*m_team_base)->m_alloc_shared_size ;
+      m_team_size        = team_size ;
+      m_team_rank        = team_size - ( team_rank_rev + 1 );
+      m_team_fan_size    = fan_size( m_team_rank , team_size );
+      m_league_size      = league_size ;
+      m_league_rank      = ( league_size *  pool_league_rank    ) / pool_league_size ;
+      m_league_end       = ( league_size * (pool_league_rank+1) ) / pool_league_size ;
+    }
+  }
+}
+
+
+void OpenMPexec::verify_is_process( const char * const label )
+{
+  if ( omp_in_parallel() ) {
+    std::string msg( label );
+    msg.append( " ERROR: in parallel" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+void OpenMPexec::verify_initialized( const char * const label )
+{
+  if ( 0 == m_thread[0] ) {
+    std::string msg( label );
+    msg.append( " ERROR: not initialized" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+void OpenMPexec::resize_reduce_scratch( size_t size )
+{
+  static size_t s_size = 0 ;
+
+  verify_initialized( "OpenMP::resize_reduce_scratch" );
+  verify_is_process( "OpenMP::resize_reduce_scratch" );
+
+  if ( size ) { size += REDUCE_TEAM_BASE ; }
+
+  const size_t rem = size % Kokkos::Impl::MEMORY_ALIGNMENT ;
+
+  if ( rem ) size += Kokkos::Impl::MEMORY_ALIGNMENT - rem ;
+
+  if ( ( 0 == size && 0 != s_size ) || s_size < size ) {
+
+#pragma omp parallel
+    {
+      OpenMPexec & th = * m_thread[ omp_get_thread_num() ];
+
+#pragma omp critical
+      {
+        kokkos_omp_in_critical_region = 1 ;
+
+        if ( th.m_alloc_reduce ) {
+          HostSpace::decrement( th.m_alloc_reduce );
+          th.m_alloc_reduce = 0 ;
+        }
+
+        if ( size ) {
+          th.m_alloc_reduce = HostSpace::allocate( "openmp_reduce_scratch" , typeid(unsigned char) , 1 , size );
+        }
+        kokkos_omp_in_critical_region = 0 ;
+      }
+/* END #pragma omp critical */
+    }
+/* END #pragma omp parallel */
+  }
+
+  s_size = size ;
+}
+
+void OpenMPexec::resize_shared_scratch( size_t size )
+{
+  static size_t s_size = 0 ;
+
+  verify_initialized( "OpenMP::resize_shared_scratch" );
+  verify_is_process( "OpenMP::resize_shared_scratch" );
+
+  const size_t rem = size % Kokkos::Impl::MEMORY_ALIGNMENT ;
+
+  if ( rem ) size += Kokkos::Impl::MEMORY_ALIGNMENT - rem ;
+
+  if ( ( 0 == size && 0 != s_size ) || s_size < size ) {
+
+#pragma omp parallel
+    {
+      OpenMPexec & th = * m_thread[ omp_get_thread_num() ];
+
+      const unsigned rank_rev = omp_get_num_threads() - ( th.m_pool_rank + 1 );
+
+      if ( ! ( rank_rev % s_threads_per_core ) ) {
+#pragma omp critical
+        {
+          kokkos_omp_in_critical_region = 1 ;
+
+          if ( th.m_alloc_shared ) {
+            HostSpace::decrement( th.m_alloc_shared );
+            th.m_alloc_shared = 0 ;
+          }
+
+          if ( size ) {
+            th.m_alloc_shared = HostSpace::allocate( "openmp_shared_scratch" , typeid(unsigned char) , 1 , size );
+            th.m_alloc_shared_size = size ;
+          }
+
+          kokkos_omp_in_critical_region = 0 ;
+        }
+/* END #pragma omp critical */
+      }
+    }
+/* END #pragma omp parallel */
+  }
+
+  s_size = size ;
+}
+
+
+KOKKOS_FUNCTION
+void * OpenMPexec::get_shmem( const int size )
+{
+#ifndef __CUDA_ARCH__
+  // m_shared_iter is in bytes, convert to integer offsets
+  const int offset = m_team_shared_iter >> power_of_two<sizeof(int)>::value ;
+
+  m_team_shared_iter += size ;
+
+  if ( m_team_shared_end < m_team_shared_iter ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("OpenMPexec::get_shmem FAILED : exceeded shared memory size" ) );
+  }
+
+  return ((int*)m_team_shared) + offset ;
+#else
+  return NULL;
+#endif
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+KOKKOS_FUNCTION
+unsigned OpenMP::league_max()
+{
+#ifndef __CUDA_ARCH__
+  Impl::OpenMPexec::verify_initialized("Kokkos::OpenMP::league_max" );
+  Impl::OpenMPexec::verify_is_process("Kokkos::OpenMP::league_max" );
+
+  return unsigned( std::numeric_limits<int>::max() );
+#else
+  return 0;
+#endif
+}
+
+KOKKOS_FUNCTION
+unsigned OpenMP::team_max()
+{
+#ifndef __CUDA_ARCH__
+  Impl::OpenMPexec::verify_initialized("Kokkos::OpenMP::team_max" );
+  Impl::OpenMPexec::verify_is_process("Kokkos::OpenMP::team_max" );
+
+  return Impl::s_threads_per_numa ;
+#else
+  return 0;
+#endif
+}
+
+//----------------------------------------------------------------------------
+
+int OpenMP::is_initialized()
+{ return 0 != Impl::OpenMPexec::m_thread[0]; }
+
+void OpenMP::initialize( unsigned thread_count ,
+                         unsigned use_numa_count ,
+                         unsigned use_cores_per_numa )
+{
+  if(thread_count==0) thread_count = omp_get_max_threads();
+  const bool is_initialized = 0 != Impl::OpenMPexec::m_thread[0] ;
+
+  bool thread_spawn_failed = false ;
+
+  if ( ! is_initialized ) {
+
+    Impl::s_using_hwloc = hwloc::available() && (use_cores_per_numa > 0);
+
+    std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPexec::MAX_THREAD_COUNT ];
+
+    if(Impl::s_using_hwloc)
+      hwloc::thread_mapping( "Kokkos::OpenMP::initialize" ,
+                           false /* do not allow asynchronous */ ,
+                           thread_count ,
+                           use_numa_count ,
+                           use_cores_per_numa ,
+                           threads_coord );
+
+    // Spawn threads:
+
+    omp_set_num_threads( thread_count );
+
+    // Verify OMP interaction:
+    if ( int(thread_count) != omp_get_max_threads() ) {
+      thread_spawn_failed = true ;
+    }
+
+    // Verify spawning and bind threads:
+#pragma omp parallel
+    {
+#pragma omp critical
+      {
+        if ( int(thread_count) != omp_get_num_threads() ) {
+          thread_spawn_failed = true ;
+        }
+
+        // Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region.
+        // Call to 'new' may not be thread safe as well.
+
+        // Reverse the rank for threads so that the scan operation reduces to the highest rank thread.
+
+        const unsigned omp_rank    = omp_get_thread_num();
+        const unsigned thread_r    = Impl::s_using_hwloc ? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord ) : omp_rank ;
+        const unsigned thread_rank = thread_count - ( thread_r + 1 );
+
+        Impl::OpenMPexec::m_thread[ omp_rank ] = new Impl::OpenMPexec( thread_rank );
+
+        Impl::OpenMPexec::m_pool[ thread_r ] = Impl::OpenMPexec::m_thread[ omp_rank ] ;
+      }
+/* END #pragma omp critical */
+    }
+/* END #pragma omp parallel */
+
+    if ( ! thread_spawn_failed ) {
+      Impl::s_threads_per_numa = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
+      Impl::s_threads_per_core = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
+
+      Impl::OpenMPexec::resize_reduce_scratch( 4096 - Impl::OpenMPexec::REDUCE_TEAM_BASE );
+      Impl::OpenMPexec::resize_shared_scratch( 4096 );
+    }
+  }
+
+  if ( is_initialized || thread_spawn_failed ) {
+    std::string msg("Kokkos::OpenMP::initialize ERROR");
+
+    if ( is_initialized ) { msg.append(" : already initialized"); }
+    if ( thread_spawn_failed ) { msg.append(" : failed spawning threads"); }
+
+    Kokkos::Impl::throw_runtime_exception(msg);
+  }
+}
+
+//----------------------------------------------------------------------------
+
+void OpenMP::finalize()
+{
+  Impl::OpenMPexec::verify_initialized( "OpenMP::finalize" );
+  Impl::OpenMPexec::verify_is_process( "OpenMP::finalize" );
+
+  Impl::OpenMPexec::resize_reduce_scratch(0);
+  Impl::OpenMPexec::resize_shared_scratch(0);
+
+  for ( int i = 0 ; i < Impl::OpenMPexec::MAX_THREAD_COUNT ; ++i ) {
+    if ( Impl::OpenMPexec::m_thread[i] ) { delete Impl::OpenMPexec::m_thread[i] ; }
+    Impl::OpenMPexec::m_thread[i] = 0 ;
+  }
+  for ( int i = 0 ; i < Impl::OpenMPexec::MAX_THREAD_COUNT ; ++i ) {
+    Impl::OpenMPexec::m_pool[i] = 0 ;
+  }
+
+  omp_set_num_threads(0);
+
+  if(Impl::s_using_hwloc)
+    hwloc::unbind_this_thread();
+}
+
+} // namespace Kokkos
+
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
new file mode 100644
index 000000000..aa569cb02
--- /dev/null
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
@@ -0,0 +1,310 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMPEXEC_HPP
+#define KOKKOS_OPENMPEXEC_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_spinwait.hpp>
+
+#include <Kokkos_Atomic.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+/** \brief  Data for OpenMP thread execution */
+
+class OpenMPexec {
+public:
+
+  // Fan array has log_2(NT) reduction threads plus 2 scan threads
+  // Currently limited to 16k threads.
+  enum { MAX_FAN_COUNT    = 16 };
+  enum { MAX_THREAD_COUNT = 1 << ( MAX_FAN_COUNT - 2 ) };
+  enum { VECTOR_LENGTH    = 8 };
+  enum { REDUCE_TEAM_BASE = 512 };
+
+  /** \brief  Thread states for team synchronization */
+  enum { Active , Rendezvous , ReductionAvailable , ScanAvailable };
+
+private:
+
+  friend class Kokkos::OpenMP ;
+
+  OpenMPexec * const * m_team_base ;
+
+  void        * m_alloc_reduce ;    ///< Reduction memory
+  void        * m_alloc_shared ;    ///< Shared memory
+  void        * m_team_shared ;     ///< Shared memory
+  int           m_alloc_shared_size ;
+  int const     m_pool_rank ;
+  int           m_team_shared_end ;
+  int           m_team_shared_iter ;
+  int           m_team_rank ;
+  int           m_team_size ;
+  int           m_team_fan_size ;
+  int           m_league_rank ;
+  int           m_league_end ;
+  int           m_league_size ;
+
+  int volatile  m_barrier_state ;
+  int volatile  m_scan_state ;
+
+  static OpenMPexec * m_thread[ MAX_THREAD_COUNT ]; // Indexed by 'omp_get_thread_num()'
+  static OpenMPexec * m_pool[   MAX_THREAD_COUNT ]; // Indexed by 'm_pool_rank'
+
+  OpenMPexec();
+  OpenMPexec( const OpenMPexec & );
+  OpenMPexec & operator = ( const OpenMPexec & );
+
+public:
+
+  void * reduce_team() const { return m_alloc_reduce ; }
+  void * reduce_base() const { return ((unsigned char *)m_alloc_reduce) + REDUCE_TEAM_BASE ; }
+
+  ~OpenMPexec();
+
+  explicit
+  OpenMPexec( const unsigned pool_rank );
+
+  static void finalize();
+
+  static void initialize( const unsigned  team_count ,
+                          const unsigned threads_per_team ,
+                          const unsigned numa_count ,
+                          const unsigned cores_per_numa );
+
+  static void verify_is_process( const char * const );
+  static void verify_initialized( const char * const );
+
+  static void resize_reduce_scratch( size_t );
+  static void resize_shared_scratch( size_t );
+
+  inline static
+  OpenMPexec * get_thread_omp() { return m_thread[ omp_get_thread_num() ]; }
+
+  inline static
+  OpenMPexec * get_thread_rank_rev( const int rank_rev ) { return m_pool[ rank_rev ]; }
+
+  //----------------------------------------------------------------------
+  /** \brief  Compute a range of work for this thread's rank */
+
+  inline
+  std::pair< size_t , size_t >
+  work_range( const size_t work_count ) const
+  {
+    typedef integral_constant< size_t , VECTOR_LENGTH - 1 > work_mask ;
+
+    const size_t thread_size = omp_get_num_threads();
+
+    // work per thread rounded up and aligned to vector length:
+
+    const size_t work_per_thread =
+      ( ( ( work_count + thread_size - 1 ) / thread_size ) + work_mask::value ) & ~(work_mask::value);
+
+    const size_t work_begin = std::min( work_count , work_per_thread * m_pool_rank );
+    const size_t work_end   = std::min( work_count , work_per_thread + work_begin );
+
+    return std::pair< size_t , size_t >( work_begin , work_end );
+  }
+
+  //----------------------------------------------------------------------
+
+  KOKKOS_FUNCTION
+  void * get_shmem( const int );
+
+  KOKKOS_INLINE_FUNCTION
+  void team_barrier()
+    {
+      if(m_team_size==1) return;
+      const int rank_rev = m_team_size - ( m_team_rank + 1 );
+
+      for ( int i = 0 ; i < m_team_fan_size ; ++i ) {
+        Impl::spinwait( m_team_base[ rank_rev + (1<<i) ]->m_barrier_state , OpenMPexec::Active );
+      }
+      if ( rank_rev ) {
+        m_barrier_state = Rendezvous ;
+        Impl::spinwait( m_barrier_state , OpenMPexec::Rendezvous );
+      }
+      for ( int i = 0 ; i < m_team_fan_size ; ++i ) {
+        m_team_base[ rank_rev + (1<<i) ]->m_barrier_state = OpenMPexec::Active ;
+      }
+    }
+
+  template< class ArgType >
+  KOKKOS_INLINE_FUNCTION
+  ArgType team_scan( const ArgType & value , ArgType * const global_accum = 0 )
+    {
+      // Sequence of m_scan_state states:
+      //  0) Active              : entry and exit state
+      //  1) ReductionAvailable  : reduction value available, waiting for scan value
+      //  2) ScanAvailable       : reduction value available, scan value available
+      //  3) Rendezvous          : broadcasting global inter-team accumulation value
+
+      // Make sure there is enough scratch space:
+      typedef typename if_c< 2 * sizeof(ArgType) < REDUCE_TEAM_BASE , ArgType , void >::type type ;
+
+      const int rank_rev = m_team_size - ( m_team_rank + 1 );
+
+      type * const work_value = (type*) reduce_team();
+
+      // OpenMPexec::Active == m_scan_state
+
+      work_value[0] = value ;
+      memory_fence();
+
+      // Fan-in reduction, wait for source thread to complete it's fan-in reduction.
+      for ( int i = 0 ; i < m_team_fan_size ; ++i ) {
+        OpenMPexec & th = *m_team_base[ rank_rev + (1<<i) ];
+
+        // Wait for source thread to exit Active state.
+        Impl::spinwait( th.m_scan_state , OpenMPexec::Active );
+        // Source thread is 'ReductionAvailable' or 'ScanAvailable'
+        work_value[0] += ((volatile type*)th.reduce_team())[0];
+        memory_fence();
+      }
+
+      work_value[1] = work_value[0] ;
+      memory_fence();
+
+      if ( rank_rev ) {
+
+        m_scan_state = OpenMPexec::ReductionAvailable ; // Reduction value is available.
+
+        // Wait for contributing threads' scan value to be available.
+        if ( ( 1 << m_team_fan_size ) < ( m_team_rank + 1 ) ) {
+          OpenMPexec & th = *m_team_base[ rank_rev + ( 1 << m_team_fan_size ) ];
+
+          // Wait: Active -> ReductionAvailable
+          Impl::spinwait( th.m_scan_state , OpenMPexec::Active );
+          // Wait: ReductionAvailable -> ScanAvailable:
+          Impl::spinwait( th.m_scan_state , OpenMPexec::ReductionAvailable );
+
+          work_value[1] += ((volatile type*)th.reduce_team())[1] ;
+          memory_fence();
+        }
+
+        m_scan_state = OpenMPexec::ScanAvailable ; // Scan value is available.
+      }
+      else {
+         // Root thread add team's total to global inter-team accumulation
+        work_value[0] = global_accum ? atomic_fetch_add( global_accum , work_value[0] ) : 0 ;
+      }
+
+      for ( int i = 0 ; i < m_team_fan_size ; ++i ) {
+        OpenMPexec & th = *m_team_base[ rank_rev + (1<<i) ];
+        // Wait: ReductionAvailable -> ScanAvailable
+        Impl::spinwait( th.m_scan_state , OpenMPexec::ReductionAvailable );
+        // Wait: ScanAvailable -> Rendezvous
+        Impl::spinwait( th.m_scan_state , OpenMPexec::ScanAvailable );
+      }
+
+      // All fan-in threads are in the ScanAvailable state
+      if ( rank_rev ) {
+        m_scan_state = OpenMPexec::Rendezvous ;
+        Impl::spinwait( m_scan_state , OpenMPexec::Rendezvous );
+      }
+
+      // Broadcast global inter-team accumulation value
+      volatile type & global_val = work_value[0] ;
+      for ( int i = 0 ; i < m_team_fan_size ; ++i ) {
+        OpenMPexec & th = *m_team_base[ rank_rev + (1<<i) ];
+        ((volatile type*)th.reduce_team())[0] = global_val ;
+        memory_fence();
+        th.m_scan_state = OpenMPexec::Active ;
+      }
+      // Exclusive scan, subtract contributed value
+      return global_val + work_value[1] - value ;
+    }
+
+  void team_work_init( size_t league_size , size_t team_size );
+
+  inline
+  bool team_work_avail()
+    { m_team_shared_iter = 0 ; return m_league_rank < m_league_end ; }
+
+  inline
+  void team_work_next()
+    { if ( ++m_league_rank < m_league_end ) team_barrier(); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+inline OpenMP::OpenMP( Impl::OpenMPexec & e ) : m_exec(e) {}
+
+KOKKOS_INLINE_FUNCTION
+int OpenMP::league_rank() const { return m_exec.m_league_rank ; }
+KOKKOS_INLINE_FUNCTION
+int OpenMP::league_size() const { return m_exec.m_league_size ; }
+KOKKOS_INLINE_FUNCTION
+int OpenMP::team_rank() const { return m_exec.m_team_rank ; }
+KOKKOS_INLINE_FUNCTION
+int OpenMP::team_size() const { return m_exec.m_team_size ; }
+
+KOKKOS_INLINE_FUNCTION
+void OpenMP::team_barrier() { m_exec.team_barrier() ; }
+
+KOKKOS_INLINE_FUNCTION
+void * OpenMP::get_shmem( const int size ) { return m_exec.get_shmem(size) ; }
+
+template< typename Type >
+KOKKOS_INLINE_FUNCTION
+Type OpenMP::team_scan( const Type & value )
+{ return m_exec.team_scan( value ); }
+
+template< typename TypeLocal , typename TypeGlobal >
+KOKKOS_INLINE_FUNCTION
+TypeGlobal OpenMP::team_scan( const TypeLocal & value , TypeGlobal * const global_accum )
+{ return m_exec.template team_scan< TypeGlobal >( value , global_accum ); }
+
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */
+
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
new file mode 100644
index 000000000..786f5deb0
--- /dev/null
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@@ -0,0 +1,854 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <limits>
+#include <utility>
+#include <iostream>
+#include <Kokkos_Threads.hpp>
+#include <Kokkos_hwloc.hpp>
+#include <Kokkos_Atomic.hpp>
+
+#include <stdint.h>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+ThreadsExec                  s_threads_process ;
+ThreadsExec                * s_threads_exec[  ThreadsExec::MAX_THREAD_COUNT ];
+std::pair<unsigned,unsigned> s_threads_coord[ ThreadsExec::MAX_THREAD_COUNT ];
+
+unsigned s_threads_count       = 0 ;
+unsigned s_threads_per_numa    = 0 ;
+unsigned s_threads_per_core    = 0 ;
+
+unsigned s_current_reduce_size = 0 ;
+unsigned s_current_shared_size = 0 ;
+unsigned s_current_team_alloc  = 0 ;
+unsigned s_current_team_size   = 0 ;
+unsigned s_current_league_size = 0 ;
+
+void (* volatile s_current_function)( ThreadsExec & , const void * );
+const void * volatile s_current_function_arg = 0 ;
+
+struct Sentinel {
+  Sentinel()
+  {
+    HostSpace::register_in_parallel( ThreadsExec::in_parallel );
+  }
+
+  ~Sentinel()
+  {
+    if ( s_threads_count ||
+         s_threads_per_numa ||
+         s_threads_per_core ||
+         s_current_reduce_size ||
+         s_current_shared_size ||
+         s_current_function ||
+         s_current_function_arg ||
+         s_threads_exec[0] ) {
+      std::cerr << "ERROR : Process exiting without calling Kokkos::Threads::terminate()" << std::endl ;
+    }
+  }
+};
+
+inline
+unsigned fan_size( const unsigned rank , const unsigned size )
+{
+  const unsigned rank_rev = size - ( rank + 1 );
+  unsigned count = 0 ;
+  for ( unsigned n = 1 ; ( rank_rev + n < size ) && ! ( rank_rev & n ) ; n <<= 1 ) { ++count ; }
+  return count ;
+}
+
+} // namespace
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void execute_function_noop( ThreadsExec & , const void * ) {}
+
+void ThreadsExec::driver(void)
+{
+  ThreadsExec this_thread ;
+
+  while ( ThreadsExec::Active == this_thread.m_pool_state ) {
+
+    this_thread.set_team_relations();
+
+    (*s_current_function)( this_thread , s_current_function_arg );
+
+    // Deactivate thread and wait for reactivation
+    this_thread.m_pool_state = ThreadsExec::Inactive ;
+
+    wait_yield( this_thread.m_pool_state , ThreadsExec::Inactive );
+  }
+}
+
+void ThreadsExec::set_team_relations()
+{
+  m_team_base        = 0 ;
+  m_team_shared      = 0 ;
+  m_team_shared_end  = 0 ;
+  m_team_size        = 0 ;
+  m_team_rank        = 0 ;
+  m_team_fan_size    = 0 ;
+  m_league_size      = 0 ;
+  m_league_rank      = 0 ;
+  m_league_end       = 0 ;
+
+  const size_t league_size = s_current_league_size ;
+
+  if ( league_size ) {
+    // Execution is using device-team interface:
+
+    const unsigned team_alloc    = s_current_team_alloc ;
+    const unsigned team_size     = s_current_team_size ;
+    const unsigned pool_rank_rev = m_pool_size - ( m_pool_rank + 1 );
+    const unsigned team_rank_rev = pool_rank_rev % team_alloc ;
+
+    // May be using fewer threads per team than a multiple of threads per core,
+    // some threads will idle.
+
+    if ( team_rank_rev < team_size ) {
+      const size_t pool_league_size     = m_pool_size   / team_alloc ;
+      const size_t pool_league_rank_rev = pool_rank_rev / team_alloc ;
+      const size_t pool_league_rank     = pool_league_size - ( pool_league_rank_rev + 1 );
+
+      m_team_base        = m_pool_base + team_alloc * pool_league_rank_rev ;
+      m_team_shared      = (*m_team_base)->m_alloc_shared ;
+      m_team_shared_end  = s_current_shared_size ;
+      m_team_size        = team_size ;
+      m_team_rank        = team_size - ( team_rank_rev + 1 );
+      m_team_fan_size    = fan_size( m_team_rank , team_size );
+      m_league_size      = league_size ;
+      m_league_rank      = ( league_size *  pool_league_rank    ) / pool_league_size ;
+      m_league_end       = ( league_size * (pool_league_rank+1) ) / pool_league_size ;
+    }
+  }
+}
+
+ThreadsExec::ThreadsExec()
+  : m_pool_base(0)
+  , m_team_base(0)
+  , m_alloc_reduce(0)
+  , m_alloc_shared(0)
+  , m_team_shared(0)
+  , m_team_shared_end(0)
+  , m_team_shared_iter(0)
+
+  , m_pool_rank(0)
+  , m_pool_size(0)
+  , m_pool_fan_size(0)
+
+  , m_team_rank(0)
+  , m_team_size(0)
+  , m_team_fan_size(0)
+
+  , m_league_rank(0)
+  , m_league_end(0)
+  , m_league_size(0)
+
+  , m_pool_state( ThreadsExec::Terminating )
+  , m_team_state( ThreadsExec::Inactive )
+{
+  if ( & s_threads_process != this ) {
+
+    // A spawned thread
+
+    ThreadsExec * const nil = 0 ;
+
+    // Which entry in 's_threads_exec', possibly determined from hwloc binding
+    const unsigned entry = ((size_t)s_current_function_arg) < s_threads_count
+                         ? ((size_t)s_current_function_arg)
+                         : size_t(Kokkos::hwloc::bind_this_thread( s_threads_count , s_threads_coord ));
+
+    // Given a good entry set this thread in the 's_threads_exec' array
+    if ( entry < s_threads_count &&
+         nil == atomic_compare_exchange( s_threads_exec + entry , nil , this ) ) {
+
+      m_pool_base     = s_threads_exec ;
+      m_pool_rank     = s_threads_count - ( entry + 1 );
+      m_pool_size     = s_threads_count ;
+      m_pool_fan_size = fan_size( m_pool_rank , m_pool_size );
+      m_pool_state    = ThreadsExec::Active ;
+
+      // Inform spawning process that the threads_exec entry has been set.
+      s_threads_process.m_pool_state = ThreadsExec::Active ;
+    }
+    else {
+      // Inform spawning process that the threads_exec entry could not be set.
+      s_threads_process.m_pool_state = ThreadsExec::Terminating ;
+    }
+  }
+  else {
+    // Enables 'parallel_for' to execute on unitialized Threads device
+    m_pool_rank  = 0 ;
+    m_pool_size  = 1 ;
+    m_pool_state = ThreadsExec::Inactive ;
+  }
+}
+
+ThreadsExec::~ThreadsExec()
+{
+  const unsigned entry = m_pool_size - ( m_pool_rank + 1 );
+
+  m_pool_base   = 0 ;
+  m_team_base   = 0 ;
+
+  m_alloc_reduce     = 0 ;
+  m_alloc_shared     = 0 ;
+  m_team_shared      = 0 ;
+  m_team_shared_end  = 0 ;
+  m_team_shared_iter = 0 ;
+
+  m_pool_rank     = 0 ;
+  m_pool_size     = 0 ;
+  m_pool_fan_size = 0 ;
+  m_team_rank     = 0 ;
+  m_team_size     = 0 ;
+  m_team_fan_size = 0 ;
+  m_league_rank   = 0 ;
+  m_league_end    = 0 ;
+  m_league_size   = 0 ;
+
+  m_pool_state  = ThreadsExec::Terminating ;
+  m_team_state  = ThreadsExec::Inactive ;
+
+  if ( & s_threads_process != this && entry < MAX_THREAD_COUNT ) {
+    ThreadsExec * const nil = 0 ;
+
+    atomic_compare_exchange( s_threads_exec + entry , this , nil );
+
+    s_threads_process.m_pool_state = ThreadsExec::Terminating ;
+  }
+}
+
+
+int ThreadsExec::get_thread_count()
+{
+  return s_threads_count ;
+}
+
+ThreadsExec * ThreadsExec::get_thread( const int init_thread_rank )
+{
+  ThreadsExec * const th =
+    unsigned(init_thread_rank) < s_threads_count
+    ? s_threads_exec[ s_threads_count - ( init_thread_rank + 1 ) ] : 0 ;
+
+  if ( 0 == th || th->m_pool_rank != init_thread_rank ) {
+    std::ostringstream msg ;
+    msg << "Kokkos::Impl::ThreadsExec::get_thread ERROR : "
+        << "thread " << init_thread_rank << " of " << s_threads_count ;
+    if ( 0 == th ) {
+      msg << " does not exist" ;
+    }
+    else {
+      msg << " has wrong thread_rank " << th->m_pool_rank ;
+    }
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  return th ;
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::execute_get_binding( ThreadsExec & exec , const void * )
+{
+  s_threads_coord[ exec.m_pool_rank ] = Kokkos::hwloc::get_this_thread_coordinate();
+}
+
+void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * )
+{
+  ThreadsExec::global_lock();
+  ThreadsExec::global_unlock();
+
+  const int n = exec.m_pool_fan_size ;
+  const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 );
+
+  for ( int i = 0 ; i < n ; ++i ) {
+    Impl::spinwait( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+  }
+
+  exec.m_pool_state = ThreadsExec::Inactive ;
+}
+
+void ThreadsExec::execute_reduce_resize( ThreadsExec & exec , const void * )
+{
+  if ( exec.m_alloc_reduce ) {
+    HostSpace::decrement( exec.m_alloc_reduce );
+    exec.m_alloc_reduce = 0 ;
+  }
+
+  if ( s_current_reduce_size ) {
+
+    exec.m_alloc_reduce =
+      HostSpace::allocate( "reduce_scratch_space" , typeid(unsigned char) , 1 , s_current_reduce_size );
+
+    // Guaranteed multiple of 'unsigned'
+
+    unsigned * ptr = (unsigned *)( exec.m_alloc_reduce );
+    unsigned * const end = ptr + s_current_reduce_size / sizeof(unsigned);
+
+    // touch on this thread
+    while ( ptr < end ) *ptr++ = 0 ;
+  }
+}
+
+void ThreadsExec::execute_shared_resize( ThreadsExec & exec , const void * )
+{
+  // First thread pinned to a core allocates shared memory
+  const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 );
+
+  if ( ! ( rank_rev % s_threads_per_core ) ) {
+
+    if ( exec.m_alloc_shared ) {
+      HostSpace::decrement( exec.m_alloc_shared );
+      exec.m_alloc_shared = 0 ;
+    }
+
+    if ( s_current_shared_size ) {
+      exec.m_alloc_shared =
+        HostSpace::allocate( "shared_scratch_space" , typeid(unsigned char) , 1 , s_current_shared_size );
+
+      // Guaranteed multiple of 'unsigned'
+
+      unsigned * ptr = (unsigned *)( exec.m_alloc_shared );
+      unsigned * const end = ptr + s_current_shared_size / sizeof(unsigned);
+
+      // touch on this thread
+      while ( ptr < end ) *ptr++ = 0 ;
+    }
+  }
+  else {
+    exec.m_alloc_shared = 0 ;
+  }
+}
+
+void * ThreadsExec::get_shmem( const int size )
+{
+  // m_team_shared_iter is in bytes, convert to integer offsets
+  const int offset = m_team_shared_iter >> power_of_two<sizeof(int)>::value ;
+
+  m_team_shared_iter += size ;
+
+  if ( m_team_shared_end < m_team_shared_iter ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("ThreadsExec::get_shmem FAILED : exceeded shared memory size" ) );
+  }
+
+  return ((int*)m_team_shared) + offset ;
+}
+
+}
+}
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void ThreadsExec::verify_is_process( const std::string & name , const bool initialized )
+{
+  if ( ! is_process() ) {
+    std::string msg( name );
+    msg.append( " FAILED : Called by a worker thread, can only be called by the master process." );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+
+  if ( initialized && 0 == s_threads_count ) {
+    std::string msg( name );
+    msg.append( " FAILED : Threads not initialized." );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+int ThreadsExec::in_parallel()
+{
+  // A thread function is in execution and
+  // the function argument is not the special threads process argument and
+  // the master process is a worker or is not the master process.
+  return s_current_function &&
+         ( & s_threads_process != s_current_function_arg ) &&
+         ( s_threads_process.m_pool_base || ! is_process() );
+}
+
+// Wait for root thread to become inactive
+void ThreadsExec::fence()
+{
+  if ( s_threads_count ) {
+    // Wait for the root thread to complete:
+    Impl::spinwait( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
+  }
+
+  s_current_function     = 0 ;
+  s_current_function_arg = 0 ;
+  s_current_team_size    = 0 ;
+  s_current_team_alloc   = 0 ;
+  s_current_league_size  = 0 ;
+}
+
+/** \brief  Begin execution of the asynchronous functor */
+void ThreadsExec::start( void (*func)( ThreadsExec & , const void * ) , const void * arg ,
+                         int work_league_size ,
+                         int work_team_size )
+{
+  const bool work_spec = work_league_size || work_team_size ;
+
+  verify_is_process("ThreadsExec::start" , work_spec );
+
+  if ( s_current_function || s_current_function_arg ) {
+    Kokkos::Impl::throw_runtime_exception( std::string( "ThreadsExec::start() FAILED : already executing" ) );
+  }
+
+  if ( work_spec ) {
+    s_current_team_size    = work_team_size ? std::min( s_threads_per_numa , unsigned(work_team_size) ) : s_threads_per_numa ;
+    s_current_team_alloc   = s_threads_per_core * ( ( s_current_team_size + s_threads_per_core - 1 ) / s_threads_per_core );
+    s_current_league_size  = work_league_size ;
+  }
+
+  s_current_function     = func ;
+  s_current_function_arg = arg ;
+
+  // Activate threads:
+  for ( int i = s_threads_count ; 0 < i-- ; ) {
+    s_threads_exec[i]->m_pool_state = ThreadsExec::Active ;
+  }
+
+  if ( s_threads_process.m_pool_size ) {
+    // Master process is the root thread, run it:
+    s_threads_process.set_team_relations();
+    (*func)( s_threads_process , arg );
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+bool ThreadsExec::sleep()
+{
+  verify_is_process("ThreadsExec::sleep", true );
+
+  if ( & execute_sleep == s_current_function ) return false ;
+
+  fence();
+
+  ThreadsExec::global_lock();
+
+  s_current_function = & execute_sleep ;
+
+  // Activate threads:
+  for ( unsigned i = s_threads_count ; 0 < i ; ) {
+    s_threads_exec[--i]->m_pool_state = ThreadsExec::Active ;
+  }
+
+  return true ;
+}
+
+bool ThreadsExec::wake()
+{
+  verify_is_process("ThreadsExec::wake", true );
+
+  if ( & execute_sleep != s_current_function ) return false ;
+
+  ThreadsExec::global_unlock();
+
+  if ( s_threads_process.m_pool_base ) {
+    execute_sleep( s_threads_process , 0 );
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+  }
+
+  fence();
+
+  return true ;
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) )
+{
+  s_current_function = func ;
+  s_current_function_arg = & s_threads_process ;
+
+  const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;
+
+  for ( unsigned i = s_threads_count ; begin < i ; ) {
+    ThreadsExec & th = * s_threads_exec[ --i ];
+
+    th.m_pool_state = ThreadsExec::Active ;
+
+    wait_yield( th.m_pool_state , ThreadsExec::Active );
+  }
+
+  if ( s_threads_process.m_pool_base ) {
+    s_threads_process.m_pool_state = ThreadsExec::Active ;
+    (*func)( s_threads_process , 0 );
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+  }
+
+  s_current_function_arg = 0 ;
+  s_current_function = 0 ;
+}
+
+//----------------------------------------------------------------------------
+
+void * ThreadsExec::root_reduce_scratch()
+{
+  return s_threads_process.reduce_base();
+}
+
+void ThreadsExec::resize_reduce_scratch( size_t size )
+{
+  fence();
+
+  if ( size ) { size += REDUCE_TEAM_BASE ; }
+
+  const size_t rem = size % Kokkos::Impl::MEMORY_ALIGNMENT ;
+
+  if ( rem ) size += Kokkos::Impl::MEMORY_ALIGNMENT - rem ;
+
+  if ( ( s_current_reduce_size < size ) ||
+       ( 0 == size && s_current_reduce_size ) ) {
+
+    verify_is_process( "ThreadsExec::resize_reduce_scratch" , true );
+
+    s_current_reduce_size = size ;
+
+    execute_serial( & execute_reduce_resize );
+
+    s_threads_process.m_alloc_reduce = s_threads_exec[0]->m_alloc_reduce ;
+  }
+}
+
+void ThreadsExec::resize_shared_scratch( size_t size )
+{
+  fence();
+
+  const size_t rem = size % Kokkos::Impl::MEMORY_ALIGNMENT ;
+
+  if ( rem ) size += Kokkos::Impl::MEMORY_ALIGNMENT - rem ;
+
+  if ( s_current_shared_size < size || ( 0 == size && s_current_shared_size ) ) {
+
+    verify_is_process( "ThreadsExec::resize_shared_scratch" , true );
+
+    s_current_shared_size = size ;
+
+    execute_serial( & execute_shared_resize );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::print_configuration( std::ostream & s , const bool detail )
+{
+  verify_is_process("ThreadsExec::print_configuration",false);
+
+  fence();
+
+  const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+  // Forestall compiler warnings for unused variables.
+  (void) numa_count;
+  (void) cores_per_numa;
+  (void) threads_per_core;
+
+  s << "Kokkos::Threads" ;
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+  s << " KOKKOS_HAVE_PTHREAD" ;
+#endif
+#if defined( KOKKOS_HAVE_HWLOC )
+  s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]" ;
+#endif
+
+  if ( s_threads_count ) {
+    s << " threads[" << s_threads_count << "]"
+      << " threads_per_numa[" << s_threads_per_numa << "]"
+      << " threads_per_core[" << s_threads_per_core << "]"
+      ;
+    if ( 0 == s_threads_process.m_pool_base ) { s << " Asynchronous" ; }
+    s << " ReduceScratch[" << s_current_reduce_size << "]"
+      << " SharedScratch[" << s_current_shared_size << "]" ;
+    s << std::endl ;
+
+    if ( detail ) {
+
+      execute_serial( & execute_get_binding );
+
+      for ( unsigned i = 0 ; i < s_threads_count ; ++i ) {
+        ThreadsExec * const th = s_threads_exec[i] ;
+        s << "  Thread hwloc("
+          << s_threads_coord[i].first << "."
+          << s_threads_coord[i].second << ")" ;
+
+        s_threads_coord[i].first  = ~0u ;
+        s_threads_coord[i].second = ~0u ;
+
+        if ( th ) {
+          const int rank_rev = th->m_pool_size - ( th->m_pool_rank + 1 );
+
+          s << " rank(" << th->m_pool_rank << ")" ;
+
+          if ( th->m_pool_fan_size ) {
+            s << " Fan{" ;
+            for ( int j = 0 ; j < th->m_pool_fan_size ; ++j ) {
+              s << " " << th->m_pool_base[rank_rev+(1<<j)]->m_pool_rank ;
+            }
+            s << " }" ;
+          }
+
+          if ( th->m_team_base && th->m_team_size ) {
+            s << " Team[ " << th->m_team_base[0]->m_pool_rank
+              << " .. " << th->m_team_base[ th->m_team_size - 1 ]->m_pool_rank
+              << " ]" ;
+          }
+
+          if ( th == & s_threads_process ) {
+            s << " is_process" ;
+          }
+        }
+        s << std::endl ;
+      }
+    }
+  }
+  else {
+    s << " not initialized" << std::endl ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+int ThreadsExec::league_max()
+{ return std::numeric_limits<int>::max(); }
+
+int ThreadsExec::team_max()
+{ return s_threads_per_numa ; }
+
+//----------------------------------------------------------------------------
+
+int ThreadsExec::is_initialized()
+{ return 0 != s_threads_exec[0] ; }
+
+void ThreadsExec::initialize( unsigned thread_count ,
+                              unsigned use_numa_count ,
+                              unsigned use_cores_per_numa ,
+                              bool allow_asynchronous_threadpool )
+{
+  static const Sentinel sentinel ;
+
+  const bool is_initialized = 0 != s_threads_count ;
+
+  unsigned thread_spawn_failed = 0 ;
+
+  if ( ! is_initialized ) {
+
+    // If thread_count, use_numa_count, or use_cores_per_numa are zero
+    // then they will be given default values based upon hwloc detection
+    // and allowed asynchronous execution.
+
+    const bool hwloc_avail = hwloc::available();
+
+    const unsigned thread_spawn_begin =
+      hwloc::thread_mapping( "Kokkos::Threads::initialize" ,
+                             allow_asynchronous_threadpool ,
+                             thread_count ,
+                             use_numa_count ,
+                             use_cores_per_numa ,
+                             s_threads_coord );
+
+    const std::pair<unsigned,unsigned> proc_coord = s_threads_coord[0] ;
+
+    if ( thread_spawn_begin ) {
+      // Synchronous with s_threads_coord[0] as the process core
+      // Claim entry #0 for binding the process core.
+      s_threads_coord[0] = std::pair<unsigned,unsigned>(~0u,~0u);
+    }
+
+    s_threads_count    = thread_count ;
+    s_threads_per_numa = s_threads_count / use_numa_count ;
+    s_threads_per_core = s_threads_per_numa / use_cores_per_numa ;
+    s_current_function = & execute_function_noop ; // Initialization work function
+
+    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
+
+      s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+
+      // If hwloc available then spawned thread will
+      // choose its own entry in 's_threads_coord'
+      // otherwise specify the entry.
+      s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_avail ? ~0u : ith );
+
+      // Spawn thread executing the 'driver()' function.
+      // Wait until spawned thread has attempted to initialize.
+      // If spawning and initialization is successfull then
+      // an entry in 's_threads_exec' will be assigned.
+      if ( ThreadsExec::spawn() ) {
+        wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
+      }
+      if ( s_threads_process.m_pool_state == ThreadsExec::Terminating ) break ;
+    }
+
+    // Wait for all spawned threads to deactivate before zeroing the function.
+
+    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
+      // Try to protect against cache coherency failure by casting to volatile.
+      ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[ith] ;
+      if ( th ) {
+        wait_yield( th->m_pool_state , ThreadsExec::Active );
+      }
+      else {
+        ++thread_spawn_failed ;
+      }
+    }
+
+    s_current_function     = 0 ;
+    s_current_function_arg = 0 ;
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+
+    if ( ! thread_spawn_failed ) {
+      // Bind process to the core on which it was located before spawning occured
+      Kokkos::hwloc::bind_this_thread( proc_coord );
+
+      if ( thread_spawn_begin ) { // Include process in pool.
+        s_threads_exec[0]                 = & s_threads_process ;
+        s_threads_process.m_pool_base     = s_threads_exec ;
+        s_threads_process.m_pool_rank     = thread_count - 1 ; // Reversed for scan-compatible reductions
+        s_threads_process.m_pool_size     = thread_count ;
+        s_threads_process.m_pool_fan_size = fan_size( s_threads_process.m_pool_rank , s_threads_process.m_pool_size );
+      }
+      else {
+        s_threads_process.m_pool_base = 0 ;
+        s_threads_process.m_pool_rank = 0 ;
+        s_threads_process.m_pool_size = 0 ;
+        s_threads_process.m_pool_fan_size = 0 ;
+      }
+
+      // Initial allocations:
+      ThreadsExec::resize_reduce_scratch( 4096 - REDUCE_TEAM_BASE );
+      ThreadsExec::resize_shared_scratch( 4096 );
+    }
+    else {
+      s_threads_count    = 0 ;
+      s_threads_per_numa = 0 ;
+      s_threads_per_core = 0 ;
+    }
+  }
+
+  if ( is_initialized || thread_spawn_failed ) {
+
+    std::ostringstream msg ;
+
+    msg << "Kokkos::Threads::initialize ERROR" ;
+
+    if ( is_initialized ) {
+      msg << " : already initialized" ;
+    }
+    if ( thread_spawn_failed ) {
+      msg << " : failed to spawn " << thread_spawn_failed << " threads" ;
+    }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::finalize()
+{
+  verify_is_process("ThreadsExec::finalize",false);
+
+  fence();
+
+  resize_reduce_scratch(0);
+  resize_shared_scratch(0);
+
+  const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;
+
+  for ( unsigned i = s_threads_count ; begin < i-- ; ) {
+
+    if ( s_threads_exec[i] ) {
+
+      s_threads_exec[i]->m_pool_state = ThreadsExec::Terminating ;
+
+      wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
+
+      s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+    }
+  }
+
+  if ( s_threads_process.m_pool_base ) {
+    ( & s_threads_process )->~ThreadsExec();
+    s_threads_exec[0] = 0 ;
+  }
+
+  Kokkos::hwloc::unbind_this_thread();
+
+  s_threads_count    = 0 ;
+  s_threads_per_numa = 0 ;
+  s_threads_per_core = 0 ;
+
+  // Reset master thread to run solo.
+  s_threads_process.m_pool_base     = 0 ;
+  s_threads_process.m_pool_rank     = 0 ;
+  s_threads_process.m_pool_size     = 1 ;
+  s_threads_process.m_pool_fan_size = 0 ;
+  s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
new file mode 100644
index 000000000..e6259646a
--- /dev/null
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@@ -0,0 +1,598 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADSEXEC_HPP
+#define KOKKOS_THREADSEXEC_HPP
+
+#include <stdio.h>
+
+#include <utility>
+#include <impl/Kokkos_spinwait.hpp>
+
+#include <Kokkos_Atomic.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< class > struct ThreadsExecAdapter ;
+
+//----------------------------------------------------------------------------
+
+class ThreadsExec {
+public:
+
+  // Fan array has log_2(NT) reduction threads plus 2 scan threads
+  // Currently limited to 16k threads.
+  enum { MAX_FAN_COUNT    = 16 };
+  enum { MAX_THREAD_COUNT = 1 << ( MAX_FAN_COUNT - 2 ) };
+  enum { VECTOR_LENGTH    = 8 };
+
+  /** \brief States of a worker thread */
+  enum { Terminating ///<  Termination in progress
+       , Inactive    ///<  Exists, waiting for work
+       , Active      ///<  Exists, performing work
+       , Rendezvous  ///<  Exists, waiting in a barrier or reduce
+
+       , ScanCompleted
+       , ScanAvailable
+       , ReductionAvailable
+       };
+
+private:
+
+  friend class Kokkos::Threads ;
+
+  // Fan-in operations' root is the highest ranking thread
+  // to place the 'scan' reduction intermediate values on
+  // the threads that need them.
+  // For a simple reduction the thread location is arbitrary.
+
+  /** \brief  Reduction memory reserved for team reductions */
+  enum { REDUCE_TEAM_BASE = 512 };
+
+  ThreadsExec * const * m_pool_base ; ///< Base for pool fan-in
+  ThreadsExec * const * m_team_base ; ///< Base for team fan-in
+
+  void        * m_alloc_reduce ;     ///< Reduction allocated memory
+  void        * m_alloc_shared ;     ///< Team-shared allocated memory
+  void        * m_team_shared ;      ///< Team-shared memory
+
+  int           m_team_shared_end ;  ///< End of team-shared memory
+  int           m_team_shared_iter ; ///< Current offset for team-shared memory
+
+  int           m_pool_rank ;
+  int           m_pool_size ;
+  int           m_pool_fan_size ;
+
+  int           m_team_rank ;
+  int           m_team_size ;
+  int           m_team_fan_size ;
+
+  int           m_league_rank ;
+  int           m_league_end ;
+  int           m_league_size ;
+
+  int volatile  m_pool_state ;  ///< State for global synchronizations
+  int volatile  m_team_state ;  ///< State for team synchronizations
+
+  static void global_lock();
+  static void global_unlock();
+  static bool spawn();
+
+  static void execute_sleep( ThreadsExec & , const void * );
+  static void execute_reduce_resize( ThreadsExec & , const void * );
+  static void execute_shared_resize( ThreadsExec & , const void * );
+  static void execute_get_binding(   ThreadsExec & , const void * );
+
+  ThreadsExec( const ThreadsExec & );
+  ThreadsExec & operator = ( const ThreadsExec & );
+
+  static void execute_serial( void (*)( ThreadsExec & , const void * ) );
+
+  inline void * reduce_team() const { return m_alloc_reduce ; }
+
+public:
+
+  static int get_thread_count();
+  static ThreadsExec * get_thread( const int init_thread_rank );
+
+  inline void * reduce_base() const { return ((unsigned char *) m_alloc_reduce) + REDUCE_TEAM_BASE ; }
+
+  static void driver(void);
+
+  void set_team_relations();
+
+  ~ThreadsExec();
+  ThreadsExec();
+
+  static void resize_reduce_scratch( size_t );
+  static void resize_shared_scratch( size_t );
+
+  static void * root_reduce_scratch();
+
+  static bool is_process();
+
+  static void verify_is_process( const std::string & , const bool initialized );
+
+  static int is_initialized();
+
+  static void initialize( unsigned thread_count ,
+                          unsigned use_numa_count ,
+                          unsigned use_cores_per_numa ,
+                          bool allow_asynchronous_threadpool );
+
+  static void finalize();
+
+  /* Given a requested team size, return valid team size */
+  static unsigned team_size_valid( unsigned );
+
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  //------------------------------------
+
+  static void wait_yield( volatile int & , const int );
+
+  //------------------------------------
+  // All-thread functions:
+
+  inline
+  std::pair< size_t , size_t >
+  work_range( const size_t work_count ) const
+  {
+    typedef integral_constant< size_t , VECTOR_LENGTH - 1 > work_mask ;
+
+    // work per thread rounded up and aligned to vector length:
+
+    const size_t work_per_thread =
+      ( ( ( work_count + m_pool_size - 1 ) / m_pool_size ) + work_mask::value ) & ~(work_mask::value);
+
+    const size_t work_begin = std::min( work_count , work_per_thread * m_pool_rank );
+    const size_t work_end   = std::min( work_count , work_per_thread + work_begin );
+
+    return std::pair< size_t , size_t >( work_begin , work_end );
+  }
+
+  template< class Functor >
+  inline
+  void fan_in_reduce( const Functor & f ) const
+    {
+      typedef ReduceAdapter< Functor > Reduce ;
+
+      const int rank_rev  = m_pool_size - ( m_pool_rank + 1 );
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+
+        ThreadsExec & fan = *m_pool_base[ rank_rev + ( 1 << i ) ] ;
+
+        Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
+
+        f.join( Reduce::reference( reduce_base() ) ,
+                Reduce::reference( fan.reduce_base() ) );
+      }
+    }
+
+  inline
+  void fan_in() const
+    {
+      const int rank_rev = m_pool_size - ( m_pool_rank + 1 );
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        Impl::spinwait( m_pool_base[rank_rev+(1<<i)]->m_pool_state , ThreadsExec::Active );
+      }
+    }
+
+  template< class FunctorType >
+  inline
+  void scan_large( const FunctorType & f )
+
+    {
+      // Sequence of states:
+      //  0) Active             : entry and exit state
+      //  1) ReductionAvailable : reduction value available
+      //  2) ScanAvailable      : inclusive scan value available
+      //  3) Rendezvous         : All threads inclusive scan value are available
+      //  4) ScanCompleted      : exclusive scan value copied
+
+      typedef ReduceAdapter< FunctorType > Reduce ;
+      typedef typename Reduce::scalar_type scalar_type ;
+
+      const int      rank_rev = m_pool_size - ( m_pool_rank + 1 );
+      const unsigned count    = Reduce::value_count( f );
+
+      scalar_type * const work_value = (scalar_type *) reduce_base();
+
+      //--------------------------------
+      // Fan-in reduction with highest ranking thread as the root
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        ThreadsExec & fan = *m_pool_base[ rank_rev + (1<<i) ];
+
+        // Wait: Active -> ReductionAvailable (or ScanAvailable)
+        Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
+        f.join( Reduce::reference( work_value ) , Reduce::reference( fan.reduce_base() ) );
+      }
+
+      // Copy reduction value to scan value before releasing from this phase.
+      for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i] ; }
+
+      if ( rank_rev ) {
+
+        // Set: Active -> ReductionAvailable
+        m_pool_state = ThreadsExec::ReductionAvailable ;
+
+        // Wait for contributing threads' scan value to be available.
+        if ( ( 1 << m_pool_fan_size ) < ( m_pool_rank + 1 ) ) {
+          ThreadsExec & th = *m_pool_base[ rank_rev + ( 1 << m_pool_fan_size ) ] ;
+
+          // Wait: Active             -> ReductionAvailable
+          // Wait: ReductionAvailable -> ScanAvailable
+          Impl::spinwait( th.m_pool_state , ThreadsExec::Active );
+          Impl::spinwait( th.m_pool_state , ThreadsExec::ReductionAvailable );
+
+          f.join( Reduce::reference( work_value + count ) ,
+                  Reduce::reference( ((scalar_type *)th.reduce_base()) + count ) );
+        }
+
+        // This thread has completed inclusive scan
+        // Set: ReductionAvailable -> ScanAvailable
+        m_pool_state = ThreadsExec::ScanAvailable ;
+
+        // Wait for all threads to complete inclusive scan
+        // Wait: ScanAvailable -> Rendezvous
+        Impl::spinwait( m_pool_state , ThreadsExec::ScanAvailable );
+      }
+
+      //--------------------------------
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        ThreadsExec & fan = *m_pool_base[ rank_rev + (1<<i) ];
+        // Wait: ReductionAvailable -> ScanAvailable
+        Impl::spinwait( fan.m_pool_state , ThreadsExec::ReductionAvailable );
+        // Set: ScanAvailable -> Rendezvous
+        fan.m_pool_state = ThreadsExec::Rendezvous ;
+      }
+
+      // All threads have completed the inclusive scan.
+      // All non-root threads are in the Rendezvous state.
+      // Threads are free to overwrite their reduction value.
+      //--------------------------------
+
+      if ( ( rank_rev + 1 ) < m_pool_size ) {
+        // Exclusive scan: copy the previous thread's inclusive scan value
+
+        ThreadsExec & th = *m_pool_base[ rank_rev + 1 ] ; // Not the root thread
+
+        const scalar_type * const src_value = ((scalar_type *)th.reduce_base()) + count ;
+
+        for ( unsigned j = 0 ; j < count ; ++j ) { work_value[j] = src_value[j]; }
+      }
+      else {
+        f.init( Reduce::reference( work_value ) );
+      }
+
+      //--------------------------------
+      // Wait for all threads to copy previous thread's inclusive scan value
+      // Wait for all threads: Rendezvous -> ScanCompleted
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        Impl::spinwait( m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
+      }
+      if ( rank_rev ) {
+        // Set: ScanAvailable -> ScanCompleted
+        m_pool_state = ThreadsExec::ScanCompleted ;
+        // Wait: ScanCompleted -> Active
+        Impl::spinwait( m_pool_state , ThreadsExec::ScanCompleted );
+      }
+      // Set: ScanCompleted -> Active
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        m_pool_base[ rank_rev + (1<<i) ]->m_pool_state = ThreadsExec::Active ;
+      }
+    }
+
+  template< class FunctorType >
+  inline
+  void scan_small( const FunctorType & f )
+    {
+      typedef ReduceAdapter< FunctorType > Reduce ;
+      typedef typename Reduce::scalar_type scalar_type ;
+
+      const int      rank_rev = m_pool_size - ( m_pool_rank + 1 );
+      const unsigned count    = Reduce::value_count( f );
+
+      scalar_type * const work_value = (scalar_type *) reduce_base();
+
+      //--------------------------------
+      // Fan-in reduction with highest ranking thread as the root
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        // Wait: Active -> Rendezvous
+        Impl::spinwait( m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+      }
+
+      for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; }
+
+      if ( rank_rev ) {
+        m_pool_state = ThreadsExec::Rendezvous ;
+        // Wait: Rendezvous -> Active
+        Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
+      }
+      else {
+        // Root thread does the thread-scan before releasing threads
+
+        scalar_type * ptr_prev = 0 ;
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          scalar_type * const ptr = (scalar_type *) get_thread( rank )->reduce_base();
+          if ( rank ) {
+            for ( unsigned i = 0 ; i < count ; ++i ) { ptr[i] = ptr_prev[ i + count ]; }
+            f.join( Reduce::reference( ptr + count ), Reduce::reference( ptr ) );
+          }
+          else {
+            f.init( Reduce::reference( ptr ) );
+          }
+          ptr_prev = ptr ;
+        }
+      }
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        m_pool_base[ rank_rev + (1<<i) ]->m_pool_state = ThreadsExec::Active ;
+      }
+    }
+
+  //------------------------------------
+  // Team-only functions:
+
+  void * get_shmem( const int size );
+
+  KOKKOS_INLINE_FUNCTION void team_barrier()
+    {
+      const int rank_rev = m_team_size - ( m_team_rank + 1 );
+
+      for ( int i = 0 ; i < m_team_fan_size ; ++i ) {
+        Impl::spinwait( m_team_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+      }
+      if ( rank_rev ) {
+        m_pool_state = Rendezvous ;
+        Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
+      }
+      for ( int i = 0 ; i < m_team_fan_size ; ++i ) {
+        m_team_base[ rank_rev + (1<<i) ]->m_pool_state = ThreadsExec::Active ;
+      }
+    }
+
+  template< class ArgType >
+  KOKKOS_INLINE_FUNCTION
+  ArgType team_scan( const ArgType & value , ArgType * const global_accum = 0 )
+    {
+      // Sequence of m_team_state states:
+      //  0) Inactive            : entry and exit state
+      //  1) ReductionAvailable  : reduction value available, waiting for scan value
+      //  2) ScanAvailable       : reduction value available, scan value available
+      //  3) Rendezvous          : broadcasting global inter-team accumulation value
+
+      // Make sure there is enough scratch space:
+      typedef typename if_c< 2 * sizeof(ArgType) < REDUCE_TEAM_BASE , ArgType , void >::type type ;
+
+      const int rank_rev = m_team_size - ( m_team_rank + 1 );
+
+      type * const work_value = (type*) reduce_team();
+
+      // ThreadsExec::Inactive == m_team_state
+
+      work_value[0] = value ;
+      memory_fence();
+
+      // Fan-in reduction, wait for source thread to complete it's fan-in reduction.
+      for ( int i = 0 ; i < m_team_fan_size ; ++i ) {
+        ThreadsExec & th = *m_team_base[ rank_rev + (1<<i) ];
+
+        // Wait for source thread to exit Inactive state.
+        Impl::spinwait( th.m_team_state , ThreadsExec::Inactive );
+        // Source thread is 'ReductionAvailable' or 'ScanAvailable'
+        work_value[0] += ((volatile type*)th.reduce_team())[0];
+        memory_fence();
+      }
+
+      work_value[1] = work_value[0] ;
+      memory_fence();
+
+      if ( rank_rev ) {
+
+        m_team_state = ThreadsExec::ReductionAvailable ; // Reduction value is available.
+
+        // Wait for contributing threads' scan value to be available.
+        if ( ( 1 << m_team_fan_size ) < ( m_team_rank + 1 ) ) {
+          ThreadsExec & th = *m_team_base[ rank_rev + ( 1 << m_team_fan_size ) ];
+
+          // Wait: Inactive -> ReductionAvailable
+          Impl::spinwait( th.m_team_state , ThreadsExec::Inactive );
+          // Wait: ReductionAvailable -> ScanAvailable:
+          Impl::spinwait( th.m_team_state , ThreadsExec::ReductionAvailable );
+
+          work_value[1] += ((volatile type*)th.reduce_team())[1] ;
+          memory_fence();
+        }
+
+        m_team_state = ThreadsExec::ScanAvailable ; // Scan value is available.
+      }
+      else {
+         // Root thread add team's total to global inter-team accumulation
+        work_value[0] = global_accum ? atomic_fetch_add( global_accum , work_value[0] ) : 0 ;
+      }
+
+      for ( int i = 0 ; i < m_team_fan_size ; ++i ) {
+        ThreadsExec & th = *m_team_base[ rank_rev + (1<<i) ];
+        // Wait: ReductionAvailable -> ScanAvailable
+        Impl::spinwait( th.m_team_state , ThreadsExec::ReductionAvailable );
+        // Wait: ScanAvailable -> Rendezvous
+        Impl::spinwait( th.m_team_state , ThreadsExec::ScanAvailable );
+      }
+
+      // All fan-in threads are in the ScanAvailable state
+      if ( rank_rev ) {
+        m_team_state = ThreadsExec::Rendezvous ;
+        Impl::spinwait( m_team_state , ThreadsExec::Rendezvous );
+      }
+
+      // Broadcast global inter-team accumulation value
+      volatile type & global_val = work_value[0] ;
+      for ( int i = 0 ; i < m_team_fan_size ; ++i ) {
+        ThreadsExec & th = *m_team_base[ rank_rev + (1<<i) ];
+        ((volatile type*)th.reduce_team())[0] = global_val ;
+        memory_fence();
+        th.m_team_state = ThreadsExec::Inactive ;
+      }
+      // Exclusive scan, subtract contributed value
+      return global_val + work_value[1] - value ;
+    }
+
+  /*  When a functor using the 'device' interface requests
+   *  more teams than are initialized the parallel operation
+   *  must loop over a range of league ranks with a team_barrier
+   *  between each iteration.
+   */
+  bool team_work_avail()
+    { m_team_shared_iter = 0 ; return m_league_rank < m_league_end ; }
+
+  void team_work_next()
+    { if ( ++m_league_rank < m_league_end ) team_barrier(); }
+
+  //------------------------------------
+  /** \brief  Wait for previous asynchronous functor to
+   *          complete and release the Threads device.
+   *          Acquire the Threads device and start this functor.
+   */
+  static void start( void (*)( ThreadsExec & , const void * ) , const void * ,
+                     int work_league_size = 0 ,
+                     int work_team_size = 0 );
+
+  static int league_max();
+  static int team_max();
+
+  static int  in_parallel();
+  static void fence();
+  static bool sleep();
+  static bool wake();
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+inline int Threads::in_parallel()
+{ return Impl::ThreadsExec::in_parallel(); }
+
+inline int Threads::is_initialized()
+{ return Impl::ThreadsExec::is_initialized(); }
+
+inline void Threads::initialize(
+  unsigned threads_count ,
+  unsigned use_numa_count ,
+  unsigned use_cores_per_numa ,
+  bool allow_asynchronous_threadpool )
+{
+  Impl::ThreadsExec::initialize( threads_count , use_numa_count , use_cores_per_numa , allow_asynchronous_threadpool );
+}
+
+inline void Threads::finalize()
+{
+  Impl::ThreadsExec::finalize();
+}
+
+inline void Threads::print_configuration( std::ostream & s , const bool detail )
+{
+  Impl::ThreadsExec::print_configuration( s , detail );
+}
+
+KOKKOS_INLINE_FUNCTION unsigned Threads::league_max()
+{ return Impl::ThreadsExec::league_max() ; }
+
+KOKKOS_INLINE_FUNCTION unsigned Threads::team_max()
+{ return Impl::ThreadsExec::team_max() ; }
+
+inline bool Threads::sleep()
+{ return Impl::ThreadsExec::sleep() ; }
+
+inline bool Threads::wake()
+{ return Impl::ThreadsExec::wake() ; }
+
+inline void Threads::fence()
+{ Impl::ThreadsExec::fence() ; }
+
+KOKKOS_INLINE_FUNCTION int Threads::league_rank() const
+{ return m_exec.m_league_rank ; }
+
+KOKKOS_INLINE_FUNCTION int Threads::league_size() const
+{ return m_exec.m_league_size ; }
+
+KOKKOS_INLINE_FUNCTION int Threads::team_rank() const
+{ return m_exec.m_team_rank ; }
+
+KOKKOS_INLINE_FUNCTION int Threads::team_size() const
+{ return m_exec.m_team_size ; }
+
+KOKKOS_INLINE_FUNCTION void Threads::team_barrier()
+{ return m_exec.team_barrier(); }
+
+inline Threads::Threads( Impl::ThreadsExec & t ) : m_exec( t ) {}
+
+template< typename Type >
+KOKKOS_INLINE_FUNCTION Type Threads::team_scan( const Type & value )
+{ return m_exec.team_scan( value ); }
+
+template< typename TypeLocal , typename TypeGlobal >
+KOKKOS_INLINE_FUNCTION TypeGlobal Threads::team_scan( const TypeLocal & value , TypeGlobal * const global_accum )
+{ return m_exec.template team_scan< TypeGlobal >( value , global_accum ); }
+
+KOKKOS_INLINE_FUNCTION
+void * Threads::get_shmem( const int size ) { return m_exec.get_shmem( size ); }
+
+} /* namespace Kokkos */
+
+#endif /* #define KOKKOS_THREADSEXEC_HPP */
+
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
new file mode 100644
index 000000000..db3299f46
--- /dev/null
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
@@ -0,0 +1,263 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <stdexcept>
+
+#include <KokkosCore_config.h>
+#include <Kokkos_Threads.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+
+/* Standard 'C' Linux libraries */
+
+#include <pthread.h>
+#include <sched.h>
+#include <errno.h>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+namespace {
+
+pthread_mutex_t host_internal_pthread_mutex = PTHREAD_MUTEX_INITIALIZER ;
+
+// Pthreads compatible driver.
+// Recovery from an exception would require constant intra-thread health
+// verification; which would negatively impact runtime.  As such simply
+// abort the process.
+
+void * internal_pthread_driver( void * )
+{
+  try {
+    ThreadsExec::driver();
+  }
+  catch( const std::exception & x ) {
+    std::cerr << "Exception thrown from worker thread: " << x.what() << std::endl ;
+    std::cerr.flush();
+    std::abort();
+  }
+  catch( ... ) {
+    std::cerr << "Exception thrown from worker thread" << std::endl ;
+    std::cerr.flush();
+    std::abort();
+  }
+  return NULL ;
+}
+
+} // namespace
+
+//----------------------------------------------------------------------------
+// Spawn a thread
+
+bool ThreadsExec::spawn()
+{
+  bool result = false ;
+
+  pthread_attr_t attr ;
+
+  if ( 0 == pthread_attr_init( & attr ) ||
+       0 == pthread_attr_setscope(       & attr, PTHREAD_SCOPE_SYSTEM ) ||
+       0 == pthread_attr_setdetachstate( & attr, PTHREAD_CREATE_DETACHED ) ) {
+
+    pthread_t pt ;
+
+    result = 0 == pthread_create( & pt, & attr, internal_pthread_driver, 0 );
+  }
+
+  pthread_attr_destroy( & attr );
+
+  return result ;
+}
+
+//----------------------------------------------------------------------------
+
+bool ThreadsExec::is_process()
+{
+  static const pthread_t master_pid = pthread_self();
+
+  return pthread_equal( master_pid , pthread_self() );
+}
+
+void ThreadsExec::global_lock()
+{
+  pthread_mutex_lock( & host_internal_pthread_mutex );
+}
+
+void ThreadsExec::global_unlock()
+{
+  pthread_mutex_unlock( & host_internal_pthread_mutex );
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::wait_yield( volatile int & flag , const int value )
+{
+  while ( value == flag ) { sched_yield(); }
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_HAVE_WINTHREAD )
+
+/* Windows libraries */
+#include <windows.h>
+#include <process.h>
+
+//----------------------------------------------------------------------------
+// Driver for each created pthread
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+unsigned WINAPI internal_winthread_driver( void * arg )
+{
+  ThreadsExec::driver();
+
+  return 0 ;
+}
+
+class ThreadLockWindows {
+private:
+  CRITICAL_SECTION  m_handle ;
+
+  ~ThreadLockWindows()
+  { DeleteCriticalSection( & m_handle ); }
+
+  ThreadLockWindows();
+  { InitializeCriticalSection( & m_handle ); }
+
+  ThreadLockWindows( const ThreadLockWindows & );
+  ThreadLockWindows & operator = ( const ThreadLockWindows & );
+
+public:
+
+  static ThreadLockWindows & singleton();
+
+  void lock()
+  { EnterCriticalSection( & m_handle ); }
+
+  void unlock()
+  { LeaveCriticalSection( & m_handle ); }
+};
+
+ThreadLockWindows & ThreadLockWindows::singleton()
+{ static ThreadLockWindows self ; return self ; }
+
+} // namespace <>
+} // namespace Kokkos
+} // namespace Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+// Spawn this thread
+
+bool ThreadsExec::spawn()
+{
+  unsigned Win32ThreadID = 0 ;
+
+  HANDLE handle =
+    _beginthreadex(0,0,internal_winthread_driver,0,0, & Win32ThreadID );
+
+  return ! handle ;
+}
+
+bool ThreadsExec::is_process() { return true ; }
+
+void ThreadsExec::global_lock()
+{ ThreadLockWindows::singleton().lock(); }
+
+void ThreadsExec::global_unlock()
+{ ThreadLockWindows::singleton().unlock(); }
+
+void ThreadsExec::wait_yield( volatile int & flag , const int value ) {}
+{
+  while ( value == flag ) { Sleep(0); }
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#else /* NO Threads */
+
+namespace Kokkos {
+namespace Impl {
+
+bool ThreadsExec::spawn()
+{
+  std::string msg("Kokkos::Threads ERROR : Attempting to spawn threads without configuring with a threading library.  Try configuring with KOKKOS_HAVE_PTHREAD");
+  throw std::runtime_error( msg );
+
+  return false ;
+}
+
+bool ThreadsExec::is_process() { return true ; }
+void ThreadsExec::global_lock() {}
+void ThreadsExec::global_unlock() {}
+void ThreadsExec::wait_yield( volatile int & , const int ) {}
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* End thread model */
+
diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
new file mode 100644
index 000000000..4de785b1d
--- /dev/null
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
@@ -0,0 +1,422 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADS_PARALLEL_HPP
+#define KOKKOS_THREADS_PARALLEL_HPP
+
+#include <vector>
+
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_ParallelReduce.hpp>
+
+#include <impl/Kokkos_StaticAssert.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class WorkSpec >
+class ParallelFor< FunctorType , WorkSpec , Kokkos::Threads >
+{
+public:
+
+  const FunctorType  m_func ;
+  const size_t       m_work ;
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    const std::pair<size_t,size_t> work = exec.work_range( self.m_work );
+
+    for ( size_t iwork = work.first, work_end = work.second ; iwork < work_end ; ++iwork ) {
+      self.m_func( iwork );
+    }
+
+    exec.fan_in();
+  }
+
+  ParallelFor( const FunctorType & functor , const size_t work )
+    : m_func( functor ), m_work( work )
+    {
+      ThreadsExec::start( & ParallelFor::execute , this );
+      ThreadsExec::fence();
+    }
+
+  inline void wait() {}
+
+  inline ~ParallelFor() { wait(); }
+};
+
+template< class FunctorType >
+class ParallelFor< FunctorType , ParallelWorkRequest , Kokkos::Threads >
+{
+public:
+
+  const FunctorType  m_func ;
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    for ( ; exec.team_work_avail() ; exec.team_work_next() ) {
+      self.m_func( Threads( exec ) );
+    }
+
+    exec.fan_in();
+  }
+
+  ParallelFor( const FunctorType & functor , const ParallelWorkRequest & work )
+    : m_func( functor )
+    {
+      ThreadsExec::resize_shared_scratch( FunctorShmemSize< FunctorType >::value( functor ) );
+      ThreadsExec::start( & ParallelFor::execute , this , work.league_size , work.team_size );
+      ThreadsExec::fence();
+    }
+
+  inline void wait() {}
+
+  inline ~ParallelFor() { wait(); }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class WorkSpec >
+class ParallelReduce< FunctorType , WorkSpec , Kokkos::Threads >
+{
+public:
+
+  typedef ReduceAdapter< FunctorType >   Reduce ;
+  typedef typename Reduce::pointer_type  pointer_type ;
+
+  const FunctorType  m_func ;
+  const size_t       m_work ;
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+
+    typename Reduce::reference_type update = Reduce::reference( exec.reduce_base() );
+
+    self.m_func.init( update ); // Initialize thread-local value
+
+    const std::pair<size_t,size_t> work = exec.work_range( self.m_work );
+
+    for ( size_t iwork = work.first, work_end = work.second ; iwork < work_end ; ++iwork ) {
+      self.m_func( iwork , update );
+    }
+
+    exec.fan_in_reduce( self.m_func );
+  }
+
+  ParallelReduce( const FunctorType & functor ,
+                  const size_t        work ,
+                  const pointer_type  result_ptr = 0 )
+    : m_func( functor ), m_work( work )
+    {
+      ThreadsExec::resize_reduce_scratch( Reduce::value_size( m_func ) );
+
+      ThreadsExec::start( & ParallelReduce::execute , this );
+
+      const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
+
+      ThreadsExec::fence();
+
+      Reduce::final( m_func , data );
+
+      if ( result_ptr ) {
+        const unsigned n = Reduce::value_count( m_func );
+        for ( unsigned i = 0 ; i < n ; ++i ) { result_ptr[i] = data[i]; }
+      }
+    }
+
+  inline void wait() {}
+
+  inline ~ParallelReduce() { wait(); }
+};
+
+template< class FunctorType >
+class ParallelReduce< FunctorType , ParallelWorkRequest , Kokkos::Threads >
+{
+public:
+
+  typedef ReduceAdapter< FunctorType >   Reduce ;
+  typedef typename Reduce::pointer_type  pointer_type ;
+
+  const FunctorType  m_func ;
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+
+    typename Reduce::reference_type update = Reduce::reference( exec.reduce_base() );
+
+    self.m_func.init( update ); // Initialize thread-local value
+
+    for ( ; exec.team_work_avail() ; exec.team_work_next() ) {
+      self.m_func( Threads( exec ) , update );
+    }
+
+    exec.fan_in_reduce( self.m_func );
+  }
+
+  ParallelReduce( const FunctorType & functor ,
+                  const ParallelWorkRequest & work ,
+                  const pointer_type  result_ptr = 0 )
+    : m_func( functor )
+    {
+      ThreadsExec::resize_shared_scratch( FunctorShmemSize< FunctorType >::value( functor ) );
+      ThreadsExec::resize_reduce_scratch( Reduce::value_size( m_func ) );
+
+      ThreadsExec::start( & ParallelReduce::execute , this , work.league_size , work.team_size );
+
+      const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
+
+      ThreadsExec::fence();
+
+      Reduce::final( m_func , data );
+
+      if ( result_ptr ) {
+        const unsigned n = Reduce::value_count( m_func );
+        for ( unsigned i = 0 ; i < n ; ++i ) { result_ptr[i] = data[i]; }
+      }
+    }
+
+  inline void wait() {}
+
+  inline ~ParallelReduce() { wait(); }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+struct ThreadsExecUseScanSmall {
+  size_t nwork ;
+  operator size_t () const { return nwork ; }
+  ThreadsExecUseScanSmall( size_t n ) : nwork( n ) {}
+};
+
+template< class FunctorType , class WorkSpec >
+class ParallelScan< FunctorType , WorkSpec , Kokkos::Threads >
+{
+public:
+
+  typedef ReduceAdapter< FunctorType > Reduce ;
+  typedef typename Reduce::pointer_type pointer_type ;
+
+  const FunctorType  m_func ;
+  const size_t       m_work ;
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelScan & self = * ((const ParallelScan *) arg );
+
+    const std::pair<size_t,size_t> work = exec.work_range( self.m_work );
+
+    typename Reduce::reference_type update = Reduce::reference( exec.reduce_base() );
+
+    self.m_func.init( update );
+
+    for ( size_t iwork = work.first, work_end = work.second ; iwork < work_end ; ++iwork ) {
+      self.m_func( iwork , update , false );
+    }
+
+    // Compile time selection of scan algorithm to support unit testing
+    // of both large and small thread count algorithms.
+    if ( ! is_same< WorkSpec , ThreadsExecUseScanSmall >::value ) {
+      exec.scan_large( self.m_func );
+    }
+    else {
+      exec.scan_small( self.m_func );
+    }
+
+    for ( size_t iwork = work.first, work_end = work.second ; iwork < work_end ; ++iwork ) {
+      self.m_func( iwork , update , true );
+    }
+
+    exec.fan_in();
+  }
+
+  ParallelScan( const FunctorType & functor , const size_t nwork )
+    : m_func( functor )
+    , m_work( nwork )
+    {
+      ThreadsExec::resize_reduce_scratch( 2 * Reduce::value_size( m_func ) );
+      ThreadsExec::start( & ParallelScan::execute , this );
+      ThreadsExec::fence();
+    }
+
+  inline void wait() {}
+
+  inline ~ParallelScan() { wait(); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template<>
+class MultiFunctorParallelReduce< Threads > {
+private:
+
+  struct MemberBase {
+    virtual void init( Impl::ThreadsExec & ) const = 0 ;
+    virtual void exec( Impl::ThreadsExec & ) const = 0 ;
+    virtual void fan_in_reduce( Impl::ThreadsExec & ) const = 0 ;
+    virtual void output( void * ) const = 0 ;
+    virtual ~MemberBase() {}
+  };
+
+  template< class FunctorType >
+  struct Member : public MemberBase {
+    typedef Impl::ReduceAdapter< FunctorType >   Reduce ;
+    typedef typename Reduce::pointer_type  pointer_type ;
+
+    const FunctorType  m_func ;
+    const size_t       m_work ;
+
+    ~Member() {}
+
+    Member( const FunctorType & func , const size_t work )
+      : m_func( func ), m_work( work )
+      {
+        Impl::ThreadsExec::resize_reduce_scratch( Reduce::value_size( m_func ) );
+      }
+
+    void init( Impl::ThreadsExec & exec_ ) const
+      { m_func.init( Reduce::reference( exec_.reduce_base() ) ); }
+
+    void exec( Impl::ThreadsExec & exec_ ) const
+      {
+        typename Reduce::reference_type update = Reduce::reference( exec_.reduce_base() );
+
+        const std::pair<size_t,size_t> work = exec_.work_range( m_work );
+
+        for ( size_t iwork = work.first, work_end = work.second ; iwork < work_end ; ++iwork ) {
+          m_func( iwork , update );
+        }
+      }
+
+    void fan_in_reduce( Impl::ThreadsExec & exec_ ) const
+      { exec_.fan_in_reduce( m_func ); }
+
+    void output( void * ptr ) const
+      {
+        const pointer_type result = (pointer_type) ptr ;
+        const pointer_type data   = (pointer_type) Impl::ThreadsExec::root_reduce_scratch();
+
+        Impl::ThreadsExec::fence();
+
+        Reduce::final( m_func , data );
+
+        if ( result ) {
+          const unsigned n = Reduce::value_count( m_func );
+          for ( unsigned i = 0 ; i < n ; ++i ) { result[i] = data[i]; }
+        }
+      }
+  };
+
+  std::vector< MemberBase * > m_members ;
+
+  static void execute_members( Impl::ThreadsExec & exec_ , const void * arg )
+  {
+    const MultiFunctorParallelReduce & self = * ((const MultiFunctorParallelReduce *) arg );
+
+    // First functor initializes:
+
+    self.m_members.front()->init( exec_ ); // Initialize thread-local value
+
+    for ( unsigned i = 0 ; i < self.m_members.size() ; ++i ) {
+      self.m_members[i]->exec( exec_ );
+    }
+
+    // Last functor fan-in reduce:
+
+    self.m_members.back()->fan_in_reduce( exec_ );
+  }
+
+public:
+
+  inline
+  void execute( void * host_ptr ) const
+    {
+      if ( ! m_members.empty() ) {
+        Impl::ThreadsExec::start( & MultiFunctorParallelReduce::execute_members , this );
+        m_members.back()->output( host_ptr );
+      }
+    }
+
+  inline
+  void wait() const {}
+
+  template< class FunctorType >
+  void push_back( const size_t work_count , const FunctorType & f )
+  {
+    MemberBase * const m = new Member< FunctorType >( f , work_count );
+    m_members.push_back( m );
+  }
+
+  ~MultiFunctorParallelReduce()
+  {
+    while ( ! m_members.empty() ) {
+      delete m_members.back();
+      m_members.pop_back();
+    }
+  }
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_THREADS_PARALLEL_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp b/lib/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp
new file mode 100644
index 000000000..1f9d2a9a3
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp
@@ -0,0 +1,258 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ANALYZESHAPE_HPP
+#define KOKKOS_ANALYZESHAPE_HPP
+
+#include <impl/Kokkos_Shape.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+/** \brief  Analyze the array shape defined by a Kokkos::View data type.
+ *
+ *  It is presumed that the data type can be mapped down to a multidimensional
+ *  array of an intrinsic scalar numerical type (double, float, int, ... ).
+ *  The 'value_type' of an array may be an embedded aggregate type such
+ *  as a fixed length array 'Array<T,N>'.  In this case the 'array_type'
+ *  represents the underlying array of intrinsic scalar type.
+ *
+ *  The embedded aggregate type must have an AnalyzeShape specialization
+ *  to map it down to a shape and intrinsic scalar numerical type.
+ */
+
+template< class T >
+struct AnalyzeShape : public Shape< sizeof(T) , 0 >
+{
+  typedef void specialize ;
+
+  typedef Shape< sizeof(T), 0 >  shape ;
+
+  typedef       T  array_type ;
+  typedef       T  value_type ;
+  typedef       T  type ;
+  typedef const T  const_array_type ;
+  typedef const T  const_value_type ;
+  typedef const T  const_type ;
+  typedef       T  non_const_array_type ;
+  typedef       T  non_const_value_type ;
+  typedef       T  non_const_type ;
+};
+
+template<>
+struct AnalyzeShape<void> : public Shape< 0 , 0 >
+{
+  typedef void specialize ;
+
+  typedef Shape< 0 , 0 >  shape ;
+
+  typedef       void  array_type ;
+  typedef       void  value_type ;
+  typedef       void  type ;
+  typedef const void  const_array_type ;
+  typedef const void  const_value_type ;
+  typedef const void  const_type ;
+  typedef       void  non_const_array_type ;
+  typedef       void  non_const_value_type ;
+  typedef       void  non_const_type ;
+};
+
+template< class T >
+struct AnalyzeShape< const T > : public AnalyzeShape<T>::shape
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename nested::shape shape ;
+
+  typedef typename nested::const_array_type  array_type ;
+  typedef typename nested::const_value_type  value_type ;
+  typedef typename nested::const_type        type ;
+
+  typedef typename nested::const_array_type  const_array_type ;
+  typedef typename nested::const_value_type  const_value_type ;
+  typedef typename nested::const_type        const_type ;
+
+  typedef typename nested::non_const_array_type  non_const_array_type ;
+  typedef typename nested::non_const_value_type  non_const_value_type ;
+  typedef typename nested::non_const_type        non_const_type ;
+};
+
+template< class T >
+struct AnalyzeShape< T * >
+  : public ShapeInsert< typename AnalyzeShape<T>::shape , 0 >::type
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
+
+  typedef typename nested::array_type * array_type ;
+  typedef typename nested::value_type   value_type ;
+  typedef typename nested::type       * type ;
+
+  typedef typename nested::const_array_type * const_array_type ;
+  typedef typename nested::const_value_type   const_value_type ;
+  typedef typename nested::const_type       * const_type ;
+
+  typedef typename nested::non_const_array_type * non_const_array_type ;
+  typedef typename nested::non_const_value_type   non_const_value_type ;
+  typedef typename nested::non_const_type       * non_const_type ;
+};
+
+template< class T >
+struct AnalyzeShape< T[] >
+  : public ShapeInsert< typename AnalyzeShape<T>::shape , 0 >::type
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
+
+  typedef typename nested::array_type  array_type [] ;
+  typedef typename nested::value_type  value_type ;
+  typedef typename nested::type        type [] ;
+
+  typedef typename nested::const_array_type  const_array_type [] ;
+  typedef typename nested::const_value_type  const_value_type ;
+  typedef typename nested::const_type        const_type [] ;
+
+  typedef typename nested::non_const_array_type  non_const_array_type [] ;
+  typedef typename nested::non_const_value_type  non_const_value_type ;
+  typedef typename nested::non_const_type        non_const_type [] ;
+};
+
+template< class T >
+struct AnalyzeShape< const T[] >
+  : public ShapeInsert< typename AnalyzeShape< const T >::shape , 0 >::type
+{
+private:
+  typedef AnalyzeShape< const T > nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
+
+  typedef typename nested::array_type  array_type [] ;
+  typedef typename nested::value_type  value_type ;
+  typedef typename nested::type        type [] ;
+
+  typedef typename nested::const_array_type  const_array_type [] ;
+  typedef typename nested::const_value_type  const_value_type ;
+  typedef typename nested::const_type        const_type [] ;
+
+  typedef typename nested::non_const_array_type  non_const_array_type [] ;
+  typedef typename nested::non_const_value_type  non_const_value_type ;
+  typedef typename nested::non_const_type        non_const_type [] ;
+};
+
+template< class T , unsigned N >
+struct AnalyzeShape< T[N] >
+  : public ShapeInsert< typename AnalyzeShape<T>::shape , N >::type
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , N >::type shape ;
+
+  typedef typename nested::array_type  array_type [N] ;
+  typedef typename nested::value_type  value_type ;
+  typedef typename nested::type        type [N] ;
+
+  typedef typename nested::const_array_type  const_array_type [N] ;
+  typedef typename nested::const_value_type  const_value_type ;
+  typedef typename nested::const_type        const_type [N] ;
+
+  typedef typename nested::non_const_array_type  non_const_array_type [N] ;
+  typedef typename nested::non_const_value_type  non_const_value_type ;
+  typedef typename nested::non_const_type        non_const_type [N] ;
+};
+
+template< class T , unsigned N >
+struct AnalyzeShape< const T[N] >
+  : public ShapeInsert< typename AnalyzeShape< const T >::shape , N >::type
+{
+private:
+  typedef AnalyzeShape< const T > nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , N >::type shape ;
+
+  typedef typename nested::array_type  array_type [N] ;
+  typedef typename nested::value_type  value_type ;
+  typedef typename nested::type        type [N] ;
+
+  typedef typename nested::const_array_type  const_array_type [N] ;
+  typedef typename nested::const_value_type  const_value_type ;
+  typedef typename nested::const_type        const_type [N] ;
+
+  typedef typename nested::non_const_array_type  non_const_array_type [N] ;
+  typedef typename nested::non_const_value_type  non_const_value_type ;
+  typedef typename nested::non_const_type        non_const_type [N] ;
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_ANALYZESHAPE_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
new file mode 100644
index 000000000..e8f5cffa4
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
@@ -0,0 +1,173 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP )
+#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+// Cuda native CAS supports int, unsigned int, and unsigned long long int (non-standard type).
+// Must cast-away 'volatile' for the CAS call.
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+__inline__ __device__
+int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
+{ return atomicCAS((int*)dest,compare,val); }
+
+__inline__ __device__
+unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val)
+{ return atomicCAS((unsigned int*)dest,compare,val); }
+
+__inline__ __device__
+unsigned long long int atomic_compare_exchange( volatile unsigned long long int * const dest ,
+                                                const unsigned long long int compare ,
+                                                const unsigned long long int val )
+{ return atomicCAS((unsigned long long int*)dest,compare,val); }
+
+template < typename T >
+__inline__ __device__
+T atomic_compare_exchange( volatile T * const dest , const T & compare ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  const int tmp = atomicCAS( (int*) dest , *((int*)&compare) , *((int*)&val) );
+  return *((T*)&tmp);
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_compare_exchange( volatile T * const dest , const T & compare ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
+{
+  typedef unsigned long long int type ;
+  const type tmp = atomicCAS( (type*) dest , *((type*)&compare) , *((type*)&val) );
+  return *((T*)&tmp);
+}
+
+//----------------------------------------------------------------------------
+// GCC native CAS supports int, long, unsigned int, unsigned long.
+// Intel native CAS support int and long with the same interface as GCC.
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+KOKKOS_INLINE_FUNCTION
+long atomic_compare_exchange( volatile long * const dest, const long compare, const long val )
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+// GCC supports unsigned
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val )
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long atomic_compare_exchange( volatile unsigned long * const dest ,
+                                       const unsigned long compare ,
+                                       const unsigned long val )
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+#endif
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest, const T & compare,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  union { int i ; T t ; } tmp ;
+  tmp.i = __sync_val_compare_and_swap( (int*) dest , *((int*)&compare) , *((int*)&val) );
+  return tmp.t ;
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest, const T & compare,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(long) , const T & >::type val )
+{
+  union { long i ; T t ; } tmp ;
+  tmp.i = __sync_val_compare_and_swap( (long*) dest , *((long*)&compare) , *((long*)&val) );
+  return tmp.t ;
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest, const T compare, const T val )
+{
+  T retval;
+#pragma omp critical
+  {
+    retval = dest[0];
+    if ( retval == compare )
+  	dest[0] = val;
+  }
+  return retval;
+}
+
+#endif
+
+
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
+{
+  return compare == atomic_compare_exchange(dest, compare, val);
+}
+
+//----------------------------------------------------------------------------
+
+} // namespace Kokkos
+
+#endif
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
new file mode 100644
index 000000000..fe660be7d
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
@@ -0,0 +1,208 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP )
+#define KOKKOS_ATOMIC_EXCHANGE_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+__inline__ __device__
+int atomic_exchange( volatile int * const dest , const int val )
+{
+  // return __iAtomicExch( (int*) dest , val );
+  return atomicExch( (int*) dest , val );
+}
+
+__inline__ __device__
+unsigned int atomic_exchange( volatile unsigned int * const dest , const unsigned int val )
+{
+  // return __uAtomicExch( (unsigned int*) dest , val );
+  return atomicExch( (unsigned int*) dest , val );
+}
+
+__inline__ __device__
+unsigned long long int atomic_exchange( volatile unsigned long long int * const dest , const unsigned long long int val )
+{
+  // return __ullAtomicExch( (unsigned long long*) dest , val );
+  return atomicExch( (unsigned long long*) dest , val );
+}
+
+/** \brief  Atomic exchange for any type with compatible size */
+template< typename T >
+__inline__ __device__
+T atomic_exchange(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  // int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) );
+  int tmp = atomicExch( ((int*)dest) , *((int*)&val) );
+  return *((T*)&tmp);
+}
+
+template< typename T >
+__inline__ __device__
+T atomic_exchange(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
+{
+  typedef unsigned long long int type ;
+  // type tmp = __ullAtomicExch( (type*) dest , *((type*)&val) );
+  type tmp = atomicExch( ((type*)dest) , *((type*)&val) );
+  return *((T*)&tmp);
+}
+
+/** \brief  Atomic exchange for any type with compatible size */
+template< typename T >
+__inline__ __device__
+void atomic_assign(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  // (void) __ullAtomicExch( (int*) dest , *((int*)&val) );
+  (void) atomicExch( ((int*)dest) , *((int*)&val) );
+}
+
+template< typename T >
+__inline__ __device__
+void atomic_assign(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
+{
+  typedef unsigned long long int type ;
+  // (void) __ullAtomicExch( (type*) dest , *((type*)&val) );
+  (void) atomicExch( ((type*)dest) , *((type*)&val) );
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_exchange( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long)
+                                  , const T & >::type val )
+{
+  typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
+
+  const type v = *((type*)&val); // Extract to be sure the value doesn't change
+  
+  type assumed ;
+
+  union { T val_T ; type val_type ; } old ;
+
+  old.val_T = *dest ;
+
+  do {
+    assumed = old.val_type ;
+    old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v );
+  } while ( assumed != old.val_type );
+
+  return old.val_T ;
+}
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void atomic_assign( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long)
+                                  , const T & >::type val )
+{
+  typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
+
+  const type v = *((type*)&val); // Extract to be sure the value doesn't change
+  
+  type assumed ;
+
+  union { T val_T ; type val_type ; } old ;
+
+  old.val_T = *dest ;
+
+  do {
+    assumed = old.val_type ;
+    old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v );
+  } while ( assumed != old.val_type );
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_exchange( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp critical
+  {
+    retval = dest[0];
+    dest[0] = val;
+  }
+  return retval;
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+void atomic_assign( volatile T * const dest , const T val )
+{
+#pragma omp critical
+  {
+    dest[0] = val;
+  }
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+} // namespace Kokkos
+
+#endif
+
+//----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
new file mode 100644
index 000000000..0eac013c6
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
@@ -0,0 +1,200 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP )
+#define KOKKOS_ATOMIC_FETCH_ADD_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_add( volatile int * const dest , const int val )
+{ return atomicAdd((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicAdd((unsigned int*)dest,val); }
+
+__inline__ __device__
+unsigned long long int atomic_fetch_add( volatile unsigned long long int * const dest ,
+                                         const unsigned long long int val )
+{ return atomicAdd((unsigned long long int*)dest,val); }
+
+__inline__ __device__
+float atomic_fetch_add( volatile float * const dest , const float val )
+{ return atomicAdd((float*)dest,val); }
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+  union { int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
+  } while ( assumed.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+  union { unsigned long long int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = atomicCAS( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_add( volatile int * const dest , const int val )
+{ return __sync_fetch_and_add(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+long int atomic_fetch_add( volatile long int * const dest , const long int val )
+{ return __sync_fetch_and_add(dest,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
+{ return __sync_fetch_and_add(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long int atomic_fetch_add( volatile unsigned long int * const dest , const unsigned long int val )
+{ return __sync_fetch_and_add(dest,val); }
+
+#endif
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+  union { int i ; T t ; } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = __sync_val_compare_and_swap( (int*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(long) , const T >::type val )
+{
+  union { long i ; T t ; } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = __sync_val_compare_and_swap( (long*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+T atomic_fetch_add( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp critical
+  {
+    retval = dest[0];
+    dest[0] += val;
+  }
+  return retval;
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+// Simpler version of atomic_fetch_add without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_add(volatile T * const dest, const T src) {
+  atomic_fetch_add(dest,src);
+}
+
+}
+
+#endif
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
new file mode 100644
index 000000000..a3b786713
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
@@ -0,0 +1,125 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP )
+#define KOKKOS_ATOMIC_FETCH_AND_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_and( volatile int * const dest , const int val )
+{ return atomicAnd((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicAnd((unsigned int*)dest,val); }
+
+#if defined( __CUDA_ARCH__ ) && ( 350 <= __CUDA_ARCH__ )
+__inline__ __device__
+unsigned long long int atomic_fetch_and( volatile unsigned long long int * const dest ,
+                                         const unsigned long long int val )
+{ return atomicAnd((unsigned long long int*)dest,val); }
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_and( volatile int * const dest , const int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+long int atomic_fetch_and( volatile long int * const dest , const long int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long int atomic_fetch_and( volatile unsigned long int * const dest , const unsigned long int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+T atomic_fetch_and( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp critical
+  {
+    retval = dest[0];
+    dest[0] = dest[0] & val;
+  }
+  return retval;
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+// Simpler version of atomic_fetch_and without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_and(volatile T * const dest, const T src) {
+  (void)atomic_fetch_and(dest,src);
+}
+
+}
+
+#endif
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
new file mode 100644
index 000000000..e04a0af35
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
@@ -0,0 +1,125 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP )
+#define KOKKOS_ATOMIC_FETCH_OR_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_or( volatile int * const dest , const int val )
+{ return atomicOr((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicOr((unsigned int*)dest,val); }
+
+#if defined( __CUDA_ARCH__ ) && ( 350 <= __CUDA_ARCH__ )
+__inline__ __device__
+unsigned long long int atomic_fetch_or( volatile unsigned long long int * const dest ,
+                                         const unsigned long long int val )
+{ return atomicOr((unsigned long long int*)dest,val); }
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_or( volatile int * const dest , const int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+long int atomic_fetch_or( volatile long int * const dest , const long int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long int atomic_fetch_or( volatile unsigned long int * const dest , const unsigned long int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+T atomic_fetch_or( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp critical
+  {
+    retval = dest[0];
+    dest[0] = dest[0] | val;
+  }
+  return retval;
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+// Simpler version of atomic_fetch_or without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_or(volatile T * const dest, const T src) {
+  (void)atomic_fetch_or(dest,src);
+}
+
+}
+
+#endif
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Compiler_Macros.hpp b/lib/kokkos/core/src/impl/Kokkos_Compiler_Macros.hpp
new file mode 100644
index 000000000..940bfcf25
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Compiler_Macros.hpp
@@ -0,0 +1,152 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_COMPILER_MACROS_HPP
+#define KOKKOS_COMPILER_MACROS_HPP
+
+#if defined __ECC || defined __ICC || defined __INTEL_COMPILER
+  #define KOKKOS_COMPILER_NAME "Intel C++"
+  #if defined __ICC
+    #define KOKKOS_COMPILER_VERSION __ICC
+  #else
+    #if defined __INTEL_COMPILER
+      #define KOKKOS_COMPILER_VERSION __INTEL_COMPILER
+    #else
+      #define KOKKOS_COMPILER_VERSION __ECC
+    #endif
+  #endif
+
+  #define KOKKOS_COMPILER_INTEL 1
+
+  #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  #define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  #define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  #define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  #define KOKKOS_HAVE_PRAGMA_SIMD 1
+#endif
+
+#if defined __IBMC__ || defined __IBMCPP__
+  #define KOKKOS_COMPILER_NAME "IBM C++"
+  #if defined __IBMC__
+    #define KOKKOS_COMPILER_VERSION __IBMC__
+  #else
+    #define KOKKOS_COMPILER_VERSION __IBMCPP__
+  #endif
+  #define KOKKOS_COMPILER_IBM 1
+
+  #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  //#define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  //#define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+#endif
+
+#if defined __APPLE_CC__
+   /* Apple uses GNU as compiler */
+  #define KOKKOS_COMPILER_APPLECC 1
+#endif
+
+#if defined __clang__
+  #define KOKKOS_COMPILER_NAME "Clang"
+  #define KOKKOS_COMPILER_VERSION __clang_major__*100+__clang_minor__*10+__clang_patchlevel__
+  #define KOKKOS_COMPILER_CLANG 1
+
+  //#define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  //#define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  //#define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+#endif
+
+#if defined __GNUC__ && !defined KOKKOS_COMPILER_NAME && !defined __clang__
+  #define KOKKOS_COMPILER_NAME "Gnu GCC"
+  #define KOKKOS_COMPILER_VERSION __GNUC__*100+__GNUC_MINOR__*10+__GNUC_PATCHLEVEL__
+  #define KOKKOS_COMPILER_GCC 1
+
+  //#define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  //#define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  //#define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+#endif
+
+#if defined __PGIC__ && !defined KOKKOS_COMPILER_NAME
+  #define KOKKOS_COMPILER_NAME "PGI C++"
+  #define KOKKOS_COMPILER_VERSION __PGIC__*100+__PGIC_MINOR__*10+__PGIC_PATCHLEVEL__
+  #define KOKKOS_COMPILER_PGI 1
+
+  #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  #define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  #define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+#endif
+
+#if defined __NVCC__
+  #define KOKKOS_DEVICE_COMPILER_NAME "NVIDIA NVCC"
+  #define KOKKOS_DEVICE_COMPILER_VERSION __NVCC__
+  #if (!defined(KOKKOS_HAVE_PRAGMA_UNROLL) && defined(__CUDA_ARCH__))
+  #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  #endif
+#endif
+
+#if !defined KOKKOS_COMPILER_NAME
+  #define KOKKOS_COMPILER_NAME "Unknown compiler"
+#endif
+
+#if !defined KOKKOS_COMPILER_VERSION
+  #define KOKKOS_COMPILER_VERSION 0
+#endif
+
+#if !defined KOKKOS_DEVICE_COMPILER_NAME
+  #define KOKKOS_DEVICE_COMPILER_NAME KOKKOS_COMPILER_NAME
+#endif
+
+#if !defined KOKKOS_DEVICE_COMPILER_VERSION
+  #define KOKKOS_DEVICE_COMPILER_VERSION KOKKOS_COMPILER_VERSION
+#endif
+
+
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_CrsArray_factory.hpp b/lib/kokkos/core/src/impl/Kokkos_CrsArray_factory.hpp
new file mode 100644
index 000000000..55efc377b
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_CrsArray_factory.hpp
@@ -0,0 +1,223 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_CRSARRAY_FACTORY_HPP
+#define KOKKOS_IMPL_CRSARRAY_FACTORY_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
+inline
+typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & view )
+{
+  // Force copy:
+  //typedef Impl::ViewAssignment< Impl::ViewDefault > alloc ; // unused
+  typedef CrsArray< DataType , Arg1Type , Arg2Type , SizeType >  crsarray_type ;
+
+  typename crsarray_type::HostMirror               tmp ;
+  typename crsarray_type::row_map_type::HostMirror tmp_row_map = create_mirror( view.row_map );
+
+  tmp.row_map = tmp_row_map ; // Assignment of 'const' from 'non-const'
+  tmp.entries = create_mirror( view.entries );
+
+  // Deep copy:
+  deep_copy( tmp_row_map , view.row_map );
+  deep_copy( tmp.entries , view.entries );
+
+  return tmp ;
+}
+
+template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
+inline
+typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror_view( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & view ,
+                    typename Impl::enable_if< ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
+{
+  return view ;
+}
+
+template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
+inline
+typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror_view( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & view ,
+                    typename Impl::enable_if< ! ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
+{
+  return create_mirror( view );
+}
+
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class CrsArrayType , class InputSizeType >
+inline
+typename CrsArrayType::crsarray_type
+create_crsarray( const std::string & label ,
+                 const std::vector< InputSizeType > & input )
+{
+  typedef CrsArrayType                  output_type ;
+  //typedef std::vector< InputSizeType >  input_type ; // unused
+
+  typedef typename output_type::entries_type   entries_type ;
+
+  typedef View< typename output_type::size_type [] ,
+                typename output_type::array_layout ,
+                typename output_type::device_type > work_type ;
+
+  output_type output ;
+
+  // Create the row map:
+
+  const size_t length = input.size();
+
+  {
+    work_type row_work( "tmp" , length + 1 );
+
+    typename work_type::HostMirror row_work_host =
+      create_mirror_view( row_work );
+
+    size_t sum = 0 ;
+    row_work_host[0] = 0 ;
+    for ( size_t i = 0 ; i < length ; ++i ) {
+      row_work_host[i+1] = sum += input[i];
+    }
+
+    deep_copy( row_work , row_work_host );
+
+    output.entries   = entries_type( label , sum );
+    output.row_map   = row_work ;
+  }
+
+  return output ;
+}
+
+//----------------------------------------------------------------------------
+
+template< class CrsArrayType , class InputSizeType >
+inline
+typename CrsArrayType::crsarray_type
+create_crsarray( const std::string & label ,
+                 const std::vector< std::vector< InputSizeType > > & input )
+{
+  typedef CrsArrayType                                output_type ;
+  //typedef std::vector< std::vector< InputSizeType > > input_type ; // unused
+  typedef typename output_type::entries_type          entries_type ;
+  //typedef typename output_type::size_type             size_type ; // unused
+
+  // mfh 14 Feb 2014: This function doesn't actually create instances
+  // of ok_rank, but it needs to declare the typedef in order to do
+  // the static "assert" (a compile-time check that the given shape
+  // has rank 1).  In order to avoid a "declared but unused typedef"
+  // warning, we declare an empty instance of this type, with the
+  // usual "(void)" marker to avoid a compiler warning for the unused
+  // variable.
+
+  typedef typename
+    Impl::assert_shape_is_rank_one< typename entries_type::shape_type >::type
+      ok_rank ;
+  {
+    ok_rank thing;
+    (void) thing;
+  }
+
+  typedef View< typename output_type::size_type [] ,
+                typename output_type::array_layout ,
+                typename output_type::device_type > work_type ;
+
+  output_type output ;
+
+    // Create the row map:
+
+  const size_t length = input.size();
+
+  {
+    work_type row_work( "tmp" , length + 1 );
+
+    typename work_type::HostMirror row_work_host =
+      create_mirror_view( row_work );
+
+    size_t sum = 0 ;
+    row_work_host[0] = 0 ;
+    for ( size_t i = 0 ; i < length ; ++i ) {
+      row_work_host[i+1] = sum += input[i].size();
+    }
+
+    deep_copy( row_work , row_work_host );
+
+    output.entries   = entries_type( label , sum );
+    output.row_map   = row_work ;
+  }
+
+  // Fill in the entries:
+  {
+    typename entries_type::HostMirror host_entries =
+      create_mirror_view( output.entries );
+
+    size_t sum = 0 ;
+    for ( size_t i = 0 ; i < length ; ++i ) {
+      for ( size_t j = 0 ; j < input[i].size() ; ++j , ++sum ) {
+        host_entries( sum ) = input[i][j] ;
+      }
+    }
+
+    deep_copy( output.entries , host_entries );
+  }
+
+  return output ;
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_IMPL_CRSARRAY_FACTORY_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Error.cpp b/lib/kokkos/core/src/impl/Kokkos_Error.cpp
new file mode 100644
index 000000000..cf762aec9
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Error.cpp
@@ -0,0 +1,184 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <ostream>
+#include <sstream>
+#include <iomanip>
+#include <stdexcept>
+#include <impl/Kokkos_Error.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void throw_runtime_exception( const std::string & msg )
+{
+  std::ostringstream o ;
+  o << msg ;
+  traceback_callstack( o );
+  throw std::runtime_error( o.str() );
+}
+
+
+std::string human_memory_size(size_t arg_bytes)
+{
+  double bytes = arg_bytes;
+  const double K = 1024;
+  const double M = K*1024;
+  const double G = M*1024;
+
+  std::ostringstream out;
+  if (bytes < K) {
+    out << std::setprecision(4) << bytes << " B";
+  } else if (bytes < M) {
+    bytes /= K;
+    out << std::setprecision(4) << bytes << " K";
+  } else if (bytes < G) {
+    bytes /= M;
+    out << std::setprecision(4) << bytes << " M";
+  } else {
+    bytes /= G;
+    out << std::setprecision(4) << bytes << " G";
+  }
+  return out.str();
+}
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __GNUC__ ) && defined( ENABLE_TRACEBACK )
+
+/*  This is only known to work with GNU C++
+ *  Must be compiled with '-rdynamic'
+ *  Must be linked with   '-ldl'
+ */
+
+/* Print call stack into an error stream,
+ * so one knows in which function the error occured.
+ *
+ * Code copied from:
+ *   http://stupefydeveloper.blogspot.com/2008/10/cc-call-stack.html
+ *
+ * License on this site:
+ *   This blog is licensed under a
+ *   Creative Commons Attribution-Share Alike 3.0 Unported License.
+ *
+ *   http://creativecommons.org/licenses/by-sa/3.0/
+ *
+ * Modified to output to std::ostream.
+ */
+#include <signal.h>
+#include <execinfo.h>
+#include <cxxabi.h>
+#include <dlfcn.h>
+#include <stdlib.h>
+
+namespace Kokkos {
+namespace Impl {
+
+void traceback_callstack( std::ostream & msg )
+{
+  using namespace abi;
+
+  enum { MAX_DEPTH = 32 };
+
+  void *trace[MAX_DEPTH];
+  Dl_info dlinfo;
+
+  int status;
+
+  int trace_size = backtrace(trace, MAX_DEPTH);
+
+  msg << std::endl << "Call stack {" << std::endl ;
+
+  for (int i=1; i<trace_size; ++i)
+  {
+    if(!dladdr(trace[i], &dlinfo))
+        continue;
+
+    const char * symname = dlinfo.dli_sname;
+
+    char * demangled = __cxa_demangle(symname, NULL, 0, &status);
+
+    if ( status == 0 && demangled ) {
+      symname = demangled;
+    }
+
+    if ( symname && *symname != 0 ) {
+      msg << "  object: " << dlinfo.dli_fname
+          << " function: " << symname
+          << std::endl ;
+    }
+
+    if ( demangled ) {
+        free(demangled);
+    }
+  }
+  msg << "}" ;
+}
+
+}
+}
+
+#else
+
+namespace Kokkos {
+namespace Impl {
+
+void traceback_callstack( std::ostream & msg )
+{
+  msg << std::endl << "Traceback functionality not available" << std::endl ;
+}
+
+}
+}
+
+#endif
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Error.hpp b/lib/kokkos/core/src/impl/Kokkos_Error.hpp
new file mode 100644
index 000000000..5b89b18fc
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Error.hpp
@@ -0,0 +1,65 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                             Kokkos
+//         Manycore Performance-Portable Multidimensional Arrays
+//
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_ERROR_HPP
+#define KOKKOS_IMPL_ERROR_HPP
+
+#include <string>
+#include <iosfwd>
+
+namespace Kokkos {
+namespace Impl {
+
+void throw_runtime_exception( const std::string & );
+
+void traceback_callstack( std::ostream & );
+
+std::string human_memory_size(size_t arg_bytes);
+
+}
+}
+
+#endif /* #ifndef KOKKOS_IMPL_ERROR_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
new file mode 100644
index 000000000..752b3d6fd
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@@ -0,0 +1,290 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <memory.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <cstring>
+
+#include <Kokkos_HostSpace.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace {
+
+class HostMemoryTrackingEntry : public Impl::MemoryTrackingEntry
+{
+public:
+
+  void * const ptr_alloc ;
+
+  HostMemoryTrackingEntry( const std::string & arg_label ,
+                           const std::type_info & arg_info ,
+                           void * const           arg_ptr ,
+                           const unsigned         arg_size )
+    : Impl::MemoryTrackingEntry( arg_label , arg_info , arg_ptr , arg_size )
+    , ptr_alloc( arg_ptr )
+    {}
+
+  ~HostMemoryTrackingEntry();
+};
+
+HostMemoryTrackingEntry::~HostMemoryTrackingEntry()
+{
+#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
+   _mm_free( ptr_alloc );
+#else
+   free( ptr_alloc );
+#endif
+}
+
+Impl::MemoryTracking & host_space_singleton()
+{
+  static Impl::MemoryTracking self("Kokkos::HostSpace");
+  return self ;
+}
+
+bool host_space_verify_modifiable( const char * const label )
+{
+  static const char error_in_parallel[] = "Called with HostSpace::in_parallel()" ;
+  static const char error_not_exists[]  = "Called after return from main()" ;
+
+  const char * const error_msg =
+    HostSpace::in_parallel() ? error_in_parallel : (
+    ! host_space_singleton().exists() ? error_not_exists : (const char *) 0 );
+
+  if ( error_msg ) {
+    std::cerr << "Kokkos::HostSpace::" << label << " ERROR : " << error_msg << std::endl ;
+  }
+
+  return error_msg == 0  ;
+}
+
+} // namespace <blank>
+} // namespade Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+void * host_allocate_not_thread_safe(
+  const std::string    & label ,
+  const std::type_info & scalar_type ,
+  const size_t           scalar_size ,
+  const size_t           scalar_count )
+{
+  void * ptr = 0 ;
+
+  if ( 0 < scalar_size && 0 < scalar_count ) {
+    void * ptr_alloc = 0 ;
+    size_t count_alloc = scalar_count ;
+
+#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
+
+    ptr = ptr_alloc = _mm_malloc( scalar_size * count_alloc , MEMORY_ALIGNMENT );
+   
+#elif ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
+      ( defined( _XOPEN_SOURCE )   && _XOPEN_SOURCE   >= 600 )
+
+    posix_memalign( & ptr_alloc , MEMORY_ALIGNMENT , scalar_size * count_alloc );
+    ptr = ptr_alloc ;
+
+#else
+
+    // Over-allocate to guarantee enough aligned space.
+
+    count_alloc += ( MEMORY_ALIGNMENT + scalar_size - 1 ) / scalar_size ;
+
+    ptr_alloc = malloc( scalar_size * count_alloc );
+
+    ptr = static_cast<unsigned char *>(ptr_alloc) + 
+          ( MEMORY_ALIGNMENT - reinterpret_cast<ptrdiff_t>(ptr_alloc) % MEMORY_ALIGNMENT );
+
+#endif
+
+    if ( ptr_alloc && ptr_alloc <= ptr &&
+         0 == ( reinterpret_cast<ptrdiff_t>(ptr) % MEMORY_ALIGNMENT ) ) {
+      host_space_singleton().insert(
+        new HostMemoryTrackingEntry( label , scalar_type , ptr_alloc , scalar_size * count_alloc ) );
+    }
+    else {
+      std::ostringstream msg ;
+      msg << "Kokkos::Impl::host_allocate_not_thread_safe( "
+          << label
+          << " , " << scalar_type.name()
+          << " , " << scalar_size
+          << " , " << scalar_count
+          << " ) FAILED aligned memory allocation" ;
+      Kokkos::Impl::throw_runtime_exception( msg.str() );
+    }
+  }
+
+  return ptr ;
+}
+
+void host_decrement_not_thread_safe( const void * ptr )
+{
+  host_space_singleton().decrement( ptr );
+}
+
+DeepCopy<HostSpace,HostSpace>::DeepCopy( void * dst , const void * src , size_t n )
+{
+  memcpy( dst , src , n );
+}
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace {
+
+static const int QUERY_DEVICE_IN_PARALLEL_MAX = 16 ;
+
+typedef int (* QueryDeviceInParallelPtr )();
+
+QueryDeviceInParallelPtr s_in_parallel_query[ QUERY_DEVICE_IN_PARALLEL_MAX ] ;
+int s_in_parallel_query_count = 0 ;
+
+} // namespace <empty>
+
+void HostSpace::register_in_parallel( int (*device_in_parallel)() )
+{
+  if ( 0 == device_in_parallel ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel ERROR : given NULL" ) );
+  }
+
+  int i = -1 ;
+
+  if ( ! (device_in_parallel)() ) {
+    for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i );
+  }
+
+  if ( i < s_in_parallel_query_count ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : called in_parallel" ) );
+
+  }
+
+  if ( QUERY_DEVICE_IN_PARALLEL_MAX <= i ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : exceeded maximum" ) );
+
+  }
+
+  for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i );
+
+  if ( i == s_in_parallel_query_count ) {
+    s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ;
+  }
+}
+
+int HostSpace::in_parallel()
+{
+  const int n = s_in_parallel_query_count ;
+
+  int i = 0 ;
+
+  while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; }
+
+  return i < n ;
+}
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+void * HostSpace::allocate(
+  const std::string    & label ,
+  const std::type_info & scalar_type ,
+  const size_t           scalar_size ,
+  const size_t           scalar_count )
+{
+  void * ptr = 0 ;
+
+  if ( host_space_verify_modifiable("allocate") ) {
+    ptr = Impl::host_allocate_not_thread_safe( label , scalar_type , scalar_size , scalar_count );
+  }
+
+  return ptr ;
+}
+
+void HostSpace::increment( const void * ptr )
+{
+  if ( host_space_verify_modifiable("increment") ) {
+    host_space_singleton().increment( ptr );
+  }
+}
+
+void HostSpace::decrement( const void * ptr )
+{
+  if ( host_space_verify_modifiable("decrement") ) {
+    Impl::host_decrement_not_thread_safe( ptr );
+  }
+}
+
+void HostSpace::print_memory_view( std::ostream & o )
+{
+  host_space_singleton().print( o , std::string("  ") );
+}
+
+std::string HostSpace::query_label( const void * p )
+{
+  const Impl::MemoryTrackingEntry * const info = 
+    host_space_singleton().query( p );
+
+  return 0 != info ? info->label : std::string("ERROR NOT DEFINED");
+}
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_MemoryTracking.cpp b/lib/kokkos/core/src/impl/Kokkos_MemoryTracking.cpp
new file mode 100644
index 000000000..627e70077
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_MemoryTracking.cpp
@@ -0,0 +1,285 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stddef.h>
+#include <limits>
+#include <iostream>
+#include <sstream>
+#include <algorithm>
+
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_MemoryTracking.hpp>
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+//----------------------------------------------------------------------------
+// Fast search for result[-1] <= val < result[0].
+// Requires result[max] == upper_bound.
+// Start with a binary search until the search range is
+// less than LINEAR_LIMIT, then switch to linear search.
+
+int upper_bound( const ptrdiff_t * const begin , unsigned length ,
+                 const ptrdiff_t val )
+{
+  enum { LINEAR_LIMIT = 32 };
+
+  // precondition: begin[length-1] == std::numeric_limits<ptrdiff_t>::max()
+
+  const ptrdiff_t * first = begin ;
+
+  while ( LINEAR_LIMIT < length ) {
+    unsigned          half   = length >> 1 ;
+    const ptrdiff_t * middle = first + half ;
+
+    if ( val < *middle ) {
+      length = half ;
+    }
+    else {
+      first   = ++middle ;
+      length -= ++half ;
+    }
+  }
+
+  for ( ; ! ( val < *first ) ; ++first ) {}
+
+  return first - begin ;
+}
+
+} // namespace
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+MemoryTracking::MemoryTracking( const std::string & space )
+  : m_space( space ), m_tracking(), m_tracking_end()
+{
+  ptrdiff_t max = std::numeric_limits<ptrdiff_t>::max();
+  void * const ptr = reinterpret_cast<void*>( max );
+
+  m_tracking.reserve(64);
+  m_tracking_end.reserve(64);
+
+  // Sentinal value of end
+
+  m_tracking.push_back( new MemoryTrackingEntry( "sentinal" , typeid(void) , ptr , 0 ) );
+  m_tracking_end.push_back( max );
+}
+
+MemoryTracking::~MemoryTracking()
+{
+  const ptrdiff_t max = std::numeric_limits<ptrdiff_t>::max();
+
+  try {
+    if ( 1 < m_tracking.size() ) {
+      std::cerr << m_space << " destroyed with memory leaks:" << std::endl ;
+      print( std::cerr , std::string("  ") );
+    }
+    else if ( 1 != m_tracking_end.size() || m_tracking_end.back() != max ) {
+      std::cerr << m_space << " corrupted data structure" << std::endl ;
+    }
+
+    // Deallocate memory within the try-catch block:
+    m_space        = std::string();
+    m_tracking     = std::vector<MemoryTrackingEntry*>();
+    m_tracking_end = std::vector<ptrdiff_t>();
+
+  } catch( ... ) {}
+}
+
+void MemoryTracking::insert( MemoryTrackingEntry * entry )
+{
+  const ptrdiff_t max = std::numeric_limits<ptrdiff_t>::max();
+
+  const bool ok_exists = ! m_tracking_end.empty();
+
+  const bool ok_range = entry &&
+                        0 < entry->begin &&
+                            entry->begin < entry->end &&
+                                           entry->end < max ;
+
+  int i = -1 ;
+
+  if ( ok_exists && ok_range ) {
+
+    i = upper_bound( & m_tracking_end[0] , m_tracking_end.size() , entry->begin );
+
+    // Guaranteed:
+    //   a) entry->begin < m_tracking_end[i]
+    //   b) i == 0 || m_tracking_end[i-1] <= entry->begin
+
+    if ( entry->end <= m_tracking[i]->begin ) {
+
+      // Non-overlapping range:
+      // m_tracking[i-1].end <= entry->begin < entry->end <= m_tracking[i].begin
+
+      entry->m_count = 1 ;
+
+      m_tracking.insert(     m_tracking.begin() + i , entry );
+      m_tracking_end.insert( m_tracking_end.begin() + i , entry->end );
+    }
+  }
+
+  if ( ! ok_exists || ! ok_range || -1 == i ) {
+    std::ostringstream msg ;
+    msg << "MemoryTracking(" << m_space << ")::insert( " ;
+    entry->print( msg );
+    msg << " ) ERROR: " ;
+
+    if ( ! ok_range ) {
+      msg << "Invalid memory range" ;
+    }
+    else {
+      msg << "Overlapping memory range with " ;
+      m_tracking[i]->print( msg );
+    }
+    msg << " )" ;
+    throw_runtime_exception( msg.str() );
+  }
+}
+
+void MemoryTracking::increment( const void * ptr )
+{
+  if ( ptr ) {
+    const ptrdiff_t p = reinterpret_cast<ptrdiff_t>( ptr );
+
+    bool error = m_tracking_end.empty();
+
+    if ( ! error ) {
+
+      const int i = upper_bound( & m_tracking_end[0] , m_tracking_end.size() , p );
+
+      error = p < m_tracking[i]->begin ;
+
+      if ( ! error ) {
+        ++( m_tracking[i]->m_count );
+      }
+    }
+
+    if ( error ) {
+      std::ostringstream msg ;
+      msg << "MemoryTracking(" << m_space
+          << ")::increment( " << p << " ) ERROR: Not being tracked" ;
+      throw_runtime_exception( msg.str() );
+    }
+  }
+}
+
+void MemoryTracking::decrement( const void * ptr )
+{
+  if ( ptr ) {
+    const ptrdiff_t p = reinterpret_cast<ptrdiff_t>( ptr );
+
+    bool error = m_tracking_end.empty();
+
+    if ( ! error ) {
+
+      const int i = upper_bound( & m_tracking_end[0] , m_tracking_end.size() , p );
+
+      error = p < m_tracking[i]->begin ;
+
+      if ( ! error && ( 0 == --( m_tracking[i]->m_count ) ) ) {
+        delete m_tracking[i] ;
+
+        m_tracking.erase(     m_tracking.begin() + i );
+        m_tracking_end.erase( m_tracking_end.begin() + i );
+      }
+    }
+
+    if ( error ) {
+      std::ostringstream msg ;
+      msg << "MemoryTracking(" << m_space
+          << ")::decrement( " << p << " ) ERROR: Not being tracked" 
+          << std::endl ;
+      std::cerr << msg.str();
+    }
+  }
+}
+
+MemoryTrackingEntry *
+MemoryTracking::query( const void * ptr ) const
+{
+  MemoryTrackingEntry * result = 0 ;
+
+  if ( ptr && ! m_tracking_end.empty() ) {
+    const ptrdiff_t p = reinterpret_cast<ptrdiff_t>( ptr );
+
+    const int i = upper_bound( & m_tracking_end[0] , m_tracking_end.size() , p );
+
+    if ( m_tracking[i]->begin <= p ) result = m_tracking[i] ;
+  }
+
+  return result ;
+}
+
+void MemoryTracking::print( std::ostream & s , const std::string & lead ) const
+{
+  // Don't print the sentinal value:
+  const size_t n = m_tracking.empty() ? 0 : m_tracking.size() - 1 ;
+
+  for ( size_t i = 0 ; i < n ; ++i ) {
+    s << lead ;
+    m_tracking[i]->print( s );
+    s << std::endl ;
+  }
+}
+
+MemoryTrackingEntry::~MemoryTrackingEntry()
+{}
+
+void MemoryTrackingEntry::print( std::ostream & s ) const
+{
+  s << "{ "
+    << "label("  << label << ") "
+    << "typeid(" << type.name() << ") "
+    << "range[ " << ((void*)begin) << " : " << ((void*)end) << " ) "
+    << "count("  << m_count << ") }" ;
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_MemoryTracking.hpp b/lib/kokkos/core/src/impl/Kokkos_MemoryTracking.hpp
new file mode 100644
index 000000000..1571e3b40
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_MemoryTracking.hpp
@@ -0,0 +1,151 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MEMORY_TRACKING_HPP
+#define KOKKOS_MEMORY_TRACKING_HPP
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+#include <string>
+#include <typeinfo>
+#include <iosfwd>
+
+namespace Kokkos {
+namespace Impl {
+
+class MemoryTracking ;
+
+class MemoryTrackingEntry {
+public:
+  const std::string      label ;
+  const std::type_info & type ;
+  const ptrdiff_t        begin ;
+  const ptrdiff_t        end ;
+private:
+  unsigned m_count ;
+protected:
+
+  MemoryTrackingEntry( const std::string    & arg_label ,
+                       const std::type_info & arg_type ,
+                       const void * const     arg_begin ,
+                       const unsigned         arg_bytes )
+    : label( arg_label )
+    , type(  arg_type )
+    , begin( reinterpret_cast<ptrdiff_t>( arg_begin ) )
+    , end(   reinterpret_cast<ptrdiff_t>(
+               reinterpret_cast<const unsigned char *>( arg_begin ) + arg_bytes ) )
+    , m_count( 0 )
+    {}
+
+public:
+
+  unsigned count() const { return m_count ; }
+
+  virtual void print( std::ostream & ) const ;
+
+  virtual ~MemoryTrackingEntry();
+
+private:
+
+  MemoryTrackingEntry();
+  MemoryTrackingEntry( const MemoryTrackingEntry & rhs );
+  MemoryTrackingEntry & operator = ( const MemoryTrackingEntry & rhs );
+
+  friend class MemoryTracking ;
+};
+
+
+class MemoryTracking {
+public:
+
+  /** \brief  Track a memory range defined by the entry.
+   *          This entry must be allocated via 'new'.
+   */
+  void insert( MemoryTrackingEntry * entry );
+
+  /** \brief  Decrement the tracked memory range.
+   *          If the count is zero then the entry is deleted
+   *          via the 'delete' operator.
+   */
+  void decrement( const void * ptr );
+
+  /** \brief  Increment the tracking count.  */
+  void increment( const void * ptr );
+
+  /** \brief  Query a tracked memory range. */
+  MemoryTrackingEntry * query( const void * ptr ) const ;
+
+  /** \brief  Call the 'print' method on all entries. */
+  void print( std::ostream & , const std::string & lead ) const ;
+
+  size_t size() const { return m_tracking.size(); }
+
+  template< typename iType >
+  MemoryTracking & operator[]( const iType & i ) const
+    { return *m_tracking[i]; }
+
+  /** \brief Construct with a name for error messages */
+  explicit MemoryTracking( const std::string & space );
+
+  /** \brief  Print memory leak warning for all entries. */
+  ~MemoryTracking();
+
+  /** \brief Query if constructed */
+  bool exists() const { return ! m_tracking_end.empty(); }
+
+private:
+  MemoryTracking();
+  MemoryTracking( const MemoryTracking & );
+  MemoryTracking & operator = ( const MemoryTracking & );
+
+  std::string                        m_space ;
+  std::vector<MemoryTrackingEntry*>  m_tracking ;
+  std::vector<ptrdiff_t>             m_tracking_end ;
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#endif
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
new file mode 100644
index 000000000..0222b561f
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@@ -0,0 +1,72 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_MEMORY_FENCE )
+#define KOKKOS_MEMORY_FENCE
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+KOKKOS_FORCEINLINE_FUNCTION
+void memory_fence()
+{
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+  __threadfence();
+#elif defined( KOKKOS_ATOMICS_USE_GCC ) && ( !defined( __INTEL_COMPILER ) || defined ( KOKKOS_HAVE_CUDA ) )
+  __sync_synchronize();
+#elif defined( __INTEL_COMPILER ) || defined( KOKKOS_ATOMICS_USE_INTEL )
+  _mm_mfence();
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+  #pragma omp flush
+
+#else
+ #error "Error: memory_fence() not defined"
+#endif
+}
+
+} // namespace kokkos
+
+#endif
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp b/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
new file mode 100644
index 000000000..0dcb3977a
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
@@ -0,0 +1,84 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_PHYSICAL_LAYOUT_HPP
+#define KOKKOS_PHYSICAL_LAYOUT_HPP
+
+
+#include <Kokkos_View.hpp>
+namespace Kokkos {
+namespace Impl {
+
+
+
+struct PhysicalLayout {
+  enum LayoutType {Left,Right,Scalar,Error};
+  LayoutType layout_type;
+  int rank;
+  long long int stride[8]; //distance between two neighboring elements in a given dimension
+
+  template< class T , class L , class D , class M >
+  PhysicalLayout( const View<T,L,D,M,ViewDefault> & view )
+    : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft  >::value ? Left : (
+                   is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error ))
+    , rank( view.Rank )
+    {
+      for(int i=0;i<8;i++) stride[i] = 0;
+      view.stride( stride );
+    }
+  #ifdef KOKKOS_HAVE_CUDA
+  template< class T , class L , class D , class M >
+  PhysicalLayout( const View<T,L,D,M,ViewCudaTexture> & view )
+    : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft  >::value ? Left : (
+                   is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error ))
+    , rank( view.Rank )
+    {
+      for(int i=0;i<8;i++) stride[i] = 0;
+      view.stride( stride );
+    }
+  #endif
+};
+
+}
+}
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
new file mode 100644
index 000000000..ba302f9c2
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
@@ -0,0 +1,85 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdlib.h>
+#include <Kokkos_Serial.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace {
+
+struct Sentinel {
+
+  void *   m_reduce ;
+  unsigned m_reduce_size ;
+
+  Sentinel() : m_reduce(0), m_reduce_size(0) {}
+
+  ~Sentinel() { if ( m_reduce ) { free( m_reduce ); } }
+};
+
+}
+
+void * Serial::resize_reduce_scratch( unsigned size )
+{
+  static Sentinel s ;
+
+  const unsigned rem = size % Impl::MEMORY_ALIGNMENT ;
+
+  if ( rem ) size += Impl::MEMORY_ALIGNMENT - rem ;
+
+  if ( ( 0 == size ) || ( s.m_reduce_size < size ) ) {
+
+    if ( s.m_reduce ) { free( s.m_reduce ); }
+  
+    s.m_reduce_size = size ;
+
+    s.m_reduce = malloc( size );
+  }
+
+  return s.m_reduce ;
+}
+
+} // namespace Kokkos
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Shape.cpp b/lib/kokkos/core/src/impl/Kokkos_Shape.cpp
new file mode 100644
index 000000000..e3bf5d39f
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Shape.cpp
@@ -0,0 +1,178 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+
+#include <sstream>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_Shape.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void assert_counts_are_equal_throw(
+  const unsigned x_count ,
+  const unsigned y_count )
+{
+  std::ostringstream msg ;
+
+  msg << "Kokkos::Impl::assert_counts_are_equal_throw( "
+      << x_count << " != " << y_count << " )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+void assert_shapes_are_equal_throw(
+  const unsigned x_scalar_size ,
+  const unsigned x_rank ,
+  const unsigned x_N0 , const unsigned x_N1 ,
+  const unsigned x_N2 , const unsigned x_N3 ,
+  const unsigned x_N4 , const unsigned x_N5 ,
+  const unsigned x_N6 , const unsigned x_N7 ,
+
+  const unsigned y_scalar_size ,
+  const unsigned y_rank ,
+  const unsigned y_N0 , const unsigned y_N1 ,
+  const unsigned y_N2 , const unsigned y_N3 ,
+  const unsigned y_N4 , const unsigned y_N5 ,
+  const unsigned y_N6 , const unsigned y_N7 )
+{
+  std::ostringstream msg ;
+
+  msg << "Kokkos::Impl::assert_shape_are_equal_throw( {"
+      << " scalar_size(" << x_scalar_size
+      << ") rank(" << x_rank
+      << ") dimension(" ;
+  if ( 0 < x_rank ) { msg << " " << x_N0 ; }
+  if ( 1 < x_rank ) { msg << " " << x_N1 ; }
+  if ( 2 < x_rank ) { msg << " " << x_N2 ; }
+  if ( 3 < x_rank ) { msg << " " << x_N3 ; }
+  if ( 4 < x_rank ) { msg << " " << x_N4 ; }
+  if ( 5 < x_rank ) { msg << " " << x_N5 ; }
+  if ( 6 < x_rank ) { msg << " " << x_N6 ; }
+  if ( 7 < x_rank ) { msg << " " << x_N7 ; }
+  msg << " ) } != { "
+      << " scalar_size(" << y_scalar_size
+      << ") rank(" << y_rank
+      << ") dimension(" ;
+  if ( 0 < y_rank ) { msg << " " << y_N0 ; }
+  if ( 1 < y_rank ) { msg << " " << y_N1 ; }
+  if ( 2 < y_rank ) { msg << " " << y_N2 ; }
+  if ( 3 < y_rank ) { msg << " " << y_N3 ; }
+  if ( 4 < y_rank ) { msg << " " << y_N4 ; }
+  if ( 5 < y_rank ) { msg << " " << y_N5 ; }
+  if ( 6 < y_rank ) { msg << " " << y_N6 ; }
+  if ( 7 < y_rank ) { msg << " " << y_N7 ; }
+  msg << " ) } )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+void AssertShapeBoundsAbort< Kokkos::HostSpace >::apply(
+  const size_t rank ,
+  const size_t n0 , const size_t n1 , 
+  const size_t n2 , const size_t n3 ,
+  const size_t n4 , const size_t n5 ,
+  const size_t n6 , const size_t n7 ,
+
+  const size_t arg_rank ,
+  const size_t i0 , const size_t i1 ,
+  const size_t i2 , const size_t i3 ,
+  const size_t i4 , const size_t i5 ,
+  const size_t i6 , const size_t i7 )
+{
+  std::ostringstream msg ;
+  msg << "Kokkos::Impl::AssertShapeBoundsAbort( shape = {" ;
+  if ( 0 < rank ) { msg << " " << n0 ; }
+  if ( 1 < rank ) { msg << " " << n1 ; }
+  if ( 2 < rank ) { msg << " " << n2 ; }
+  if ( 3 < rank ) { msg << " " << n3 ; }
+  if ( 4 < rank ) { msg << " " << n4 ; }
+  if ( 5 < rank ) { msg << " " << n5 ; }
+  if ( 6 < rank ) { msg << " " << n6 ; }
+  if ( 7 < rank ) { msg << " " << n7 ; }
+  msg << " } index = {" ;
+  if ( 0 < arg_rank ) { msg << " " << i0 ; }
+  if ( 1 < arg_rank ) { msg << " " << i1 ; }
+  if ( 2 < arg_rank ) { msg << " " << i2 ; }
+  if ( 3 < arg_rank ) { msg << " " << i3 ; }
+  if ( 4 < arg_rank ) { msg << " " << i4 ; }
+  if ( 5 < arg_rank ) { msg << " " << i5 ; }
+  if ( 6 < arg_rank ) { msg << " " << i6 ; }
+  if ( 7 < arg_rank ) { msg << " " << i7 ; }
+  msg << " } )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+void assert_shape_effective_rank1_at_leastN_throw(
+  const size_t x_rank , const size_t x_N0 ,
+  const size_t x_N1 ,   const size_t x_N2 ,
+  const size_t x_N3 ,   const size_t x_N4 ,
+  const size_t x_N5 ,   const size_t x_N6 ,
+  const size_t x_N7 ,
+  const size_t N0 )
+{
+  std::ostringstream msg ;
+
+  msg << "Kokkos::Impl::assert_shape_effective_rank1_at_leastN_throw( shape = {" ;
+  if ( 0 < x_rank ) { msg << " " << x_N0 ; }
+  if ( 1 < x_rank ) { msg << " " << x_N1 ; }
+  if ( 2 < x_rank ) { msg << " " << x_N2 ; }
+  if ( 3 < x_rank ) { msg << " " << x_N3 ; }
+  if ( 4 < x_rank ) { msg << " " << x_N4 ; }
+  if ( 5 < x_rank ) { msg << " " << x_N5 ; }
+  if ( 6 < x_rank ) { msg << " " << x_N6 ; }
+  if ( 7 < x_rank ) { msg << " " << x_N7 ; }
+  msg << " } N = " << N0 << " )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+
+
+}
+}
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Shape.hpp b/lib/kokkos/core/src/impl/Kokkos_Shape.hpp
new file mode 100644
index 000000000..2bcd7faf8
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Shape.hpp
@@ -0,0 +1,895 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SHAPE_HPP
+#define KOKKOS_SHAPE_HPP
+
+#include <typeinfo>
+#include <utility>
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Layout.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_StaticAssert.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+/** \brief  The shape of a Kokkos with dynamic and static dimensions.
+ *          Dynamic dimensions are member values and static dimensions are
+ *          'static const' values.
+ *
+ *  The upper bound on the array rank is eight.
+ */
+template< unsigned ScalarSize ,
+          unsigned Rank ,
+          unsigned s0  = 1 ,
+          unsigned s1  = 1 ,
+          unsigned s2  = 1 ,
+          unsigned s3  = 1 ,
+          unsigned s4  = 1 ,
+          unsigned s5  = 1 ,
+          unsigned s6  = 1 ,
+          unsigned s7  = 1 >
+struct Shape ;
+
+template< class ShapeType , class Layout >
+struct ShapeMap ;
+
+//----------------------------------------------------------------------------
+/** \brief  Shape equality if the value type, layout, and dimensions
+ *          are equal.
+ */
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize , unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+KOKKOS_INLINE_FUNCTION
+bool operator == ( const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+                   const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{
+  enum { same_size = xSize == ySize };
+  enum { same_rank = xRank == yRank };
+
+  return same_size && same_rank &&
+         unsigned( x.N0 ) == unsigned( y.N0 ) &&
+         unsigned( x.N1 ) == unsigned( y.N1 ) &&
+         unsigned( x.N2 ) == unsigned( y.N2 ) &&
+         unsigned( x.N3 ) == unsigned( y.N3 ) &&
+         unsigned( x.N4 ) == unsigned( y.N4 ) &&
+         unsigned( x.N5 ) == unsigned( y.N5 ) &&
+         unsigned( x.N6 ) == unsigned( y.N6 ) &&
+         unsigned( x.N7 ) == unsigned( y.N7 ) ;
+}
+
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize ,unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+KOKKOS_INLINE_FUNCTION
+bool operator != ( const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+                   const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{ return ! operator == ( x , y ); }
+
+//----------------------------------------------------------------------------
+
+void assert_counts_are_equal_throw(
+  const unsigned x_count ,
+  const unsigned y_count );
+
+inline
+void assert_counts_are_equal(
+  const unsigned x_count ,
+  const unsigned y_count )
+{
+  if ( x_count != y_count ) {
+    assert_counts_are_equal_throw( x_count , y_count );
+  }
+}
+
+void assert_shapes_are_equal_throw(
+  const unsigned x_scalar_size ,
+  const unsigned x_rank ,
+  const unsigned x_N0 , const unsigned x_N1 ,
+  const unsigned x_N2 , const unsigned x_N3 ,
+  const unsigned x_N4 , const unsigned x_N5 ,
+  const unsigned x_N6 , const unsigned x_N7 ,
+
+  const unsigned y_scalar_size ,
+  const unsigned y_rank ,
+  const unsigned y_N0 , const unsigned y_N1 ,
+  const unsigned y_N2 , const unsigned y_N3 ,
+  const unsigned y_N4 , const unsigned y_N5 ,
+  const unsigned y_N6 , const unsigned y_N7 );
+
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize , unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+inline
+void assert_shapes_are_equal(
+  const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+  const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{
+  typedef Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> x_type ;
+  typedef Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> y_type ;
+
+  if ( x != y ) {
+    assert_shapes_are_equal_throw(
+      x_type::scalar_size, x_type::rank, x.N0, x.N1, x.N2, x.N3, x.N4, x.N5, x.N6, x.N7,
+      y_type::scalar_size, y_type::rank, y.N0, y.N1, y.N2, y.N3, y.N4, y.N5, y.N6, y.N7 );
+  }
+}
+
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize , unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+void assert_shapes_equal_dimension(
+  const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+  const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{
+  typedef Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> x_type ;
+  typedef Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> y_type ;
+
+  // Omit comparison of scalar_size.
+  if ( unsigned( x.rank ) != unsigned( y.rank ) ||
+       unsigned( x.N0 ) != unsigned( y.N0 ) || 
+       unsigned( x.N1 ) != unsigned( y.N1 ) || 
+       unsigned( x.N2 ) != unsigned( y.N2 ) || 
+       unsigned( x.N3 ) != unsigned( y.N3 ) ||
+       unsigned( x.N4 ) != unsigned( y.N4 ) || 
+       unsigned( x.N5 ) != unsigned( y.N5 ) || 
+       unsigned( x.N6 ) != unsigned( y.N6 ) || 
+       unsigned( x.N7 ) != unsigned( y.N7 ) ) {
+    assert_shapes_are_equal_throw(
+      x_type::scalar_size, x_type::rank, x.N0, x.N1, x.N2, x.N3, x.N4, x.N5, x.N6, x.N7,
+      y_type::scalar_size, y_type::rank, y.N0, y.N1, y.N2, y.N3, y.N4, y.N5, y.N6, y.N7 );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< class ShapeType > struct assert_shape_is_rank_zero ;
+template< class ShapeType > struct assert_shape_is_rank_one ;
+
+template< unsigned Size >
+struct assert_shape_is_rank_zero< Shape<Size,0> >
+  : public true_type {};
+
+template< unsigned Size , unsigned s0 >
+struct assert_shape_is_rank_one< Shape<Size,1,s0> >
+  : public true_type {};
+
+//----------------------------------------------------------------------------
+
+/** \brief  Array bounds assertion templated on the execution space
+ *          to allow device-specific abort code.
+ */
+template< class ExecutionSpace >
+struct AssertShapeBoundsAbort ;
+
+template<>
+struct AssertShapeBoundsAbort< Kokkos::HostSpace >
+{
+  static void apply( const size_t rank ,
+                     const size_t n0 , const size_t n1 ,
+                     const size_t n2 , const size_t n3 ,
+                     const size_t n4 , const size_t n5 ,
+                     const size_t n6 , const size_t n7 ,
+                     const size_t arg_rank ,
+                     const size_t i0 , const size_t i1 ,
+                     const size_t i2 , const size_t i3 ,
+                     const size_t i4 , const size_t i5 ,
+                     const size_t i6 , const size_t i7 );
+};
+
+template< class ExecutionDevice >
+struct AssertShapeBoundsAbort
+{
+  KOKKOS_INLINE_FUNCTION
+  static void apply( const size_t rank ,
+                     const size_t n0 , const size_t n1 ,
+                     const size_t n2 , const size_t n3 ,
+                     const size_t n4 , const size_t n5 ,
+                     const size_t n6 , const size_t n7 ,
+                     const size_t arg_rank ,
+                     const size_t i0 , const size_t i1 ,
+                     const size_t i2 , const size_t i3 ,
+                     const size_t i4 , const size_t i5 ,
+                     const size_t i6 , const size_t i7 )
+    {
+      AssertShapeBoundsAbort< Kokkos::HostSpace >
+        ::apply( rank ,    n0 , n1 , n2 , n3 , n4 , n5 , n6 , n7 ,
+                 arg_rank, i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
+    }
+};
+
+template< class ShapeType >
+KOKKOS_INLINE_FUNCTION
+void assert_shape_bounds( const ShapeType & shape ,
+                          const size_t arg_rank ,
+                          const size_t i0 ,
+                          const size_t i1 = 0 ,
+                          const size_t i2 = 0 ,
+                          const size_t i3 = 0 ,
+                          const size_t i4 = 0 ,
+                          const size_t i5 = 0 ,
+                          const size_t i6 = 0 ,
+                          const size_t i7 = 0 )
+{
+  // Must supply at least as many indices as ranks.
+  // Every index must be within bounds.
+  const bool ok = ShapeType::rank <= arg_rank &&
+                  i0 < shape.N0 && 
+                  i1 < shape.N1 &&
+                  i2 < shape.N2 &&
+                  i3 < shape.N3 &&
+                  i4 < shape.N4 &&
+                  i5 < shape.N5 &&
+                  i6 < shape.N6 &&
+                  i7 < shape.N7 ;
+
+  if ( ! ok ) {
+    AssertShapeBoundsAbort< ExecutionSpace >
+      ::apply( ShapeType::rank ,
+               shape.N0 , shape.N1 , shape.N2 , shape.N3 ,
+               shape.N4 , shape.N5 , shape.N6 , shape.N7 ,
+               arg_rank , i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
+  }
+}
+
+#if defined( KOKKOS_EXPRESSION_CHECK )
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_1( S , I0 ) assert_shape_bounds(S,1,I0);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_2( S , I0 , I1 ) assert_shape_bounds(S,2,I0,I1);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_3( S , I0 , I1 , I2 ) assert_shape_bounds(S,3,I0,I1,I2);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_4( S , I0 , I1 , I2 , I3 ) assert_shape_bounds(S,4,I0,I1,I2,I3);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_5( S , I0 , I1 , I2 , I3 , I4 ) assert_shape_bounds(S,5,I0,I1,I2,I3,I4);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_6( S , I0 , I1 , I2 , I3 , I4 , I5 ) assert_shape_bounds(S,6,I0,I1,I2,I3,I4,I5);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_7( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 ) assert_shape_bounds(S,7,I0,I1,I2,I3,I4,I5,I6);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_8( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) assert_shape_bounds(S,8,I0,I1,I2,I3,I4,I5,I6,I7);
+#else
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_1( S , I0 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_2( S , I0 , I1 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_3( S , I0 , I1 , I2 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_4( S , I0 , I1 , I2 , I3 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_5( S , I0 , I1 , I2 , I3 , I4 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_6( S , I0 , I1 , I2 , I3 , I4 , I5 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_7( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_8( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) /* */
+#endif
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Specialization and optimization for the Rank 0 shape.
+
+template < unsigned ScalarSize >
+struct Shape< ScalarSize , 0, 1,1,1,1, 1,1,1,1 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 0 };
+  enum { rank         = 0 };
+
+  enum { N0 = 1 };
+  enum { N1 = 1 };
+  enum { N2 = 1 };
+  enum { N3 = 1 };
+  enum { N4 = 1 };
+  enum { N5 = 1 };
+  enum { N6 = 1 };
+  enum { N7 = 1 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  {}
+};
+
+//----------------------------------------------------------------------------
+// All-static dimension array
+
+template < unsigned ScalarSize ,
+           unsigned Rank ,
+           unsigned s0 ,
+           unsigned s1 ,
+           unsigned s2 ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape {
+
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 0 };
+  enum { rank         = Rank };
+
+  enum { N0 = s0 };
+  enum { N1 = s1 };
+  enum { N2 = s2 };
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  {}
+};
+
+// 1 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize ,
+           unsigned Rank ,
+           unsigned s1 ,
+           unsigned s2 ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,s1,s2,s3, s4,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 1 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+
+  enum { N1 = s1 };
+  enum { N2 = s2 };
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; }
+};
+
+// 2 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s2 ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,s2,s3, s4,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 2 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+
+  enum { N2 = s2 };
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; }
+};
+
+// 3 == dynamic_rank <= rank <= 8
+template < unsigned Rank , unsigned ScalarSize ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,s3, s4,s5,s6,s7>
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 3 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; }
+};
+
+// 4 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank, 0,0,0,0, s4,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 4 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; }
+};
+
+// 5 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,0, 0,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 5 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; s.N4 = n4 ; }
+};
+
+// 6 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,0, 0,0,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 6 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+  unsigned N5 ;
+
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned n5 = 0 , unsigned = 0 , unsigned = 0 )
+  {
+    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
+    s.N4 = n4 ; s.N5 = n5 ;
+  }
+};
+
+// 7 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,0, 0,0,0,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 7 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+  unsigned N5 ;
+  unsigned N6 ;
+
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned n5 , unsigned n6 , unsigned = 0 )
+  {
+    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
+    s.N4 = n4 ; s.N5 = n5 ; s.N6 = n6 ;
+  }
+};
+
+// 8 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize >
+struct Shape< ScalarSize , 8 , 0,0,0,0, 0,0,0,0 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 8 };
+  enum { rank         = 8 };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+  unsigned N5 ;
+  unsigned N6 ;
+  unsigned N7 ;
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned n5 , unsigned n6 , unsigned n7 )
+  {
+    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
+    s.N4 = n4 ; s.N5 = n5 ; s.N6 = n6 ; s.N7 = n7 ;
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< class ShapeType , unsigned N ,
+          unsigned R = ShapeType::rank_dynamic >
+struct ShapeInsert ;
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 0 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 N ,
+                 ShapeType::N0 ,
+                 ShapeType::N1 ,
+                 ShapeType::N2 ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 1 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 N ,
+                 ShapeType::N1 ,
+                 ShapeType::N2 ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 2 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N2 ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 3 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 4 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 5 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 6 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 7 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N > type ;
+};
+
+//----------------------------------------------------------------------------
+
+template< class DstShape , class SrcShape ,
+          unsigned DstRankDynamic   = DstShape::rank_dynamic ,
+          bool     DstRankDynamicOK = unsigned(DstShape::rank_dynamic) >= unsigned(SrcShape::rank_dynamic) >
+struct ShapeCompatible { enum { value = false }; };
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 8 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 7 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 6 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 5 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 4 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 3 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 2 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 1 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N1) == unsigned(SrcShape::N1) &&
+                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 0 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N0) == unsigned(SrcShape::N0) &&
+                 unsigned(DstShape::N1) == unsigned(SrcShape::N1) &&
+                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< unsigned ScalarSize , unsigned Rank ,
+          unsigned s0 , unsigned s1 , unsigned s2 , unsigned s3 ,
+          unsigned s4 , unsigned s5 , unsigned s6 , unsigned s7 ,
+          typename iType >
+KOKKOS_INLINE_FUNCTION
+size_t dimension( 
+  const Shape<ScalarSize,Rank,s0,s1,s2,s3,s4,s5,s6,s7> & shape ,
+  const iType & r )
+{
+  return 0 == r ? shape.N0 : (
+         1 == r ? shape.N1 : (
+         2 == r ? shape.N2 : (
+         3 == r ? shape.N3 : (
+         4 == r ? shape.N4 : (
+         5 == r ? shape.N5 : (
+         6 == r ? shape.N6 : (
+         7 == r ? shape.N7 : 1 )))))));
+}
+
+template< unsigned ScalarSize , unsigned Rank ,
+          unsigned s0 , unsigned s1 , unsigned s2 , unsigned s3 ,
+          unsigned s4 , unsigned s5 , unsigned s6 , unsigned s7 >
+KOKKOS_INLINE_FUNCTION
+size_t cardinality_count(
+  const Shape<ScalarSize,Rank,s0,s1,s2,s3,s4,s5,s6,s7> & shape )
+{
+  return shape.N0 * shape.N1 * shape.N2 * shape.N3 *
+         shape.N4 * shape.N5 * shape.N6 * shape.N7 ;
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_CORESHAPE_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp b/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp
new file mode 100644
index 000000000..f1017c312
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp
@@ -0,0 +1,79 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STATICASSERT_HPP
+#define KOKKOS_STATICASSERT_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template < bool , class T = void >
+struct StaticAssert ;
+
+template< class T >
+struct StaticAssert< true , T > {
+  typedef T type ;
+  static const bool value = true ;
+};
+
+template < class A , class B >
+struct StaticAssertSame ;
+
+template < class A >
+struct StaticAssertSame<A,A> { typedef A type ; };
+
+template < class A , class B >
+struct StaticAssertAssignable ;
+
+template < class A >
+struct StaticAssertAssignable<A,A> { typedef A type ; };
+
+template < class A >
+struct StaticAssertAssignable< const A , A > { typedef const A type ; };
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* KOKKOS_STATICASSERT_HPP */
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Timer.hpp b/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
new file mode 100644
index 000000000..700653b0d
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
@@ -0,0 +1,115 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPLWALLTIME_HPP
+#define KOKKOS_IMPLWALLTIME_HPP
+
+#include <stddef.h>
+
+#ifdef _MSC_VER
+#undef KOKKOS_USE_LIBRT
+#include <gettimeofday.c>
+#else
+#ifdef KOKKOS_USE_LIBRT
+#include <ctime>
+#else
+#include <sys/time.h>
+#endif
+#endif
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Time since construction */
+
+class Timer {
+private:
+  #ifdef KOKKOS_USE_LIBRT
+	struct timespec m_old;
+  #else
+	struct timeval m_old ;
+  #endif
+  Timer( const Timer & );
+  Timer & operator = ( const Timer & );
+public:
+
+  inline
+  void reset() {
+    #ifdef KOKKOS_USE_LIBRT
+	  clock_gettime(&m_old);
+    #else
+	  gettimeofday( & m_old , ((struct timezone *) NULL ) );
+    #endif
+  }
+
+  inline
+  ~Timer() {}
+
+  inline
+  Timer() { reset(); }
+
+  inline
+  double seconds() const
+  {
+    #ifdef KOKKOS_USE_LIBRT
+      struct timespec m_new;
+      clock_gettime(&m_new);
+
+      return ( (double) ( m_new.tv_sec  - m_old.tv_sec ) ) +
+             ( (double) ( m_new.tv_nsec - m_old.tv_nsec ) * 1.0e-9 );
+    #else
+      struct timeval m_new ;
+
+      ::gettimeofday( & m_new , ((struct timezone *) NULL ) );
+
+      return ( (double) ( m_new.tv_sec  - m_old.tv_sec ) ) +
+             ( (double) ( m_new.tv_usec - m_old.tv_usec ) * 1.0e-6 );
+    #endif
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_IMPLWALLTIME_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Traits.hpp b/lib/kokkos/core/src/impl/Kokkos_Traits.hpp
new file mode 100644
index 000000000..64dbfc0da
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Traits.hpp
@@ -0,0 +1,293 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSTRAITS_HPP
+#define KOKKOSTRAITS_HPP
+
+#include <stddef.h>
+#include <Kokkos_Macros.hpp>
+#include <stdint.h>
+
+namespace Kokkos {
+namespace Impl {
+
+/* C++11 conformal compile-time type traits utilities.
+ * Prefer to use C++11 when portably available.
+ */
+//----------------------------------------------------------------------------
+// C++11 Helpers:
+
+template < class T , T v >
+struct integral_constant
+{
+  // Declaration of 'static const' causes an unresolved linker symbol in debug
+  // static const T value = v ;
+  enum { value = T(v) };
+  typedef T value_type;
+  typedef integral_constant<T,v> type;
+  KOKKOS_INLINE_FUNCTION operator T() { return v ; }
+};
+
+typedef integral_constant<bool,false> false_type ;
+typedef integral_constant<bool,true>  true_type ;
+
+//----------------------------------------------------------------------------
+// C++11 Type relationships:
+
+template< class X , class Y > struct is_same : public false_type {};
+template< class X >           struct is_same<X,X> : public true_type {};
+
+//----------------------------------------------------------------------------
+// C++11 Type properties:
+
+template <typename T> struct is_const : public false_type {};
+template <typename T> struct is_const<const T> : public true_type {};
+template <typename T> struct is_const<const T & > : public true_type {};
+
+//----------------------------------------------------------------------------
+// C++11 Type transformations:
+
+template <typename T> struct remove_const { typedef T type; };
+template <typename T> struct remove_const<const T> { typedef T type; };
+template <typename T> struct remove_const<const T & > { typedef T & type; };
+
+template <typename T> struct add_const { typedef const T type; };
+template <typename T> struct add_const<T & > { typedef const T & type; };
+template <typename T> struct add_const<const T> { typedef const T type; };
+template <typename T> struct add_const<const T & > { typedef const T & type; };
+
+template<typename T> struct remove_reference { typedef T type ; };
+template<typename T> struct remove_reference< T & > { typedef T type ; };
+template<typename T> struct remove_reference< const T & > { typedef const T type ; };
+
+//----------------------------------------------------------------------------
+// C++11 Other type generators:
+
+template< bool , class T , class F >
+struct condition { typedef F type ; };
+
+template< class T , class F >
+struct condition<true,T,F> { typedef T type ; };
+
+template< bool , class = void >
+struct enable_if ;
+
+template< class T >
+struct enable_if< true , T > { typedef T type ; };
+
+//----------------------------------------------------------------------------
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Other traits
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< class , class T = void >
+struct enable_if_type { typedef T type ; };
+
+//----------------------------------------------------------------------------
+
+template< bool B >
+struct bool_ : public integral_constant<bool,B> {};
+
+template< unsigned I >
+struct unsigned_ : public integral_constant<unsigned,I> {};
+
+template< int I >
+struct int_ : public integral_constant<int,I> {};
+
+typedef bool_<true> true_;
+typedef bool_<false> false_;
+//----------------------------------------------------------------------------
+// if_
+
+template < bool Cond , typename TrueType , typename FalseType>
+struct if_c
+{
+  enum { value = Cond };
+
+  typedef FalseType type;
+
+
+  typedef typename remove_const<
+          typename remove_reference<type>::type >::type value_type ;
+
+  typedef typename add_const<value_type>::type const_value_type ;
+
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const_value_type & v ) { return v ; }
+
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( value_type & v ) { return v ; }
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( const T & ) { value_type * ptr(0); return *ptr ; }
+
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const T & , const_value_type & v ) { return v ; }
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( const T & , value_type & v ) { return v ; }
+};
+
+template <typename TrueType, typename FalseType>
+struct if_c< true , TrueType , FalseType >
+{
+  enum { value = true };
+
+  typedef TrueType type;
+
+
+  typedef typename remove_const<
+          typename remove_reference<type>::type >::type value_type ;
+
+  typedef typename add_const<value_type>::type const_value_type ;
+
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const_value_type & v ) { return v ; }
+
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( value_type & v ) { return v ; }
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( const T & ) { value_type * ptr(0); return *ptr ; }
+
+
+  template< class F >
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const_value_type & v , const F & ) { return v ; }
+
+  template< class F >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( value_type & v , const F & ) { return v ; }
+};
+
+
+template <typename Cond, typename TrueType, typename FalseType>
+struct if_ : public if_c<Cond::value, TrueType, FalseType> {};
+
+//----------------------------------------------------------------------------
+
+template <size_t N>
+struct is_power_of_two
+{
+  enum type { value = (N > 0) && !(N & (N-1)) };
+};
+
+template < size_t N , bool OK = is_power_of_two<N>::value >
+struct power_of_two ;
+
+template < size_t N >
+struct power_of_two<N,true>
+{
+  enum type { value = 1+ power_of_two<(N>>1),true>::value };
+};
+
+template <>
+struct power_of_two<2,true>
+{
+  enum type { value = 1 };
+};
+
+template <>
+struct power_of_two<1,true>
+{
+  enum type { value = 0 };
+};
+
+//----------------------------------------------------------------------------
+
+template< typename T , T v , bool NonZero = ( v != T(0) ) >
+struct integral_nonzero_constant
+{
+  // Declaration of 'static const' causes an unresolved linker symbol in debug
+  // static const T value = v ;
+  enum { value = T(v) };
+  typedef T value_type ;
+  typedef integral_nonzero_constant<T,v> type ;
+  KOKKOS_INLINE_FUNCTION integral_nonzero_constant( const T & ) {}
+};
+
+template< typename T , T zero >
+struct integral_nonzero_constant<T,zero,false>
+{
+  const T value ;
+  typedef T value_type ;
+  typedef integral_nonzero_constant<T,0> type ;
+  KOKKOS_INLINE_FUNCTION integral_nonzero_constant( const T & v ) : value(v) {}
+};
+
+//----------------------------------------------------------------------------
+
+template <typename T> struct is_integral : public false_ {};
+
+template <> struct is_integral<int8_t>  : public true_ {};
+template <> struct is_integral<int16_t> : public true_ {};
+template <> struct is_integral<int32_t> : public true_ {};
+template <> struct is_integral<int64_t> : public true_ {};
+
+template <> struct is_integral<uint8_t>  : public true_ {};
+template <> struct is_integral<uint16_t> : public true_ {};
+template <> struct is_integral<uint32_t> : public true_ {};
+template <> struct is_integral<uint64_t> : public true_ {};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOSTRAITS_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewDefault.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewDefault.hpp
new file mode 100644
index 000000000..1f897e9c8
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewDefault.hpp
@@ -0,0 +1,2012 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VIEWDEFAULT_HPP
+#define KOKKOS_VIEWDEFAULT_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct ViewAssignment< ViewDefault , ViewDefault , void >
+{
+  typedef ViewDefault Specialize ;
+
+  //------------------------------------
+  /** \brief  Compatible value and shape */
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const typename enable_if<(
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                    ViewTraits<ST,SL,SD,SM> >::value
+                    ||
+                    ( ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                      ViewTraits<ST,SL,SD,SM> >::assignable_value
+                      &&
+                      ShapeCompatible< typename ViewTraits<DT,DL,DD,DM>::shape_type ,
+                                       typename ViewTraits<ST,SL,SD,SM>::shape_type >::value
+                      &&
+                      is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout,LayoutStride>::value )
+                  )>::type * = 0 )
+  {
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_offset_map.assign( src.m_offset_map );
+
+    dst.m_tracking = src.m_tracking ;
+
+    dst.m_ptr_on_device = src.m_ptr_on_device ;
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-0 from Rank-1 */
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                    ViewTraits<ST,SL,SD,SM> >::assignable_value &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 0 ) &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 1 )
+                  ), unsigned >::type i0 )
+  {
+    assert_shape_bounds( src.m_offset_map , 1 , i0 );
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking = src.m_tracking ;
+
+    dst.m_ptr_on_device = src.m_ptr_on_device + i0 ;
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-0 from Rank-2 */
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                    ViewTraits<ST,SL,SD,SM> >::assignable_value &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 0 ) &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 2 )
+                  ), unsigned >::type i0 ,
+                  const unsigned i1 )
+  {
+    assert_shape_bounds( src.m_offset_map , 2 , i0 , i1 );
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking = src.m_tracking ;
+
+    dst.m_ptr_on_device = src.m_ptr_on_device + src.m_offset_map(i0,i1);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-0 from Rank-3 */
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                    ViewTraits<ST,SL,SD,SM> >::assignable_value &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 0 ) &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 3 )
+                  ), unsigned >::type i0 ,
+                  const unsigned i1 ,
+                  const unsigned i2 )
+  {
+    assert_shape_bounds( src.m_offset_map, 3, i0, i1, i2 );
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking = src.m_tracking ;
+
+    dst.m_ptr_on_device = src.m_ptr_on_device + src.m_offset_map(i0,i1,i2);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-0 from Rank-4 */
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                    ViewTraits<ST,SL,SD,SM> >::assignable_value &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 0 ) &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 4 )
+                  ), unsigned >::type i0 ,
+                  const unsigned i1 ,
+                  const unsigned i2 ,
+                  const unsigned i3 )
+  {
+    assert_shape_bounds( src.m_offset_map, 4, i0, i1, i2, i3 );
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking = src.m_tracking ;
+
+    dst.m_ptr_on_device = src.m_ptr_on_device + src.m_offset_map(i0,i1,i2,i3);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-0 from Rank-5 */
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                    ViewTraits<ST,SL,SD,SM> >::assignable_value &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 0 ) &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 5 )
+                  ), unsigned >::type i0 ,
+                  const unsigned i1 ,
+                  const unsigned i2 ,
+                  const unsigned i3 ,
+                  const unsigned i4 )
+  {
+    assert_shape_bounds( src.m_offset_map, 5, i0, i1, i2, i3, i4);
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking = src.m_tracking ;
+
+    dst.m_ptr_on_device = src.m_ptr_on_device + src.m_offset_map(i0,i1,i2,i3,i4);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-0 from Rank-6 */
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                    ViewTraits<ST,SL,SD,SM> >::assignable_value &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 0 ) &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 6 )
+                  ), unsigned >::type i0 ,
+                  const unsigned i1 ,
+                  const unsigned i2 ,
+                  const unsigned i3 ,
+                  const unsigned i4 ,
+                  const unsigned i5 )
+  {
+    assert_shape_bounds( src.m_offset_map, 6, i0, i1, i2, i3, i4, i5);
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking = src.m_tracking ;
+
+    dst.m_ptr_on_device = src.m_ptr_on_device + src.m_offset_map(i0,i1,i2,i3,i4,i5);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-0 from Rank-7 */
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                    ViewTraits<ST,SL,SD,SM> >::assignable_value &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 0 ) &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 7 )
+                  ), unsigned >::type i0 ,
+                  const unsigned i1 ,
+                  const unsigned i2 ,
+                  const unsigned i3 ,
+                  const unsigned i4 ,
+                  const unsigned i5 ,
+                  const unsigned i6 )
+  {
+    assert_shape_bounds( src.m_offset_map, 7, i0, i1, i2, i3, i4, i5, i6 );
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking = src.m_tracking ;
+
+    dst.m_ptr_on_device = src.m_ptr_on_device + src.m_offset_map(i0,i1,i2,i3,i4,i5,i6);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-0 from Rank-8 */
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                    ViewTraits<ST,SL,SD,SM> >::assignable_value &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 0 ) &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 8 )
+                  ), unsigned >::type i0 ,
+                  const unsigned i1 ,
+                  const unsigned i2 ,
+                  const unsigned i3 ,
+                  const unsigned i4 ,
+                  const unsigned i5 ,
+                  const unsigned i6 ,
+                  const unsigned i7 )
+  {
+    assert_shape_bounds( src.m_offset_map, 8, i0, i1, i2, i3, i4, i5, i6, i7 );
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking = src.m_tracking ;
+
+    dst.m_ptr_on_device = src.m_ptr_on_device + src.m_offset_map(i0,i1,i2,i3,i4,i5,i6,i7);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-1 array from range of Rank-1 array, either layout */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM ,
+            typename iType >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const std::pair<iType,iType> & range ,
+                  typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 1 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 1 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 1 )
+                  ) >::type * = 0 )
+  {
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_offset_map.N0 = 0 ;
+    dst.m_ptr_on_device = 0 ;
+
+    if ( range.first < range.second ) {
+      assert_shape_bounds( src.m_offset_map , 1 , range.first );
+      assert_shape_bounds( src.m_offset_map , 1 , range.second - 1 );
+
+      dst.m_tracking      = src.m_tracking ;
+      dst.m_offset_map.N0 = range.second - range.first ;
+      dst.m_ptr_on_device = src.m_ptr_on_device + range.first ;
+
+      dst.m_tracking.increment( dst.m_ptr_on_device );
+    }
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-1 array from LayoutLeft Rank-2 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutLeft >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 2 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 1 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 1 )
+                  ), unsigned >::type i1 )
+  {
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_offset_map.N0 = src.m_offset_map.N0 ;
+    dst.m_ptr_on_device = src.m_ptr_on_device + src.m_offset_map(0,i1);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-1 array from LayoutRight Rank-2 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 2 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 1 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 1 )
+                  ), ALL >::type & )
+  {
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_offset_map.N0 = src.m_offset_map.N1 ;
+    dst.m_ptr_on_device = src.m_ptr_on_device + src.m_offset_map(i0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-2 array from LayoutLeft Rank-2 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutLeft >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 2 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 2 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 2 )
+                  ), unsigned >::type i1 )
+  {
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_offset_map.N0 = src.m_offset_map.N0 ;
+    dst.m_offset_map.N1 = 1 ;
+    dst.m_offset_map.S0 = src.m_offset_map.S0 ;
+    dst.m_ptr_on_device = src.m_ptr_on_device + src.m_offset_map(0,i1);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-2 array from LayoutRight Rank-2 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 2 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 2 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 2 )
+                  ), ALL >::type & )
+  {
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_offset_map.N0 = 1 ;
+    dst.m_offset_map.N1 = src.m_offset_map.N1 ;
+    dst.m_offset_map.SR = src.m_offset_map.SR ;
+    dst.m_ptr_on_device = src.m_ptr_on_device + src.m_offset_map(i0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+  //------------------------------------
+  /** \brief  Extract LayoutRight Rank-N array from range of LayoutRight Rank-N array */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM ,
+            typename iType >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const std::pair<iType,iType> & range ,
+                  typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::value
+                    &&
+                    Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank > 1 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic > 0 )
+                  )>::type * = 0 )
+  {
+    //typedef ViewTraits<DT,DL,DD,DM> traits_type ; // unused
+    //typedef typename traits_type::shape_type shape_type ; // unused
+    //typedef typename View<DT,DL,DD,DM,Specialize>::stride_type stride_type ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_offset_map.assign( 0, 0, 0, 0, 0, 0, 0, 0 );
+
+    dst.m_ptr_on_device = 0 ;
+
+    if ( ( range.first == range.second ) ||
+         ( (src.capacity()==0u) && (range.second<src.m_offset_map.N0) )) {
+      dst.m_offset_map.assign( 0 , src.m_offset_map.N1 , src.m_offset_map.N2 , src.m_offset_map.N3 ,
+                                   src.m_offset_map.N4 , src.m_offset_map.N5 , src.m_offset_map.N6 , src.m_offset_map.N7 );
+      dst.m_offset_map.SR = src.m_offset_map.SR ;
+    }
+    else if ( (range.first < range.second) ) {
+      assert_shape_bounds( src.m_offset_map , 8 , range.first ,      0,0,0,0,0,0,0);
+      assert_shape_bounds( src.m_offset_map , 8 , range.second - 1 , 0,0,0,0,0,0,0);
+
+      dst.m_offset_map.assign( range.second - range.first
+                             , src.m_offset_map.N1 , src.m_offset_map.N2 , src.m_offset_map.N3
+                             , src.m_offset_map.N4 , src.m_offset_map.N5 , src.m_offset_map.N6 , src.m_offset_map.N7 );
+
+      dst.m_offset_map.SR = src.m_offset_map.SR ;
+
+      dst.m_tracking      = src.m_tracking ;
+
+      dst.m_ptr_on_device = src.m_ptr_on_device + range.first * src.m_offset_map.SR ;
+
+      dst.m_tracking.increment( dst.m_ptr_on_device );
+    }
+  }
+
+  //------------------------------------
+  /** \brief  Extract rank-2 from rank-2 array */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM ,
+            typename iType0 , typename iType1 >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const std::pair<iType0,iType0> & range0 ,
+                  const std::pair<iType1,iType1> & range1 ,
+                  typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::value
+                    &&
+                    ViewTraits<DT,DL,DD,DM>::rank == 2
+                    &&
+                    ViewTraits<DT,DL,DD,DM>::rank_dynamic == 2
+                  ) >::type * = 0 )
+  {
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+    dst.m_offset_map.assign(0,0,0,0, 0,0,0,0);
+    dst.m_ptr_on_device = 0 ;
+
+    if ( (range0.first == range0.second) ||
+         (range1.first == range1.second) ||
+         ( ( src.capacity() == 0u ) &&
+           ( long(range0.second) < long(src.m_offset_map.N0) ) &&
+           ( long(range1.second) < long(src.m_offset_map.N1) ) ) ) {
+
+      dst.m_offset_map.assign( src.m_offset_map );
+      dst.m_offset_map.N0 = range0.second - range0.first ;
+      dst.m_offset_map.N1 = range1.second - range1.first ;
+    }
+    else if ( (range0.first < range0.second && range1.first < range1.second) ) {
+
+      assert_shape_bounds( src.m_offset_map , 2 , range0.first , range1.first );
+      assert_shape_bounds( src.m_offset_map , 2 , range0.second - 1 , range1.second - 1 );
+
+      dst.m_offset_map.assign( src.m_offset_map );
+      dst.m_offset_map.N0 = range0.second - range0.first ;
+      dst.m_offset_map.N1 = range1.second - range1.first ;
+
+      dst.m_tracking = src.m_tracking ;
+
+      dst.m_ptr_on_device = src.m_ptr_on_device + src.m_offset_map(range0.first,range1.first);
+
+      dst.m_tracking.increment( dst.m_ptr_on_device );
+    }
+  }
+
+  //------------------------------------
+  /** \brief  Extract rank-2 from rank-2 array */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM ,
+            typename iType >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  ALL ,
+                  const std::pair<iType,iType> & range1 ,
+                  typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::value
+                    &&
+                    ViewTraits<DT,DL,DD,DM>::rank == 2
+                    &&
+                    ViewTraits<DT,DL,DD,DM>::rank_dynamic == 2
+                  ) >::type * = 0 )
+  {
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+    dst.m_offset_map.assign(0,0,0,0, 0,0,0,0);
+    dst.m_ptr_on_device = 0 ;
+
+    if ( (range1.first == range1.second) || ( (src.capacity()==0) && (range1.second<src.m_offset_map.N1) )) {
+      dst.m_offset_map.assign(src.m_offset_map);
+      dst.m_offset_map.N1 = range1.second - range1.first ;
+    }
+    else if ( (range1.first < range1.second) ) {
+      assert_shape_bounds( src.m_offset_map , 2 , 0 , range1.first );
+      assert_shape_bounds( src.m_offset_map , 2 , src.m_offset_map.N0 - 1 , range1.second - 1 );
+
+      dst.m_offset_map.assign(src.m_offset_map);
+      dst.m_offset_map.N1 = range1.second - range1.first ;
+      dst.m_tracking      = src.m_tracking ;
+
+      dst.m_ptr_on_device = src.m_ptr_on_device + src.m_offset_map(0,range1.first);
+
+      dst.m_tracking.increment( dst.m_ptr_on_device );
+    }
+  }
+
+  //------------------------------------
+  /** \brief  Extract rank-2 from rank-2 array */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM ,
+            typename iType >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const std::pair<iType,iType> & range0 ,
+                  ALL ,
+                  typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::value
+                    &&
+                    ViewTraits<DT,DL,DD,DM>::rank == 2
+                    &&
+                    ViewTraits<DT,DL,DD,DM>::rank_dynamic == 2
+                  ) >::type * = 0 )
+  {
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+    dst.m_offset_map.assign(0,0,0,0, 0,0,0,0);
+    dst.m_ptr_on_device = 0 ;
+
+    if ( (range0.first == range0.second) || ( (src.capacity()==0) && (range0.second<src.m_offset_map.N0) )) {
+      dst.m_offset_map.assign(src.m_offset_map);
+      dst.m_offset_map.N0 = range0.second - range0.first ;
+    }
+    else if ( (range0.first < range0.second) ) {
+      assert_shape_bounds( src.m_offset_map , 2 , range0.first , 0 );
+      assert_shape_bounds( src.m_offset_map , 2 , range0.second - 1 , src.m_offset_map.N1 - 1 );
+
+      dst.m_offset_map.assign(src.m_offset_map);
+      dst.m_offset_map.N0 = range0.second - range0.first ;
+      dst.m_tracking = src.m_tracking ;
+
+      dst.m_ptr_on_device = src.m_ptr_on_device + src.m_offset_map(range0.first,0);
+
+      dst.m_tracking.increment( dst.m_ptr_on_device );
+    }
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-2 array from LayoutRight Rank-3 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 3 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 2 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 2 )
+                  ), ALL >::type & )
+  {
+    //typedef ViewTraits<DT,DL,DD,DM> traits_type ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_shape.N0      = src.m_shape.N1 ;
+    dst.m_shape.N1      = src.m_shape.N2 ;
+    dst.m_stride.value  = dst.m_shape.N1 ;
+    dst.m_ptr_on_device = &src(i0,0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-2 array from LayoutRight Rank-4 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const unsigned i1 ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 4 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 2 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 2 )
+                  ), ALL >::type & )
+  {
+    //typedef ViewTraits<DT,DL,DD,DM> traits_type ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_shape.N0      = src.m_shape.N2 ;
+    dst.m_shape.N1      = src.m_shape.N3 ;
+    dst.m_stride.value  = dst.m_shape.N1 ;
+    dst.m_ptr_on_device = &src(i0,i1,0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-2 array from LayoutRight Rank-5 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const unsigned i1 ,
+                  const unsigned i2 ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 5 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 2 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 2 )
+                  ), ALL >::type & )
+  {
+    //typedef ViewTraits<DT,DL,DD,DM> traits_type ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_shape.N0      = src.m_shape.N3 ;
+    dst.m_shape.N1      = src.m_shape.N4 ;
+    dst.m_stride.value  = dst.m_shape.N1 ;
+    dst.m_ptr_on_device = &src(i0,i1,i2,0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+
+  //------------------------------------
+  /** \brief  Extract Rank-2 array from LayoutRight Rank-6 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const unsigned i1 ,
+                  const unsigned i2 ,
+                  const unsigned i3 ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 6 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 2 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 2 )
+                  ), ALL >::type & )
+  {
+    //typedef ViewTraits<DT,DL,DD,DM> traits_type ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_shape.N0      = src.m_shape.N4 ;
+    dst.m_shape.N1      = src.m_shape.N5 ;
+    dst.m_stride.value  = dst.m_shape.N1 ;
+    dst.m_ptr_on_device = &src(i0,i1,i2,i3,0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-2 array from LayoutRight Rank-7 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const unsigned i1 ,
+                  const unsigned i2 ,
+                  const unsigned i3 ,
+                  const unsigned i4 ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 7 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 2 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 2 )
+                  ), ALL >::type & )
+  {
+    //typedef ViewTraits<DT,DL,DD,DM> traits_type ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_shape.N0      = src.m_shape.N5 ;
+    dst.m_shape.N1      = src.m_shape.N6 ;
+    dst.m_stride.value  = dst.m_shape.N1 ;
+    dst.m_ptr_on_device = &src(i0,i1,i2,i3,i4,0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+
+  //------------------------------------
+  /** \brief  Extract Rank-2 array from LayoutRight Rank-8 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const unsigned i1 ,
+                  const unsigned i2 ,
+                  const unsigned i3 ,
+                  const unsigned i4 ,
+                  const unsigned i5 ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 8 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 2 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 2 )
+                  ), ALL >::type & )
+  {
+    //typedef ViewTraits<DT,DL,DD,DM> traits_type ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_shape.N0      = src.m_shape.N6 ;
+    dst.m_shape.N1      = src.m_shape.N7 ;
+    dst.m_stride.value  = dst.m_shape.N1 ;
+    dst.m_ptr_on_device = &src(i0,i1,i2,i3,i4,i5,0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-3 array from LayoutRight Rank-4 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const ALL & ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 4 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 3 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 3 )
+                  ), ALL >::type & )
+  {
+    //typedef ViewTraits<DT,DL,DD,DM> traits_type ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_shape.N0      = src.m_shape.N1 ;
+    dst.m_shape.N1      = src.m_shape.N2 ;
+    dst.m_shape.N2      = src.m_shape.N3 ;
+    dst.m_stride.value  = dst.m_shape.N1 * dst.m_shape.N2 ;
+    dst.m_ptr_on_device = &src(i0,0,0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-3 array from LayoutRight Rank-5 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const unsigned i1 ,
+                  const ALL & ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 5 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 3 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 3 )
+                  ), ALL >::type & )
+  {
+    //typedef ViewTraits<DT,DL,DD,DM> traits_type ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_shape.N0      = src.m_shape.N2 ;
+    dst.m_shape.N1      = src.m_shape.N3 ;
+    dst.m_shape.N2      = src.m_shape.N4 ;
+    dst.m_stride.value  = dst.m_shape.N1 * dst.m_shape.N2 ;
+    dst.m_ptr_on_device = &src(i0,i1,0,0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-3 array from LayoutRight Rank-6 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const unsigned i1 ,
+                  const unsigned i2 ,
+                  const ALL & ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 6 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 3 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 3 )
+                  ), ALL >::type & )
+  {
+    //typedef ViewTraits<DT,DL,DD,DM> traits_type ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_shape.N0      = src.m_shape.N3 ;
+    dst.m_shape.N1      = src.m_shape.N4 ;
+    dst.m_shape.N2      = src.m_shape.N5 ;
+    dst.m_stride.value  = dst.m_shape.N1 * dst.m_shape.N2 ;
+    dst.m_ptr_on_device = &src(i0,i1,i2,0,0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-3 array from LayoutRight Rank-7 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const unsigned i1 ,
+                  const unsigned i2 ,
+                  const unsigned i3 ,
+                  const ALL & ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 7 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 3 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 3 )
+                  ), ALL >::type & )
+  {
+    //typedef ViewTraits<DT,DL,DD,DM> traits_type ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_shape.N0      = src.m_shape.N4 ;
+    dst.m_shape.N1      = src.m_shape.N5 ;
+    dst.m_shape.N2      = src.m_shape.N6 ;
+    dst.m_stride.value  = dst.m_shape.N1 * dst.m_shape.N2 ;
+    dst.m_ptr_on_device = &src(i0,i1,i2,i3,0,0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-3 array from LayoutRight Rank-8 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const unsigned i1 ,
+                  const unsigned i2 ,
+                  const unsigned i3 ,
+                  const unsigned i4 ,
+                  const ALL & ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 8 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 3 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 3 )
+                  ), ALL >::type & )
+  {
+    //typedef ViewTraits<DT,DL,DD,DM> traits_type ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_shape.N0      = src.m_shape.N5 ;
+    dst.m_shape.N1      = src.m_shape.N6 ;
+    dst.m_shape.N2      = src.m_shape.N7 ;
+    dst.m_stride.value  = dst.m_shape.N1 * dst.m_shape.N2 ;
+    dst.m_ptr_on_device = &src(i0,i1,i2,i3,i4,0,0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-4 array from LayoutRight Rank-5 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const ALL & ,
+                  const ALL & ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 5 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 4 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 4 )
+                  ), ALL >::type & )
+  {
+    //typedef ViewTraits<DT,DL,DD,DM> traits_type ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_shape.N0      = src.m_shape.N1 ;
+    dst.m_shape.N1      = src.m_shape.N2 ;
+    dst.m_shape.N2      = src.m_shape.N3 ;
+    dst.m_shape.N3      = src.m_shape.N4 ;
+    dst.m_stride.value  = dst.m_shape.N1 * dst.m_shape.N2 *
+                          dst.m_shape.N3 ;
+    dst.m_ptr_on_device = &src(i0,0,0,0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-4 array from LayoutRight Rank-6 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const unsigned i1 ,
+                  const ALL & ,
+                  const ALL & ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 6 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 4 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 4 )
+                  ), ALL >::type & )
+  {
+    //typedef ViewTraits<DT,DL,DD,DM> traits_type ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_shape.N0      = src.m_shape.N2 ;
+    dst.m_shape.N1      = src.m_shape.N3 ;
+    dst.m_shape.N2      = src.m_shape.N4 ;
+    dst.m_shape.N3      = src.m_shape.N5 ;
+    dst.m_stride.value  = dst.m_shape.N1 * dst.m_shape.N2 *
+                          dst.m_shape.N3 ;
+    dst.m_ptr_on_device = &src(i0,i1,0,0,0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-4 array from LayoutRight Rank-7 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const unsigned i1 ,
+                  const unsigned i2 ,
+                  const ALL & ,
+                  const ALL & ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 7 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 4 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 4 )
+                  ), ALL >::type & )
+  {
+    //typedef ViewTraits<DT,DL,DD,DM> traits_type ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_shape.N0      = src.m_shape.N3 ;
+    dst.m_shape.N1      = src.m_shape.N4 ;
+    dst.m_shape.N2      = src.m_shape.N5 ;
+    dst.m_shape.N3      = src.m_shape.N6 ;
+    dst.m_stride.value  = dst.m_shape.N1 * dst.m_shape.N2 *
+                          dst.m_shape.N3 ;
+    dst.m_ptr_on_device = &src(i0,i1,i2,0,0,0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-4 array from LayoutRight Rank-8 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const unsigned i1 ,
+                  const unsigned i2 ,
+                  const unsigned i3 ,
+                  const ALL & ,
+                  const ALL & ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 8 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 4 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 4 )
+                  ), ALL >::type & )
+  {
+    //typedef ViewTraits<DT,DL,DD,DM> traits_type ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_shape.N0      = src.m_shape.N4 ;
+    dst.m_shape.N1      = src.m_shape.N5 ;
+    dst.m_shape.N2      = src.m_shape.N6 ;
+    dst.m_shape.N3      = src.m_shape.N7 ;
+    dst.m_stride.value  = dst.m_shape.N1 * dst.m_shape.N2 *
+                          dst.m_shape.N3 ;
+    dst.m_ptr_on_device = &src(i0,i1,i2,i3,0,0,0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-5 array from LayoutRight Rank-6 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const ALL & ,
+                  const ALL & ,
+                  const ALL & ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 6 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 5 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 5 )
+                  ), ALL >::type & )
+  {
+    //typedef ViewTraits<DT,DL,DD,DM> traits_type ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_shape.N0      = src.m_shape.N1 ;
+    dst.m_shape.N1      = src.m_shape.N2 ;
+    dst.m_shape.N2      = src.m_shape.N3 ;
+    dst.m_shape.N3      = src.m_shape.N4 ;
+    dst.m_shape.N4      = src.m_shape.N5 ;
+    dst.m_stride.value  = dst.m_shape.N1 * dst.m_shape.N2 *
+                          dst.m_shape.N3 * dst.m_shape.N4 ;
+    dst.m_ptr_on_device = &src(i0,0,0,0,0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-5 array from LayoutRight Rank-7 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const unsigned i1 ,
+                  const ALL & ,
+                  const ALL & ,
+                  const ALL & ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 7 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 5 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 5 )
+                  ), ALL >::type & )
+  {
+    //typedef ViewTraits<DT,DL,DD,DM> traits_type ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_shape.N0      = src.m_shape.N2 ;
+    dst.m_shape.N1      = src.m_shape.N3 ;
+    dst.m_shape.N2      = src.m_shape.N4 ;
+    dst.m_shape.N3      = src.m_shape.N5 ;
+    dst.m_shape.N4      = src.m_shape.N6 ;
+    dst.m_stride.value  = dst.m_shape.N1 * dst.m_shape.N2 *
+                          dst.m_shape.N3 * dst.m_shape.N4 ;
+    dst.m_ptr_on_device = &src(i0,i1,0,0,0,0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Extract Rank-5 array from LayoutRight Rank-8 array. */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const unsigned i0 ,
+                  const unsigned i1 ,
+                  const unsigned i2 ,
+                  const ALL & ,
+                  const ALL & ,
+                  const ALL & ,
+                  const ALL & ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 8 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank == 5 )
+                    &&
+                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 5 )
+                  ), ALL >::type & )
+  {
+    //typedef ViewTraits<DT,DL,DD,DM> traits_type ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_shape.N0      = src.m_shape.N3 ;
+    dst.m_shape.N1      = src.m_shape.N4 ;
+    dst.m_shape.N2      = src.m_shape.N5 ;
+    dst.m_shape.N3      = src.m_shape.N6 ;
+    dst.m_shape.N4      = src.m_shape.N7 ;
+    dst.m_stride.value  = dst.m_shape.N1 * dst.m_shape.N2 *
+                          dst.m_shape.N3 * dst.m_shape.N4 ;
+    dst.m_ptr_on_device = &src(i0,i1,i2,0,0,0,0,0);
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+
+  template< class DT , class DL , class DD , class DM
+          , class ST , class SL , class SD , class SM
+          , class Type0
+          >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const Type0 & arg0 ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout , LayoutStride >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 1 )
+                    &&
+                    ( unsigned(ViewTraits<DT,DL,DD,DM>::rank) ==
+                      ( ViewOffsetRange< Type0 >::is_range ? 1u : 0 ) )
+                  )>::type * = 0 )
+  {
+    enum { src_rank = 1 };
+
+    size_t str[2] = {0,0};
+
+    src.m_offset_map.stride( str );
+
+    const size_t offset = ViewOffsetRange< Type0 >::begin( arg0 ) * str[0] ;
+
+    LayoutStride spec ;
+
+    // Collapse dimension for non-ranges
+    if ( ViewOffsetRange< Type0 >::is_range ) {
+      spec.dimension[0] = ViewOffsetRange< Type0 >::dimension( src.m_offset_map.N0 , arg0 );
+      spec.stride[0]    = str[0] ;
+    }
+    else {
+      spec.dimension[0] = 1 ;
+      spec.stride[0]    = 1 ;
+    }
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+    dst.m_tracking = src.m_tracking ;
+    dst.m_offset_map.assign( spec );
+    dst.m_ptr_on_device = src.m_ptr_on_device + offset ;
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  template< class DT , class DL , class DD , class DM
+          , class ST , class SL , class SD , class SM
+          , class Type0
+          , class Type1
+          >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const Type0 & arg0 ,
+                  const Type1 & arg1 ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout , LayoutStride >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 2 )
+                    &&
+                    ( unsigned(ViewTraits<DT,DL,DD,DM>::rank) ==
+                      ( ViewOffsetRange< Type0 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type1 >::is_range ? 1u : 0 ) )
+                  )>::type * = 0 )
+  {
+    enum { src_rank = 2 };
+
+    const bool is_range[ src_rank ] =
+      { ViewOffsetRange< Type0 >::is_range
+      , ViewOffsetRange< Type1 >::is_range
+      };
+
+    const unsigned begin[ src_rank ] =
+      { static_cast<unsigned>(ViewOffsetRange< Type0 >::begin( arg0 ))
+      , static_cast<unsigned>(ViewOffsetRange< Type1 >::begin( arg1 ))
+      };
+
+    size_t stride[9] ;
+
+    src.m_offset_map.stride( stride );
+
+    LayoutStride spec ;
+
+    spec.dimension[0] = ViewOffsetRange< Type0 >::dimension( src.m_offset_map.N0 , arg0 );
+    spec.dimension[1] = ViewOffsetRange< Type1 >::dimension( src.m_offset_map.N1 , arg1 );
+    spec.stride[0]    = stride[0] ;
+    spec.stride[1]    = stride[1] ;
+
+    size_t offset = 0 ;
+
+    // Collapse dimension for non-ranges
+    for ( int i = 0 , j = 0 ; i < int(src_rank) ; ++i ) {
+      spec.dimension[j] = spec.dimension[i] ;
+      spec.stride[j]    = spec.stride[i] ;
+      offset += begin[i] * spec.stride[i] ;
+      if ( is_range[i] ) { ++j ; }
+    }
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+    dst.m_tracking = src.m_tracking ;
+    dst.m_offset_map.assign( spec );
+    dst.m_ptr_on_device = src.m_ptr_on_device + offset ;
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  template< class DT , class DL , class DD , class DM
+          , class ST , class SL , class SD , class SM
+          , class Type0
+          , class Type1
+          , class Type2
+          >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const Type0 & arg0 ,
+                  const Type1 & arg1 ,
+                  const Type2 & arg2 ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout , LayoutStride >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 3 )
+                    &&
+                    ( unsigned(ViewTraits<DT,DL,DD,DM>::rank) ==
+                      ( ViewOffsetRange< Type0 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type1 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type2 >::is_range ? 1u : 0 ) )
+                  )>::type * = 0 )
+  {
+    enum { src_rank = 3 };
+
+    const bool is_range[ src_rank ] =
+      { ViewOffsetRange< Type0 >::is_range
+      , ViewOffsetRange< Type1 >::is_range
+      , ViewOffsetRange< Type2 >::is_range
+      };
+
+    const unsigned begin[ src_rank ] =
+      { ViewOffsetRange< Type0 >::begin( arg0 )
+      , ViewOffsetRange< Type1 >::begin( arg1 )
+      , ViewOffsetRange< Type2 >::begin( arg2 )
+      };
+
+    unsigned dim[ src_rank ] =
+      { ViewOffsetRange< Type0 >::dimension( src.m_offset_map.N0 , arg0 )
+      , ViewOffsetRange< Type1 >::dimension( src.m_offset_map.N1 , arg1 )
+      , ViewOffsetRange< Type2 >::dimension( src.m_offset_map.N2 , arg2 )
+      };
+
+    size_t stride[9] = {0,0,0,0,0,0,0,0,0};
+
+    src.m_offset_map.stride( stride );
+
+    LayoutStride spec ;
+
+    size_t offset = 0 ;
+
+    // Collapse dimension for non-ranges
+    for ( int i = 0 , j = 0 ; i < int(src_rank) ; ++i ) {
+      spec.dimension[j] = dim[i] ;
+      spec.stride[j]    = stride[i] ;
+      offset += begin[i] * stride[i] ;
+      if ( is_range[i] ) { ++j ; }
+    }
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+    dst.m_tracking = src.m_tracking ;
+    dst.m_offset_map.assign( spec );
+    dst.m_ptr_on_device = src.m_ptr_on_device + offset ;
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  template< class DT , class DL , class DD , class DM
+          , class ST , class SL , class SD , class SM
+          , class Type0
+          , class Type1
+          , class Type2
+          , class Type3
+          >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const Type0 & arg0 ,
+                  const Type1 & arg1 ,
+                  const Type2 & arg2 ,
+                  const Type3 & arg3 ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout , LayoutStride >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 4 )
+                    &&
+                    ( unsigned(ViewTraits<DT,DL,DD,DM>::rank) ==
+                      ( ViewOffsetRange< Type0 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type1 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type2 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type3 >::is_range ? 1u : 0 ) )
+                  )>::type * = 0 )
+  {
+    enum { src_rank = 4 };
+    const bool is_range[ src_rank ] =
+      { ViewOffsetRange< Type0 >::is_range
+      , ViewOffsetRange< Type1 >::is_range
+      , ViewOffsetRange< Type2 >::is_range
+      , ViewOffsetRange< Type3 >::is_range
+      };
+
+    const unsigned begin[ src_rank ] =
+      { static_cast<unsigned>(ViewOffsetRange< Type0 >::begin( arg0 ))
+      , static_cast<unsigned>(ViewOffsetRange< Type1 >::begin( arg1 ))
+      , static_cast<unsigned>(ViewOffsetRange< Type2 >::begin( arg2 ))
+      , static_cast<unsigned>(ViewOffsetRange< Type3 >::begin( arg3 ))
+      };
+
+    unsigned dim[ src_rank ] =
+      { static_cast<unsigned>(ViewOffsetRange< Type0 >::dimension( src.m_offset_map.N0 , arg0 ))
+      , static_cast<unsigned>(ViewOffsetRange< Type1 >::dimension( src.m_offset_map.N1 , arg1 ))
+      , static_cast<unsigned>(ViewOffsetRange< Type2 >::dimension( src.m_offset_map.N2 , arg2 ))
+      , static_cast<unsigned>(ViewOffsetRange< Type3 >::dimension( src.m_offset_map.N3 , arg3 ))
+      };
+
+    size_t stride[9] = {0,0,0,0,0,0,0,0,0};
+
+    src.m_offset_map.stride( stride );
+
+    LayoutStride spec ;
+
+    size_t offset = 0 ;
+
+    // Collapse dimension for non-ranges
+    for ( int i = 0 , j = 0 ; i < int(src_rank) ; ++i ) {
+      spec.dimension[j] = dim[i] ;
+      spec.stride[j]    = stride[i] ;
+      offset += begin[i] * stride[i] ;
+      if ( is_range[i] ) { ++j ; }
+    }
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+    dst.m_tracking = src.m_tracking ;
+    dst.m_offset_map.assign( spec );
+    dst.m_ptr_on_device = src.m_ptr_on_device + offset ;
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  template< class DT , class DL , class DD , class DM
+          , class ST , class SL , class SD , class SM
+          , class Type0
+          , class Type1
+          , class Type2
+          , class Type3
+          , class Type4
+          >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const Type0 & arg0 ,
+                  const Type1 & arg1 ,
+                  const Type2 & arg2 ,
+                  const Type3 & arg3 ,
+                  const Type4 & arg4 ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout , LayoutStride >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 5 )
+                    &&
+                    ( unsigned(ViewTraits<DT,DL,DD,DM>::rank) ==
+                      ( ViewOffsetRange< Type0 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type1 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type2 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type3 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type4 >::is_range ? 1u : 0 ) )
+                  )>::type * = 0 )
+  {
+    enum { src_rank = 5 };
+    const bool is_range[ src_rank ] =
+      { ViewOffsetRange< Type0 >::is_range
+      , ViewOffsetRange< Type1 >::is_range
+      , ViewOffsetRange< Type2 >::is_range
+      , ViewOffsetRange< Type3 >::is_range
+      , ViewOffsetRange< Type4 >::is_range
+      };
+
+    const unsigned begin[ src_rank ] =
+      { ViewOffsetRange< Type0 >::begin( arg0 )
+      , ViewOffsetRange< Type1 >::begin( arg1 )
+      , ViewOffsetRange< Type2 >::begin( arg2 )
+      , ViewOffsetRange< Type3 >::begin( arg3 )
+      , ViewOffsetRange< Type4 >::begin( arg4 )
+      };
+
+    unsigned dim[ src_rank ] =
+      { ViewOffsetRange< Type0 >::dimension( src.m_offset_map.N0 , arg0 )
+      , ViewOffsetRange< Type1 >::dimension( src.m_offset_map.N1 , arg1 )
+      , ViewOffsetRange< Type2 >::dimension( src.m_offset_map.N2 , arg2 )
+      , ViewOffsetRange< Type3 >::dimension( src.m_offset_map.N3 , arg3 )
+      , ViewOffsetRange< Type4 >::dimension( src.m_offset_map.N4 , arg4 )
+      };
+
+    size_t stride[9] = {0,0,0,0,0,0,0,0,0};
+
+    src.m_offset_map.stride( stride );
+
+    LayoutStride spec ;
+
+    size_t offset = 0 ;
+
+    // Collapse dimension for non-ranges
+    for ( int i = 0 , j = 0 ; i < int(src_rank) ; ++i ) {
+      spec.dimension[j] = dim[i] ;
+      spec.stride[j]    = stride[i] ;
+      offset += begin[i] * stride[i] ;
+      if ( is_range[i] ) { ++j ; }
+    }
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+    dst.m_tracking = src.m_tracking ;
+    dst.m_offset_map.assign( spec );
+    dst.m_ptr_on_device = src.m_ptr_on_device + offset ;
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  template< class DT , class DL , class DD , class DM
+          , class ST , class SL , class SD , class SM
+          , class Type0
+          , class Type1
+          , class Type2
+          , class Type3
+          , class Type4
+          , class Type5
+          >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const Type0 & arg0 ,
+                  const Type1 & arg1 ,
+                  const Type2 & arg2 ,
+                  const Type3 & arg3 ,
+                  const Type4 & arg4 ,
+                  const Type5 & arg5 ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout , LayoutStride >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 6 )
+                    &&
+                    ( unsigned(ViewTraits<DT,DL,DD,DM>::rank) ==
+                      ( ViewOffsetRange< Type0 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type1 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type2 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type3 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type4 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type5 >::is_range ? 1u : 0 ) )
+                  )>::type * = 0 )
+  {
+    enum { src_rank = 6 };
+    const bool is_range[ src_rank ] =
+      { ViewOffsetRange< Type0 >::is_range
+      , ViewOffsetRange< Type1 >::is_range
+      , ViewOffsetRange< Type2 >::is_range
+      , ViewOffsetRange< Type3 >::is_range
+      , ViewOffsetRange< Type4 >::is_range
+      , ViewOffsetRange< Type5 >::is_range
+      };
+
+    const unsigned begin[ src_rank ] =
+      { ViewOffsetRange< Type0 >::begin( arg0 )
+      , ViewOffsetRange< Type1 >::begin( arg1 )
+      , ViewOffsetRange< Type2 >::begin( arg2 )
+      , ViewOffsetRange< Type3 >::begin( arg3 )
+      , ViewOffsetRange< Type4 >::begin( arg4 )
+      , ViewOffsetRange< Type5 >::begin( arg5 )
+      };
+
+    unsigned dim[ src_rank ] =
+      { ViewOffsetRange< Type0 >::dimension( src.m_offset_map.N0 , arg0 )
+      , ViewOffsetRange< Type1 >::dimension( src.m_offset_map.N1 , arg1 )
+      , ViewOffsetRange< Type2 >::dimension( src.m_offset_map.N2 , arg2 )
+      , ViewOffsetRange< Type3 >::dimension( src.m_offset_map.N3 , arg3 )
+      , ViewOffsetRange< Type4 >::dimension( src.m_offset_map.N4 , arg4 )
+      , ViewOffsetRange< Type5 >::dimension( src.m_offset_map.N5 , arg5 )
+      };
+
+    size_t stride[9] = {0,0,0,0,0,0,0,0,0};
+
+    src.m_offset_map.stride( stride );
+
+    LayoutStride spec ;
+
+    size_t offset = 0 ;
+
+    // Collapse dimension for non-ranges
+    for ( int i = 0 , j = 0 ; i < int(src_rank) ; ++i ) {
+      spec.dimension[j] = dim[i] ;
+      spec.stride[j]    = stride[i] ;
+      offset += begin[i] * stride[i] ;
+      if ( is_range[i] ) { ++j ; }
+    }
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+    dst.m_tracking = src.m_tracking ;
+    dst.m_offset_map.assign( spec );
+    dst.m_ptr_on_device = src.m_ptr_on_device + offset ;
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  template< class DT , class DL , class DD , class DM
+          , class ST , class SL , class SD , class SM
+          , class Type0
+          , class Type1
+          , class Type2
+          , class Type3
+          , class Type4
+          , class Type5
+          , class Type6
+          >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const Type0 & arg0 ,
+                  const Type1 & arg1 ,
+                  const Type2 & arg2 ,
+                  const Type3 & arg3 ,
+                  const Type4 & arg4 ,
+                  const Type5 & arg5 ,
+                  const Type6 & arg6 ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout , LayoutStride >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 7 )
+                    &&
+                    ( unsigned(ViewTraits<DT,DL,DD,DM>::rank) ==
+                      ( ViewOffsetRange< Type0 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type1 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type2 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type3 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type4 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type5 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type6 >::is_range ? 1u : 0 ) )
+                  )>::type * = 0 )
+  {
+    enum { src_rank = 7 };
+    const bool is_range[ src_rank ] =
+      { ViewOffsetRange< Type0 >::is_range
+      , ViewOffsetRange< Type1 >::is_range
+      , ViewOffsetRange< Type2 >::is_range
+      , ViewOffsetRange< Type3 >::is_range
+      , ViewOffsetRange< Type4 >::is_range
+      , ViewOffsetRange< Type5 >::is_range
+      , ViewOffsetRange< Type6 >::is_range
+      };
+
+    const unsigned begin[ src_rank ] =
+      { ViewOffsetRange< Type0 >::begin( arg0 )
+      , ViewOffsetRange< Type1 >::begin( arg1 )
+      , ViewOffsetRange< Type2 >::begin( arg2 )
+      , ViewOffsetRange< Type3 >::begin( arg3 )
+      , ViewOffsetRange< Type4 >::begin( arg4 )
+      , ViewOffsetRange< Type5 >::begin( arg5 )
+      , ViewOffsetRange< Type6 >::begin( arg6 )
+      };
+
+    unsigned dim[ src_rank ] =
+      { ViewOffsetRange< Type0 >::dimension( src.m_offset_map.N0 , arg0 )
+      , ViewOffsetRange< Type1 >::dimension( src.m_offset_map.N1 , arg1 )
+      , ViewOffsetRange< Type2 >::dimension( src.m_offset_map.N2 , arg2 )
+      , ViewOffsetRange< Type3 >::dimension( src.m_offset_map.N3 , arg3 )
+      , ViewOffsetRange< Type4 >::dimension( src.m_offset_map.N4 , arg4 )
+      , ViewOffsetRange< Type5 >::dimension( src.m_offset_map.N5 , arg5 )
+      , ViewOffsetRange< Type6 >::dimension( src.m_offset_map.N6 , arg6 )
+      };
+
+    size_t stride[9] = {0,0,0,0,0,0,0,0,0};
+
+    src.m_offset_map.stride( stride );
+
+    LayoutStride spec ;
+
+    size_t offset = 0 ;
+
+    // Collapse dimension for non-ranges
+    for ( int i = 0 , j = 0 ; i < int(src_rank) ; ++i ) {
+      spec.dimension[j] = dim[i] ;
+      spec.stride[j]    = stride[i] ;
+      offset += begin[i] * stride[i] ;
+      if ( is_range[i] ) { ++j ; }
+    }
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+    dst.m_tracking = src.m_tracking ;
+    dst.m_offset_map.assign( spec );
+    dst.m_ptr_on_device = src.m_ptr_on_device + offset ;
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  template< class DT , class DL , class DD , class DM
+          , class ST , class SL , class SD , class SM
+          , class Type0
+          , class Type1
+          , class Type2
+          , class Type3
+          , class Type4
+          , class Type5
+          , class Type6
+          , class Type7
+          >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const Type0 & arg0 ,
+                  const Type1 & arg1 ,
+                  const Type2 & arg2 ,
+                  const Type3 & arg3 ,
+                  const Type4 & arg4 ,
+                  const Type5 & arg5 ,
+                  const Type6 & arg6 ,
+                  const Type7 & arg7 ,
+                  const typename enable_if< (
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
+                    &&
+                    is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout , LayoutStride >::value
+                    &&
+                    ( ViewTraits<ST,SL,SD,SM>::rank == 8 )
+                    &&
+                    ( unsigned(ViewTraits<DT,DL,DD,DM>::rank) ==
+                      ( ViewOffsetRange< Type0 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type1 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type2 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type3 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type4 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type5 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type6 >::is_range ? 1u : 0 ) +
+                      ( ViewOffsetRange< Type7 >::is_range ? 1u : 0 ) )
+                  )>::type * = 0 )
+  {
+    enum { src_rank = 8 };
+
+    const bool is_range[ src_rank ] =
+      { ViewOffsetRange< Type0 >::is_range
+      , ViewOffsetRange< Type1 >::is_range
+      , ViewOffsetRange< Type2 >::is_range
+      , ViewOffsetRange< Type3 >::is_range
+      , ViewOffsetRange< Type4 >::is_range
+      , ViewOffsetRange< Type5 >::is_range
+      , ViewOffsetRange< Type6 >::is_range
+      , ViewOffsetRange< Type7 >::is_range
+      };
+
+    const unsigned begin[ src_rank ] =
+      { ViewOffsetRange< Type0 >::begin( arg0 )
+      , ViewOffsetRange< Type1 >::begin( arg1 )
+      , ViewOffsetRange< Type2 >::begin( arg2 )
+      , ViewOffsetRange< Type3 >::begin( arg3 )
+      , ViewOffsetRange< Type4 >::begin( arg4 )
+      , ViewOffsetRange< Type5 >::begin( arg5 )
+      , ViewOffsetRange< Type6 >::begin( arg6 )
+      , ViewOffsetRange< Type7 >::begin( arg7 )
+      };
+
+    unsigned dim[ src_rank ] =
+      { ViewOffsetRange< Type0 >::dimension( src.m_offset_map.N0 , arg0 )
+      , ViewOffsetRange< Type1 >::dimension( src.m_offset_map.N1 , arg1 )
+      , ViewOffsetRange< Type2 >::dimension( src.m_offset_map.N2 , arg2 )
+      , ViewOffsetRange< Type3 >::dimension( src.m_offset_map.N3 , arg3 )
+      , ViewOffsetRange< Type4 >::dimension( src.m_offset_map.N4 , arg4 )
+      , ViewOffsetRange< Type5 >::dimension( src.m_offset_map.N5 , arg5 )
+      , ViewOffsetRange< Type6 >::dimension( src.m_offset_map.N6 , arg6 )
+      , ViewOffsetRange< Type7 >::dimension( src.m_offset_map.N7 , arg7 )
+      };
+
+    size_t stride[9] = {0,0,0,0,0,0,0,0,0};
+
+    src.m_offset_map.stride( stride );
+
+    LayoutStride spec ;
+
+    size_t offset = 0 ;
+
+    // Collapse dimension for non-ranges
+    for ( int i = 0 , j = 0 ; i < int(src_rank) ; ++i ) {
+      spec.dimension[j] = dim[i] ;
+      spec.stride[j]    = stride[i] ;
+      offset += begin[i] * stride[i] ;
+      if ( is_range[i] ) { ++j ; }
+    }
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_tracking = src.m_tracking ;
+
+    dst.m_offset_map.assign( spec );
+
+    dst.m_ptr_on_device = src.m_ptr_on_device + offset ;
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Deep copy data from compatible value type, layout, rank, and specialization.
+   *          Check the dimensions and allocation lengths at runtime.
+   */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  inline static
+  void deep_copy( const View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const typename Impl::enable_if<(
+                    Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::value_type ,
+                                   typename ViewTraits<ST,SL,SD,SM>::non_const_value_type >::value
+                    &&
+                    Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout ,
+                                   typename ViewTraits<ST,SL,SD,SM>::array_layout >::value
+                    &&
+                    ( unsigned(ViewTraits<DT,DL,DD,DM>::rank) == unsigned(ViewTraits<ST,SL,SD,SM>::rank) )
+                  )>::type * = 0 )
+  {
+    typedef typename ViewTraits<DT,DL,DD,DM>::memory_space dst_memory_space ;
+    typedef typename ViewTraits<ST,SL,SD,SM>::memory_space src_memory_space ;
+
+    if ( dst.m_ptr_on_device != src.m_ptr_on_device ) {
+
+      Impl::assert_shapes_are_equal( dst.m_offset_map , src.m_offset_map );
+
+      const size_t nbytes = dst.m_offset_map.scalar_size * dst.m_offset_map.capacity();
+
+      DeepCopy< dst_memory_space , src_memory_space >( dst.m_ptr_on_device , src.m_ptr_on_device , nbytes );
+    }
+  }
+};
+
+//----------------------------------------------------------------------------
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_VIEWDEFAULT_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewOffset.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewOffset.hpp
new file mode 100644
index 000000000..565f85da9
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewOffset.hpp
@@ -0,0 +1,1074 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VIEWOFFSET_HPP
+#define KOKKOS_VIEWOFFSET_HPP
+
+#include <Kokkos_Pair.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Shape.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+struct ALL ;
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos { namespace Impl {
+
+template < class ShapeType , class LayoutType , typename Enable = void>
+struct ViewOffset ;
+
+//----------------------------------------------------------------------------
+
+template< class T , unsigned R , typename Enable = void >
+struct AssignViewOffsetDimension ; 
+
+template< class T >
+struct AssignViewOffsetDimension< T , 0 , typename enable_if<( 0 < T::rank_dynamic )>::type >
+{ AssignViewOffsetDimension( T & s , unsigned n ) { s.N0 = n ; } };
+
+template< class T >
+struct AssignViewOffsetDimension< T , 1 , typename enable_if<( 1 < T::rank_dynamic )>::type >
+{ AssignViewOffsetDimension( T & s , unsigned n ) { s.N1 = n ; } };
+
+template< class T >
+struct AssignViewOffsetDimension< T , 2 , typename enable_if<( 2 < T::rank_dynamic )>::type >
+{ AssignViewOffsetDimension( T & s , unsigned n ) { s.N2 = n ; } };
+
+template< class T >
+struct AssignViewOffsetDimension< T , 3 , typename enable_if<( 3 < T::rank_dynamic )>::type >
+{ AssignViewOffsetDimension( T & s , unsigned n ) { s.N3 = n ; } };
+
+template< class T >
+struct AssignViewOffsetDimension< T , 4 , typename enable_if<( 4 < T::rank_dynamic )>::type >
+{ AssignViewOffsetDimension( T & s , unsigned n ) { s.N4 = n ; } };
+
+template< class T >
+struct AssignViewOffsetDimension< T , 5 , typename enable_if<( 5 < T::rank_dynamic )>::type >
+{ AssignViewOffsetDimension( T & s , unsigned n ) { s.N5 = n ; } };
+
+template< class T >
+struct AssignViewOffsetDimension< T , 6 , typename enable_if<( 6 < T::rank_dynamic )>::type >
+{ AssignViewOffsetDimension( T & s , unsigned n ) { s.N6 = n ; } };
+
+template< class T >
+struct AssignViewOffsetDimension< T , 7 , typename enable_if<( 7 < T::rank_dynamic )>::type >
+{ AssignViewOffsetDimension( T & s , unsigned n ) { s.N7 = n ; } };
+
+//----------------------------------------------------------------------------
+// LayoutLeft AND ( 1 >= rank OR 0 == rank_dynamic ) : has padding / striding
+template < class ShapeType >
+struct ViewOffset< ShapeType , LayoutLeft
+                 , typename enable_if<( 1 >= ShapeType::rank
+                                        ||
+                                        0 == ShapeType::rank_dynamic
+                                      )>::type >
+  : public ShapeType
+{
+  typedef size_t     size_type ;
+  typedef ShapeType  shape_type ;
+  typedef LayoutLeft array_layout ;
+
+  enum { has_padding = false };
+
+  template< unsigned R >
+  KOKKOS_INLINE_FUNCTION
+  void assign( unsigned n )
+    { AssignViewOffsetDimension< ViewOffset , R >( *this , n ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3
+             , unsigned n4 , unsigned n5 , unsigned n6 , unsigned n7
+             , unsigned = 0 )
+    { shape_type::assign( *this , n0, n1, n2, n3, n4, n5, n6, n7 ); }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutLeft > & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic)
+                                  )>::type * = 0 )
+    { shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutRight > & rhs
+             , typename enable_if<( 1 == int(ShapeRHS::rank)
+                                    &&
+                                    1 == int(shape_type::rank)
+                                    &&
+                                    1 == int(shape_type::rank_dynamic)
+                                  )>::type * = 0 )
+    { shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void set_padding() {}
+
+  KOKKOS_INLINE_FUNCTION
+  size_type cardinality() const
+    { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type capacity() const
+    { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  // Stride with [ rank ] value is the total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      s[0] = 1 ;
+      if ( 0 < shape_type::rank ) { s[1] = shape_type::N0 ; }
+      if ( 1 < shape_type::rank ) { s[2] = s[1] * shape_type::N1 ; }
+      if ( 2 < shape_type::rank ) { s[3] = s[2] * shape_type::N2 ; }
+      if ( 3 < shape_type::rank ) { s[4] = s[3] * shape_type::N3 ; }
+      if ( 4 < shape_type::rank ) { s[5] = s[4] * shape_type::N4 ; }
+      if ( 5 < shape_type::rank ) { s[6] = s[5] * shape_type::N5 ; }
+      if ( 6 < shape_type::rank ) { s[7] = s[6] * shape_type::N6 ; }
+      if ( 7 < shape_type::rank ) { s[8] = s[7] * shape_type::N7 ; }
+    }
+
+  KOKKOS_INLINE_FUNCTION size_type stride_0() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION size_type stride_1() const { return shape_type::N0 ; }
+  KOKKOS_INLINE_FUNCTION size_type stride_2() const { return shape_type::N0 * shape_type::N1 ; }
+  KOKKOS_INLINE_FUNCTION size_type stride_3() const { return shape_type::N0 * shape_type::N1 * shape_type::N2 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_4() const
+    { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_5() const
+    { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_6() const
+    { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_7() const
+    { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N7 ; }
+
+  // rank 1
+  template< typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const & i0 ) const { return i0 ; }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const & i0 , I1 const & i1 ) const
+    { return i0 + shape_type::N0 * i1 ; }
+
+  //rank 3
+  template <typename I0, typename I1, typename I2>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0
+                      , I1 const& i1
+                      , I2 const& i2
+                      ) const
+    {
+      return i0 + shape_type::N0 * (
+             i1 + shape_type::N1 * i2 );
+    }
+
+  //rank 4
+  template <typename I0, typename I1, typename I2, typename I3>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3 ) const
+    {
+      return i0 + shape_type::N0 * (
+             i1 + shape_type::N1 * (
+             i2 + shape_type::N2 * i3 ));
+    }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4 ) const
+    {
+      return i0 + shape_type::N0 * (
+             i1 + shape_type::N1 * (
+             i2 + shape_type::N2 * (
+             i3 + shape_type::N3 * i4 )));
+    }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5 ) const
+    {
+      return i0 + shape_type::N0 * (
+             i1 + shape_type::N1 * (
+             i2 + shape_type::N2 * (
+             i3 + shape_type::N3 * (
+             i4 + shape_type::N4 * i5 ))));
+    }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6) const
+  {
+    return i0 + shape_type::N0 * (
+           i1 + shape_type::N1 * (
+           i2 + shape_type::N2 * (
+           i3 + shape_type::N3 * (
+           i4 + shape_type::N4 * (
+           i5 + shape_type::N5 * i6 )))));
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7) const
+  {
+    return i0 + shape_type::N0 * (
+           i1 + shape_type::N1 * (
+           i2 + shape_type::N2 * (
+           i3 + shape_type::N3 * (
+           i4 + shape_type::N4 * (
+           i5 + shape_type::N5 * (
+           i6 + shape_type::N6 * i7 ))))));
+  }
+};
+
+//----------------------------------------------------------------------------
+// LayoutLeft AND ( 1 < rank AND 0 < rank_dynamic ) : has padding / striding
+template < class ShapeType >
+struct ViewOffset< ShapeType , LayoutLeft
+                 , typename enable_if<( 1 < ShapeType::rank
+                                        &&
+                                        0 < ShapeType::rank_dynamic
+                                      )>::type >
+  : public ShapeType
+{
+  typedef size_t     size_type ;
+  typedef ShapeType  shape_type ;
+  typedef LayoutLeft array_layout ;
+
+  enum { has_padding = true };
+
+  unsigned S0 ;
+
+  template< unsigned R >
+  KOKKOS_INLINE_FUNCTION
+  void assign( unsigned n )
+    { AssignViewOffsetDimension< ViewOffset , R >( *this , n ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3
+             , unsigned n4 , unsigned n5 , unsigned n6 , unsigned n7
+             , unsigned = 0 )
+    { shape_type::assign( *this , n0, n1, n2, n3, n4, n5, n6, n7 ); S0 = shape_type::N0 ; }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutLeft > & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) == 0
+                                  )>::type * = 0 )
+    {
+      shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 );
+      S0 = shape_type::N0 ;
+    }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutLeft > & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) > 0
+                                  )>::type * = 0 )
+    {
+      shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 );
+      S0 = rhs.S0 ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void set_padding()
+    {
+      enum { div   = MEMORY_ALIGNMENT / shape_type::scalar_size };
+      enum { mod   = MEMORY_ALIGNMENT % shape_type::scalar_size };
+      enum { align = 0 == mod ? div : 0 };
+
+      if ( align && MEMORY_ALIGNMENT_THRESHOLD * align < S0 ) {
+
+        const unsigned count_mod = S0 % ( div ? div : 1 );
+
+        if ( count_mod ) { S0 += align - count_mod ; }
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type cardinality() const { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type capacity() const { return S0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  // Stride with [ rank ] as total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      s[0] = 1 ;
+      if ( 0 < shape_type::rank ) { s[1] = S0 ; }
+      if ( 1 < shape_type::rank ) { s[2] = s[1] * shape_type::N1 ; }
+      if ( 2 < shape_type::rank ) { s[3] = s[2] * shape_type::N2 ; }
+      if ( 3 < shape_type::rank ) { s[4] = s[3] * shape_type::N3 ; }
+      if ( 4 < shape_type::rank ) { s[5] = s[4] * shape_type::N4 ; }
+      if ( 5 < shape_type::rank ) { s[6] = s[5] * shape_type::N5 ; }
+      if ( 6 < shape_type::rank ) { s[7] = s[6] * shape_type::N6 ; }
+      if ( 7 < shape_type::rank ) { s[8] = s[7] * shape_type::N6 ; }
+    }
+
+  KOKKOS_INLINE_FUNCTION size_type stride_0() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION size_type stride_1() const { return S0 ; }
+  KOKKOS_INLINE_FUNCTION size_type stride_2() const { return S0 * shape_type::N1 ; }
+  KOKKOS_INLINE_FUNCTION size_type stride_3() const { return S0 * shape_type::N1 * shape_type::N2 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_4() const
+    { return S0 * shape_type::N1 * shape_type::N2 * shape_type::N3 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_5() const
+    { return S0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_6() const
+    { return S0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_7() const
+    { return S0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N7 ; }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const & i0 , I1 const & i1) const
+    { return i0 + S0 * i1 ; }
+
+  //rank 3
+  template <typename I0, typename I1, typename I2>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 ) const
+    {
+      return i0 + S0 * (
+             i1 + shape_type::N1 * i2 );
+    }
+
+  //rank 4
+  template <typename I0, typename I1, typename I2, typename I3>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3 ) const
+    {
+      return i0 + S0 * (
+             i1 + shape_type::N1 * (
+             i2 + shape_type::N2 * i3 ));
+    }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4 ) const
+    {
+      return i0 + S0 * (
+             i1 + shape_type::N1 * (
+             i2 + shape_type::N2 * (
+             i3 + shape_type::N3 * i4 )));
+    }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5 ) const
+    {
+      return i0 + S0 * (
+             i1 + shape_type::N1 * (
+             i2 + shape_type::N2 * (
+             i3 + shape_type::N3 * (
+             i4 + shape_type::N4 * i5 ))));
+    }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6 ) const
+  {
+    return i0 + S0 * (
+           i1 + shape_type::N1 * (
+           i2 + shape_type::N2 * (
+           i3 + shape_type::N3 * (
+           i4 + shape_type::N4 * (
+           i5 + shape_type::N5 * i6 )))));
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7 ) const
+  {
+    return i0 + S0 * (
+           i1 + shape_type::N1 * (
+           i2 + shape_type::N2 * (
+           i3 + shape_type::N3 * (
+           i4 + shape_type::N4 * (
+           i5 + shape_type::N5 * (
+           i6 + shape_type::N6 * i7 ))))));
+  }
+};
+
+//----------------------------------------------------------------------------
+// LayoutRight AND ( 1 >= rank OR 1 >= rank_dynamic ) : no padding / striding
+template < class ShapeType >
+struct ViewOffset< ShapeType , LayoutRight
+                 , typename enable_if<( 1 >= ShapeType::rank
+                                        ||
+                                        1 >= ShapeType::rank_dynamic
+                                      )>::type >
+  : public ShapeType
+{
+  typedef size_t       size_type;
+  typedef ShapeType    shape_type;
+  typedef LayoutRight  array_layout ;
+
+  enum { has_padding = false };
+
+  template< unsigned R >
+  KOKKOS_INLINE_FUNCTION
+  void assign( unsigned n )
+    { AssignViewOffsetDimension< ViewOffset , R >( *this , n ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3
+             , unsigned n4 , unsigned n5 , unsigned n6 , unsigned n7
+             , unsigned = 0 )
+    { shape_type::assign( *this , n0, n1, n2, n3, n4, n5, n6, n7 ); }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutRight > & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic)
+                                  )>::type * = 0 )
+    { shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutLeft > & rhs
+             , typename enable_if<( 1 == int(ShapeRHS::rank)
+                                    &&
+                                    1 == int(shape_type::rank)
+                                    &&
+                                    1 == int(shape_type::rank_dynamic)
+                                  )>::type * = 0 )
+    { shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void set_padding() {}
+
+  KOKKOS_INLINE_FUNCTION
+  size_type cardinality() const { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type capacity() const { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  size_type stride_R() const
+    {
+      return shape_type::N1 * shape_type::N2 * shape_type::N3 *
+             shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ;
+    };
+
+  // Stride with [rank] as total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      size_type n = 1 ;
+      if ( 7 < shape_type::rank ) { s[7] = n ; n *= shape_type::N7 ; }
+      if ( 6 < shape_type::rank ) { s[6] = n ; n *= shape_type::N6 ; }
+      if ( 5 < shape_type::rank ) { s[5] = n ; n *= shape_type::N5 ; }
+      if ( 4 < shape_type::rank ) { s[4] = n ; n *= shape_type::N4 ; }
+      if ( 3 < shape_type::rank ) { s[3] = n ; n *= shape_type::N3 ; }
+      if ( 2 < shape_type::rank ) { s[2] = n ; n *= shape_type::N2 ; }
+      if ( 1 < shape_type::rank ) { s[1] = n ; n *= shape_type::N1 ; }
+      if ( 0 < shape_type::rank ) { s[0] = n ; }
+      s[shape_type::rank] = n * shape_type::N0 ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_7() const { return 1 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_6() const { return shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_5() const { return shape_type::N7 * shape_type::N6 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_4() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_3() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_2() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_1() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 * shape_type::N2 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_0() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 * shape_type::N2 * shape_type::N1 ; }
+
+  // rank 2
+  template <typename I0, typename I1>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1 ) const
+    {
+      return i1 + shape_type::N1 * i0 ;
+    }
+
+  template <typename I0, typename I1, typename I2>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 ) const
+    {
+      return i2 + shape_type::N2 * (
+             i1 + shape_type::N1 * ( i0 ));
+    }
+
+  template <typename I0, typename I1, typename I2, typename I3>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3 ) const
+    {
+      return i3 + shape_type::N3 * (
+             i2 + shape_type::N2 * (
+             i1 + shape_type::N1 * ( i0 )));
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4 ) const
+    {
+      return i4 + shape_type::N4 * (
+             i3 + shape_type::N3 * (
+             i2 + shape_type::N2 * (
+             i1 + shape_type::N1 * ( i0 ))));
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5 ) const
+  {
+    return i5 + shape_type::N5 * (
+           i4 + shape_type::N4 * (
+           i3 + shape_type::N3 * (
+           i2 + shape_type::N2 * (
+           i1 + shape_type::N1 * ( i0 )))));
+  }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6 ) const
+  {
+    return i6 + shape_type::N6 * (
+           i5 + shape_type::N5 * (
+           i4 + shape_type::N4 * (
+           i3 + shape_type::N3 * (
+           i2 + shape_type::N2 * (
+           i1 + shape_type::N1 * ( i0 ))))));
+  }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7 ) const
+  {
+    return i7 + shape_type::N7 * (
+           i6 + shape_type::N6 * (
+           i5 + shape_type::N5 * (
+           i4 + shape_type::N4 * (
+           i3 + shape_type::N3 * (
+           i2 + shape_type::N2 * (
+           i1 + shape_type::N1 * ( i0 )))))));
+  }
+};
+
+//----------------------------------------------------------------------------
+// LayoutRight AND ( 1 < rank AND 1 < rank_dynamic ) : has padding / striding
+template < class ShapeType >
+struct ViewOffset< ShapeType , LayoutRight
+                 , typename enable_if<( 1 < ShapeType::rank
+                                        &&
+                                        1 < ShapeType::rank_dynamic
+                                      )>::type >
+  : public ShapeType
+{
+  typedef size_t       size_type;
+  typedef ShapeType    shape_type;
+  typedef LayoutRight  array_layout ;
+
+  enum { has_padding = true };
+
+  unsigned SR ;
+
+  template< unsigned R >
+  KOKKOS_INLINE_FUNCTION
+  void assign( unsigned n )
+    { AssignViewOffsetDimension< ViewOffset , R >( *this , n ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3
+             , unsigned n4 , unsigned n5 , unsigned n6 , unsigned n7
+             , unsigned = 0 )
+    {
+      shape_type::assign( *this , n0, n1, n2, n3, n4, n5, n6, n7 );
+      SR = shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ;
+    }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutRight > & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= 1
+                                  )>::type * = 0 )
+    {
+      shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 );
+      SR = shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ;
+    }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutRight > & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) > 1
+                                  )>::type * = 0 )
+    {
+      shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 );
+      SR = rhs.SR ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void set_padding()
+    {
+      enum { div   = MEMORY_ALIGNMENT / shape_type::scalar_size };
+      enum { mod   = MEMORY_ALIGNMENT % shape_type::scalar_size };
+      enum { align = 0 == mod ? div : 0 };
+
+      if ( align && MEMORY_ALIGNMENT_THRESHOLD * align < SR ) {
+
+        const unsigned count_mod = SR % ( div ? div : 1 );
+
+        if ( count_mod ) { SR += align - count_mod ; }
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type cardinality() const { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type capacity() const { return shape_type::N0 * SR ; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      size_type n = 1 ;
+      if ( 7 < shape_type::rank ) { s[7] = n ; n *= shape_type::N7 ; }
+      if ( 6 < shape_type::rank ) { s[6] = n ; n *= shape_type::N6 ; }
+      if ( 5 < shape_type::rank ) { s[5] = n ; n *= shape_type::N5 ; }
+      if ( 4 < shape_type::rank ) { s[4] = n ; n *= shape_type::N4 ; }
+      if ( 3 < shape_type::rank ) { s[3] = n ; n *= shape_type::N3 ; }
+      if ( 2 < shape_type::rank ) { s[2] = n ; n *= shape_type::N2 ; }
+      if ( 1 < shape_type::rank ) { s[1] = n ; n *= shape_type::N1 ; }
+      if ( 0 < shape_type::rank ) { s[0] = SR ; }
+      s[shape_type::rank] = SR * shape_type::N0 ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_7() const { return 1 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_6() const { return shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_5() const { return shape_type::N7 * shape_type::N6 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_4() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_3() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_2() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_1() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 * shape_type::N2 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_0() const { return SR ; }
+
+  // rank 2
+  template <typename I0, typename I1>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1 ) const
+    {
+      return i1 + i0 * SR ;
+    }
+
+  template <typename I0, typename I1, typename I2>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 ) const
+    {
+      return i2 + shape_type::N2 * ( i1 ) +
+             i0 * SR ;
+    }
+
+  template <typename I0, typename I1, typename I2, typename I3>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3 ) const
+    {
+      return i3 + shape_type::N3 * (
+             i2 + shape_type::N2 * ( i1 )) +
+             i0 * SR ;
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4 ) const
+    {
+      return i4 + shape_type::N4 * (
+             i3 + shape_type::N3 * (
+             i2 + shape_type::N2 * ( i1 ))) +
+             i0 * SR ;
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5 ) const
+  {
+    return i5 + shape_type::N5 * (
+           i4 + shape_type::N4 * (
+           i3 + shape_type::N3 * (
+           i2 + shape_type::N2 * ( i1 )))) +
+           i0 * SR ;
+  }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6 ) const
+  {
+    return i6 + shape_type::N6 * (
+           i5 + shape_type::N5 * (
+           i4 + shape_type::N4 * (
+           i3 + shape_type::N3 * (
+           i2 + shape_type::N2 * ( i1 ))))) +
+           i0 * SR ;
+  }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7 ) const
+  {
+    return i7 + shape_type::N7 * (
+           i6 + shape_type::N6 * (
+           i5 + shape_type::N5 * (
+           i4 + shape_type::N4 * (
+           i3 + shape_type::N3 * (
+           i2 + shape_type::N2 * ( i1 )))))) +
+           i0 * SR ;
+  }
+};
+
+//----------------------------------------------------------------------------
+// LayoutStride : 
+template < class ShapeType >
+struct ViewOffset< ShapeType , LayoutStride
+                 , typename enable_if<( 0 < ShapeType::rank )>::type >
+  : public ShapeType
+{
+  typedef size_t        size_type;
+  typedef ShapeType     shape_type;
+  typedef LayoutStride  array_layout ;
+
+  size_type S[ shape_type::rank + 1 ];
+
+  template< unsigned R >
+  KOKKOS_INLINE_FUNCTION
+  void assign( unsigned n )
+    { AssignViewOffsetDimension< ViewOffset , R >( *this , n ); }
+
+  template< class ShapeRHS , class Layout >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset<ShapeRHS,Layout> & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank) )>::type * = 0 )
+    {
+      rhs.stride(S);
+      shape_type::assign( *this, rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( const LayoutStride & layout )
+  {
+    size_type max = 0 ;
+    for ( int i = 0 ; i < shape_type::rank ; ++i ) {
+      S[i] = layout.stride[i] ;
+      const size_type m = layout.dimension[i] * S[i] ;
+      if ( max < m ) { max = m ; }
+    }
+    S[ shape_type::rank ] = max ;
+    shape_type::assign( *this, layout.dimension[0], layout.dimension[1],
+                               layout.dimension[2], layout.dimension[3],
+                               layout.dimension[4], layout.dimension[5],
+                               layout.dimension[6], layout.dimension[7] );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t s0 , size_t s1 , size_t s2 , size_t s3
+             , size_t s4 , size_t s5 , size_t s6 , size_t s7
+             , size_t s8 )
+    {
+      const size_t str[9] = { s0, s1, s2, s3, s4, s5, s6, s7, s8 };
+
+      // Last argument is the total length.
+      // Total length must be non-zero.
+      // All strides must be non-zero and less than total length.
+      bool ok = 0 < str[ shape_type::rank ] ;
+
+      for ( int i = 0 ; ( i < shape_type::rank ) &&
+                        ( ok = 0 < str[i] && str[i] < str[ shape_type::rank ] ); ++i );
+
+      if ( ok ) {
+        size_t dim[8] = { 1,1,1,1,1,1,1,1 }; 
+        int iorder[9] = { 0,0,0,0,0,0,0,0,0 }; 
+
+        // Ordering of strides smallest to largest.
+        for ( int i = 1 ; i < shape_type::rank ; ++i ) {
+          int j = i ;
+          for ( ; 0 < j && str[i] < str[ iorder[j-1] ] ; --j ) {
+            iorder[j] = iorder[j-1] ;
+          }
+          iorder[j] = i ;
+        }
+
+        // Last argument is the total length.
+        iorder[ shape_type::rank ] = shape_type::rank ;
+
+        // Determine dimension associated with each stride.
+        // Guarantees non-overlap by truncating dimension
+        // if ( 0 != str[ iorder[i+1] ] % str[ iorder[i] ] )
+        for ( int i = 0 ; i < shape_type::rank ; ++i ) {
+          dim[ iorder[i] ] = str[ iorder[i+1] ] / str[ iorder[i] ] ;
+        }
+
+        // Assign dimensions and strides:
+        shape_type::assign( *this, dim[0], dim[1], dim[2], dim[3], dim[4], dim[5], dim[6], dim[7] );
+        for ( int i = 0 ; i <= shape_type::rank ; ++i ) { S[i] = str[i] ; }
+      }
+      else {
+        shape_type::assign(*this,0,0,0,0,0,0,0,0);
+        for ( int i = 0 ; i <= shape_type::rank ; ++i ) { S[i] = 0 ; }
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type cardinality() const
+    { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type capacity() const { return S[ shape_type::rank ]; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    { for ( int i = 0 ; i <= shape_type::rank ; ++i ) { s[i] = S[i] ; } }
+
+  // rank 1
+  template <typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0 ) const
+    {
+      return i0 * S[0] ;
+    }
+
+  // rank 2
+  template <typename I0, typename I1>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1 ) const
+    {
+      return i0 * S[0] + i1 * S[1] ;
+    }
+
+  template <typename I0, typename I1, typename I2>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 ) const
+    {
+      return i0 * S[0] + i1 * S[1] + i2 * S[2] ;
+    }
+
+  template <typename I0, typename I1, typename I2, typename I3>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3 ) const
+    {
+      return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] ;
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4 ) const
+    {
+      return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] + i4 * S[4] ;
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5 ) const
+    {
+      return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] + i4 * S[4] + i5 * S[5] ;
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6 ) const
+    {
+      return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] + i4 * S[4] + i5 * S[5] + i6 * S[6] ;
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7 ) const
+    {
+      return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] + i4 * S[4] + i5 * S[5] + i6 * S[6] + i7 * S[7] ;
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< class T /* assume an integral type */ >
+struct ViewOffsetRange {
+  enum { is_range = false };
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t dimension( size_t const , T const & ) { return 0 ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t begin( T const & i ) { return size_t(i) ; }
+};
+
+template<>
+struct ViewOffsetRange<ALL> {
+  enum { is_range = true };
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t dimension( size_t const n , ALL const & ) { return n ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t begin( ALL const & ) { return 0 ; }
+};
+
+template< typename iType >
+struct ViewOffsetRange< std::pair<iType,iType> > {
+  enum { is_range = true };
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t dimension( size_t const n , std::pair<iType,iType> const & r )
+    { return ( 0 <= int(r.first) && r.first < r.second && size_t(r.second) < n ) ? r.second - r.first : 0 ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t begin( std::pair<iType,iType> const & r ) { return r.first ; }
+};
+
+template< typename iType >
+struct ViewOffsetRange< Kokkos::pair<iType,iType> > {
+  enum { is_range = true };
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t dimension( size_t const n , Kokkos::pair<iType,iType> const & r )
+    { return ( 0 <= int(r.first) && r.first < r.second && size_t(r.second) < n ) ? r.second - r.first : 0 ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t begin( Kokkos::pair<iType,iType> const & r ) { return r.first ; }
+};
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_VIEWOFFSET_HPP
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp
new file mode 100644
index 000000000..522a7a7db
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp
@@ -0,0 +1,317 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VIEWSUPPORT_HPP
+#define KOKKOS_VIEWSUPPORT_HPP
+
+#include <impl/Kokkos_Shape.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Evaluate if LHS = RHS view assignment is allowed. */
+template< class ViewLHS , class ViewRHS >
+struct ViewAssignable
+{
+  // Same memory space.
+  // Same value type.
+  // Compatible 'const' qualifier
+  // Cannot assign managed = unmannaged
+  enum { assignable_value =
+    ( is_same< typename ViewLHS::value_type ,
+               typename ViewRHS::value_type >::value
+      ||
+      is_same< typename ViewLHS::value_type ,
+               typename ViewRHS::const_value_type >::value )
+    &&
+    is_same< typename ViewLHS::memory_space ,
+             typename ViewRHS::memory_space >::value
+    &&
+    ( ! ( ViewLHS::is_managed && ! ViewRHS::is_managed ) )
+  };
+
+  enum { assignable_shape =
+    // Compatible shape and matching layout:
+    ( ShapeCompatible< typename ViewLHS::shape_type ,
+                       typename ViewRHS::shape_type >::value
+      &&
+      is_same< typename ViewLHS::array_layout ,
+               typename ViewRHS::array_layout >::value )
+    ||
+    // Matching layout, same rank, and LHS dynamic rank
+    ( is_same< typename ViewLHS::array_layout ,
+               typename ViewRHS::array_layout >::value
+      &&
+      int(ViewLHS::rank) == int(ViewRHS::rank)
+      &&
+      int(ViewLHS::rank) == int(ViewLHS::rank_dynamic) )
+    ||
+    // Both rank-0, any shape and layout
+    ( int(ViewLHS::rank) == 0 && int(ViewRHS::rank) == 0 )
+    ||
+    // Both rank-1 and LHS is dynamic rank-1, any shape and layout
+    ( int(ViewLHS::rank) == 1 && int(ViewRHS::rank) == 1 &&
+      int(ViewLHS::rank_dynamic) == 1 )
+    };
+
+  enum { value = assignable_value && assignable_shape };
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  View tracking increment/decrement only happens when
+ *          view memory is managed and executing in the host space.
+ */
+template< class ViewTraits , class Enable = void >
+struct ViewTracking {
+  KOKKOS_INLINE_FUNCTION void increment( const void * ) const {}
+  KOKKOS_INLINE_FUNCTION void decrement( const void * ) const {}
+
+  KOKKOS_INLINE_FUNCTION
+  ViewTracking & operator = ( const ViewTracking & ) { return *this ; }
+
+  template< class T >
+  KOKKOS_INLINE_FUNCTION
+  ViewTracking & operator = ( const ViewTracking<T> & ) { return *this ; }
+
+  KOKKOS_INLINE_FUNCTION
+  ViewTracking & operator = ( const bool ) { return *this ; }
+
+  KOKKOS_INLINE_FUNCTION
+  operator bool() const { return false ; }
+};
+
+template< class ViewTraits >
+struct ViewTracking< ViewTraits , typename enable_if< ViewTraits::is_managed >::type >
+{
+private:
+
+  enum { is_host_space = is_same< HostSpace , ExecutionSpace >::value };
+
+  bool m_flag ;
+
+  struct NoType {};
+
+public:
+
+  typedef typename ViewTraits::memory_space memory_space ;
+
+  template< class T >
+  KOKKOS_INLINE_FUNCTION
+  void increment( const T * ptr
+                , typename enable_if<( ! is_same<T,NoType>::value && is_host_space )>::type * = 0 ) const
+    { if ( m_flag ) memory_space::increment( ptr ); }
+
+  template< class T >
+  KOKKOS_INLINE_FUNCTION
+  void increment( const T *
+                , typename enable_if<( ! is_same<T,NoType>::value && ! is_host_space )>::type * = 0 ) const
+    {}
+
+  template< class T >
+  KOKKOS_INLINE_FUNCTION
+  void decrement( const T * ptr
+                , typename enable_if<( ! is_same<T,NoType>::value && is_host_space )>::type * = 0 ) const
+    { if ( m_flag ) memory_space::decrement( ptr ); }
+
+  template< class T >
+  KOKKOS_INLINE_FUNCTION
+  void decrement( const T *
+                , typename enable_if<( ! is_same<T,NoType>::value && ! is_host_space )>::type * = 0 ) const
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  ViewTracking() : m_flag( true ) {}
+
+  template< class T >
+  KOKKOS_INLINE_FUNCTION
+  ViewTracking & operator = ( const ViewTracking & rhs ) { m_flag = rhs.m_flag ; return *this ; }
+
+  template< class T >
+  KOKKOS_INLINE_FUNCTION
+  ViewTracking & operator = ( const ViewTracking<T> & rhs ) { m_flag = rhs.operator bool(); return *this ; }
+
+  KOKKOS_INLINE_FUNCTION
+  ViewTracking & operator = ( const bool rhs ) { m_flag = rhs ; return *this ; }
+
+  KOKKOS_INLINE_FUNCTION
+  operator bool() const { return m_flag ; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class OutputView , class InputView  , unsigned Rank = OutputView::Rank >
+struct ViewRemap
+{
+  typedef typename OutputView::device_type device_type ;
+  typedef typename device_type::size_type  size_type ;
+
+  const OutputView output ;
+  const InputView  input ;
+  const size_type n0 ;
+  const size_type n1 ;
+  const size_type n2 ;
+  const size_type n3 ;
+  const size_type n4 ;
+  const size_type n5 ;
+  const size_type n6 ;
+  const size_type n7 ;
+
+  ViewRemap( const OutputView & arg_out , const InputView & arg_in )
+    : output( arg_out ), input( arg_in )
+    , n0( std::min( (size_t)arg_out.dimension_0() , (size_t)arg_in.dimension_0() ) )
+    , n1( std::min( (size_t)arg_out.dimension_1() , (size_t)arg_in.dimension_1() ) )
+    , n2( std::min( (size_t)arg_out.dimension_2() , (size_t)arg_in.dimension_2() ) )
+    , n3( std::min( (size_t)arg_out.dimension_3() , (size_t)arg_in.dimension_3() ) )
+    , n4( std::min( (size_t)arg_out.dimension_4() , (size_t)arg_in.dimension_4() ) )
+    , n5( std::min( (size_t)arg_out.dimension_5() , (size_t)arg_in.dimension_5() ) )
+    , n6( std::min( (size_t)arg_out.dimension_6() , (size_t)arg_in.dimension_6() ) )
+    , n7( std::min( (size_t)arg_out.dimension_7() , (size_t)arg_in.dimension_7() ) )
+    {
+      parallel_for( n0 , *this );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i0 ) const
+  {
+    for ( size_type i1 = 0 ; i1 < n1 ; ++i1 ) {
+    for ( size_type i2 = 0 ; i2 < n2 ; ++i2 ) {
+    for ( size_type i3 = 0 ; i3 < n3 ; ++i3 ) {
+    for ( size_type i4 = 0 ; i4 < n4 ; ++i4 ) {
+    for ( size_type i5 = 0 ; i5 < n5 ; ++i5 ) {
+    for ( size_type i6 = 0 ; i6 < n6 ; ++i6 ) {
+    for ( size_type i7 = 0 ; i7 < n7 ; ++i7 ) {
+      output.at(i0,i1,i2,i3,i4,i5,i6,i7) = input.at(i0,i1,i2,i3,i4,i5,i6,i7);
+    }}}}}}}
+  }
+};
+
+template< class OutputView , class InputView  >
+struct ViewRemap< OutputView ,  InputView , 0 >
+{
+  typedef typename OutputView::value_type   value_type ;
+  typedef typename OutputView::memory_space dst_space ;
+  typedef typename InputView ::memory_space src_space ;
+
+  ViewRemap( const OutputView & arg_out , const InputView & arg_in )
+  {
+    DeepCopy< dst_space , src_space >( arg_out.ptr_on_device() ,
+                                       arg_in.ptr_on_device() ,
+                                       sizeof(value_type) );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< class OutputView , unsigned Rank = OutputView::Rank >
+struct ViewFill
+{
+  typedef typename OutputView::device_type       device_type ;
+  typedef typename OutputView::const_value_type  const_value_type ;
+  typedef typename device_type::size_type        size_type ;
+
+  const OutputView output ;
+  const_value_type input ;
+
+  ViewFill( const OutputView & arg_out , const_value_type & arg_in )
+    : output( arg_out ), input( arg_in )
+    {
+      parallel_for( output.dimension_0() , *this );
+      device_type::fence();
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i0 ) const
+  {
+    for ( size_type i1 = 0 ; i1 < output.dimension_1() ; ++i1 ) {
+    for ( size_type i2 = 0 ; i2 < output.dimension_2() ; ++i2 ) {
+    for ( size_type i3 = 0 ; i3 < output.dimension_3() ; ++i3 ) {
+    for ( size_type i4 = 0 ; i4 < output.dimension_4() ; ++i4 ) {
+    for ( size_type i5 = 0 ; i5 < output.dimension_5() ; ++i5 ) {
+    for ( size_type i6 = 0 ; i6 < output.dimension_6() ; ++i6 ) {
+    for ( size_type i7 = 0 ; i7 < output.dimension_7() ; ++i7 ) {
+      output.at(i0,i1,i2,i3,i4,i5,i6,i7) = input ;
+    }}}}}}}
+  }
+};
+
+template< class OutputView >
+struct ViewFill< OutputView , 0 >
+{
+  typedef typename OutputView::device_type       device_type ;
+  typedef typename OutputView::const_value_type  const_value_type ;
+  typedef typename OutputView::memory_space      dst_space ;
+
+  ViewFill( const OutputView & arg_out , const_value_type & arg_in )
+  {
+    DeepCopy< dst_space , dst_space >( arg_out.ptr_on_device() , & arg_in ,
+                                       sizeof(const_value_type) );
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_VIEWSUPPORT_HPP */
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp
new file mode 100644
index 000000000..e1a20e6be
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp
@@ -0,0 +1,409 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VIEWTILELEFT_HPP
+#define KOKKOS_VIEWTILELEFT_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+struct ViewTileLeftFast ;
+struct ViewTileLeftSlow ;
+
+template< class ValueType , unsigned N0 , unsigned N1 , bool B , class MemorySpace , class MemoryTraits >
+struct ViewSpecialize< ValueType , void ,
+                       LayoutTileLeft<N0,N1,B> ,
+                       MemorySpace , MemoryTraits >
+{ typedef typename if_c< B , ViewTileLeftFast , ViewTileLeftSlow >::type type ; };
+
+//----------------------------------------------------------------------------
+
+template<>
+struct ViewAssignment< ViewTileLeftFast , void , void >
+{
+private:
+
+  template< class DT , class DL , class DD , class DM >
+  inline
+  void allocate( View<DT,DL,DD,DM,ViewTileLeftFast> & dst , const std::string label )
+  {
+    typedef View<DT,DL,DD,DM,ViewTileLeftFast>  DstViewType ;
+    typedef typename DstViewType::memory_space  memory_space ;
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    dst.m_ptr_on_device = (typename DstViewType::value_type *)
+      memory_space::allocate( label ,
+                              typeid(typename DstViewType::value_type) ,
+                              sizeof(typename DstViewType::value_type) ,
+                              dst.capacity() );
+
+    ViewFill< DstViewType > init( dst , typename DstViewType::value_type() );
+  }
+
+public:
+
+  template< class DT , class DL , class DD , class DM >
+  inline
+  ViewAssignment( View<DT,DL,DD,DM,ViewTileLeftFast> & dst ,
+                  const typename enable_if< ViewTraits<DT,DL,DD,DM>::is_managed , std::string >::type & label ,
+                  const size_t n0 ,
+                  const size_t n1 ,
+                  const size_t = 0 ,
+                  const size_t = 0 ,
+                  const size_t = 0 ,
+                  const size_t = 0 ,
+                  const size_t = 0 ,
+                  const size_t = 0 )
+  {
+    typedef View<DT,DL,DD,DM,ViewTileLeftFast>  DstViewType ;
+
+    dst.m_shape.N0 = n0 ;
+    dst.m_shape.N1 = n1 ;
+    dst.m_tile_N0  = ( n0 + DstViewType::MASK_0 ) >> DstViewType::SHIFT_0 ;
+
+    allocate( dst , label );
+  }
+
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  ViewAssignment(       View<DT,DL,DD,DM,ViewTileLeftFast> & dst ,
+                  const View<ST,SL,SD,SM,ViewTileLeftFast> & src ,
+                  typename enable_if<
+                    is_same< View<DT,DL,DD,DM,ViewTileLeftFast> ,
+                             typename View<ST,SL,SD,SM,ViewTileLeftFast>::HostMirror >::value
+                  >::type * = 0 )
+  {
+    dst.m_shape   = src.m_shape ;
+    dst.m_tile_N0 = src.m_tile_N0 ;
+    allocate( dst , "mirror" );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template<>
+struct ViewAssignment< ViewTileLeftFast , ViewTileLeftFast, void >
+{
+  /** \brief Assign compatible views */
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,ViewTileLeftFast> & dst ,
+                  const View<ST,SL,SD,SM,ViewTileLeftFast> & src ,
+                  const typename enable_if<(
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::value
+                  )>::type * = 0 )
+  {
+    typedef View<DT,DL,DD,DM,ViewTileLeftFast> DstViewType ;
+    typedef typename DstViewType::shape_type    shape_type ;
+    //typedef typename DstViewType::memory_space  memory_space ; // unused
+    //typedef typename DstViewType::memory_traits memory_traits ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    shape_type::assign( dst.m_shape, src.m_shape.N0 , src.m_shape.N1 );
+
+    dst.m_tracking       = src.m_tracking ;
+    dst.m_tile_N0       = src.m_tile_N0 ;
+    dst.m_ptr_on_device = src.m_ptr_on_device ;
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+
+  //------------------------------------
+  /** \brief  Deep copy data from compatible value type, layout, rank, and specialization.
+   *          Check the dimensions and allocation lengths at runtime.
+   */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  inline static
+  void deep_copy( const View<DT,DL,DD,DM,Impl::ViewTileLeftFast> & dst ,
+                  const View<ST,SL,SD,SM,Impl::ViewTileLeftFast> & src ,
+                  const typename Impl::enable_if<(
+                    Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::value_type ,
+                                   typename ViewTraits<ST,SL,SD,SM>::non_const_value_type >::value
+                    &&
+                    Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout ,
+                                   typename ViewTraits<ST,SL,SD,SM>::array_layout >::value
+                    &&
+                    ( unsigned(ViewTraits<DT,DL,DD,DM>::rank) == unsigned(ViewTraits<ST,SL,SD,SM>::rank) )
+                  )>::type * = 0 )
+  {
+    typedef ViewTraits<DT,DL,DD,DM> dst_traits ;
+    typedef ViewTraits<ST,SL,SD,SM> src_traits ;
+
+    if ( dst.m_ptr_on_device != src.m_ptr_on_device ) {
+
+      Impl::assert_shapes_are_equal( dst.m_shape , src.m_shape );
+
+      const size_t n_dst = sizeof(typename dst_traits::value_type) * dst.capacity();
+      const size_t n_src = sizeof(typename src_traits::value_type) * src.capacity();
+
+      Impl::assert_counts_are_equal( n_dst , n_src );
+
+      DeepCopy< typename dst_traits::memory_space ,
+                typename src_traits::memory_space >( dst.m_ptr_on_device , src.m_ptr_on_device , n_dst );
+    }
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template<>
+struct ViewAssignment< ViewDefault , ViewTileLeftFast, void >
+{
+  /** \brief Extracting a single tile from a tiled view */
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,ViewDefault> & dst ,
+                  const View<ST,SL,SD,SM,ViewTileLeftFast> & src ,
+                  const unsigned i0 ,
+                  const typename enable_if<(
+                    is_same< View<DT,DL,DD,DM,ViewDefault> ,
+                             typename View<ST,SL,SD,SM,ViewTileLeftFast>::tile_type >::value
+                  ), unsigned >::type i1 )
+  {
+    //typedef View<DT,DL,DD,DM,ViewDefault> DstViewType ; // unused
+    //typedef typename DstViewType::shape_type    shape_type ; // unused
+    //typedef typename DstViewType::memory_space  memory_space ; // unused
+    //typedef typename DstViewType::memory_traits memory_traits ; // unused
+
+    dst.m_tracking.decrement( dst.m_ptr_on_device );
+
+    enum { N0 = SL::N0 };
+    enum { N1 = SL::N1 };
+    enum { SHIFT_0 = power_of_two<N0>::value };
+    enum { MASK_0 = N0 - 1 };
+    enum { SHIFT_1 = power_of_two<N1>::value };
+
+    const unsigned NT0 = ( src.dimension_0() + MASK_0 ) >> SHIFT_0 ;
+
+    dst.m_tracking      = src.m_tracking ;
+    dst.m_ptr_on_device = src.m_ptr_on_device + (( i0 + i1 * NT0 ) << ( SHIFT_0 + SHIFT_1 ));
+
+    dst.m_tracking.increment( dst.m_ptr_on_device );
+  }
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class DataType , class Arg1Type , class Arg2Type , class Arg3Type >
+class View< DataType , Arg1Type , Arg2Type , Arg3Type , Impl::ViewTileLeftFast >
+  : public ViewTraits< DataType , Arg1Type , Arg2Type , Arg3Type >
+{
+private:
+  template< class , class , class > friend struct Impl::ViewAssignment ;
+
+  typedef ViewTraits< DataType , Arg1Type , Arg2Type , Arg3Type > traits ;
+
+  typedef Impl::ViewAssignment<Impl::ViewTileLeftFast> alloc ;
+
+  typedef Impl::ViewAssignment<Impl::ViewTileLeftFast,
+                               Impl::ViewTileLeftFast> assign ;
+
+  typename traits::value_type * m_ptr_on_device ;
+  typename traits::shape_type   m_shape ;
+  unsigned                      m_tile_N0 ;
+  Impl::ViewTracking< traits >  m_tracking ;
+
+  typedef typename traits::array_layout layout ;
+
+  enum { SHIFT_0 = Impl::power_of_two<layout::N0>::value };
+  enum { SHIFT_1 = Impl::power_of_two<layout::N1>::value };
+  enum { MASK_0  = layout::N0 - 1 };
+  enum { MASK_1  = layout::N1 - 1 };
+
+public:
+
+  typedef Impl::ViewTileLeftFast specialize ;
+
+  typedef View< typename traits::const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits > const_type ;
+
+  typedef View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type::host_mirror_device_type ,
+                void > HostMirror ;
+
+  enum { Rank = 2 };
+
+  KOKKOS_INLINE_FUNCTION typename traits::shape_type shape() const { return m_shape ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const { return m_shape.N0 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_shape.N1 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return 1 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  View() : m_ptr_on_device(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~View() { m_tracking.decrement( m_ptr_on_device ); }
+
+  KOKKOS_INLINE_FUNCTION
+  View( const View & rhs ) : m_ptr_on_device(0) { (void)assign( *this , rhs ); }
+
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( const View & rhs ) { (void)assign( *this , rhs ); return *this ; }
+
+  //------------------------------------
+  // Array allocator and member access operator:
+
+  View( const std::string & label , const size_t n0 , const size_t n1 )
+    : m_ptr_on_device(0) { (void)alloc( *this , label , n0 , n1 ); }
+
+  template< typename iType0 , typename iType1 >
+  KOKKOS_INLINE_FUNCTION
+  typename traits::value_type & operator()( const iType0 & i0 , const iType1 & i1 ) const
+    {
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_shape, i0,i1 );
+
+      // Use care to insert necessary parentheses as the
+      // shift operators have lower precedence than the arithmatic operators.
+
+      return m_ptr_on_device[
+        // ( ( Tile offset                               ) *  ( Tile size       ) )
+         + ( ( (i0>>SHIFT_0) + m_tile_N0 * (i1>>SHIFT_1) ) << (SHIFT_0 + SHIFT_1) )
+        // ( Offset within tile                       )
+         + ( (i0 & MASK_0) + ((i1 & MASK_1)<<SHIFT_0) ) ] ;
+    }
+
+  //------------------------------------
+  // Accept but ignore extra indices, they should be zero.
+
+  template< typename iType0 , typename iType1 >
+  KOKKOS_INLINE_FUNCTION
+  typename traits::value_type &
+    at( const iType0 & i0 , const iType1 & i1 , const int , const int ,
+        const int , const int , const int , const int ) const
+    {
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_shape, i0,i1 );
+
+      // Use care to insert necessary parentheses as the
+      // shift operators have lower precedence than the arithmatic operators.
+
+      return m_ptr_on_device[
+        // ( ( Tile offset                               ) *  ( Tile size       ) )
+         + ( ( (i0>>SHIFT_0) + m_tile_N0 * (i1>>SHIFT_1) ) << (SHIFT_0 + SHIFT_1) )
+        // ( Offset within tile                       )
+         + ( (i0 & MASK_0) + ((i1 & MASK_1)<<SHIFT_0) ) ] ;
+    }
+
+  //------------------------------------
+  // Tile specialization specific declarations and functions:
+
+  typedef View< typename traits::value_type [ layout::N0 ][ layout::N1 ] ,
+                LayoutLeft ,
+                typename traits::device_type ,
+                MemoryUnmanaged >
+    tile_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  typename traits::value_type * ptr_on_device() const { return m_ptr_on_device ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t tiles_in_dimension_0() const { return m_tile_N0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_t tiles_in_dimension_1() const { return ( m_shape.N1 + MASK_1 ) >> SHIFT_1 ; }
+
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  size_t global_to_tile_index_0( const iType & global_i0 ) const
+    { return global_i0 >> SHIFT_0 ; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  size_t global_to_tile_index_1( const iType & global_i1 ) const
+    { return global_i1 >> SHIFT_1 ; }
+
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  size_t global_to_local_tile_index_0( const iType & global_i0 ) const
+    { return global_i0 & MASK_0 ; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  size_t global_to_local_tile_index_1( const iType & global_i1 ) const
+    { return global_i1 & MASK_1 ; }
+
+
+  //------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  typename traits::size_type capacity() const
+  {
+    return ( m_tile_N0 * ( ( m_shape.N1 + MASK_1 ) >> SHIFT_1 ) ) << ( SHIFT_0 + SHIFT_1 );
+  }
+};
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_VIEWTILELEFT_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp b/lib/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp
new file mode 100644
index 000000000..ea349e7ab
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp
@@ -0,0 +1,242 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_VOLATILE_LOAD )
+#define KOKKOS_VOLATILE_LOAD
+
+#if defined( __GNUC__ ) /* GNU C   */ || \
+    defined( __GNUG__ ) /* GNU C++ */ || \
+    defined( __clang__ )
+
+#define KOKKOS_MAY_ALIAS __attribute__((__may_alias__))
+
+#else
+
+#define KOKKOS_MAY_ALIAS
+
+#endif
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+T volatile_load(T const volatile * const src_ptr)
+{
+  typedef uint64_t KOKKOS_MAY_ALIAS T64;
+  typedef uint32_t KOKKOS_MAY_ALIAS T32;
+  typedef uint16_t KOKKOS_MAY_ALIAS T16;
+  typedef uint8_t  KOKKOS_MAY_ALIAS T8;
+
+  enum {
+    NUM_8  = sizeof(T),
+    NUM_16 = NUM_8 / 2,
+    NUM_32 = NUM_8 / 4,
+    NUM_64 = NUM_8 / 8
+  };
+
+  union {
+    T   const volatile * const ptr;
+    T64 const volatile * const ptr64;
+    T32 const volatile * const ptr32;
+    T16 const volatile * const ptr16;
+    T8  const volatile * const ptr8;
+  } src = {src_ptr};
+
+  T result;
+
+  union {
+    T   * const ptr;
+    T64 * const ptr64;
+    T32 * const ptr32;
+    T16 * const ptr16;
+    T8  * const ptr8;
+  } dst = {&result};
+
+  for (int i=0; i < NUM_64; ++i) {
+    dst.ptr64[i] = src.ptr64[i];
+  }
+
+  if ( NUM_64*2 < NUM_32 ) {
+    dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2];
+  }
+
+  if ( NUM_32*2 < NUM_16 ) {
+    dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2];
+  }
+
+  if ( NUM_16*2 < NUM_8 ) {
+    dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2];
+  }
+
+  return result;
+}
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * const dst_ptr, T const volatile * const src_ptr)
+{
+  typedef uint64_t KOKKOS_MAY_ALIAS T64;
+  typedef uint32_t KOKKOS_MAY_ALIAS T32;
+  typedef uint16_t KOKKOS_MAY_ALIAS T16;
+  typedef uint8_t  KOKKOS_MAY_ALIAS T8;
+
+  enum {
+    NUM_8  = sizeof(T),
+    NUM_16 = NUM_8 / 2,
+    NUM_32 = NUM_8 / 4,
+    NUM_64 = NUM_8 / 8
+  };
+
+  union {
+    T   const volatile * const ptr;
+    T64 const volatile * const ptr64;
+    T32 const volatile * const ptr32;
+    T16 const volatile * const ptr16;
+    T8  const volatile * const ptr8;
+  } src = {src_ptr};
+
+  union {
+    T   volatile * const ptr;
+    T64 volatile * const ptr64;
+    T32 volatile * const ptr32;
+    T16 volatile * const ptr16;
+    T8  volatile * const ptr8;
+  } dst = {dst_ptr};
+
+  for (int i=0; i < NUM_64; ++i) {
+    dst.ptr64[i] = src.ptr64[i];
+  }
+
+  if ( NUM_64*2 < NUM_32 ) {
+    dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2];
+  }
+
+  if ( NUM_32*2 < NUM_16 ) {
+    dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2];
+  }
+
+  if ( NUM_16*2 < NUM_8 ) {
+    dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2];
+  }
+}
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * const dst_ptr, T const * const src_ptr)
+{
+  typedef uint64_t KOKKOS_MAY_ALIAS T64;
+  typedef uint32_t KOKKOS_MAY_ALIAS T32;
+  typedef uint16_t KOKKOS_MAY_ALIAS T16;
+  typedef uint8_t  KOKKOS_MAY_ALIAS T8;
+
+  enum {
+    NUM_8  = sizeof(T),
+    NUM_16 = NUM_8 / 2,
+    NUM_32 = NUM_8 / 4,
+    NUM_64 = NUM_8 / 8
+  };
+
+  union {
+    T   const * const ptr;
+    T64 const * const ptr64;
+    T32 const * const ptr32;
+    T16 const * const ptr16;
+    T8  const * const ptr8;
+  } src = {src_ptr};
+
+  union {
+    T   volatile * const ptr;
+    T64 volatile * const ptr64;
+    T32 volatile * const ptr32;
+    T16 volatile * const ptr16;
+    T8  volatile * const ptr8;
+  } dst = {dst_ptr};
+
+  for (int i=0; i < NUM_64; ++i) {
+    dst.ptr64[i] = src.ptr64[i];
+  }
+
+  if ( NUM_64*2 < NUM_32 ) {
+    dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2];
+  }
+
+  if ( NUM_32*2 < NUM_16 ) {
+    dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2];
+  }
+
+  if ( NUM_16*2 < NUM_8 ) {
+    dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2];
+  }
+}
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * dst_ptr, T const volatile & src)
+{ volatile_store(dst_ptr, &src); }
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * dst_ptr, T const & src)
+{ volatile_store(dst_ptr, &src); }
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+T safe_load(T const * const ptr)
+{
+#if !defined( __MIC__ )
+  return *ptr;
+#else
+  return volatile_load(ptr);
+#endif
+}
+
+} // namespace kokkos
+
+#undef KOKKOS_MAY_ALIAS
+
+#endif
+
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp b/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp
new file mode 100644
index 000000000..bfb3bb2da
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp
@@ -0,0 +1,700 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#define DEBUG_PRINT 0
+
+#include <iostream>
+#include <sstream>
+
+#include <KokkosCore_config.h>
+#include <Kokkos_hwloc.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace hwloc {
+
+/* Return 0 if asynchronous, 1 if synchronous and include process. */
+unsigned thread_mapping( const char * const label ,
+                         const bool allow_async ,
+                         unsigned & thread_count ,
+                         unsigned & use_numa_count ,
+                         unsigned & use_cores_per_numa ,
+                         std::pair<unsigned,unsigned> threads_coord[] )
+{
+  const bool     hwloc_avail            = Kokkos::hwloc::available();
+  const unsigned avail_numa_count       = hwloc_avail ? hwloc::get_available_numa_count() : 1 ;
+  const unsigned avail_cores_per_numa   = hwloc_avail ? hwloc::get_available_cores_per_numa() : thread_count ;
+  const unsigned avail_threads_per_core = hwloc_avail ? hwloc::get_available_threads_per_core() : 1 ;
+
+  // (numa,core) coordinate of the process:
+  const std::pair<unsigned,unsigned> proc_coord = Kokkos::hwloc::get_this_thread_coordinate();
+
+  //------------------------------------------------------------------------
+  // Defaults for unspecified inputs:
+
+  if ( ! use_numa_count ) {
+    // Default to use all NUMA regions
+    use_numa_count = ! thread_count ? avail_numa_count : (
+                       thread_count < avail_numa_count ? thread_count : avail_numa_count );
+  }
+
+  if ( ! use_cores_per_numa ) {
+    // Default to use all but one core if asynchronous, all cores if synchronous.
+    const unsigned threads_per_numa = thread_count / use_numa_count ;
+
+    use_cores_per_numa = ! threads_per_numa ? avail_cores_per_numa - ( allow_async ? 1 : 0 ) : (
+                           threads_per_numa < avail_cores_per_numa ? threads_per_numa : avail_cores_per_numa );
+  }
+
+  if ( ! thread_count ) {
+    thread_count = use_numa_count * use_cores_per_numa * avail_threads_per_core ;
+  }
+
+  //------------------------------------------------------------------------
+  // Input verification:
+
+  const bool valid_numa      = use_numa_count <= avail_numa_count ;
+  const bool valid_cores     = use_cores_per_numa &&
+                               use_cores_per_numa <= avail_cores_per_numa ;
+  const bool valid_threads   = thread_count &&
+                               thread_count <= use_numa_count * use_cores_per_numa * avail_threads_per_core ;
+  const bool balanced_numa   = ! ( thread_count % use_numa_count );
+  const bool balanced_cores  = ! ( thread_count % ( use_numa_count * use_cores_per_numa ) );
+
+  const bool valid_input = valid_numa && valid_cores && valid_threads && balanced_numa && balanced_cores ;
+
+  if ( ! valid_input ) {
+
+    std::ostringstream msg ;
+
+    msg << label << " HWLOC ERROR(s)" ;
+
+    if ( ! valid_threads ) {
+      msg << " : thread_count(" << thread_count
+          << ") exceeds capacity("
+          << use_numa_count * use_cores_per_numa * avail_threads_per_core
+          << ")" ;
+    }
+    if ( ! valid_numa ) {
+      msg << " : use_numa_count(" << use_numa_count
+          << ") exceeds capacity(" << avail_numa_count << ")" ;
+    }
+    if ( ! valid_cores ) {
+      msg << " : use_cores_per_numa(" << use_cores_per_numa
+          << ") exceeds capacity(" << avail_cores_per_numa << ")" ;
+    }
+    if ( ! balanced_numa ) {
+      msg << " : thread_count(" << thread_count
+          << ") imbalanced among numa(" << use_numa_count << ")" ;
+    }
+    if ( ! balanced_cores ) {
+      msg << " : thread_count(" << thread_count
+          << ") imbalanced among cores(" << use_numa_count * use_cores_per_numa << ")" ;
+    }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  const unsigned thread_spawn_synchronous =
+    ( allow_async &&
+      1 < thread_count &&
+      ( use_numa_count     < avail_numa_count ||
+        use_cores_per_numa < avail_cores_per_numa ) )
+     ? 0 /* asyncronous */
+     : 1 /* synchronous, threads_coord[0] is process core */ ;
+
+  // Determine binding coordinates for to-be-spawned threads so that
+  // threads may be bound to cores as they are spawned.
+
+  const unsigned threads_per_core = thread_count / ( use_numa_count * use_cores_per_numa );
+
+  if ( thread_spawn_synchronous ) {
+    // Working synchronously and include process core as threads_coord[0].
+    // Swap the NUMA coordinate of the process core with 0
+    // Swap the CORE coordinate of the process core with 0
+    for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) {
+      const unsigned numa_coord = 0 == inuma ? proc_coord.first : ( proc_coord.first == inuma ? 0 : inuma );
+      for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) {
+        const unsigned core_coord = 0 == icore ? proc_coord.second : ( proc_coord.second == icore ? 0 : icore );
+        for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) {
+          threads_coord[i].first  = numa_coord ;
+          threads_coord[i].second = core_coord ;
+        }
+      }
+    }
+  }
+  else if ( use_numa_count < avail_numa_count ) {
+    // Working asynchronously and omit the process' NUMA region from the pool.
+    // Swap the NUMA coordinate of the process core with ( ( avail_numa_count - use_numa_count ) - 1 )
+    const unsigned numa_coord_swap = ( avail_numa_count - use_numa_count ) - 1 ;
+    for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) {
+      const unsigned numa_coord = proc_coord.first == inuma ? numa_coord_swap : inuma ;
+      for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) {
+        const unsigned core_coord = icore ;
+        for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) {
+          threads_coord[i].first  = numa_coord ;
+          threads_coord[i].second = core_coord ;
+        }
+      }
+    }
+  }
+  else if ( use_cores_per_numa < avail_cores_per_numa ) {
+    // Working asynchronously and omit the process' core from the pool.
+    // Swap the CORE coordinate of the process core with ( ( avail_cores_per_numa - use_cores_per_numa ) - 1 )
+    const unsigned core_coord_swap = ( avail_cores_per_numa - use_cores_per_numa ) - 1 ;
+    for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) {
+      const unsigned numa_coord = inuma ;
+      for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) {
+        const unsigned core_coord = proc_coord.second == icore ? core_coord_swap : icore ;
+        for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) {
+          threads_coord[i].first  = numa_coord ;
+          threads_coord[i].second = core_coord ;
+        }
+      }
+    }
+  }
+
+  return thread_spawn_synchronous ;
+}
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#if defined( KOKKOS_HAVE_HWLOC )
+
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+/*--------------------------------------------------------------------------*/
+/* Third Party Libraries */
+
+/* Hardware locality library: http://www.open-mpi.org/projects/hwloc/ */
+#include <hwloc.h>
+
+#define  REQUIRED_HWLOC_API_VERSION  0x000010300
+
+#if HWLOC_API_VERSION < REQUIRED_HWLOC_API_VERSION
+#error "Requires  http://www.open-mpi.org/projects/hwloc/  Version 1.3 or greater"
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace hwloc {
+namespace {
+
+inline
+void print_bitmap( std::ostream & s , const hwloc_const_bitmap_t bitmap )
+{
+  s << "{" ;
+  for ( int i = hwloc_bitmap_first( bitmap ) ;
+        -1 != i ; i = hwloc_bitmap_next( bitmap , i ) ) {
+    s << " " << i ;
+  }
+  s << " }" ;
+}
+
+enum { MAX_CORE = 1024 };
+
+std::pair<unsigned,unsigned> s_core_topology(0,0);
+unsigned                     s_core_capacity(0);
+hwloc_topology_t             s_hwloc_topology(0);
+hwloc_bitmap_t               s_hwloc_location(0);
+hwloc_bitmap_t               s_process_binding(0);
+hwloc_bitmap_t               s_core[ MAX_CORE ];
+
+struct Sentinel {
+  ~Sentinel();
+  Sentinel();
+};
+
+bool sentinel()
+{
+  static Sentinel self ;
+
+  if ( 0 == s_hwloc_topology ) {
+    std::cerr << "Kokkos::hwloc ERROR : Called after return from main()" << std::endl ;
+    std::cerr.flush();
+  }
+
+  return 0 != s_hwloc_topology ;
+}
+
+Sentinel::~Sentinel()
+{
+  hwloc_topology_destroy( s_hwloc_topology );
+  hwloc_bitmap_free( s_process_binding );
+  hwloc_bitmap_free( s_hwloc_location );
+
+  s_core_topology.first  = 0 ;
+  s_core_topology.second = 0 ;
+  s_core_capacity   = 0 ;
+  s_hwloc_topology  = 0 ;
+  s_hwloc_location  = 0 ;
+  s_process_binding = 0 ;
+}
+
+Sentinel::Sentinel()
+{
+#if defined(__MIC__)
+  static const bool remove_core_0 = true ;
+#else
+  static const bool remove_core_0 = false ;
+#endif
+
+  s_core_topology   = std::pair<unsigned,unsigned>(0,0);
+  s_core_capacity   = 0 ;
+  s_hwloc_topology  = 0 ;
+  s_hwloc_location  = 0 ;
+  s_process_binding = 0 ;
+
+  for ( unsigned i = 0 ; i < MAX_CORE ; ++i ) s_core[i] = 0 ;
+
+  hwloc_topology_init( & s_hwloc_topology );
+  hwloc_topology_load( s_hwloc_topology );
+
+  s_hwloc_location  = hwloc_bitmap_alloc();
+  s_process_binding = hwloc_bitmap_alloc();
+
+  hwloc_get_cpubind( s_hwloc_topology , s_process_binding ,  HWLOC_CPUBIND_PROCESS );
+
+  if ( remove_core_0 ) {
+
+    const hwloc_obj_t core = hwloc_get_obj_by_type( s_hwloc_topology , HWLOC_OBJ_CORE , 0 );
+
+    if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
+
+      hwloc_bitmap_t s_process_no_core_zero = hwloc_bitmap_alloc();
+
+      hwloc_bitmap_andnot( s_process_no_core_zero , s_process_binding , core->allowed_cpuset );
+
+      bool ok = 0 == hwloc_set_cpubind( s_hwloc_topology ,
+                                        s_process_no_core_zero ,
+                                        HWLOC_CPUBIND_PROCESS | HWLOC_CPUBIND_STRICT );
+
+      if ( ok ) {
+        hwloc_get_cpubind( s_hwloc_topology , s_process_binding ,  HWLOC_CPUBIND_PROCESS );
+
+        ok = 0 != hwloc_bitmap_isequal( s_process_binding , s_process_no_core_zero );
+      }
+
+      hwloc_bitmap_free( s_process_no_core_zero );
+
+      if ( ! ok ) {
+        std::cerr << "WARNING: Kokkos::hwloc attempted and failed to move process off of core #0" << std::endl ;
+      }
+    }
+  }
+
+  // Choose a hwloc object type for the NUMA level, which may not exist.
+
+  hwloc_obj_type_t root_type = HWLOC_OBJ_TYPE_MAX ;
+
+  {
+    // Object types to search, in order.
+    static const hwloc_obj_type_t candidate_root_type[] =
+      { HWLOC_OBJ_NODE     /* NUMA region     */
+      , HWLOC_OBJ_SOCKET   /* hardware socket */
+      , HWLOC_OBJ_MACHINE  /* local machine   */
+      };
+
+    enum { CANDIDATE_ROOT_TYPE_COUNT =
+             sizeof(candidate_root_type) / sizeof(hwloc_obj_type_t) };
+
+    for ( int k = 0 ; k < CANDIDATE_ROOT_TYPE_COUNT && HWLOC_OBJ_TYPE_MAX == root_type ; ++k ) {
+      if ( 0 < hwloc_get_nbobjs_by_type( s_hwloc_topology , candidate_root_type[k] ) ) {
+        root_type = candidate_root_type[k] ;
+      }
+    }
+  }
+
+  // Determine which of these 'root' types are available to this process.
+  // The process may have been bound (e.g., by MPI) to a subset of these root types.
+  // Determine current location of the master (calling) process>
+
+  hwloc_bitmap_t proc_cpuset_location = hwloc_bitmap_alloc();
+
+  hwloc_get_last_cpu_location( s_hwloc_topology , proc_cpuset_location , HWLOC_CPUBIND_THREAD );
+
+  const unsigned max_root = hwloc_get_nbobjs_by_type( s_hwloc_topology , root_type );
+
+  unsigned root_base     = max_root ;
+  unsigned root_count    = 0 ;
+  unsigned core_per_root = 0 ;
+  unsigned pu_per_core   = 0 ;
+  bool     symmetric     = true ;
+
+  for ( unsigned i = 0 ; i < max_root ; ++i ) {
+
+    const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , i );
+
+    if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) {
+
+      ++root_count ;
+
+      // Remember which root (NUMA) object the master thread is running on.
+      // This will be logical NUMA rank #0 for this process.
+
+      if ( hwloc_bitmap_intersects( proc_cpuset_location, root->allowed_cpuset ) ) {
+        root_base = i ;
+      }
+
+      // Count available cores:
+
+      const unsigned max_core =
+        hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
+                                                root->allowed_cpuset ,
+                                                HWLOC_OBJ_CORE );
+
+      unsigned core_count = 0 ;
+
+      for ( unsigned j = 0 ; j < max_core ; ++j ) {
+
+        const hwloc_obj_t core =
+          hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology ,
+                                               root->allowed_cpuset ,
+                                               HWLOC_OBJ_CORE , j );
+
+        // If process' cpuset intersects core's cpuset then process can access this core.
+        // Must use intersection instead of inclusion because the Intel-Phi
+        // MPI may bind the process to only one of the core's hyperthreads.
+        //
+        // Assumption: if the process can access any hyperthread of the core
+        // then it has ownership of the entire core.
+        // This assumes that it would be performance-detrimental
+        // to spawn more than one MPI process per core and use nested threading.
+
+        if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
+
+          ++core_count ;
+
+          const unsigned pu_count =
+            hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
+                                                    core->allowed_cpuset ,
+                                                    HWLOC_OBJ_PU );
+
+          if ( pu_per_core == 0 ) pu_per_core = pu_count ;
+
+          // Enforce symmetry by taking the minimum:
+
+          pu_per_core = std::min( pu_per_core , pu_count );
+
+          if ( pu_count != pu_per_core ) symmetric = false ;
+        }
+      }
+
+      if ( 0 == core_per_root ) core_per_root = core_count ;
+
+      // Enforce symmetry by taking the minimum:
+
+      core_per_root = std::min( core_per_root , core_count );
+
+      if ( core_count != core_per_root ) symmetric = false ;
+    }
+  }
+
+  s_core_topology.first  = root_count ;
+  s_core_topology.second = core_per_root ;
+  s_core_capacity        = pu_per_core ;
+
+  // Fill the 's_core' array for fast mapping from a core coordinate to the
+  // hwloc cpuset object required for thread location querying and binding.
+
+  for ( unsigned i = 0 ; i < max_root ; ++i ) {
+
+    const unsigned root_rank = ( i + root_base ) % max_root ;
+
+    const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , root_rank );
+
+    if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) {
+
+      const unsigned max_core =
+        hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
+                                                root->allowed_cpuset ,
+                                                HWLOC_OBJ_CORE );
+
+      unsigned core_count = 0 ;
+
+      for ( unsigned j = 0 ; j < max_core && core_count < core_per_root ; ++j ) {
+
+        const hwloc_obj_t core =
+          hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology ,
+                                               root->allowed_cpuset ,
+                                               HWLOC_OBJ_CORE , j );
+
+        if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
+
+          s_core[ core_count + core_per_root * i ] = core->allowed_cpuset ;
+
+          ++core_count ;
+        }
+      }
+    }
+  }
+
+  hwloc_bitmap_free( proc_cpuset_location );
+
+  if ( ! symmetric ) {
+    std::cout << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology."
+              << std::endl ;
+  }
+}
+
+
+} // namespace
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+bool available()
+{ return true ; }
+
+unsigned get_available_numa_count()
+{ sentinel(); return s_core_topology.first ; }
+
+unsigned get_available_cores_per_numa()
+{ sentinel(); return s_core_topology.second ; }
+
+unsigned get_available_threads_per_core()
+{ sentinel(); return s_core_capacity ; }
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+unsigned bind_this_thread(
+  const unsigned               coordinate_count ,
+  std::pair<unsigned,unsigned> coordinate[] )
+{
+  unsigned i = 0 ;
+
+  try {
+    const std::pair<unsigned,unsigned> current = get_this_thread_coordinate();
+
+    // Match one of the requests:
+    for ( i = 0 ; i < coordinate_count && current != coordinate[i] ; ++i );
+
+    if ( coordinate_count == i ) {
+      // Match the first request (typically NUMA):
+      for ( i = 0 ; i < coordinate_count && current.first != coordinate[i].first ; ++i );
+    }
+
+    if ( coordinate_count == i ) {
+      // Match any unclaimed request:
+      for ( i = 0 ; i < coordinate_count && ~0u == coordinate[i].first  ; ++i );
+    }
+
+    if ( coordinate_count == i || ! bind_this_thread( coordinate[i] ) ) {
+       // Failed to bind:
+       i = ~0u ;
+    }
+
+    if ( i < coordinate_count ) {
+
+#if DEBUG_PRINT
+      if ( current != coordinate[i] ) {
+        std::cout << "  bind_this_thread: rebinding from ("
+                  << current.first << ","
+                  << current.second
+                  << ") to ("
+                  << coordinate[i].first << ","
+                  << coordinate[i].second
+                  << ")" << std::endl ;
+      }
+#endif
+
+      coordinate[i].first  = ~0u ;
+      coordinate[i].second = ~0u ;
+    }
+  }
+  catch( ... ) {
+    i = ~0u ;
+  }
+
+  return i ;
+}
+
+
+bool bind_this_thread( const std::pair<unsigned,unsigned> coord )
+{
+  if ( ! sentinel() ) return false ;
+
+#if DEBUG_PRINT
+
+  std::cout << "Kokkos::bind_this_thread() at " ;
+
+  hwloc_get_last_cpu_location( s_hwloc_topology ,
+                               s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  print_bitmap( std::cout , s_hwloc_location );
+
+  std::cout << " to " ;
+
+  print_bitmap( std::cout , s_core[ coord.second + coord.first * s_core_topology.second ] );
+
+  std::cout << std::endl ;
+
+#endif
+
+  // As safe and fast as possible.
+  // Fast-lookup by caching the coordinate -> hwloc cpuset mapping in 's_core'.
+  return coord.first  < s_core_topology.first &&
+         coord.second < s_core_topology.second &&
+         0 == hwloc_set_cpubind( s_hwloc_topology ,
+                                 s_core[ coord.second + coord.first * s_core_topology.second ] ,
+                                 HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT );
+}
+
+bool unbind_this_thread()
+{
+  if ( ! sentinel() ) return false ;
+
+#define HWLOC_DEBUG_PRINT 0
+
+#if HWLOC_DEBUG_PRINT
+
+  std::cout << "Kokkos::unbind_this_thread() from " ;
+
+  hwloc_get_cpubind( s_hwloc_topology , s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  print_bitmap( std::cout , s_hwloc_location );
+
+#endif
+
+  const bool result =
+    s_hwloc_topology &&
+    0 == hwloc_set_cpubind( s_hwloc_topology ,
+                            s_process_binding ,
+                            HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT );
+
+#if HWLOC_DEBUG_PRINT
+
+  std::cout << " to " ;
+
+  hwloc_get_cpubind( s_hwloc_topology , s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  print_bitmap( std::cout , s_hwloc_location );
+
+  std::cout << std::endl ;
+
+#endif
+
+  return result ;
+
+#undef HWLOC_DEBUG_PRINT
+
+}
+
+//----------------------------------------------------------------------------
+
+std::pair<unsigned,unsigned> get_this_thread_coordinate()
+{
+  std::pair<unsigned,unsigned> coord(0u,0u);
+
+  if ( ! sentinel() ) return coord ;
+
+  const unsigned n = s_core_topology.first * s_core_topology.second ;
+
+  // Using the pre-allocated 's_hwloc_location' to avoid memory
+  // allocation by this thread.  This call is NOT thread-safe.
+  hwloc_get_last_cpu_location( s_hwloc_topology ,
+                               s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  unsigned i = 0 ;
+
+  while ( i < n && ! hwloc_bitmap_intersects( s_hwloc_location , s_core[ i ] ) ) ++i ;
+
+  if ( i < n ) {
+    coord.first  = i / s_core_topology.second ;
+    coord.second = i % s_core_topology.second ;
+  }
+
+  return coord ;
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#else /* ! defined( KOKKOS_HAVE_HWLOC ) */
+
+namespace Kokkos {
+namespace hwloc {
+
+bool available() { return false ; }
+
+unsigned get_available_numa_count() { return 1 ; }
+unsigned get_available_cores_per_numa() { return 1 ; }
+unsigned get_available_threads_per_core() { return 1 ; }
+
+unsigned bind_this_thread( const unsigned , std::pair<unsigned,unsigned>[] )
+{ return ~0 ; }
+
+bool bind_this_thread( const std::pair<unsigned,unsigned> )
+{ return false ; }
+
+bool unbind_this_thread()
+{ return true ; }
+
+std::pair<unsigned,unsigned> get_this_thread_coordinate()
+{ return std::pair<unsigned,unsigned>(0,0); }
+
+} // namespace hwloc
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
new file mode 100644
index 000000000..316f3094f
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
@@ -0,0 +1,90 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_spinwait.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#if ! defined( KOKKOS_DISABLE_ASM ) && \
+    ( defined( __GNUC__ ) || \
+      defined( __GNUG__ ) || \
+      defined( __INTEL_COMPILER ) )
+
+#ifndef __arm__
+/* Pause instruction to prevent excess processor bus usage */
+#define YIELD   asm volatile("pause\n":::"memory")
+#else
+/* No-operation instruction to idle the thread. */
+#define YIELD   asm volatile("nop")
+#endif
+
+#elif ! defined( KOKKOS_HAVE_WINTHREAD )
+
+#include <sched.h>
+
+#define YIELD  sched_yield()
+
+#else
+
+#include <process.h>
+
+#define YIELD  Sleep(0)
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+void spinwait( volatile int & flag , const int value )
+{
+  while ( value == flag ) {
+    YIELD ;
+  }
+}
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp b/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
new file mode 100644
index 000000000..f2b42e921
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
@@ -0,0 +1,59 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
+//              Copyright (2012) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+#ifndef KOKKOS_SPINWAIT_HPP
+#define KOKKOS_SPINWAIT_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+void spinwait( volatile int & flag , const int value );
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#undef KOKKOS_YIELD
+
+#endif /* #ifndef KOKKOS_SPINWAIT_HPP */
+
diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
new file mode 100644
index 000000000..9378eccfc
--- /dev/null
+++ b/src/KOKKOS/Install.sh
@@ -0,0 +1,71 @@
+# Install/unInstall package files in LAMMPS
+# mode = 0/1/2 for uninstall/install/update
+
+mode=$1
+
+# arg1 = file, arg2 = file it depends on
+
+action () {
+  if (test $mode = 0) then
+    rm -f ../$1
+  elif (! cmp -s $1 ../$1) then
+    if (test -z "$2" || test -e ../$2) then
+      cp $1 ..
+      if (test $mode = 2) then
+        echo "  updating src/$1"
+      fi
+    fi
+  elif (test -n "$2") then
+    if (test ! -e ../$2) then
+      rm -f ../$1
+    fi
+  fi
+}
+
+# force rebuild of files with LMP_KOKKOS switch
+
+touch ../accelerator_kokkos.h
+touch ../memory.h
+
+# all package files with no dependencies
+
+for file in *.cpp *.h; do
+  action $file
+done
+
+# edit 2 Makefile.package files to include/exclude package info
+
+if (test $1 = 1) then
+
+  if (test -e ../Makefile.package) then
+    sed -i -e 's/[^ \t]*kokkos[^ \t]* //g' ../Makefile.package
+    sed -i -e 's/[^ \t]*KOKKOS[^ \t]* //g' ../Makefile.package
+    sed -i -e 's|^PKG_INC =[ \t]*|&-I..\/..\/lib\/kokkos\/core\/src -I../../lib/kokkos/containers/src -DLMP_KOKKOS |' ../Makefile.package
+    sed -i -e 's|^PKG_PATH =[ \t]*|&-L..\/..\/lib\/kokkos\/core\/src |' ../Makefile.package
+    sed -i -e 's|^PKG_LIB =[ \t]*|&-lkokkoscore |' ../Makefile.package
+    sed -i -e 's|^PKG_SYSINC =[ \t]*|&$(kokkos_SYSINC) |' ../Makefile.package
+    sed -i -e 's|^PKG_SYSLIB =[ \t]*|&$(kokkos_SYSLIB) |' ../Makefile.package
+    sed -i -e 's|^PKG_SYSPATH =[ \t]*|&$(kokkos_SYSPATH) |' ../Makefile.package
+  fi
+
+  if (test -e ../Makefile.package.settings) then
+    sed -i -e '/^include.*kokkos.*$/d' ../Makefile.package.settings
+    # multiline form needed for BSD sed on Macs
+    sed -i -e '4 i \
+include ..\/..\/lib\/kokkos\/Makefile.lammps
+' ../Makefile.package.settings
+
+  fi
+
+elif (test $1 = 0) then
+
+  if (test -e ../Makefile.package) then
+    sed -i -e 's/[^ \t]*kokkos[^ \t]* //g' ../Makefile.package
+    sed -i -e 's/[^ \t]*KOKKOS[^ \t]* //g' ../Makefile.package
+  fi
+
+  if (test -e ../Makefile.package.settings) then
+    sed -i -e '/^include.*kokkos.*$/d' ../Makefile.package.settings
+  fi
+
+fi
diff --git a/src/KOKKOS/atom_kokkos.cpp b/src/KOKKOS/atom_kokkos.cpp
new file mode 100644
index 000000000..e36a5a926
--- /dev/null
+++ b/src/KOKKOS/atom_kokkos.cpp
@@ -0,0 +1,190 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include "atom_kokkos.h"
+#include "atom_vec.h"
+#include "atom_vec_kokkos.h"
+#include "comm_kokkos.h"
+#include "update.h"
+#include "domain.h"
+#include "atom_masks.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+AtomKokkos::AtomKokkos(LAMMPS *lmp) : Atom(lmp)
+{
+  // set CommKokkos pointer to Atom class, since CommKokkos allocated first
+
+  ((CommKokkos *) comm)->atomKK = this;
+}
+
+/* ---------------------------------------------------------------------- */
+
+AtomKokkos::~AtomKokkos()
+{
+  k_tag = DAT::tdual_int_1d();
+  k_mask = DAT::tdual_int_1d();
+  k_type = DAT::tdual_int_1d();
+  k_image = DAT::tdual_int_1d();
+  k_molecule = DAT::tdual_int_1d();
+
+  k_x = DAT::tdual_x_array();
+  k_v = DAT::tdual_v_array();
+  k_f = DAT::tdual_f_array();
+
+  k_mass = DAT::tdual_float_1d();
+
+  tag = NULL;
+  mask = NULL;
+  type = NULL;
+  image = NULL;
+  molecule = NULL;
+  mass = NULL;
+
+  memory->sfree(x);
+  memory->sfree(v);
+  memory->sfree(f);
+  x = NULL;
+  v = NULL;
+  f = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomKokkos::sync(const ExecutionSpace space, unsigned int mask)
+{
+  ((AtomVecKokkos *) avec)->sync(space,mask);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomKokkos::modified(const ExecutionSpace space, unsigned int mask)
+{
+  ((AtomVecKokkos *) avec)->modified(space,mask);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomKokkos::allocate_type_arrays()
+{
+  if (avec->mass_type) {
+    k_mass = DAT::tdual_float_1d("Mass",ntypes+1);
+    mass = k_mass.h_view.ptr_on_device();
+    mass_setflag = new int[ntypes+1];
+    for (int itype = 1; itype <= ntypes; itype++) mass_setflag[itype] = 0;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomKokkos::sort()
+{
+  int i,m,n,ix,iy,iz,ibin,empty;
+
+  sync(Host,ALL_MASK);
+  modified(Host,ALL_MASK);
+
+  // set next timestep for sorting to take place
+
+  nextsort = (update->ntimestep/sortfreq)*sortfreq + sortfreq;
+
+  // re-setup sort bins if needed
+
+  if (domain->box_change) setup_sort_bins();
+  if (nbins == 1) return;
+
+  // reallocate per-atom vectors if needed
+
+  if (nlocal > maxnext) {
+    memory->destroy(next);
+    memory->destroy(permute);
+    maxnext = atom->nmax;
+    memory->create(next,maxnext,"atom:next");
+    memory->create(permute,maxnext,"atom:permute");
+  }
+
+  // insure there is one extra atom location at end of arrays for swaps
+
+  if (nlocal == nmax) avec->grow(0);
+
+  // bin atoms in reverse order so linked list will be in forward order
+
+  for (i = 0; i < nbins; i++) binhead[i] = -1;
+
+  HAT::t_x_array_const h_x = k_x.view<LMPHostType>();
+  for (i = nlocal-1; i >= 0; i--) {
+    ix = static_cast<int> ((h_x(i,0)-bboxlo[0])*bininvx);
+    iy = static_cast<int> ((h_x(i,1)-bboxlo[1])*bininvy);
+    iz = static_cast<int> ((h_x(i,2)-bboxlo[2])*bininvz);
+    ix = MAX(ix,0);
+    iy = MAX(iy,0);
+    iz = MAX(iz,0);
+    ix = MIN(ix,nbinx-1);
+    iy = MIN(iy,nbiny-1);
+    iz = MIN(iz,nbinz-1);
+    ibin = iz*nbiny*nbinx + iy*nbinx + ix;
+    next[i] = binhead[ibin];
+    binhead[ibin] = i;
+  }
+
+  // permute = desired permutation of atoms
+  // permute[I] = J means Ith new atom will be Jth old atom
+
+  n = 0;
+  for (m = 0; m < nbins; m++) {
+    i = binhead[m];
+    while (i >= 0) {
+      permute[n++] = i;
+      i = next[i];
+    }
+  }
+
+  // current = current permutation, just reuse next vector
+  // current[I] = J means Ith current atom is Jth old atom
+
+  int *current = next;
+  for (i = 0; i < nlocal; i++) current[i] = i;
+
+  // reorder local atom list, when done, current = permute
+  // perform "in place" using copy() to extra atom location at end of list
+  // inner while loop processes one cycle of the permutation
+  // copy before inner-loop moves an atom to end of atom list
+  // copy after inner-loop moves atom at end of list back into list
+  // empty = location in atom list that is currently empty
+
+  for (i = 0; i < nlocal; i++) {
+    if (current[i] == permute[i]) continue;
+    avec->copy(i,nlocal,0);
+    empty = i;
+    while (permute[empty] != i) {
+      avec->copy(permute[empty],empty,0);
+      empty = current[empty] = permute[empty];
+    }
+    avec->copy(nlocal,empty,0);
+    current[empty] = permute[empty];
+  }
+
+  // sanity check that current = permute
+
+  //int flag = 0;
+  //for (i = 0; i < nlocal; i++)
+  //  if (current[i] != permute[i]) flag = 1;
+  //int flagall;
+  //MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_SUM,world);
+  //if (flagall) error->all(FLERR,"Atom sort did not operate correctly");
+}
diff --git a/src/KOKKOS/atom_kokkos.h b/src/KOKKOS/atom_kokkos.h
new file mode 100644
index 000000000..594bf80e5
--- /dev/null
+++ b/src/KOKKOS/atom_kokkos.h
@@ -0,0 +1,86 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "atom.h"
+#include "kokkos_type.h"
+
+#ifndef LMP_ATOM_KOKKOS_H
+#define LMP_ATOM_KOKKOS_H
+
+namespace LAMMPS_NS {
+
+class AtomKokkos : public Atom {
+ public:
+  DAT::tdual_int_1d k_tag, k_type, k_mask, k_molecule;
+  DAT::tdual_tagint_1d k_image;
+  DAT::tdual_x_array k_x;
+  DAT::tdual_v_array k_v;
+  DAT::tdual_f_array k_f;
+
+  DAT::tdual_float_1d k_mass;
+
+  AtomKokkos(class LAMMPS *);
+  ~AtomKokkos();
+
+  virtual void allocate_type_arrays();
+  void sync(const ExecutionSpace space, unsigned int mask);
+  void modified(const ExecutionSpace space, unsigned int mask);
+  virtual void sort();
+};
+
+template<class ViewType, class IndexView>
+class SortFunctor {
+  typedef typename ViewType::device_type device_type;
+  ViewType source;
+  Kokkos::View<typename ViewType::non_const_data_type,typename ViewType::array_type,device_type> dest;
+  IndexView index;
+  SortFunctor(ViewType src, typename Kokkos::Impl::enable_if<ViewType::dynamic_rank==1,IndexView>::type ind):source(src),index(ind){
+    dest = Kokkos::View<typename ViewType::non_const_data_type,typename ViewType::array_type,device_type>("",src.dimension_0());
+  }
+  SortFunctor(ViewType src, typename Kokkos::Impl::enable_if<ViewType::dynamic_rank==2,IndexView>::type ind):source(src),index(ind){
+    dest = Kokkos::View<typename ViewType::non_const_data_type,typename ViewType::array_type,device_type>("",src.dimension_0(),src.dimension_1());
+  }
+  SortFunctor(ViewType src, typename Kokkos::Impl::enable_if<ViewType::dynamic_rank==3,IndexView>::type ind):source(src),index(ind){
+    dest = Kokkos::View<typename ViewType::non_const_data_type,typename ViewType::array_type,device_type>("",src.dimension_0(),src.dimension_1(),src.dimension_2());
+  }
+  SortFunctor(ViewType src, typename Kokkos::Impl::enable_if<ViewType::dynamic_rank==4,IndexView>::type ind):source(src),index(ind){
+    dest = Kokkos::View<typename ViewType::non_const_data_type,typename ViewType::array_type,device_type>("",src.dimension_0(),src.dimension_1(),src.dimension_2(),src.dimension_3());
+  }
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const typename Kokkos::Impl::enable_if<ViewType::rank==1, int>::type& i) {
+    dest(i) = source(index(i));
+  }
+  void operator()(const typename Kokkos::Impl::enable_if<ViewType::rank==2, int>::type& i) {
+    for(int j=0;j<source.dimension_1();j++)
+      dest(i,j) = source(index(i),j);
+  }
+  void operator()(const typename Kokkos::Impl::enable_if<ViewType::rank==3, int>::type& i) {
+    for(int j=0;j<source.dimension_1();j++)
+    for(int k=0;k<source.dimension_2();k++)
+      dest(i,j,k) = source(index(i),j,k);
+  }
+  void operator()(const typename Kokkos::Impl::enable_if<ViewType::rank==4, int>::type& i) {
+    for(int j=0;j<source.dimension_1();j++)
+    for(int k=0;k<source.dimension_2();k++)
+    for(int l=0;l<source.dimension_3();l++)
+      dest(i,j,k,l) = source(index(i),j,k,l);
+  }
+};
+
+}
+
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.cpp b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
new file mode 100644
index 000000000..1db293cd4
--- /dev/null
+++ b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
@@ -0,0 +1,1371 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale AtomicKokkos/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "stdlib.h"
+#include "atom_vec_atomic_kokkos.h"
+#include "atom_kokkos.h"
+#include "comm_kokkos.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "atom_masks.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define DELTA 10000
+
+/* ---------------------------------------------------------------------- */
+
+AtomVecAtomicKokkos::AtomVecAtomicKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)
+{
+  molecular = 0;
+  mass_type = 1;
+
+  comm_x_only = comm_f_only = 1;
+  size_forward = 3;
+  size_reverse = 3;
+  size_border = 6;
+  size_velocity = 3;
+  size_data_atom = 5;
+  size_data_vel = 4;
+  xcol_data = 3;
+
+  k_count = DAT::tdual_int_1d("atom::k_count",1);
+  atomKK = (AtomKokkos *) atom;
+  commKK = (CommKokkos *) comm;
+}
+
+/* ----------------------------------------------------------------------
+   grow atom arrays
+   n = 0 grows arrays by DELTA
+   n > 0 allocates arrays to size n
+------------------------------------------------------------------------- */
+
+void AtomVecAtomicKokkos::grow(int n)
+{
+  if (n == 0) nmax += DELTA;
+  else nmax = n;
+  atomKK->nmax = nmax;
+  if (nmax < 0 || nmax > MAXSMALLINT)
+    error->one(FLERR,"Per-processor system is too big");
+
+  sync(Device,ALL_MASK);
+  modified(Device,ALL_MASK);
+
+  memory->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
+  memory->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
+  memory->grow_kokkos(atomKK->k_mask,atomKK->mask,nmax,"atom:mask");
+  memory->grow_kokkos(atomKK->k_image,atomKK->image,nmax,"atom:image");
+
+  memory->grow_kokkos(atomKK->k_x,atomKK->x,nmax,3,"atom:x");
+  memory->grow_kokkos(atomKK->k_v,atomKK->v,nmax,3,"atom:v");
+  memory->grow_kokkos(atomKK->k_f,atomKK->f,nmax,3,"atom:f");
+
+  grow_reset();
+  sync(Host,ALL_MASK);
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      modify->fix[atom->extra_grow[iextra]]->grow_arrays(nmax);
+}
+
+/* ----------------------------------------------------------------------
+   reset local array ptrs
+------------------------------------------------------------------------- */
+
+void AtomVecAtomicKokkos::grow_reset()
+{
+  tag = atomKK->tag; 
+  d_tag = atomKK->k_tag.d_view; 
+  h_tag = atomKK->k_tag.h_view;
+
+  type = atomKK->type; 
+  d_type = atomKK->k_type.d_view; 
+  h_type = atomKK->k_type.h_view;
+  mask = atomKK->mask; 
+  d_mask = atomKK->k_mask.d_view; 
+  h_mask = atomKK->k_mask.h_view;
+  image = atomKK->image; 
+  d_image = atomKK->k_image.d_view; 
+  h_image = atomKK->k_image.h_view;
+
+  x = atomKK->x; 
+  d_x = atomKK->k_x.d_view; 
+  h_x = atomKK->k_x.h_view;
+  v = atomKK->v; 
+  d_v = atomKK->k_v.d_view; 
+  h_v = atomKK->k_v.h_view;
+  f = atomKK->f; 
+  d_f = atomKK->k_f.d_view; 
+  h_f = atomKK->k_f.h_view;
+}
+
+/* ----------------------------------------------------------------------
+   copy atom I info to atom J
+------------------------------------------------------------------------- */
+
+void AtomVecAtomicKokkos::copy(int i, int j, int delflag)
+{
+  h_tag[j] = h_tag[i];
+  h_type[j] = h_type[i];
+  mask[j] = mask[i];
+  h_image[j] = h_image[i];
+  h_x(j,0) = h_x(i,0);
+  h_x(j,1) = h_x(i,1);
+  h_x(j,2) = h_x(i,2);
+  h_v(j,0) = h_v(i,0);
+  h_v(j,1) = h_v(i,1);
+  h_v(j,2) = h_v(i,2);
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      modify->fix[atom->extra_grow[iextra]]->copy_arrays(i,j,delflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG,int TRICLINIC>
+struct AtomVecAtomicKokkos_PackComm {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
+  X_FLOAT _pbc[6];
+
+  AtomVecAtomicKokkos_PackComm(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_xfloat_2d &buf,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap,
+      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
+      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
+      _x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
+      _xprd(xprd),_yprd(yprd),_zprd(zprd),
+      _xy(xy),_xz(xz),_yz(yz) {
+        const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
+        const size_t elements = 3;
+        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
+        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
+        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+        const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _buf(i,0) = _x(j,0);
+          _buf(i,1) = _x(j,1);
+          _buf(i,2) = _x(j,2);
+      } else {
+        if (TRICLINIC == 0) {
+          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
+          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
+          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
+        } else {
+          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
+          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
+          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
+        }
+      }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAtomicKokkos::pack_comm_kokkos(const int &n, 
+                                          const DAT::tdual_int_2d &list, 
+                                          const int & iswap,
+                                          const DAT::tdual_xfloat_2d &buf,
+                                          const int &pbc_flag,
+                                          const int* const pbc)
+{
+  // Check whether to always run forward communication on the host
+  // Choose correct forward PackComm kernel
+
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+        struct AtomVecAtomicKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecAtomicKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+        struct AtomVecAtomicKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecAtomicKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    }
+    LMPHostType::fence();
+  } else {
+    sync(Device,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+        struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+        struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    }
+    LMPDeviceType::fence();
+  }
+
+	return n*size_forward;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG,int TRICLINIC>
+struct AtomVecAtomicKokkos_PackCommSelf {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  typename ArrayTypes<DeviceType>::t_x_array _xw;
+  int _nfirst;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
+  X_FLOAT _pbc[6];
+
+  AtomVecAtomicKokkos_PackCommSelf(
+      const typename DAT::tdual_x_array &x,
+      const int &nfirst,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap,
+      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
+      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
+      _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
+      _xprd(xprd),_yprd(yprd),_zprd(zprd),
+      _xy(xy),_xz(xz),_yz(yz) {
+        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
+        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+        const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _xw(i+_nfirst,0) = _x(j,0);
+          _xw(i+_nfirst,1) = _x(j,1);
+          _xw(i+_nfirst,2) = _x(j,2);
+      } else {
+        if (TRICLINIC == 0) {
+          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
+          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
+          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
+        } else {
+          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
+          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
+          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
+        }
+      }
+
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAtomicKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
+										const int nfirst, const int &pbc_flag, const int* const pbc) {
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    modified(Host,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+      struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+      struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    }
+    LMPHostType::fence();
+  } else {
+    sync(Device,X_MASK);
+    modified(Device,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+      struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+      struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    }
+    LMPDeviceType::fence();
+  }
+	return n*3;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecAtomicKokkos_UnpackComm {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array _x;
+  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
+  int _first;
+
+  AtomVecAtomicKokkos_UnpackComm(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_xfloat_2d &buf,
+      const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
+                        _first(first) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      _x(i+_first,0) = _buf(i,0);
+      _x(i+_first,1) = _buf(i,1);
+      _x(i+_first,2) = _buf(i,2);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAtomicKokkos::unpack_comm_kokkos(const int &n, const int &first,
+    const DAT::tdual_xfloat_2d &buf ) {
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    modified(Host,X_MASK);
+    struct AtomVecAtomicKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
+    Kokkos::parallel_for(n,f);
+    LMPDeviceType::fence();
+  } else {
+    sync(Device,X_MASK);
+    modified(Device,X_MASK);
+    struct AtomVecAtomicKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
+    Kokkos::parallel_for(n,f);
+    LMPDeviceType::fence();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAtomicKokkos::pack_comm(int n, int *list, double *buf,
+                             int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+      dz = pbc[2]*domain->zprd;
+    }
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0) + dx;
+      buf[m++] = h_x(j,1) + dy;
+      buf[m++] = h_x(j,2) + dz;
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAtomicKokkos::pack_comm_vel(int n, int *list, double *buf,
+                                 int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz,dvx,dvy,dvz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = h_v(j,0);
+      buf[m++] = h_v(j,1);
+      buf[m++] = h_v(j,2);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+      dz = pbc[2]*domain->zprd;
+    }
+    if (!deform_vremap) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = h_v(j,0);
+        buf[m++] = h_v(j,1);
+        buf[m++] = h_v(j,2);
+      }
+    } else {
+      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
+      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
+      dvz = pbc[2]*h_rate[2];
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        if (mask[i] & deform_groupbit) {
+          buf[m++] = h_v(j,0) + dvx;
+          buf[m++] = h_v(j,1) + dvy;
+          buf[m++] = h_v(j,2) + dvz;
+        } else {
+          buf[m++] = h_v(j,0);
+          buf[m++] = h_v(j,1);
+          buf[m++] = h_v(j,2);
+        }
+      }
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAtomicKokkos::unpack_comm(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAtomicKokkos::unpack_comm_vel(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_v(i,0) = buf[m++];
+    h_v(i,1) = buf[m++];
+    h_v(i,2) = buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAtomicKokkos::pack_reverse(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    buf[m++] = f[i][0];
+    buf[m++] = f[i][1];
+    buf[m++] = f[i][2];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAtomicKokkos::unpack_reverse(int n, int *list, double *buf)
+{
+  int i,j,m;
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    f[j][0] += buf[m++];
+    f[j][1] += buf[m++];
+    f[j][2] += buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG>
+struct AtomVecAtomicKokkos_PackBorder {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_xfloat_2d _buf;
+  const typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  const typename ArrayTypes<DeviceType>::t_tagint_1d _tag;
+  const typename ArrayTypes<DeviceType>::t_int_1d _type;
+  const typename ArrayTypes<DeviceType>::t_int_1d _mask;
+  const int _iswap;
+  X_FLOAT _dx,_dy,_dz;
+
+  AtomVecAtomicKokkos_PackBorder(
+      const typename ArrayTypes<DeviceType>::t_xfloat_2d &buf,
+      const typename ArrayTypes<DeviceType>::t_int_2d_const &list,
+      const int & iswap,
+      const typename ArrayTypes<DeviceType>::t_x_array &x,
+      const typename ArrayTypes<DeviceType>::t_tagint_1d &tag,
+      const typename ArrayTypes<DeviceType>::t_int_1d &type,
+      const typename ArrayTypes<DeviceType>::t_int_1d &mask,
+      const X_FLOAT &dx, const X_FLOAT &dy, const X_FLOAT &dz):
+      _buf(buf),_list(list),_iswap(iswap),
+      _x(x),_tag(tag),_type(type),_mask(mask),
+      _dx(dx),_dy(dy),_dz(dz) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _buf(i,0) = _x(j,0);
+          _buf(i,1) = _x(j,1);
+          _buf(i,2) = _x(j,2);
+          _buf(i,3) = _tag(j);
+          _buf(i,4) = _type(j);
+          _buf(i,5) = _mask(j);
+      } else {
+          _buf(i,0) = _x(j,0) + _dx;
+          _buf(i,1) = _x(j,1) + _dy;
+          _buf(i,2) = _x(j,2) + _dz;
+          _buf(i,3) = _tag(j);
+          _buf(i,4) = _type(j);
+          _buf(i,5) = _mask(j);
+      }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAtomicKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap,
+                               int pbc_flag, int *pbc, ExecutionSpace space)
+{
+  X_FLOAT dx,dy,dz;
+
+  if (pbc_flag != 0) {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    if(space==Host) {
+      AtomVecAtomicKokkos_PackBorder<LMPHostType,1> f(
+        buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
+        iswap,h_x,h_tag,h_type,h_mask,dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+      LMPHostType::fence();
+    } else {
+      AtomVecAtomicKokkos_PackBorder<LMPDeviceType,1> f(
+        buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
+        iswap,d_x,d_tag,d_type,d_mask,dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+      LMPDeviceType::fence();
+    }
+
+  } else {
+    dx = dy = dz = 0;
+    if(space==Host) {
+      AtomVecAtomicKokkos_PackBorder<LMPHostType,0> f(
+        buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
+        iswap,h_x,h_tag,h_type,h_mask,dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+      LMPHostType::fence();
+    } else {
+      AtomVecAtomicKokkos_PackBorder<LMPDeviceType,0> f(
+        buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
+        iswap,d_x,d_tag,d_type,d_mask,dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+      LMPDeviceType::fence();
+    }
+  }
+  return n*6;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAtomicKokkos::pack_border(int n, int *list, double *buf,
+                               int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = h_tag[j];
+      buf[m++] = h_type[j];
+      buf[m++] = h_mask[j];
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0) + dx;
+      buf[m++] = h_x(j,1) + dy;
+      buf[m++] = h_x(j,2) + dz;
+      buf[m++] = h_tag[j];
+      buf[m++] = h_type[j];
+      buf[m++] = h_mask[j];
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAtomicKokkos::pack_border_vel(int n, int *list, double *buf,
+                                   int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz,dvx,dvy,dvz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = h_tag[j];
+      buf[m++] = h_type[j];
+      buf[m++] = h_mask[j];
+      buf[m++] = h_v(j,0);
+      buf[m++] = h_v(j,1);
+      buf[m++] = h_v(j,2);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    if (!deform_vremap) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = h_tag[j];
+        buf[m++] = h_type[j];
+        buf[m++] = h_mask[j];
+        buf[m++] = h_v(j,0);
+        buf[m++] = h_v(j,1);
+        buf[m++] = h_v(j,2);
+      }
+    } else {
+      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
+      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
+      dvz = pbc[2]*h_rate[2];
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = h_tag[j];
+        buf[m++] = h_type[j];
+        buf[m++] = h_mask[j];
+        if (mask[i] & deform_groupbit) {
+          buf[m++] = h_v(j,0) + dvx;
+          buf[m++] = h_v(j,1) + dvy;
+          buf[m++] = h_v(j,2) + dvz;
+        } else {
+          buf[m++] = h_v(j,0);
+          buf[m++] = h_v(j,1);
+          buf[m++] = h_v(j,2);
+        }
+      }
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecAtomicKokkos_UnpackBorder {
+  typedef DeviceType device_type;
+
+  const typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
+  typename ArrayTypes<DeviceType>::t_x_array _x;
+  typename ArrayTypes<DeviceType>::t_tagint_1d _tag;
+  typename ArrayTypes<DeviceType>::t_int_1d _type;
+  typename ArrayTypes<DeviceType>::t_int_1d _mask;
+  int _first;
+
+
+  AtomVecAtomicKokkos_UnpackBorder(
+      const typename ArrayTypes<DeviceType>::t_xfloat_2d_const &buf,
+      typename ArrayTypes<DeviceType>::t_x_array &x,
+      typename ArrayTypes<DeviceType>::t_tagint_1d &tag,
+      typename ArrayTypes<DeviceType>::t_int_1d &type,
+      typename ArrayTypes<DeviceType>::t_int_1d &mask,
+      const int& first):
+      _buf(buf),_x(x),_tag(tag),_type(type),_mask(mask),_first(first){
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      _x(i+_first,0) = _buf(i,0);
+      _x(i+_first,1) = _buf(i,1);
+      _x(i+_first,2) = _buf(i,2);
+      _tag(i+_first) = static_cast<int> (_buf(i,3));
+      _type(i+_first) = static_cast<int>  (_buf(i,4));
+      _mask(i+_first) = static_cast<int>  (_buf(i,5));
+//      printf("%i %i %lf %lf %lf %i BORDER\n",_tag(i+_first),i+_first,_x(i+_first,0),_x(i+_first,1),_x(i+_first,2),_type(i+_first));
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAtomicKokkos::unpack_border_kokkos(const int &n, const int &first,
+                     const DAT::tdual_xfloat_2d &buf,ExecutionSpace space) {
+  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+  while (first+n >= nmax) grow(0);
+  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+  if(space==Host) {
+    struct AtomVecAtomicKokkos_UnpackBorder<LMPHostType> f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,first);
+    Kokkos::parallel_for(n,f);
+    LMPHostType::fence();
+  } else {
+    struct AtomVecAtomicKokkos_UnpackBorder<LMPDeviceType> f(buf.view<LMPDeviceType>(),d_x,d_tag,d_type,d_mask,first);
+    Kokkos::parallel_for(n,f);
+    LMPDeviceType::fence();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAtomicKokkos::unpack_border(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    if (i == nmax) grow(0);
+    modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_tag[i] = static_cast<int> (buf[m++]);
+    h_type[i] = static_cast<int> (buf[m++]);
+    h_mask[i] = static_cast<int> (buf[m++]);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAtomicKokkos::unpack_border_vel(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    if (i == nmax) grow(0);
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_tag[i] = static_cast<int> (buf[m++]);
+    h_type[i] = static_cast<int> (buf[m++]);
+    h_mask[i] = static_cast<int> (buf[m++]);
+    h_v(i,0) = buf[m++];
+    h_v(i,1) = buf[m++];
+    h_v(i,2) = buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecAtomicKokkos_PackExchangeFunctor {
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  X_FLOAT _lo,_hi;
+  typename AT::t_x_array_randomread _x;
+  typename AT::t_v_array_randomread _v;
+  typename AT::t_tagint_1d_randomread _tag;
+  typename AT::t_int_1d_randomread _type;
+  typename AT::t_int_1d_randomread _mask;
+  typename AT::t_int_1d_randomread _image;
+  typename AT::t_x_array _xw;
+  typename AT::t_v_array _vw;
+  typename AT::t_tagint_1d _tagw;
+  typename AT::t_int_1d _typew;
+  typename AT::t_int_1d _maskw;
+  typename AT::t_int_1d _imagew;
+
+  typename AT::t_xfloat_2d_um _buf;
+  int _nlocal,_dim;
+  typename AT::t_int_1d_const _sendlist;
+  typename AT::t_int_1d_const _copylist;
+
+  AtomVecAtomicKokkos_PackExchangeFunctor(
+      const AtomKokkos* atom,
+      const typename AT::tdual_xfloat_2d buf,
+      typename AT::tdual_int_1d sendlist,
+      typename AT::tdual_int_1d copylist,int nlocal, int dim,
+                X_FLOAT lo, X_FLOAT hi):
+                _x(atom->k_x.view<DeviceType>()),
+                _v(atom->k_v.view<DeviceType>()),
+                _tag(atom->k_tag.view<DeviceType>()),
+                _type(atom->k_type.view<DeviceType>()),
+                _mask(atom->k_mask.view<DeviceType>()),
+                _image(atom->k_image.view<DeviceType>()),
+                _xw(atom->k_x.view<DeviceType>()),
+                _vw(atom->k_v.view<DeviceType>()),
+                _tagw(atom->k_tag.view<DeviceType>()),
+                _typew(atom->k_type.view<DeviceType>()),
+                _maskw(atom->k_mask.view<DeviceType>()),
+                _imagew(atom->k_image.view<DeviceType>()),
+                _sendlist(sendlist.template view<DeviceType>()),
+                _copylist(copylist.template view<DeviceType>()),
+                _nlocal(nlocal),_dim(dim),
+                _lo(lo),_hi(hi){
+    const size_t elements = 11;
+    const int maxsendlist = (buf.template view<DeviceType>().dimension_0()*buf.template view<DeviceType>().dimension_1())/elements;
+
+    buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &mysend) const {
+    const int i = _sendlist(mysend);
+    _buf(mysend,0) = 11;
+    _buf(mysend,1) = _x(i,0);
+    _buf(mysend,2) = _x(i,1);
+    _buf(mysend,3) = _x(i,2);
+    _buf(mysend,4) = _v(i,0);
+    _buf(mysend,5) = _v(i,1);
+    _buf(mysend,6) = _v(i,2);
+    _buf(mysend,7) = _tag[i];
+    _buf(mysend,8) = _type[i];
+    _buf(mysend,9) = _mask[i];
+    _buf(mysend,10) = _image[i];
+    const int j = _copylist(mysend);
+
+    if(j>-1) {
+    _xw(i,0) = _x(j,0);
+    _xw(i,1) = _x(j,1);
+    _xw(i,2) = _x(j,2);
+    _vw(i,0) = _v(j,0);
+    _vw(i,1) = _v(j,1);
+    _vw(i,2) = _v(j,2);
+    _tagw[i] = _tag(j);
+    _typew[i] = _type(j);
+    _maskw[i] = _mask(j);
+    _imagew[i] = _image(j);
+    }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAtomicKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &k_buf, DAT::tdual_int_1d k_sendlist,DAT::tdual_int_1d k_copylist,ExecutionSpace space,int dim,X_FLOAT lo,X_FLOAT hi )
+{
+  if(nsend > (k_buf.view<LMPHostType>().dimension_0()*k_buf.view<LMPHostType>().dimension_1())/11) {
+    int newsize = nsend*11/k_buf.view<LMPHostType>().dimension_1()+1;
+    k_buf.resize(newsize,k_buf.view<LMPHostType>().dimension_1());
+  }
+  if(space == Host) {
+    AtomVecAtomicKokkos_PackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
+    Kokkos::parallel_for(nsend,f);
+    LMPHostType::fence();
+    return nsend*11;
+  } else {
+    AtomVecAtomicKokkos_PackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
+    Kokkos::parallel_for(nsend,f);
+    LMPDeviceType::fence();
+    return nsend*11;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAtomicKokkos::pack_exchange(int i, double *buf)
+{
+  int m = 1;
+  buf[m++] = h_x(i,0);
+  buf[m++] = h_x(i,1);
+  buf[m++] = h_x(i,2);
+  buf[m++] = h_v(i,0);
+  buf[m++] = h_v(i,1);
+  buf[m++] = h_v(i,2);
+  buf[m++] = h_tag[i];
+  buf[m++] = h_type[i];
+  buf[m++] = h_mask[i];
+  *((tagint *) &buf[m++]) = h_image[i];
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      m += modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&buf[m]);
+
+  buf[0] = m;
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecAtomicKokkos_UnpackExchangeFunctor {
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  X_FLOAT _lo,_hi;
+  typename AT::t_x_array _x;
+  typename AT::t_v_array _v;
+  typename AT::t_tagint_1d _tag;
+  typename AT::t_int_1d _type;
+  typename AT::t_int_1d _mask;
+  typename AT::t_int_1d _image;
+
+  typename AT::t_xfloat_2d_um _buf;
+  int _dim;
+  typename AT::t_int_1d _nlocal;
+
+  AtomVecAtomicKokkos_UnpackExchangeFunctor(
+      const AtomKokkos* atom,
+      const typename AT::tdual_xfloat_2d buf,
+      typename AT::tdual_int_1d nlocal,
+      int dim, X_FLOAT lo, X_FLOAT hi):
+                _x(atom->k_x.view<DeviceType>()),
+                _v(atom->k_v.view<DeviceType>()),
+                _tag(atom->k_tag.view<DeviceType>()),
+                _type(atom->k_type.view<DeviceType>()),
+                _mask(atom->k_mask.view<DeviceType>()),
+                _image(atom->k_image.view<DeviceType>()),
+                _nlocal(nlocal.template view<DeviceType>()),_dim(dim),
+                _lo(lo),_hi(hi){
+    const size_t elements = 11;
+    const int maxsendlist = (buf.template view<DeviceType>().dimension_0()*buf.template view<DeviceType>().dimension_1())/elements;
+
+    buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &myrecv) const {
+    X_FLOAT x = _buf(myrecv,_dim+1);
+    if (x >= _lo && x < _hi) {
+      int i = Kokkos::atomic_fetch_add(&_nlocal(0),1);
+      _x(i,0) = _buf(myrecv,1);
+      _x(i,1) = _buf(myrecv,2);
+      _x(i,2) = _buf(myrecv,3);
+      _v(i,0) = _buf(myrecv,4);
+      _v(i,1) = _buf(myrecv,5);
+      _v(i,2) = _buf(myrecv,6);
+      _tag[i] = _buf(myrecv,7);
+      _type[i] = _buf(myrecv,8);
+      _mask[i] = _buf(myrecv,9);
+      _image[i] = _buf(myrecv,10);
+    }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAtomicKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nrecv,int nlocal,int dim,X_FLOAT lo,X_FLOAT hi,ExecutionSpace space) {
+  if(space == Host) {
+    k_count.h_view(0) = nlocal;
+    AtomVecAtomicKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi);
+    Kokkos::parallel_for(nrecv/11,f);
+    LMPHostType::fence();
+    return k_count.h_view(0);
+  } else {
+    k_count.h_view(0) = nlocal;
+    k_count.modify<LMPHostType>();
+    k_count.sync<LMPDeviceType>();
+    AtomVecAtomicKokkos_UnpackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_count,dim,lo,hi);
+    Kokkos::parallel_for(nrecv/11,f);
+    LMPDeviceType::fence();
+    k_count.modify<LMPDeviceType>();
+    k_count.sync<LMPHostType>();
+
+    return k_count.h_view(0);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAtomicKokkos::unpack_exchange(double *buf)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) grow(0);
+  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK | 
+           MASK_MASK | IMAGE_MASK);
+
+  int m = 1;
+  h_x(nlocal,0) = buf[m++];
+  h_x(nlocal,1) = buf[m++];
+  h_x(nlocal,2) = buf[m++];
+  h_v(nlocal,0) = buf[m++];
+  h_v(nlocal,1) = buf[m++];
+  h_v(nlocal,2) = buf[m++];
+  h_tag[nlocal] = static_cast<int> (buf[m++]);
+  h_type[nlocal] = static_cast<int> (buf[m++]);
+  h_mask[nlocal] = static_cast<int> (buf[m++]);
+  h_image[nlocal] = static_cast<int> (buf[m++]);
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      m += modify->fix[atom->extra_grow[iextra]]->
+        unpack_exchange(nlocal,&buf[m]);
+
+  atom->nlocal++;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   size of restart data for all atoms owned by this proc
+   include extra data stored by fixes
+------------------------------------------------------------------------- */
+
+int AtomVecAtomicKokkos::size_restart()
+{
+  int i;
+
+  int nlocal = atom->nlocal;
+  int n = 11 * nlocal;
+
+  if (atom->nextra_restart)
+    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
+      for (i = 0; i < nlocal; i++)
+        n += modify->fix[atom->extra_restart[iextra]]->size_restart(i);
+
+  return n;
+}
+
+/* ----------------------------------------------------------------------
+   pack atom I's data for restart file including extra quantities
+   xyz must be 1st 3 values, so that read_restart can test on them
+   molecular types may be negative, but write as positive
+------------------------------------------------------------------------- */
+
+int AtomVecAtomicKokkos::pack_restart(int i, double *buf)
+{
+  int m = 1;
+  buf[m++] = h_x(i,0);
+  buf[m++] = h_x(i,1);
+  buf[m++] = h_x(i,2);
+  buf[m++] = h_tag[i];
+  buf[m++] = h_type[i];
+  buf[m++] = h_mask[i];
+  buf[m++] = h_image[i];
+  buf[m++] = h_v(i,0);
+  buf[m++] = h_v(i,1);
+  buf[m++] = h_v(i,2);
+
+  if (atom->nextra_restart)
+    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
+      m += modify->fix[atom->extra_restart[iextra]]->pack_restart(i,&buf[m]);
+
+  buf[0] = m;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   unpack data for one atom from restart file including extra quantities
+------------------------------------------------------------------------- */
+
+int AtomVecAtomicKokkos::unpack_restart(double *buf)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) {
+    grow(0);
+    if (atom->nextra_store)
+      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
+  }
+
+  int m = 1;
+  h_x(nlocal,0) = buf[m++];
+  h_x(nlocal,1) = buf[m++];
+  h_x(nlocal,2) = buf[m++];
+  h_tag[nlocal] = static_cast<int> (buf[m++]);
+  h_type[nlocal] = static_cast<int> (buf[m++]);
+  h_mask[nlocal] = static_cast<int> (buf[m++]);
+  h_image[nlocal] = *((tagint *) &buf[m++]);
+  h_v(nlocal,0) = buf[m++];
+  h_v(nlocal,1) = buf[m++];
+  h_v(nlocal,2) = buf[m++];
+
+  double **extra = atom->extra;
+  if (atom->nextra_store) {
+    int size = static_cast<int> (buf[0]) - m;
+    for (int i = 0; i < size; i++) extra[nlocal][i] = buf[m++];
+  }
+
+  atom->nlocal++;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   create one atom of itype at coord
+   set other values to defaults
+------------------------------------------------------------------------- */
+
+void AtomVecAtomicKokkos::create_atom(int itype, double *coord)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) {
+    //if(nlocal>2) printf("typeA: %i %i\n",type[0],type[1]);
+    atomKK->modified(Host,ALL_MASK);
+    grow(0);
+    //if(nlocal>2) printf("typeB: %i %i\n",type[0],type[1]);
+  }
+  atomKK->modified(Host,ALL_MASK);
+
+  tag[nlocal] = 0;
+  type[nlocal] = itype;
+  h_x(nlocal,0) = coord[0];
+  h_x(nlocal,1) = coord[1];
+  h_x(nlocal,2) = coord[2];
+  h_mask[nlocal] = 1;
+  h_image[nlocal] = ((tagint) IMGMAX << IMG2BITS) |
+    ((tagint) IMGMAX << IMGBITS) | IMGMAX;
+  h_v(nlocal,0) = 0.0;
+  h_v(nlocal,1) = 0.0;
+  h_v(nlocal,2) = 0.0;
+
+  atom->nlocal++;
+}
+
+/* ----------------------------------------------------------------------
+   unpack one line from Atoms section of data file
+   initialize other atom quantities
+------------------------------------------------------------------------- */
+
+void AtomVecAtomicKokkos::data_atom(double *coord, tagint imagetmp, 
+                                    char **values)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) grow(0);
+
+  h_tag[nlocal] = atoi(values[0]);
+  if (tag[nlocal] <= 0)
+    error->one(FLERR,"Invalid atom ID in Atoms section of data file");
+
+  h_type[nlocal] = atoi(values[1]);
+  if (type[nlocal] <= 0 || type[nlocal] > atom->ntypes)
+    error->one(FLERR,"Invalid atom type in Atoms section of data file");
+
+  h_x(nlocal,0) = coord[0];
+  h_x(nlocal,1) = coord[1];
+  h_x(nlocal,2) = coord[2];
+
+  h_image[nlocal] = imagetmp;
+
+  h_mask[nlocal] = 1;
+  h_v(nlocal,0) = 0.0;
+  h_v(nlocal,1) = 0.0;
+  h_v(nlocal,2) = 0.0;
+
+  atom->nlocal++;
+}
+
+/* ----------------------------------------------------------------------
+   pack atom info for data file including 3 image flags
+------------------------------------------------------------------------- */
+
+void AtomVecAtomicKokkos::pack_data(double **buf)
+{
+  int nlocal = atom->nlocal;
+  for (int i = 0; i < nlocal; i++) {
+    buf[i][0] = h_tag[i];
+    buf[i][1] = h_type[i];
+    buf[i][2] = h_x(i,0);
+    buf[i][3] = h_x(i,1);
+    buf[i][4] = h_x(i,2);
+    buf[i][5] = (h_image[i] & IMGMASK) - IMGMAX;
+    buf[i][6] = (h_image[i] >> IMGBITS & IMGMASK) - IMGMAX;
+    buf[i][7] = (h_image[i] >> IMG2BITS) - IMGMAX;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   write atom info to data file including 3 image flags
+------------------------------------------------------------------------- */
+
+void AtomVecAtomicKokkos::write_data(FILE *fp, int n, double **buf)
+{
+  for (int i = 0; i < n; i++)
+    fprintf(fp,"%d %d %-1.16e %-1.16e %-1.16e %d %d %d\n",
+            (int) buf[i][0],(int) buf[i][1],buf[i][2],buf[i][3],buf[i][4],
+            (int) buf[i][5],(int) buf[i][6],(int) buf[i][7]);
+}
+
+/* ----------------------------------------------------------------------
+   return # of bytes of allocated memory
+------------------------------------------------------------------------- */
+
+bigint AtomVecAtomicKokkos::memory_usage()
+{
+  bigint bytes = 0;
+
+  if (atom->memcheck("tag")) bytes += memory->usage(tag,nmax);
+  if (atom->memcheck("type")) bytes += memory->usage(type,nmax);
+  if (atom->memcheck("mask")) bytes += memory->usage(mask,nmax);
+  if (atom->memcheck("image")) bytes += memory->usage(image,nmax);
+  if (atom->memcheck("x")) bytes += memory->usage(x,nmax,3);
+  if (atom->memcheck("v")) bytes += memory->usage(v,nmax,3);
+  if (atom->memcheck("f")) bytes += memory->usage(f,nmax*commKK->nthreads,3);
+
+  return bytes;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAtomicKokkos::sync(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if (mask & X_MASK) atomKK->k_x.sync<LMPDeviceType>();
+    if (mask & V_MASK) atomKK->k_v.sync<LMPDeviceType>();
+    if (mask & F_MASK) atomKK->k_f.sync<LMPDeviceType>();
+    if (mask & TAG_MASK) atomKK->k_tag.sync<LMPDeviceType>();
+    if (mask & TYPE_MASK) atomKK->k_type.sync<LMPDeviceType>();
+    if (mask & MASK_MASK) atomKK->k_mask.sync<LMPDeviceType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPDeviceType>();
+  } else {
+    if (mask & X_MASK) atomKK->k_x.sync<LMPHostType>();
+    if (mask & V_MASK) atomKK->k_v.sync<LMPHostType>();
+    if (mask & F_MASK) atomKK->k_f.sync<LMPHostType>();
+    if (mask & TAG_MASK) atomKK->k_tag.sync<LMPHostType>();
+    if (mask & TYPE_MASK) atomKK->k_type.sync<LMPHostType>();
+    if (mask & MASK_MASK) atomKK->k_mask.sync<LMPHostType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPHostType>();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAtomicKokkos::modified(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if (mask & X_MASK) atomKK->k_x.modify<LMPDeviceType>();
+    if (mask & V_MASK) atomKK->k_v.modify<LMPDeviceType>();
+    if (mask & F_MASK) atomKK->k_f.modify<LMPDeviceType>();
+    if (mask & TAG_MASK) atomKK->k_tag.modify<LMPDeviceType>();
+    if (mask & TYPE_MASK) atomKK->k_type.modify<LMPDeviceType>();
+    if (mask & MASK_MASK) atomKK->k_mask.modify<LMPDeviceType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPDeviceType>();
+  } else {
+    if (mask & X_MASK) atomKK->k_x.modify<LMPHostType>();
+    if (mask & V_MASK) atomKK->k_v.modify<LMPHostType>();
+    if (mask & F_MASK) atomKK->k_f.modify<LMPHostType>();
+    if (mask & TAG_MASK) atomKK->k_tag.modify<LMPHostType>();
+    if (mask & TYPE_MASK) atomKK->k_type.modify<LMPHostType>();
+    if (mask & MASK_MASK) atomKK->k_mask.modify<LMPHostType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPHostType>();
+  }
+}
diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.h b/src/KOKKOS/atom_vec_atomic_kokkos.h
new file mode 100644
index 000000000..dc96cbb65
--- /dev/null
+++ b/src/KOKKOS/atom_vec_atomic_kokkos.h
@@ -0,0 +1,111 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale AtomicKokkos/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef ATOM_CLASS
+
+AtomStyle(atomic/kk,AtomVecAtomicKokkos)
+
+#else
+
+#ifndef LMP_ATOM_VEC_ATOMIC_KOKKOS_H
+#define LMP_ATOM_VEC_ATOMIC_KOKKOS_H
+
+#include "atom_vec_kokkos.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+class AtomVecAtomicKokkos : public AtomVecKokkos {
+ public:
+  AtomVecAtomicKokkos(class LAMMPS *);
+  virtual ~AtomVecAtomicKokkos() {}
+  void grow(int);
+  void copy(int, int, int);
+  int pack_comm(int, int *, double *, int, int *);
+  int pack_comm_vel(int, int *, double *, int, int *);
+  void unpack_comm(int, int, double *);
+  void unpack_comm_vel(int, int, double *);
+  int pack_reverse(int, int, double *);
+  void unpack_reverse(int, int *, double *);
+  int pack_border(int, int *, double *, int, int *);
+  int pack_border_vel(int, int *, double *, int, int *);
+  void unpack_border(int, int, double *);
+  void unpack_border_vel(int, int, double *);
+  int pack_exchange(int, double *);
+  int unpack_exchange(double *);
+  int size_restart();
+  int pack_restart(int, double *);
+  int unpack_restart(double *);
+  void create_atom(int, double *);
+  void data_atom(double *, tagint, char **);
+  void pack_data(double **);
+  void write_data(FILE *, int, double **);
+  bigint memory_usage();
+
+  void grow_reset();
+  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist, 
+                       const int & iswap,
+                       const DAT::tdual_xfloat_2d &buf,
+                       const int &pbc_flag, const int pbc[]);
+  void unpack_comm_kokkos(const int &n, const int &nfirst, 
+                          const DAT::tdual_xfloat_2d &buf);
+  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list, 
+                     const int & iswap, const int nfirst, 
+                     const int &pbc_flag, const int pbc[]);
+  int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, 
+                         DAT::tdual_xfloat_2d buf,int iswap,
+                         int pbc_flag, int *pbc, ExecutionSpace space);
+  void unpack_border_kokkos(const int &n, const int &nfirst, 
+                            const DAT::tdual_xfloat_2d &buf, 
+                            ExecutionSpace space);
+  int pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &buf, 
+                           DAT::tdual_int_1d k_sendlist,
+                           DAT::tdual_int_1d k_copylist,
+                           ExecutionSpace space, int dim, 
+                           X_FLOAT lo, X_FLOAT hi);
+  int unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv,
+                             int nlocal, int dim, X_FLOAT lo, X_FLOAT hi,
+                             ExecutionSpace space);
+
+  void sync(ExecutionSpace space, unsigned int mask);
+  void modified(ExecutionSpace space, unsigned int mask);
+
+ protected:
+  int *tag,*type,*mask;
+  tagint *image;
+  double **x,**v,**f;
+
+  DAT::t_int_1d d_tag, d_type, d_mask;
+  HAT::t_int_1d h_tag, h_type, h_mask;
+
+  DAT::t_tagint_1d d_image;
+  HAT::t_tagint_1d h_image;
+
+  DAT::t_x_array d_x;
+  DAT::t_v_array d_v;
+  DAT::t_f_array d_f;
+  HAT::t_x_array h_x;
+  HAT::t_v_array h_v;
+  HAT::t_f_array h_f;
+
+  DAT::tdual_int_1d k_count;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/KOKKOS/atom_vec_kokkos.cpp b/src/KOKKOS/atom_vec_kokkos.cpp
new file mode 100644
index 000000000..1d9174196
--- /dev/null
+++ b/src/KOKKOS/atom_vec_kokkos.cpp
@@ -0,0 +1,23 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "atom_vec_kokkos.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+AtomVecKokkos::AtomVecKokkos(LAMMPS *lmp) : AtomVec(lmp)
+{
+  kokkosable = 1;
+}
diff --git a/src/KOKKOS/atom_vec_kokkos.h b/src/KOKKOS/atom_vec_kokkos.h
new file mode 100644
index 000000000..ac651b0b5
--- /dev/null
+++ b/src/KOKKOS/atom_vec_kokkos.h
@@ -0,0 +1,76 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_ATOM_VEC_KOKKOS_H
+#define LMP_ATOM_VEC_KOKKOS_H
+
+#include "atom_vec.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+class AtomVecKokkos : public AtomVec {
+ public:
+  AtomVecKokkos(class LAMMPS *);
+  virtual ~AtomVecKokkos() {}
+
+  virtual void sync(ExecutionSpace space, unsigned int mask) {};
+  virtual void modified(ExecutionSpace space, unsigned int mask) {};
+
+  virtual int 
+    pack_comm_self(const int &n, const DAT::tdual_int_2d &list, 
+                   const int & iswap, const int nfirst, 
+                   const int &pbc_flag, const int pbc[])
+  {return 0;}
+  virtual int 
+    pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &list, 
+                     const int & iswap, const DAT::tdual_xfloat_2d &buf,
+                     const int &pbc_flag, const int pbc[])
+  {return 0;}
+  virtual void 
+    unpack_comm_kokkos(const int &n, const int &nfirst, 
+                       const DAT::tdual_xfloat_2d &buf) {};
+  virtual int 
+    pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, 
+                       DAT::tdual_xfloat_2d buf,int iswap,
+                       int pbc_flag, int *pbc, ExecutionSpace space)
+  {return 0;};
+  virtual void 
+    unpack_border_kokkos(const int &n, const int &nfirst, 
+                         const DAT::tdual_xfloat_2d &buf, 
+                         ExecutionSpace space) {};
+
+  virtual int 
+    pack_exchange_kokkos(const int &nsend, DAT::tdual_xfloat_2d &buf, 
+                         DAT::tdual_int_1d k_sendlist,
+                         DAT::tdual_int_1d k_copylist,
+                         ExecutionSpace space, int dim, X_FLOAT lo, X_FLOAT hi) 
+  {return 0;};
+  virtual int 
+    unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv,
+                           int nlocal, int dim, X_FLOAT lo, X_FLOAT hi,
+                           ExecutionSpace space)
+  {return 0;};
+
+ protected:
+  class AtomKokkos *atomKK;
+  class CommKokkos *commKK;
+};
+
+}
+
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp
new file mode 100644
index 000000000..5211d11a0
--- /dev/null
+++ b/src/KOKKOS/comm_kokkos.cpp
@@ -0,0 +1,820 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "comm_kokkos.h"
+#include "kokkos.h"
+#include "atom.h"
+#include "atom_kokkos.h"
+#include "atom_vec.h"
+#include "atom_vec_kokkos.h"
+#include "domain.h"
+#include "atom_masks.h"
+#include "error.h"
+#include "memory.h"
+
+using namespace LAMMPS_NS;
+
+#define BUFFACTOR 1.5
+#define BUFMIN 10000
+#define BUFEXTRA 1000
+
+enum{SINGLE,MULTI};
+
+/* ----------------------------------------------------------------------
+   setup MPI and allocate buffer space
+------------------------------------------------------------------------- */
+
+CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp)
+{
+  sendlist = NULL;  // need to free this since parent allocated?
+  k_sendlist = ArrayTypes<LMPDeviceType>::tdual_int_2d();
+
+  // error check for disallow of OpenMP threads?
+
+  // initialize comm buffers & exchange memory
+
+  maxsend = BUFMIN;
+  k_buf_send = ArrayTypes<LMPDeviceType>::
+    tdual_xfloat_2d("comm:k_buf_send",(maxsend+BUFEXTRA+5)/6,6);
+  buf_send = k_buf_send.view<LMPHostType>().ptr_on_device();
+
+  maxrecv = BUFMIN;
+  k_buf_recv = ArrayTypes<LMPDeviceType>::
+    tdual_xfloat_2d("comm:k_buf_recv",(maxrecv+5)/6,6);
+  buf_recv = k_buf_recv.view<LMPHostType>().ptr_on_device();
+
+  k_exchange_sendlist = ArrayTypes<LMPDeviceType>::
+    tdual_int_1d("comm:k_exchange_sendlist",100);
+  k_exchange_copylist = ArrayTypes<LMPDeviceType>::
+    tdual_int_1d("comm:k_exchange_copylist",100);
+  k_count = ArrayTypes<LMPDeviceType>::tdual_int_1d("comm:k_count",1);
+  k_sendflag = ArrayTypes<LMPDeviceType>::tdual_int_1d("comm:k_sendflag",100);
+
+  // next line is bogus?
+
+  memory->create(maxsendlist,maxswap,"comm:maxsendlist");
+  for (int i = 0; i < maxswap; i++) {
+    maxsendlist[i] = BUFMIN;
+  }
+  memory->create_kokkos(k_sendlist,sendlist,maxswap,BUFMIN,"comm:sendlist");
+}
+
+/* ---------------------------------------------------------------------- */
+
+CommKokkos::~CommKokkos()
+{
+  memory->destroy_kokkos(k_sendlist,sendlist);
+  memory->destroy_kokkos(k_buf_send,buf_send);
+  memory->destroy_kokkos(k_buf_recv,buf_recv);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void CommKokkos::init()
+{
+  atomKK = (AtomKokkos *) atom;
+  exchange_comm_classic = lmp->kokkos->exchange_comm_classic;
+  forward_comm_classic = lmp->kokkos->forward_comm_classic;
+  exchange_comm_on_host = lmp->kokkos->exchange_comm_on_host;
+  forward_comm_on_host = lmp->kokkos->forward_comm_on_host;
+
+  CommBrick::init();
+}
+
+/* ----------------------------------------------------------------------
+   forward communication of atom coords every timestep
+   other per-atom attributes may also be sent via pack/unpack routines
+------------------------------------------------------------------------- */
+
+void CommKokkos::forward_comm(int dummy)
+{
+
+ if (!forward_comm_classic) {
+    if (forward_comm_on_host) forward_comm_device<LMPHostType>(dummy);
+    else forward_comm_device<LMPDeviceType>(dummy);
+    return;
+  }
+
+  k_sendlist.sync<LMPHostType>();
+
+  if (comm_x_only) {
+    atomKK->sync(Host,X_MASK);
+    atomKK->modified(Host,X_MASK);
+  } else if (ghost_velocity) {
+    atomKK->sync(Host,X_MASK | V_MASK);
+    atomKK->modified(Host,X_MASK | V_MASK);
+  } else {
+    atomKK->sync(Host,ALL_MASK);
+    atomKK->modified(Host,ALL_MASK);
+  }
+
+  CommBrick::forward_comm(dummy);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void CommKokkos::forward_comm_device(int dummy)
+{
+  int n;
+  MPI_Request request;
+  MPI_Status status;
+  AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;
+  double **x = atom->x;
+  double *buf;
+
+  // exchange data with another proc
+  // if other proc is self, just copy
+  // if comm_x_only set, exchange or copy directly to x, don't unpack
+
+  k_sendlist.sync<DeviceType>();
+
+  for (int iswap = 0; iswap < nswap; iswap++) {
+
+    if (sendproc[iswap] != me) {
+      if (comm_x_only) {
+        atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
+        if (size_forward_recv[iswap]) buf = x[firstrecv[iswap]];
+        else buf = NULL;
+
+        if (size_forward_recv[iswap]) {
+            buf = atomKK->k_x.view<DeviceType>().ptr_on_device() + 
+              firstrecv[iswap]*atomKK->k_x.view<DeviceType>().dimension_1();
+            MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE,
+                    recvproc[iswap],0,world,&request);
+        }
+        n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,
+                                   iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]);
+
+        if (n) {
+          MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),
+                   n,MPI_DOUBLE,sendproc[iswap],0,world);
+        }
+
+        if (size_forward_recv[iswap]) MPI_Wait(&request,&status);
+        atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
+                         space,X_MASK);
+      } else if (ghost_velocity) {
+        error->all(FLERR,"Ghost velocity forward comm not yet "
+                   "implemented with Kokkos");
+        if (size_forward_recv[iswap])
+          MPI_Irecv(k_buf_recv.view<LMPHostType>().ptr_on_device(),
+                    size_forward_recv[iswap],MPI_DOUBLE,
+                    recvproc[iswap],0,world,&request);
+        n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap],
+                                buf_send,pbc_flag[iswap],pbc[iswap]);
+        if (n) MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
+        if (size_forward_recv[iswap]) MPI_Wait(&request,&status);
+        avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_recv);
+      } else {
+        if (size_forward_recv[iswap])
+          MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(),
+                    size_forward_recv[iswap],MPI_DOUBLE,
+                    recvproc[iswap],0,world,&request);
+        n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,iswap,
+                                   k_buf_send,pbc_flag[iswap],pbc[iswap]);
+        if (n)
+          MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),n,
+                   MPI_DOUBLE,sendproc[iswap],0,world);
+        if (size_forward_recv[iswap]) MPI_Wait(&request,&status);
+        avec->unpack_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv);
+      }
+
+    } else {
+      if (!ghost_velocity) {
+        if (sendnum[iswap])
+          n = avec->pack_comm_self(sendnum[iswap],k_sendlist,iswap,
+                                   firstrecv[iswap],pbc_flag[iswap],pbc[iswap]);
+      } else if (ghost_velocity) {
+        error->all(FLERR,"Ghost velocity forward comm not yet "
+                   "implemented with Kokkos");
+        n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap],
+                                buf_send,pbc_flag[iswap],pbc[iswap]);
+        avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_send);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   exchange: move atoms to correct processors
+   atoms exchanged with all 6 stencil neighbors
+   send out atoms that have left my box, receive ones entering my box
+   atoms will be lost if not inside some proc's box
+     can happen if atom moves outside of non-periodic bounary
+     or if atom moves more than one proc away
+   this routine called before every reneighboring
+   for triclinic, atoms must be in lamda coords (0-1) before exchange is called
+------------------------------------------------------------------------- */
+
+void CommKokkos::exchange()
+{
+  if (!exchange_comm_classic) {
+    if (exchange_comm_on_host) exchange_device<LMPHostType>();
+    else exchange_device<LMPDeviceType>();
+    return;
+  }
+
+  atomKK->sync(Host,ALL_MASK);
+  atomKK->modified(Host,ALL_MASK);
+
+  CommBrick::exchange();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct BuildExchangeListFunctor {
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  X_FLOAT _lo,_hi;
+  typename AT::t_x_array _x;
+
+  int _nlocal,_dim;
+  typename AT::t_int_1d _nsend;
+  typename AT::t_int_1d _sendlist;
+  typename AT::t_int_1d _sendflag;
+
+
+  BuildExchangeListFunctor(
+      const typename AT::tdual_x_array x,
+      const typename AT::tdual_int_1d sendlist,
+      typename AT::tdual_int_1d nsend,
+      typename AT::tdual_int_1d sendflag,int nlocal, int dim,
+                X_FLOAT lo, X_FLOAT hi):
+                _x(x.template view<DeviceType>()),
+                _sendlist(sendlist.template view<DeviceType>()),
+                _nsend(nsend.template view<DeviceType>()),
+                _sendflag(sendflag.template view<DeviceType>()),
+                _nlocal(nlocal),_dim(dim),
+                _lo(lo),_hi(hi){
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i) const {
+    if (_x(i,_dim) < _lo || _x(i,_dim) >= _hi) {
+      const int mysend=Kokkos::atomic_fetch_add(&_nsend(0),1);
+      if(mysend<_sendlist.dimension_0()) {
+        _sendlist(mysend) = i;
+        _sendflag(i) = 1;
+      }
+    } else
+      _sendflag(i) = 0;
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void CommKokkos::exchange_device()
+{
+  int i,m,nsend,nrecv,nrecv1,nrecv2,nlocal;
+  double lo,hi,value;
+  double **x;
+  double *sublo,*subhi,*buf;
+  MPI_Request request;
+  MPI_Status status;
+  AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;
+
+  // clear global->local map for owned and ghost atoms
+  // b/c atoms migrate to new procs in exchange() and
+  //   new ghosts are created in borders()
+  // map_set() is done at end of borders()
+  // clear ghost count and any ghost bonus data internal to AtomVec
+
+  if (map_style) atom->map_clear();
+  atom->nghost = 0;
+  atom->avec->clear_bonus();
+
+  // subbox bounds for orthogonal or triclinic
+
+  if (triclinic == 0) {
+    sublo = domain->sublo;
+    subhi = domain->subhi;
+  } else {
+    sublo = domain->sublo_lamda;
+    subhi = domain->subhi_lamda;
+  }
+
+  atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,ALL_MASK);
+
+  // loop over dimensions
+  for (int dim = 0; dim < 3; dim++) {
+
+    // fill buffer with atoms leaving my box, using < and >=
+    // when atom is deleted, fill it in with last atom
+
+    x = atom->x;
+    lo = sublo[dim];
+    hi = subhi[dim];
+    nlocal = atom->nlocal;
+    i = nsend = 0;
+
+    if (true) {
+      if (k_sendflag.h_view.dimension_0()<nlocal) k_sendflag.resize(nlocal);
+      k_count.h_view(0) = k_exchange_sendlist.h_view.dimension_0();
+      while (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) {
+        k_count.h_view(0) = 0;
+        k_count.modify<LMPHostType>();
+        k_count.sync<DeviceType>();
+
+        BuildExchangeListFunctor<DeviceType> 
+          f(atomKK->k_x,k_exchange_sendlist,k_count,k_sendflag,
+            nlocal,dim,lo,hi);
+        Kokkos::parallel_for(nlocal,f);
+        DeviceType::fence();
+        k_exchange_sendlist.modify<DeviceType>();
+        k_sendflag.modify<DeviceType>();
+        k_count.modify<DeviceType>();
+
+        k_count.sync<LMPHostType>();
+        if (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) {
+          k_exchange_sendlist.resize(k_count.h_view(0)*1.1);
+          k_exchange_copylist.resize(k_count.h_view(0)*1.1);
+          k_count.h_view(0)=k_exchange_sendlist.h_view.dimension_0();
+        }
+      }
+      k_exchange_sendlist.sync<LMPHostType>();
+      k_sendflag.sync<LMPHostType>();
+
+      int sendpos = nlocal-1;
+      nlocal -= k_count.h_view(0);
+      for(int i = 0; i < k_count.h_view(0); i++) {
+        if (k_exchange_sendlist.h_view(i)<nlocal) {
+          while (k_sendflag.h_view(sendpos)) sendpos--;
+          k_exchange_copylist.h_view(i) = sendpos;
+          sendpos--;
+        } else
+        k_exchange_copylist.h_view(i) = -1;
+      }
+
+      k_exchange_copylist.modify<LMPHostType>();
+      k_exchange_copylist.sync<DeviceType>();
+      nsend = 
+        avec->pack_exchange_kokkos(k_count.h_view(0),k_buf_send,
+                                   k_exchange_sendlist,k_exchange_copylist,
+                                   ExecutionSpaceFromDevice<DeviceType>::
+                                   space,dim,lo,hi);
+      DeviceType::fence();
+
+    } else {
+      while (i < nlocal) {
+        if (x[i][dim] < lo || x[i][dim] >= hi) {
+          if (nsend > maxsend) grow_send_kokkos(nsend,1);
+          nsend += avec->pack_exchange(i,&buf_send[nsend]);
+          avec->copy(nlocal-1,i,1);
+          nlocal--;
+        } else i++;
+      }
+    }
+    atom->nlocal = nlocal;
+
+    // send/recv atoms in both directions
+    // if 1 proc in dimension, no send/recv, set recv buf to send buf
+    // if 2 procs in dimension, single send/recv
+    // if more than 2 procs in dimension, send/recv to both neighbors
+
+    if (procgrid[dim] == 1) {
+      nrecv = nsend;
+      buf = buf_send;
+      if (nrecv) {
+        atom->nlocal=avec->
+          unpack_exchange_kokkos(k_buf_send,nrecv,atom->nlocal,dim,lo,hi,
+                                 ExecutionSpaceFromDevice<DeviceType>::space);
+        DeviceType::fence();
+      }
+    } else {
+      MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][0],0,
+                   &nrecv1,1,MPI_INT,procneigh[dim][1],0,world,&status);
+      nrecv = nrecv1;
+      if (procgrid[dim] > 2) {
+        MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][1],0,
+                     &nrecv2,1,MPI_INT,procneigh[dim][0],0,world,&status);
+        nrecv += nrecv2;
+      }
+      if (nrecv > maxrecv) grow_recv_kokkos(nrecv);
+
+      MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(),nrecv1,
+                MPI_DOUBLE,procneigh[dim][1],0,
+                world,&request);
+      MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),nsend,
+               MPI_DOUBLE,procneigh[dim][0],0,world);
+      MPI_Wait(&request,&status);
+
+      if (procgrid[dim] > 2) {
+        MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device()+nrecv1,
+                  nrecv2,MPI_DOUBLE,procneigh[dim][0],0,
+                  world,&request);
+        MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),nsend,
+                 MPI_DOUBLE,procneigh[dim][1],0,world);
+        MPI_Wait(&request,&status);
+      }
+
+      buf = buf_recv;
+      if (nrecv) {
+        atom->nlocal = avec->
+          unpack_exchange_kokkos(k_buf_recv,nrecv,atom->nlocal,dim,lo,hi,
+                                 ExecutionSpaceFromDevice<DeviceType>::space);
+        DeviceType::fence();
+      }
+    }
+
+    // check incoming atoms to see if they are in my box
+    // if so, add to my list
+
+  }
+
+  atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::space,ALL_MASK);
+
+  if (atom->firstgroupname) {
+    /* this is not yet implemented with Kokkos */
+    atomKK->sync(Host,ALL_MASK);
+    atom->first_reorder();
+    atomKK->modified(Host,ALL_MASK);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   borders: list nearby atoms to send to neighboring procs at every timestep
+   one list is created for every swap that will be made
+   as list is made, actually do swaps
+   this does equivalent of a communicate, so don't need to explicitly
+     call communicate routine on reneighboring timestep
+   this routine is called before every reneighboring
+   for triclinic, atoms must be in lamda coords (0-1) before borders is called
+------------------------------------------------------------------------- */
+
+void CommKokkos::borders()
+{
+  if (!exchange_comm_classic) {
+    if (exchange_comm_on_host) borders_device<LMPHostType>();
+    else borders_device<LMPDeviceType>();
+    return;
+  }
+
+  atomKK->sync(Host,ALL_MASK);
+  k_sendlist.modify<LMPHostType>();
+  atomKK->modified(Host,ALL_MASK);
+
+  CommBrick::borders();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct BuildBorderListFunctor {
+	typedef DeviceType device_type;
+	typedef ArrayTypes<DeviceType> AT;
+  X_FLOAT lo,hi;
+  typename AT::t_x_array x;
+  int iswap,maxsendlist;
+  int nfirst,nlast,dim;
+  typename AT::t_int_2d sendlist;
+  typename AT::t_int_1d nsend;
+
+  BuildBorderListFunctor(typename AT::tdual_x_array _x, 
+                         typename AT::tdual_int_2d _sendlist,
+                         typename AT::tdual_int_1d _nsend,int _nfirst, 
+                         int _nlast, int _dim,
+                         X_FLOAT _lo, X_FLOAT _hi, int _iswap, 
+                         int _maxsendlist):
+    x(_x.template view<DeviceType>()),
+    sendlist(_sendlist.template view<DeviceType>()),
+    nsend(_nsend.template view<DeviceType>()),
+    nfirst(_nfirst),nlast(_nlast),dim(_dim),
+    lo(_lo),hi(_hi),iswap(_iswap),maxsendlist(_maxsendlist){}
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (DeviceType dev) const {
+    const int chunk = ((nlast - nfirst + dev.league_size() - 1 ) / 
+                       dev.league_size());
+    const int teamstart = chunk*dev.league_rank() + nfirst;
+    const int teamend = (teamstart + chunk) < nlast?(teamstart + chunk):nlast;
+    int mysend = 0;
+    for (int i=teamstart + dev.team_rank(); i<teamend; i+=dev.team_size()) {
+      if (x(i,dim) >= lo && x(i,dim) <= hi) mysend++;
+    }
+    const int my_store_pos = dev.team_scan(mysend,&nsend(0));
+
+    if (my_store_pos+mysend < maxsendlist) {
+    mysend = my_store_pos;
+      for(int i=teamstart + dev.team_rank(); i<teamend; i+=dev.team_size()){
+        if (x(i,dim) >= lo && x(i,dim) <= hi) {
+          sendlist(iswap,mysend++) = i;
+        }
+      }
+    }
+  }
+
+  size_t shmem_size() const { return 1000u;}
+};
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void CommKokkos::borders_device() {
+  int i,n,itype,iswap,dim,ineed,twoneed,smax,rmax;
+  int nsend,nrecv,sendflag,nfirst,nlast,ngroup;
+  double lo,hi;
+  int *type;
+  double **x;
+  double *buf,*mlo,*mhi;
+  MPI_Request request;
+  MPI_Status status;
+  AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;
+
+  ExecutionSpace exec_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  k_sendlist.modify<DeviceType>();
+  atomKK->sync(exec_space,ALL_MASK);
+
+  // do swaps over all 3 dimensions
+
+  iswap = 0;
+  smax = rmax = 0;
+
+  for (dim = 0; dim < 3; dim++) {
+    nlast = 0;
+    twoneed = 2*maxneed[dim];
+    for (ineed = 0; ineed < twoneed; ineed++) {
+
+      // find atoms within slab boundaries lo/hi using <= and >=
+      // check atoms between nfirst and nlast
+      //   for first swaps in a dim, check owned and ghost
+      //   for later swaps in a dim, only check newly arrived ghosts
+      // store sent atom indices in list for use in future timesteps
+
+      x = atom->x;
+      if (style == SINGLE) {
+        lo = slablo[iswap];
+        hi = slabhi[iswap];
+      } else {
+        type = atom->type;
+        mlo = multilo[iswap];
+        mhi = multihi[iswap];
+      }
+      if (ineed % 2 == 0) {
+        nfirst = nlast;
+        nlast = atom->nlocal + atom->nghost;
+      }
+
+      nsend = 0;
+
+      // sendflag = 0 if I do not send on this swap
+      // sendneed test indicates receiver no longer requires data
+      // e.g. due to non-PBC or non-uniform sub-domains
+
+      if (ineed/2 >= sendneed[dim][ineed % 2]) sendflag = 0;
+      else sendflag = 1;
+
+      // find send atoms according to SINGLE vs MULTI
+      // all atoms eligible versus atoms in bordergroup
+      // only need to limit loop to bordergroup for first sends (ineed < 2)
+      // on these sends, break loop in two: owned (in group) and ghost
+
+      if (sendflag) {
+        if (!bordergroup || ineed >= 2) {
+          if (style == SINGLE) {
+            typename ArrayTypes<DeviceType>::tdual_int_1d total_send("TS",1);
+            total_send.h_view(0) = 0;
+            if(exec_space == Device) {
+              total_send.template modify<DeviceType>();
+              total_send.template sync<LMPDeviceType>();
+            }
+            BuildBorderListFunctor<DeviceType> f(atomKK->k_x,k_sendlist,
+                total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
+            Kokkos::ParallelWorkRequest config((nlast-nfirst+127)/128,128);
+            Kokkos::parallel_for(config,f);
+            DeviceType::fence();
+            total_send.template modify<DeviceType>();
+            total_send.template sync<LMPHostType>();
+
+            if(total_send.h_view(0) >= maxsendlist[iswap]) {
+              grow_list(iswap,total_send.h_view(0));
+              total_send.h_view(0) = 0;
+              if(exec_space == Device) {
+                total_send.template modify<LMPHostType>();
+                total_send.template sync<LMPDeviceType>();
+              }
+              BuildBorderListFunctor<DeviceType> f(atomKK->k_x,k_sendlist,
+                  total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
+              Kokkos::ParallelWorkRequest config((nlast-nfirst+127)/128,128);
+              Kokkos::parallel_for(config,f);
+              DeviceType::fence();
+              total_send.template modify<DeviceType>();
+              total_send.template sync<LMPHostType>();
+            }
+            nsend = total_send.h_view(0);
+          } else {
+            error->all(FLERR,"Required border comm not yet "
+                       "implemented with Kokkos\n");
+            for (i = nfirst; i < nlast; i++) {
+              itype = type[i];
+              if (x[i][dim] >= mlo[itype] && x[i][dim] <= mhi[itype]) {
+                if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend);
+                sendlist[iswap][nsend++] = i;
+              }
+            }
+          }
+
+        } else {
+          error->all(FLERR,"Required border comm not yet "
+                     "implemented with Kokkos\n");
+          if (style == SINGLE) {
+            ngroup = atom->nfirst;
+            for (i = 0; i < ngroup; i++)
+              if (x[i][dim] >= lo && x[i][dim] <= hi) {
+                if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend);
+                sendlist[iswap][nsend++] = i;
+              }
+            for (i = atom->nlocal; i < nlast; i++)
+              if (x[i][dim] >= lo && x[i][dim] <= hi) {
+                if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend);
+                sendlist[iswap][nsend++] = i;
+              }
+          } else {
+            ngroup = atom->nfirst;
+            for (i = 0; i < ngroup; i++) {
+              itype = type[i];
+              if (x[i][dim] >= mlo[itype] && x[i][dim] <= mhi[itype]) {
+                if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend);
+                sendlist[iswap][nsend++] = i;
+              }
+            }
+            for (i = atom->nlocal; i < nlast; i++) {
+              itype = type[i];
+              if (x[i][dim] >= mlo[itype] && x[i][dim] <= mhi[itype]) {
+                if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend);
+                sendlist[iswap][nsend++] = i;
+              }
+            }
+          }
+        }
+      }
+
+      // pack up list of border atoms
+
+      if (nsend*size_border > maxsend)
+        grow_send_kokkos(nsend*size_border,0);
+      if (ghost_velocity) {
+        error->all(FLERR,"Required border comm not yet "
+                   "implemented with Kokkos\n");
+        n = avec->pack_border_vel(nsend,sendlist[iswap],buf_send,
+                                  pbc_flag[iswap],pbc[iswap]);
+      }
+      else
+        n = avec->
+          pack_border_kokkos(nsend,k_sendlist,k_buf_send,iswap,
+                             pbc_flag[iswap],pbc[iswap],exec_space);
+
+      // swap atoms with other proc
+      // no MPI calls except SendRecv if nsend/nrecv = 0
+      // put incoming ghosts at end of my atom arrays
+      // if swapping with self, simply copy, no messages
+
+      if (sendproc[iswap] != me) {
+        MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0,
+                     &nrecv,1,MPI_INT,recvproc[iswap],0,world,&status);
+        if (nrecv*size_border > maxrecv) grow_recv_kokkos(nrecv*size_border);
+        if (nrecv) MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(),
+                             nrecv*size_border,MPI_DOUBLE,
+                             recvproc[iswap],0,world,&request);
+        if (n) MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),n,
+                        MPI_DOUBLE,sendproc[iswap],0,world);
+        if (nrecv) MPI_Wait(&request,&status);
+        buf = buf_recv;
+      } else {
+        nrecv = nsend;
+        buf = buf_send;
+      }
+
+      // unpack buffer
+
+      if (ghost_velocity) {
+        error->all(FLERR,"Required border comm not yet "
+                   "implemented with Kokkos\n");
+        avec->unpack_border_vel(nrecv,atom->nlocal+atom->nghost,buf);
+      }
+      else
+        if (sendproc[iswap] != me)
+          avec->unpack_border_kokkos(nrecv,atom->nlocal+atom->nghost,
+                                     k_buf_recv,exec_space);
+        else
+          avec->unpack_border_kokkos(nrecv,atom->nlocal+atom->nghost,
+                                     k_buf_send,exec_space);
+
+      // set all pointers & counters
+
+      smax = MAX(smax,nsend);
+      rmax = MAX(rmax,nrecv);
+      sendnum[iswap] = nsend;
+      recvnum[iswap] = nrecv;
+      size_forward_recv[iswap] = nrecv*size_forward;
+      size_reverse_send[iswap] = nrecv*size_reverse;
+      size_reverse_recv[iswap] = nsend*size_reverse;
+      firstrecv[iswap] = atom->nlocal + atom->nghost;
+      atom->nghost += nrecv;
+      iswap++;
+    }
+  }
+
+  // insure send/recv buffers are long enough for all forward & reverse comm
+
+  int max = MAX(maxforward*smax,maxreverse*rmax);
+  if (max > maxsend) grow_send_kokkos(max,0);
+  max = MAX(maxforward*rmax,maxreverse*smax);
+  if (max > maxrecv) grow_recv_kokkos(max);
+
+  // reset global->local map
+
+  if (map_style) atom->map_set();
+  if (exec_space == Host) k_sendlist.sync<LMPDeviceType>();
+  atomKK->modified(exec_space,ALL_MASK);
+  DeviceType::fence();
+}
+
+/* ----------------------------------------------------------------------
+   realloc the size of the send buffer as needed with BUFFACTOR & BUFEXTRA
+   if flag = 1, realloc
+   if flag = 0, don't need to realloc with copy, just free/malloc
+------------------------------------------------------------------------- */
+
+void CommKokkos::grow_send_kokkos(int n, int flag, ExecutionSpace space)
+{
+  maxsend = static_cast<int> (BUFFACTOR * n);
+  int maxsend_border = (maxsend+BUFEXTRA+5)/atom->avec->size_border + 2;
+  if (flag) {
+    if(space == Device)
+      k_buf_send.modify<LMPDeviceType>();
+    else
+      k_buf_send.modify<LMPHostType>();
+
+    k_buf_send.resize(maxsend_border,atom->avec->size_border);
+    buf_send = k_buf_send.view<LMPHostType>().ptr_on_device();
+  }
+  else {
+    k_buf_send = ArrayTypes<LMPDeviceType>::
+      tdual_xfloat_2d("comm:k_buf_send",maxsend_border,atom->avec->size_border);
+    buf_send = k_buf_send.view<LMPHostType>().ptr_on_device();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   free/malloc the size of the recv buffer as needed with BUFFACTOR
+------------------------------------------------------------------------- */
+
+void CommKokkos::grow_recv_kokkos(int n, ExecutionSpace space)
+{
+  maxrecv = static_cast<int> (BUFFACTOR * n);
+  int maxrecv_border = (maxrecv+BUFEXTRA+5)/atom->avec->size_border + 2;
+  k_buf_recv = ArrayTypes<LMPDeviceType>::
+    tdual_xfloat_2d("comm:k_buf_recv",maxrecv_border,atom->avec->size_border);
+  buf_recv = k_buf_recv.view<LMPHostType>().ptr_on_device();
+}
+
+/* ----------------------------------------------------------------------
+   realloc the size of the iswap sendlist as needed with BUFFACTOR
+------------------------------------------------------------------------- */
+
+void CommKokkos::grow_list(int iswap, int n)
+{
+  int size = static_cast<int> (BUFFACTOR * n);
+
+  memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist");
+
+  for(int i=0;i<maxswap;i++) {
+    maxsendlist[i]=size; sendlist[i]=&k_sendlist.view<LMPHostType>()(i,0);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   realloc the buffers needed for swaps
+------------------------------------------------------------------------- */
+
+void CommKokkos::grow_swap(int n)
+{
+  free_swap();
+  allocate_swap(n);
+  if (style == MULTI) {
+    free_multi();
+    allocate_multi(n);
+  }
+
+  maxswap = n;
+  int size = MAX(k_sendlist.d_view.dimension_1(),BUFMIN);
+
+  memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist");
+
+  memory->grow(maxsendlist,n,"comm:maxsendlist");
+  for (int i=0;i<maxswap;i++) maxsendlist[i]=size;
+}
diff --git a/src/KOKKOS/comm_kokkos.h b/src/KOKKOS/comm_kokkos.h
new file mode 100644
index 000000000..46d3552d2
--- /dev/null
+++ b/src/KOKKOS/comm_kokkos.h
@@ -0,0 +1,63 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_COMM_KOKKOS_H
+#define LMP_COMM_KOKKOS_H
+
+#include "comm_brick.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+class CommKokkos : public CommBrick {
+ public:
+  class AtomKokkos *atomKK;
+
+  bool exchange_comm_classic;
+  bool forward_comm_classic;
+  bool exchange_comm_on_host;
+  bool forward_comm_on_host;
+
+  CommKokkos(class LAMMPS *);
+  ~CommKokkos();
+  void init();
+
+  void forward_comm(int dummy = 0);    // forward comm of atom coords
+  void exchange();                     // move atoms to new procs
+  void borders();                      // setup list of atoms to comm
+
+  template<class DeviceType> void forward_comm_device(int dummy);
+  template<class DeviceType> void exchange_device();
+  template<class DeviceType> void borders_device();
+
+ protected:
+  DAT::tdual_int_2d k_sendlist;
+  DAT::tdual_xfloat_2d k_buf_send,k_buf_recv;
+  DAT::tdual_int_1d k_exchange_sendlist,k_exchange_copylist,k_sendflag;
+  DAT::tdual_int_1d k_count;
+  //double *buf_send;                 // send buffer for all comm
+  //double *buf_recv;                 // recv buffer for all comm
+
+  void grow_send_kokkos(int, int, ExecutionSpace space = Host);
+  void grow_recv_kokkos(int, ExecutionSpace space = Host);
+  void grow_list(int, int);
+  void grow_swap(int);
+};
+
+}
+
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/KOKKOS/domain_kokkos.cpp b/src/KOKKOS/domain_kokkos.cpp
new file mode 100644
index 000000000..c2214b611
--- /dev/null
+++ b/src/KOKKOS/domain_kokkos.cpp
@@ -0,0 +1,207 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "domain_kokkos.h"
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+DomainKokkos::DomainKokkos(LAMMPS *lmp) : Domain(lmp) {}
+
+/* ---------------------------------------------------------------------- */
+
+void DomainKokkos::init()
+{
+  atomKK = (AtomKokkos *) atom;
+  Domain::init();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType, int PERIODIC, int DEFORM_VREMAP>
+struct DomainPBCFunctor {
+  typedef DeviceType device_type;
+  double lo[3],hi[3],period[3];
+  typename ArrayTypes<DeviceType>::t_x_array x;
+  typename ArrayTypes<DeviceType>::t_v_array v;
+  typename ArrayTypes<DeviceType>::t_int_1d mask;
+  typename ArrayTypes<DeviceType>::t_int_1d image;
+  int deform_groupbit;
+  double h_rate[6];
+  int xperiodic,yperiodic,zperiodic;
+
+  DomainPBCFunctor(double* _lo, double* _hi, double* _period,
+                   DAT::tdual_x_array _x, DAT::tdual_v_array _v,
+                   DAT::tdual_int_1d _mask, DAT::tdual_int_1d _image, 
+                   int _deform_groupbit, double* _h_rate,
+                   int _xperiodic, int _yperiodic, int _zperiodic):
+    x(_x.view<DeviceType>()), v(_v.view<DeviceType>()),
+    mask(_mask.view<DeviceType>()), image(_image.view<DeviceType>()),
+    deform_groupbit(_deform_groupbit),
+    xperiodic(_xperiodic), yperiodic(_yperiodic), zperiodic(_zperiodic){
+    lo[0]=_lo[0]; lo[1]=_lo[1]; lo[2]=_lo[2];
+    hi[0]=_hi[0]; hi[1]=_hi[1]; hi[2]=_hi[2];
+    period[0]=_period[0]; period[1]=_period[1]; period[2]=_period[2];
+    h_rate[0]=_h_rate[0]; h_rate[1]=_h_rate[1]; h_rate[2]=_h_rate[2];
+    h_rate[3]=_h_rate[3]; h_rate[4]=_h_rate[4]; h_rate[5]=_h_rate[5];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &i) const {
+    if (PERIODIC && xperiodic) {
+      if (x(i,0) < lo[0]) {
+        x(i,0) += period[0];
+        if (DEFORM_VREMAP && (mask[i] & deform_groupbit)) v(i,0) += h_rate[0];
+        int idim = image[i] & IMGMASK;
+        const int otherdims = image[i] ^ idim;
+        idim--;
+        idim &= IMGMASK;
+        image[i] = otherdims | idim;
+      }
+      if (x(i,0) >= hi[0]) {
+        x(i,0) -= period[0];
+        x(i,0) = MAX(x(i,0),lo[0]);
+        if (DEFORM_VREMAP && (mask[i] & deform_groupbit)) v(i,0) -= h_rate[0];
+        int idim = image[i] & IMGMASK;
+        const int otherdims = image[i] ^ idim;
+        idim++;
+        idim &= IMGMASK;
+        image[i] = otherdims | idim;
+      }
+    }
+    
+    if (PERIODIC && yperiodic) {
+      if (x(i,1) < lo[1]) {
+        x(i,1) += period[1];
+        if (DEFORM_VREMAP && (mask[i] & deform_groupbit)) {
+          v(i,0) += h_rate[5];
+          v(i,1) += h_rate[1];
+        }
+        int idim = (image[i] >> IMGBITS) & IMGMASK;
+        const int otherdims = image[i] ^ (idim << IMGBITS);
+        idim--;
+        idim &= IMGMASK;
+        image[i] = otherdims | (idim << IMGBITS);
+      }
+      if (x(i,1) >= hi[1]) {
+        x(i,1) -= period[1];
+        x(i,1) = MAX(x(i,1),lo[1]);
+        if (DEFORM_VREMAP && (mask[i] & deform_groupbit)) {
+          v(i,0) -= h_rate[5];
+          v(i,1) -= h_rate[1];
+        }
+        int idim = (image[i] >> IMGBITS) & IMGMASK;
+        const int otherdims = image[i] ^ (idim << IMGBITS);
+        idim++;
+        idim &= IMGMASK;
+        image[i] = otherdims | (idim << IMGBITS);
+      }
+    }
+    
+    if (PERIODIC && zperiodic) {
+      if (x(i,2) < lo[2]) {
+        x(i,2) += period[2];
+        if (DEFORM_VREMAP && (mask[i] & deform_groupbit)) {
+          v(i,0) += h_rate[4];
+          v(i,1) += h_rate[3];
+          v(i,2) += h_rate[2];
+        }
+        int idim = image[i] >> IMG2BITS;
+        const int otherdims = image[i] ^ (idim << IMG2BITS);
+        idim--;
+        idim &= IMGMASK;
+        image[i] = otherdims | (idim << IMG2BITS);
+      }
+      if (x(i,2) >= hi[2]) {
+        x(i,2) -= period[2];
+        x(i,2) = MAX(x(i,2),lo[2]);
+        if (DEFORM_VREMAP && (mask[i] & deform_groupbit)) {
+          v(i,0) -= h_rate[4];
+          v(i,1) -= h_rate[3];
+          v(i,2) -= h_rate[2];
+        }
+        int idim = image[i] >> IMG2BITS;
+        const int otherdims = image[i] ^ (idim << IMG2BITS);
+        idim++;
+        idim &= IMGMASK;
+        image[i] = otherdims | (idim << IMG2BITS);
+      }
+    }
+  }
+};
+
+/* ----------------------------------------------------------------------
+   enforce PBC and modify box image flags for each atom
+   called every reneighboring and by other commands that change atoms
+   resulting coord must satisfy lo <= coord < hi
+   MAX is important since coord - prd < lo can happen when coord = hi
+   if fix deform, remap velocity of fix group atoms by box edge velocities
+   for triclinic, atoms must be in lamda coords (0-1) before pbc is called
+   image = 10 bits for each dimension
+   increment/decrement in wrap-around fashion
+------------------------------------------------------------------------- */
+
+void DomainKokkos::pbc()
+{
+  double *lo,*hi,*period;
+  int nlocal = atomKK->nlocal;
+
+  if (triclinic == 0) {
+    lo = boxlo;
+    hi = boxhi;
+    period = prd;
+  } else {
+    lo = boxlo_lamda;
+    hi = boxhi_lamda;
+    period = prd_lamda;
+  }
+
+  atomKK->sync(Device,X_MASK|V_MASK|MASK_MASK|IMAGE_MASK);
+  atomKK->modified(Device,X_MASK|V_MASK);
+
+  if (xperiodic || yperiodic || zperiodic) {
+    if (deform_vremap) {
+      DomainPBCFunctor<LMPDeviceType,1,1> 
+        f(lo,hi,period,
+          atomKK->k_x,atomKK->k_v,atomKK->k_mask,atomKK->k_image,
+          deform_groupbit,h_rate,xperiodic,yperiodic,zperiodic);
+      Kokkos::parallel_for(nlocal,f);
+    } else {
+      DomainPBCFunctor<LMPDeviceType,1,0> 
+        f(lo,hi,period,
+          atomKK->k_x,atomKK->k_v,atomKK->k_mask,atomKK->k_image,
+          deform_groupbit,h_rate,xperiodic,yperiodic,zperiodic);
+      Kokkos::parallel_for(nlocal,f);
+    }
+  } else {
+    if (deform_vremap) {
+      DomainPBCFunctor<LMPDeviceType,0,1> 
+        f(lo,hi,period,
+          atomKK->k_x,atomKK->k_v,atomKK->k_mask,atomKK->k_image,
+          deform_groupbit,h_rate,xperiodic,yperiodic,zperiodic);
+      Kokkos::parallel_for(nlocal,f);
+    } else {
+      DomainPBCFunctor<LMPDeviceType,0,0> 
+        f(lo,hi,period,
+          atomKK->k_x,atomKK->k_v,atomKK->k_mask,atomKK->k_image,
+          deform_groupbit,h_rate,xperiodic,yperiodic,zperiodic);
+      Kokkos::parallel_for(nlocal,f);
+    }
+  }
+
+  LMPDeviceType::fence();
+}
+
diff --git a/src/KOKKOS/domain_kokkos.h b/src/KOKKOS/domain_kokkos.h
new file mode 100644
index 000000000..36e0aa4aa
--- /dev/null
+++ b/src/KOKKOS/domain_kokkos.h
@@ -0,0 +1,38 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_DOMAIN_KOKKOS_H
+#define LMP_DOMAIN_KOKKOS_H
+
+#include "domain.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+class DomainKokkos : public Domain {
+ public:
+  class AtomKokkos *atomKK;
+
+  DomainKokkos(class LAMMPS *);
+  ~DomainKokkos() {}
+  void init();
+  void pbc();
+};
+
+}
+
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/KOKKOS/fix_nve_kokkos.cpp b/src/KOKKOS/fix_nve_kokkos.cpp
new file mode 100644
index 000000000..3076dca4f
--- /dev/null
+++ b/src/KOKKOS/fix_nve_kokkos.cpp
@@ -0,0 +1,177 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "stdio.h"
+#include "string.h"
+#include "fix_nve_kokkos.h"
+#include "atom_masks.h"
+#include "atom_kokkos.h"
+#include "force.h"
+#include "update.h"
+#include "respa.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+FixNVEKokkos<DeviceType>::FixNVEKokkos(LAMMPS *lmp, int narg, char **arg) :
+  FixNVE(lmp, narg, arg)
+{
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+
+  datamask_read = X_MASK | V_MASK | F_MASK | MASK_MASK | RMASS_MASK | TYPE_MASK;
+  datamask_modify = X_MASK | V_MASK | F_MASK;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixNVEKokkos<DeviceType>::init()
+{
+  FixNVE::init();
+
+  atomKK->k_mass.modify<LMPHostType>();
+  atomKK->k_mass.sync<LMPDeviceType>();
+}
+
+/* ----------------------------------------------------------------------
+   allow for both per-type and per-atom mass
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixNVEKokkos<DeviceType>::initial_integrate(int vflag)
+{
+  atomKK->sync(execution_space,datamask_read);
+  atomKK->modified(execution_space,datamask_modify);
+
+  x = atomKK->k_x.view<DeviceType>();
+  v = atomKK->k_v.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  rmass = atomKK->rmass;
+  mass = atomKK->k_mass.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  mask = atomKK->k_mask.view<DeviceType>();
+  int nlocal = atomKK->nlocal;
+  if (igroup == atomKK->firstgroup) nlocal = atomKK->nfirst;
+
+  if (rmass) {
+    FixNVEKokkosInitialIntegrateFunctor<DeviceType,1> functor(this);
+    Kokkos::parallel_for(nlocal,functor);
+  } else {
+    FixNVEKokkosInitialIntegrateFunctor<DeviceType,0> functor(this);
+    Kokkos::parallel_for(nlocal,functor);
+  }
+  DeviceType::fence();
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixNVEKokkos<DeviceType>::initial_integrate_item(int i) const
+{
+  if (mask[i] & groupbit) {
+    const double dtfm = dtf / mass[type[i]];
+    v(i,0) += dtfm * f(i,0);
+    v(i,1) += dtfm * f(i,1);
+    v(i,2) += dtfm * f(i,2);
+    x(i,0) += dtv * v(i,0);
+    x(i,1) += dtv * v(i,1);
+    x(i,2) += dtv * v(i,2);
+  }
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixNVEKokkos<DeviceType>::initial_integrate_rmass_item(int i) const
+{
+  if (mask[i] & groupbit) {
+    const double dtfm = dtf / rmass[type[i]];
+    v(i,0) += dtfm * f(i,0);
+    v(i,1) += dtfm * f(i,1);
+    v(i,2) += dtfm * f(i,2);
+    x(i,0) += dtv * v(i,0);
+    x(i,1) += dtv * v(i,1);
+    x(i,2) += dtv * v(i,2);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixNVEKokkos<DeviceType>::final_integrate()
+{
+  atomKK->sync(execution_space,datamask_read);
+  atomKK->modified(execution_space,datamask_modify);
+
+  v = atomKK->k_v.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  rmass = atomKK->rmass;
+  mass = atomKK->k_mass.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  mask = atomKK->k_mask.view<DeviceType>();
+  int nlocal = atomKK->nlocal;
+  if (igroup == atomKK->firstgroup) nlocal = atomKK->nfirst;
+
+  if (rmass) {
+    FixNVEKokkosFinalIntegrateFunctor<DeviceType,1> functor(this);
+    Kokkos::parallel_for(nlocal,functor);
+  } else {
+    FixNVEKokkosFinalIntegrateFunctor<DeviceType,0> functor(this);
+    Kokkos::parallel_for(nlocal,functor);
+  }
+  DeviceType::fence();
+
+  // debug
+  //atomKK->sync(Host,datamask_read);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixNVEKokkos<DeviceType>::final_integrate_item(int i) const
+{
+  if (mask[i] & groupbit) {
+    const double dtfm = dtf / mass[type[i]];
+    v(i,0) += dtfm * f(i,0);
+    v(i,1) += dtfm * f(i,1);
+    v(i,2) += dtfm * f(i,2);
+  }
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixNVEKokkos<DeviceType>::final_integrate_rmass_item(int i) const
+{
+  if (mask[i] & groupbit) {
+    const double dtfm = dtf / rmass[type[i]];
+    v(i,0) += dtfm * f(i,0);
+    v(i,1) += dtfm * f(i,1);
+    v(i,2) += dtfm * f(i,2);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixNVEKokkos<DeviceType>::cleanup_copy()
+{
+  id = style = NULL;
+  vatom = NULL;
+}
+
+template class FixNVEKokkos<LMPDeviceType>;
+#if DEVICE==2
+template class FixNVEKokkos<LMPHostType>;
+#endif
diff --git a/src/KOKKOS/fix_nve_kokkos.h b/src/KOKKOS/fix_nve_kokkos.h
new file mode 100644
index 000000000..bd9ec4d81
--- /dev/null
+++ b/src/KOKKOS/fix_nve_kokkos.h
@@ -0,0 +1,110 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(nve/kk,FixNVEKokkos<LMPDeviceType>)
+FixStyle(nve/kk/device,FixNVEKokkos<LMPDeviceType>)
+FixStyle(nve/kk/host,FixNVEKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_FIX_NVE_KOKKOS_H
+#define LMP_FIX_NVE_KOKKOS_H
+
+#include "fix_nve.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+template<class DeviceType>
+class FixNVEKokkos;
+
+template <class DeviceType, int RMass>
+class FixNVEKokkosInitialIntegrateFunctor;
+template <class DeviceType, int RMass>
+class FixNVEKokkosFinalIntegrateFunctor;
+
+template<class DeviceType>
+class FixNVEKokkos : public FixNVE {
+ public:
+  FixNVEKokkos(class LAMMPS *, int, char **);
+  ~FixNVEKokkos() {}
+  void cleanup_copy();
+  void init();
+  void initial_integrate(int);
+  void final_integrate();
+
+  KOKKOS_INLINE_FUNCTION
+  void initial_integrate_item(int) const;
+  KOKKOS_INLINE_FUNCTION
+  void initial_integrate_rmass_item(int) const;
+  KOKKOS_INLINE_FUNCTION
+  void final_integrate_item(int) const;
+  KOKKOS_INLINE_FUNCTION
+  void final_integrate_rmass_item(int) const;
+
+ private:
+  class AtomKokkos *atomKK;
+
+  typename ArrayTypes<DeviceType>::t_x_array x;
+  typename ArrayTypes<DeviceType>::t_v_array v;
+  typename ArrayTypes<DeviceType>::t_f_array_const f;
+  double *rmass;
+  typename ArrayTypes<DeviceType>::t_float_1d_randomread mass;
+  typename ArrayTypes<DeviceType>::t_int_1d type;
+  typename ArrayTypes<DeviceType>::t_int_1d mask;
+};
+
+template <class DeviceType, int RMass>
+struct FixNVEKokkosInitialIntegrateFunctor  {
+  typedef DeviceType  device_type ;
+  FixNVEKokkos<DeviceType> c;
+
+  FixNVEKokkosInitialIntegrateFunctor(FixNVEKokkos<DeviceType>* c_ptr):
+  c(*c_ptr) {c.cleanup_copy();};
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    if (RMass) c.initial_integrate_rmass_item(i);
+    else c.initial_integrate_item(i);
+  }
+};
+
+template <class DeviceType, int RMass>
+struct FixNVEKokkosFinalIntegrateFunctor  {
+  typedef DeviceType  device_type ;
+  FixNVEKokkos<DeviceType> c;
+
+  FixNVEKokkosFinalIntegrateFunctor(FixNVEKokkos<DeviceType>* c_ptr):
+  c(*c_ptr) {c.cleanup_copy();};
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    if (RMass) c.final_integrate_rmass_item(i);
+    else c.final_integrate_item(i);
+  }
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+*/
diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp
new file mode 100644
index 000000000..4f6031f22
--- /dev/null
+++ b/src/KOKKOS/kokkos.cpp
@@ -0,0 +1,220 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "stdio.h"
+#include "string.h"
+#include "stdlib.h"
+#include "ctype.h"
+#include "kokkos.h"
+#include "lammps.h"
+#include "neighbor_kokkos.h"
+#include "neigh_list_kokkos.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+enum{FULL,HALFTHREAD,HALF};
+
+/* ---------------------------------------------------------------------- */
+
+KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
+{
+  kokkos_exists = 1;
+  lmp->kokkos = this;
+
+  // process any command-line args that invoke Kokkos settings
+
+  int device = 0;
+  int num_threads = 1;
+  int numa = 1;
+
+  int iarg = 0;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"d") == 0 || strcmp(arg[iarg],"device") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Invalid Kokkos command-line args");
+      device = atoi(arg[iarg+1]);
+      iarg += 2;
+
+    } else if (strcmp(arg[iarg],"g") == 0 || 
+               strcmp(arg[iarg],"gpus") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Invalid Kokkos command-line args");
+      int ngpu = atoi(arg[iarg+1]);
+      iarg += 2;
+
+      int skip_gpu = 9999;
+      if (iarg+2 < narg && isdigit(arg[iarg+2][0])) {
+        skip_gpu = atoi(arg[iarg+2]);
+        iarg++;
+      }
+
+      char *str;
+      if (str = getenv("SLURM_LOCALID")) {
+        int local_rank = atoi(str);
+        device = local_rank % ngpu;
+        if (device >= skip_gpu) device++;
+      }
+      if (str = getenv("MV2_COMM_WORLD_LOCAL_RANK")) {
+        int local_rank = atoi(str);
+        device = local_rank % ngpu;
+        if (device >= skip_gpu) device++;
+      }
+      if (str = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) {
+        int local_rank = atoi(str);
+        device = local_rank % ngpu;
+        if (device >= skip_gpu) device++;
+      }
+
+    } else if (strcmp(arg[iarg],"t") == 0 ||
+               strcmp(arg[iarg],"threads") == 0) {
+      num_threads = atoi(arg[iarg+1]);
+      iarg += 2;
+
+    } else if (strcmp(arg[iarg],"n") == 0 ||
+               strcmp(arg[iarg],"numa") == 0) {
+      numa = atoi(arg[iarg+1]);
+      iarg += 2;
+
+    } else error->all(FLERR,"Invalid Kokkos command-line args");
+  }
+
+  // initialize Kokkos
+
+#if DEVICE==2
+  Kokkos::Cuda::host_mirror_device_type::initialize(num_threads,numa);
+  Kokkos::Cuda::SelectDevice select_device(device);
+  Kokkos::Cuda::initialize(select_device);
+#else
+  LMPHostType::initialize(num_threads,numa);
+#endif
+
+  // default settings for package kokkos command
+
+  neighflag = FULL;
+  exchange_comm_classic = 0;
+  forward_comm_classic = 0;
+  exchange_comm_on_host = 1;
+  forward_comm_on_host = 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+KokkosLMP::~KokkosLMP()
+{
+  // finalize Kokkos
+
+#if DEVICE==2
+  Kokkos::Cuda::finalize();
+  Kokkos::Cuda::host_mirror_device_type::finalize();
+#else
+  LMPHostType::finalize();
+#endif
+}
+
+/* ----------------------------------------------------------------------
+   invoked by package kokkos command
+------------------------------------------------------------------------- */
+
+void KokkosLMP::accelerator(int narg, char **arg)
+{
+  int iarg = 0;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"neigh") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package command");
+      if (strcmp(arg[iarg+1],"full") == 0) neighflag = FULL;
+      else if (strcmp(arg[iarg+1],"half/thread") == 0) neighflag = HALFTHREAD;
+      else if (strcmp(arg[iarg+1],"half") == 0) neighflag = HALF;
+      else if (strcmp(arg[iarg+1],"n2") == 0) neighflag = N2;
+      else if (strcmp(arg[iarg+1],"full/cluster") == 0) neighflag = FULLCLUSTER;
+      else error->all(FLERR,"Illegal package command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"comm/exchange") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package command");
+      if (strcmp(arg[iarg+1],"no") == 0) exchange_comm_classic = 1;
+      else if (strcmp(arg[iarg+1],"host") == 0) {
+        exchange_comm_classic = 0;
+        exchange_comm_on_host = 1;
+      } else if (strcmp(arg[iarg+1],"device") == 0) {
+        exchange_comm_classic = 0;
+        exchange_comm_on_host = 0;
+      } else error->all(FLERR,"Illegal package command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"comm/forward") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package command");
+      if (strcmp(arg[iarg+1],"no") == 0) forward_comm_classic = 1;
+      else if (strcmp(arg[iarg+1],"host") == 0) {
+        forward_comm_classic = 0;
+        forward_comm_on_host = 1;
+      } else if (strcmp(arg[iarg+1],"device") == 0) {
+        forward_comm_classic = 0;
+        forward_comm_on_host = 0;
+      } else error->all(FLERR,"Illegal package command");
+      iarg += 2;
+    } else error->all(FLERR,"Illegal package command");
+  }
+}
+
+/* ----------------------------------------------------------------------
+   called by Finish
+------------------------------------------------------------------------- */
+
+int KokkosLMP::neigh_list_kokkos(int m)
+{
+  NeighborKokkos *nk = (NeighborKokkos *) neighbor;
+  if (nk->lists_host[m] && nk->lists_host[m]->d_numneigh.dimension_0()) 
+    return 1;
+  if (nk->lists_device[m] && nk->lists_device[m]->d_numneigh.dimension_0()) 
+    return 1;
+  return 0;
+}
+
+/* ----------------------------------------------------------------------
+   called by Finish
+------------------------------------------------------------------------- */
+
+int KokkosLMP::neigh_count(int m)
+{
+  int inum;
+  int nneigh = 0;
+
+  ArrayTypes<LMPHostType>::t_int_1d h_ilist;
+  ArrayTypes<LMPHostType>::t_int_1d h_numneigh;
+
+  NeighborKokkos *nk = (NeighborKokkos *) neighbor;
+  if (nk->lists_host[m]) {
+    inum = nk->lists_host[m]->inum;
+#ifndef KOKKOS_USE_UVM
+    h_ilist = Kokkos::create_mirror_view(nk->lists_host[m]->d_ilist);
+    h_numneigh = Kokkos::create_mirror_view(nk->lists_host[m]->d_numneigh);
+#else
+    h_ilist = nk->lists_host[m]->d_ilist;
+    h_numneigh = nk->lists_host[m]->d_numneigh;
+#endif
+    Kokkos::deep_copy(h_ilist,nk->lists_host[m]->d_ilist);
+    Kokkos::deep_copy(h_numneigh,nk->lists_host[m]->d_numneigh);
+  } else if (nk->lists_device[m]) {
+    inum = nk->lists_device[m]->inum;
+#ifndef KOKKOS_USE_UVM
+    h_ilist = Kokkos::create_mirror_view(nk->lists_device[m]->d_ilist);
+    h_numneigh = Kokkos::create_mirror_view(nk->lists_device[m]->d_numneigh);
+#else
+    h_ilist = nk->lists_device[m]->d_ilist;
+    h_numneigh = nk->lists_device[m]->d_numneigh;
+#endif
+    Kokkos::deep_copy(h_ilist,nk->lists_device[m]->d_ilist);
+    Kokkos::deep_copy(h_numneigh,nk->lists_device[m]->d_numneigh);
+  }
+
+  for (int i = 0; i < inum; i++) nneigh += h_numneigh[h_ilist[i]];
+
+  return nneigh;
+}
diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h
new file mode 100644
index 000000000..512c76a48
--- /dev/null
+++ b/src/KOKKOS/kokkos.h
@@ -0,0 +1,40 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef KOKKOS_LMP_H
+#define KOKKOS_LMP_H
+
+#include "pointers.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+class KokkosLMP : protected Pointers {
+ public:
+  int kokkos_exists;
+  int neighflag;
+  int exchange_comm_classic;
+  int forward_comm_classic;
+  int exchange_comm_on_host;
+  int forward_comm_on_host;
+
+  KokkosLMP(class LAMMPS *, int, char **);
+  ~KokkosLMP();
+  void accelerator(int, char **);
+  int neigh_list_kokkos(int);
+  int neigh_count(int);
+};
+
+}
+
+#endif
diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h
new file mode 100644
index 000000000..4887b91b1
--- /dev/null
+++ b/src/KOKKOS/kokkos_type.h
@@ -0,0 +1,617 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_LMPTYPE_KOKKOS_H
+#define LMP_LMPTYPE_KOKKOS_H
+
+#include <Kokkos_View.hpp>
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <Kokkos_DualView.hpp>
+#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Vectorization.hpp>
+
+#define MAX_TYPES_STACKPARAMS 12
+#define NeighClusterSize 8
+// set LMPHostype and LMPDeviceType
+
+#ifndef DEVICE
+#define DEVICE 1
+#endif
+
+#if DEVICE==1
+  #ifdef KOKKOS_HAVE_OPENMP
+    #include "Kokkos_OpenMP.hpp"
+    typedef Kokkos::OpenMP LMPDeviceType;
+    typedef Kokkos::OpenMP LMPHostType;
+  #else
+    #include "Kokkos_Threads.hpp"
+    typedef Kokkos::Threads LMPDeviceType;
+    typedef Kokkos::Threads LMPHostType;
+  #endif
+  #ifndef __CUDACC__
+    struct double2 {
+      double x, y;
+    };
+    struct float2 {
+      float x, y;
+    };
+    struct double4 {
+      double x, y, z, w;
+    };
+    struct float4 {
+      float x, y, z, w;
+    };
+  #endif
+#else
+  #include "cuda.h"
+  #include "cuda_runtime.h"
+  #include "Kokkos_Cuda.hpp"
+  #include "Kokkos_Threads.hpp"
+  typedef Kokkos::Cuda LMPDeviceType;
+  typedef Kokkos::Cuda::host_mirror_device_type LMPHostType;
+#endif
+
+// set ExecutionSpace stuct with variable "space"
+
+template<class Device>
+struct ExecutionSpaceFromDevice;
+
+#ifdef KOKKOS_HAVE_OPENMP
+template<>
+struct ExecutionSpaceFromDevice<Kokkos::OpenMP> {
+  static const LAMMPS_NS::ExecutionSpace space = LAMMPS_NS::Host;
+};
+#else
+template<>
+struct ExecutionSpaceFromDevice<Kokkos::Threads> {
+  static const LAMMPS_NS::ExecutionSpace space = LAMMPS_NS::Host;
+};
+#endif
+#if DEVICE==2
+template<>
+struct ExecutionSpaceFromDevice<Kokkos::Cuda> {
+  static const LAMMPS_NS::ExecutionSpace space = LAMMPS_NS::Device;
+};
+#endif
+
+// define precision
+// handle global precision, force, energy, positions, kspace separately
+
+#ifndef PRECISION
+#define PRECISION 2
+#endif
+#if PRECISION==1
+typedef float LMP_FLOAT;
+typedef float2 LMP_FLOAT2;
+typedef float4 LMP_FLOAT4;
+#else
+typedef double LMP_FLOAT;
+typedef double2 LMP_FLOAT2;
+typedef double4 LMP_FLOAT4;
+#endif
+
+#ifndef PREC_FORCE
+#define PREC_FORCE PRECISION
+#endif
+
+#if PREC_FORCE==1
+typedef float F_FLOAT;
+typedef float2 F_FLOAT2;
+typedef float4 F_FLOAT4;
+#else
+typedef double F_FLOAT;
+typedef double2 F_FLOAT2;
+typedef double4 F_FLOAT4;
+#endif
+
+#ifndef PREC_ENERGY
+#define PREC_ENERGY PRECISION
+#endif
+
+#if PREC_ENERGY==1
+typedef float E_FLOAT;
+typedef float2 E_FLOAT2;
+typedef float4 E_FLOAT4;
+#else
+typedef double E_FLOAT;
+typedef double2 E_FLOAT2;
+typedef double4 E_FLOAT4;
+#endif
+
+struct s_EV_FLOAT {
+  E_FLOAT evdwl;
+  E_FLOAT ecoul;
+  E_FLOAT v[6];
+  KOKKOS_INLINE_FUNCTION
+  s_EV_FLOAT() {
+	  evdwl = 0;
+	  ecoul = 0;
+	  v[0] = 0; v[1] = 0; v[2] = 0;
+	  v[3] = 0; v[4] = 0; v[5] = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  s_EV_FLOAT& operator+=(const s_EV_FLOAT &rhs) {
+	evdwl += rhs.evdwl;
+	ecoul += rhs.ecoul;
+	v[0] += rhs.v[0];
+	v[1] += rhs.v[1];
+	v[2] += rhs.v[2];
+	v[3] += rhs.v[3];
+	v[4] += rhs.v[4];
+	v[5] += rhs.v[5];
+	return *this;
+  }
+};
+typedef struct s_EV_FLOAT EV_FLOAT;
+
+#ifndef PREC_POS
+#define PREC_POS PRECISION
+#endif
+
+#if PREC_POS==1
+typedef float X_FLOAT;
+typedef float2 X_FLOAT2;
+typedef float4 X_FLOAT4;
+#else
+typedef double X_FLOAT;
+typedef double2 X_FLOAT2;
+typedef double4 X_FLOAT4;
+#endif
+
+#ifndef PREC_VELOCITIES
+#define PREC_VELOCITIES PRECISION
+#endif
+
+#if PREC_VELOCITIES==1
+typedef float V_FLOAT;
+typedef float2 V_FLOAT2;
+typedef float4 V_FLOAT4;
+#else
+typedef double V_FLOAT;
+typedef double2 V_FLOAT2;
+typedef double4 V_FLOAT4;
+#endif
+
+#if PREC_KSPACE==1
+typedef float K_FLOAT;
+typedef float2 K_FLOAT2;
+typedef float4 K_FLOAT4;
+#else
+typedef double K_FLOAT;
+typedef double2 K_FLOAT2;
+typedef double4 K_FLOAT4;
+#endif
+
+// ------------------------------------------------------------------------
+
+// LAMMPS types
+
+template <class DeviceType>
+struct ArrayTypes;
+
+template <>
+struct ArrayTypes<LMPDeviceType> {
+
+// scalar types
+
+typedef Kokkos::
+  DualView<int, LMPDeviceType::array_layout, LMPDeviceType> tdual_int_scalar;
+typedef tdual_int_scalar::t_dev t_int_scalar;
+typedef tdual_int_scalar::t_dev_const t_int_scalar_const;
+typedef tdual_int_scalar::t_dev_um t_int_scalar_um;
+typedef tdual_int_scalar::t_dev_const_um t_int_scalar_const_um;
+
+typedef Kokkos::
+  DualView<LMP_FLOAT, LMPDeviceType::array_layout, LMPDeviceType> 
+  tdual_float_scalar;
+typedef tdual_float_scalar::t_dev t_float_scalar;
+typedef tdual_float_scalar::t_dev_const t_float_scalar_const;
+typedef tdual_float_scalar::t_dev_um t_float_scalar_um;
+typedef tdual_float_scalar::t_dev_const_um t_float_scalar_const_um;
+
+// generic array types
+
+typedef Kokkos::
+  DualView<int*, LMPDeviceType::array_layout, LMPDeviceType> tdual_int_1d;
+typedef tdual_int_1d::t_dev t_int_1d;
+typedef tdual_int_1d::t_dev_const t_int_1d_const;
+typedef tdual_int_1d::t_dev_um t_int_1d_um;
+typedef tdual_int_1d::t_dev_const_um t_int_1d_const_um;
+typedef tdual_int_1d::t_dev_const_randomread t_int_1d_randomread;
+
+typedef Kokkos::
+  DualView<int**, Kokkos::LayoutRight, LMPDeviceType> tdual_int_2d;
+typedef tdual_int_2d::t_dev t_int_2d;
+typedef tdual_int_2d::t_dev_const t_int_2d_const;
+typedef tdual_int_2d::t_dev_um t_int_2d_um;
+typedef tdual_int_2d::t_dev_const_um t_int_2d_const_um;
+typedef tdual_int_2d::t_dev_const_randomread t_int_2d_randomread;
+
+typedef Kokkos::
+  DualView<LAMMPS_NS::tagint*, LMPDeviceType::array_layout, LMPDeviceType> 
+  tdual_tagint_1d;
+typedef tdual_tagint_1d::t_dev t_tagint_1d;
+typedef tdual_tagint_1d::t_dev_const t_tagint_1d_const;
+typedef tdual_tagint_1d::t_dev_um t_tagint_1d_um;
+typedef tdual_tagint_1d::t_dev_const_um t_tagint_1d_const_um;
+typedef tdual_tagint_1d::t_dev_const_randomread t_tagint_1d_randomread;
+
+// 1d float array n
+
+typedef Kokkos::DualView<LMP_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_float_1d;
+typedef tdual_float_1d::t_dev t_float_1d;
+typedef tdual_float_1d::t_dev_const t_float_1d_const;
+typedef tdual_float_1d::t_dev_um t_float_1d_um;
+typedef tdual_float_1d::t_dev_const_um t_float_1d_const_um;
+typedef tdual_float_1d::t_dev_const_randomread t_float_1d_randomread;
+
+//2d float array n
+typedef Kokkos::DualView<LMP_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_float_2d;
+typedef tdual_float_2d::t_dev t_float_2d;
+typedef tdual_float_2d::t_dev_const t_float_2d_const;
+typedef tdual_float_2d::t_dev_um t_float_2d_um;
+typedef tdual_float_2d::t_dev_const_um t_float_2d_const_um;
+typedef tdual_float_2d::t_dev_const_randomread t_float_2d_randomread;
+
+//Position Types
+//1d X_FLOAT array n
+typedef Kokkos::DualView<X_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_xfloat_1d;
+typedef tdual_xfloat_1d::t_dev t_xfloat_1d;
+typedef tdual_xfloat_1d::t_dev_const t_xfloat_1d_const;
+typedef tdual_xfloat_1d::t_dev_um t_xfloat_1d_um;
+typedef tdual_xfloat_1d::t_dev_const_um t_xfloat_1d_const_um;
+typedef tdual_xfloat_1d::t_dev_const_randomread t_xfloat_1d_randomread;
+
+//2d X_FLOAT array n*m
+typedef Kokkos::DualView<X_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_xfloat_2d;
+typedef tdual_xfloat_2d::t_dev t_xfloat_2d;
+typedef tdual_xfloat_2d::t_dev_const t_xfloat_2d_const;
+typedef tdual_xfloat_2d::t_dev_um t_xfloat_2d_um;
+typedef tdual_xfloat_2d::t_dev_const_um t_xfloat_2d_const_um;
+typedef tdual_xfloat_2d::t_dev_const_randomread t_xfloat_2d_randomread;
+
+//2d X_FLOAT array n*4 
+#ifdef LMP_KOKKOS_NO_LEGACY
+typedef Kokkos::DualView<X_FLOAT*[3], Kokkos::LayoutLeft, LMPDeviceType> tdual_x_array;
+#else
+typedef Kokkos::DualView<X_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_x_array;
+#endif
+typedef tdual_x_array::t_dev t_x_array;
+typedef tdual_x_array::t_dev_const t_x_array_const;
+typedef tdual_x_array::t_dev_um t_x_array_um;
+typedef tdual_x_array::t_dev_const_um t_x_array_const_um;
+typedef tdual_x_array::t_dev_const_randomread t_x_array_randomread;
+
+//Velocity Types
+//1d V_FLOAT array n
+typedef Kokkos::DualView<V_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_vfloat_1d;
+typedef tdual_vfloat_1d::t_dev t_vfloat_1d;
+typedef tdual_vfloat_1d::t_dev_const t_vfloat_1d_const;
+typedef tdual_vfloat_1d::t_dev_um t_vfloat_1d_um;
+typedef tdual_vfloat_1d::t_dev_const_um t_vfloat_1d_const_um;
+typedef tdual_vfloat_1d::t_dev_const_randomread t_vfloat_1d_randomread;
+
+//2d V_FLOAT array n*m
+typedef Kokkos::DualView<V_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_vfloat_2d;
+typedef tdual_vfloat_2d::t_dev t_vfloat_2d;
+typedef tdual_vfloat_2d::t_dev_const t_vfloat_2d_const;
+typedef tdual_vfloat_2d::t_dev_um t_vfloat_2d_um;
+typedef tdual_vfloat_2d::t_dev_const_um t_vfloat_2d_const_um;
+typedef tdual_vfloat_2d::t_dev_const_randomread t_vfloat_2d_randomread;
+
+//2d V_FLOAT array n*3
+typedef Kokkos::DualView<V_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_v_array;
+//typedef Kokkos::DualView<V_FLOAT*[3], LMPDeviceType::array_layout, LMPDeviceType> tdual_v_array;
+typedef tdual_v_array::t_dev t_v_array;
+typedef tdual_v_array::t_dev_const t_v_array_const;
+typedef tdual_v_array::t_dev_um t_v_array_um;
+typedef tdual_v_array::t_dev_const_um t_v_array_const_um;
+typedef tdual_v_array::t_dev_const_randomread t_v_array_randomread;
+
+//Force Types
+//1d F_FLOAT array n
+
+typedef Kokkos::DualView<F_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_ffloat_1d;
+typedef tdual_ffloat_1d::t_dev t_ffloat_1d;
+typedef tdual_ffloat_1d::t_dev_const t_ffloat_1d_const;
+typedef tdual_ffloat_1d::t_dev_um t_ffloat_1d_um;
+typedef tdual_ffloat_1d::t_dev_const_um t_ffloat_1d_const_um;
+typedef tdual_ffloat_1d::t_dev_const_randomread t_ffloat_1d_randomread;
+
+//2d F_FLOAT array n*m
+
+typedef Kokkos::DualView<F_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_ffloat_2d;
+typedef tdual_ffloat_2d::t_dev t_ffloat_2d;
+typedef tdual_ffloat_2d::t_dev_const t_ffloat_2d_const;
+typedef tdual_ffloat_2d::t_dev_um t_ffloat_2d_um;
+typedef tdual_ffloat_2d::t_dev_const_um t_ffloat_2d_const_um;
+typedef tdual_ffloat_2d::t_dev_const_randomread t_ffloat_2d_randomread;
+
+//2d F_FLOAT array n*3
+
+typedef Kokkos::DualView<F_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_f_array;
+//typedef Kokkos::DualView<F_FLOAT*[3], LMPDeviceType::array_layout, LMPDeviceType> tdual_f_array;
+typedef tdual_f_array::t_dev t_f_array;
+typedef tdual_f_array::t_dev_const t_f_array_const;
+typedef tdual_f_array::t_dev_um t_f_array_um;
+typedef tdual_f_array::t_dev_const_um t_f_array_const_um;
+typedef tdual_f_array::t_dev_const_randomread t_f_array_randomread;
+
+//2d F_FLOAT array n*6 (for virial)
+
+typedef Kokkos::DualView<F_FLOAT*[6], Kokkos::LayoutRight, LMPDeviceType> tdual_virial_array;
+typedef tdual_virial_array::t_dev t_virial_array;
+typedef tdual_virial_array::t_dev_const t_virial_array_const;
+typedef tdual_virial_array::t_dev_um t_virial_array_um;
+typedef tdual_virial_array::t_dev_const_um t_virial_array_const_um;
+typedef tdual_virial_array::t_dev_const_randomread t_virial_array_randomread;
+
+//Energy Types
+//1d E_FLOAT array n
+
+typedef Kokkos::DualView<E_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_efloat_1d;
+typedef tdual_efloat_1d::t_dev t_efloat_1d;
+typedef tdual_efloat_1d::t_dev_const t_efloat_1d_const;
+typedef tdual_efloat_1d::t_dev_um t_efloat_1d_um;
+typedef tdual_efloat_1d::t_dev_const_um t_efloat_1d_const_um;
+typedef tdual_efloat_1d::t_dev_const_randomread t_efloat_1d_randomread;
+
+//2d E_FLOAT array n*m
+
+typedef Kokkos::DualView<E_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_efloat_2d;
+typedef tdual_efloat_2d::t_dev t_efloat_2d;
+typedef tdual_efloat_2d::t_dev_const t_efloat_2d_const;
+typedef tdual_efloat_2d::t_dev_um t_efloat_2d_um;
+typedef tdual_efloat_2d::t_dev_const_um t_efloat_2d_const_um;
+typedef tdual_efloat_2d::t_dev_const_randomread t_efloat_2d_randomread;
+
+//2d E_FLOAT array n*3 
+
+typedef Kokkos::DualView<E_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_e_array;
+typedef tdual_e_array::t_dev t_e_array;
+typedef tdual_e_array::t_dev_const t_e_array_const;
+typedef tdual_e_array::t_dev_um t_e_array_um;
+typedef tdual_e_array::t_dev_const_um t_e_array_const_um;
+typedef tdual_e_array::t_dev_const_randomread t_e_array_randomread;
+
+//Neighbor Types
+
+typedef Kokkos::DualView<int**, LMPDeviceType::array_layout, LMPDeviceType> tdual_neighbors_2d;
+typedef tdual_neighbors_2d::t_dev t_neighbors_2d;
+typedef tdual_neighbors_2d::t_dev_const t_neighbors_2d_const;
+typedef tdual_neighbors_2d::t_dev_um t_neighbors_2d_um;
+typedef tdual_neighbors_2d::t_dev_const_um t_neighbors_2d_const_um;
+typedef tdual_neighbors_2d::t_dev_const_randomread t_neighbors_2d_randomread;
+
+};
+
+#if DEVICE==2
+template <>
+struct ArrayTypes<LMPHostType> {
+
+//Scalar Types
+
+typedef Kokkos::DualView<int, LMPDeviceType::array_layout, LMPDeviceType> tdual_int_scalar;
+typedef tdual_int_scalar::t_host t_int_scalar;
+typedef tdual_int_scalar::t_host_const t_int_scalar_const;
+typedef tdual_int_scalar::t_host_um t_int_scalar_um;
+typedef tdual_int_scalar::t_host_const_um t_int_scalar_const_um;
+
+typedef Kokkos::DualView<LMP_FLOAT, LMPDeviceType::array_layout, LMPDeviceType> tdual_float_scalar;
+typedef tdual_float_scalar::t_host t_float_scalar;
+typedef tdual_float_scalar::t_host_const t_float_scalar_const;
+typedef tdual_float_scalar::t_host_um t_float_scalar_um;
+typedef tdual_float_scalar::t_host_const_um t_float_scalar_const_um;
+
+//Generic ArrayTypes
+typedef Kokkos::DualView<int*, LMPDeviceType::array_layout, LMPDeviceType> tdual_int_1d;
+typedef tdual_int_1d::t_host t_int_1d;
+typedef tdual_int_1d::t_host_const t_int_1d_const;
+typedef tdual_int_1d::t_host_um t_int_1d_um;
+typedef tdual_int_1d::t_host_const_um t_int_1d_const_um;
+typedef tdual_int_1d::t_host_const_randomread t_int_1d_randomread;
+
+typedef Kokkos::DualView<int**, Kokkos::LayoutRight, LMPDeviceType> tdual_int_2d;
+typedef tdual_int_2d::t_host t_int_2d;
+typedef tdual_int_2d::t_host_const t_int_2d_const;
+typedef tdual_int_2d::t_host_um t_int_2d_um;
+typedef tdual_int_2d::t_host_const_um t_int_2d_const_um;
+typedef tdual_int_2d::t_host_const_randomread t_int_2d_randomread;
+
+typedef Kokkos::DualView<LAMMPS_NS::tagint*, LMPDeviceType::array_layout, LMPDeviceType> tdual_tagint_1d;
+typedef tdual_tagint_1d::t_host t_tagint_1d;
+typedef tdual_tagint_1d::t_host_const t_tagint_1d_const;
+typedef tdual_tagint_1d::t_host_um t_tagint_1d_um;
+typedef tdual_tagint_1d::t_host_const_um t_tagint_1d_const_um;
+typedef tdual_tagint_1d::t_host_const_randomread t_tagint_1d_randomread;
+
+//1d float array n
+typedef Kokkos::DualView<LMP_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_float_1d;
+typedef tdual_float_1d::t_host t_float_1d;
+typedef tdual_float_1d::t_host_const t_float_1d_const;
+typedef tdual_float_1d::t_host_um t_float_1d_um;
+typedef tdual_float_1d::t_host_const_um t_float_1d_const_um;
+typedef tdual_float_1d::t_host_const_randomread t_float_1d_randomread;
+
+//2d float array n
+typedef Kokkos::DualView<LMP_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_float_2d;
+typedef tdual_float_2d::t_host t_float_2d;
+typedef tdual_float_2d::t_host_const t_float_2d_const;
+typedef tdual_float_2d::t_host_um t_float_2d_um;
+typedef tdual_float_2d::t_host_const_um t_float_2d_const_um;
+typedef tdual_float_2d::t_host_const_randomread t_float_2d_randomread;
+
+//Position Types
+//1d X_FLOAT array n
+typedef Kokkos::DualView<X_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_xfloat_1d;
+typedef tdual_xfloat_1d::t_host t_xfloat_1d;
+typedef tdual_xfloat_1d::t_host_const t_xfloat_1d_const;
+typedef tdual_xfloat_1d::t_host_um t_xfloat_1d_um;
+typedef tdual_xfloat_1d::t_host_const_um t_xfloat_1d_const_um;
+typedef tdual_xfloat_1d::t_host_const_randomread t_xfloat_1d_randomread;
+
+//2d X_FLOAT array n*m
+typedef Kokkos::DualView<X_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_xfloat_2d;
+typedef tdual_xfloat_2d::t_host t_xfloat_2d;
+typedef tdual_xfloat_2d::t_host_const t_xfloat_2d_const;
+typedef tdual_xfloat_2d::t_host_um t_xfloat_2d_um;
+typedef tdual_xfloat_2d::t_host_const_um t_xfloat_2d_const_um;
+typedef tdual_xfloat_2d::t_host_const_randomread t_xfloat_2d_randomread;
+
+//2d X_FLOAT array n*3
+typedef Kokkos::DualView<X_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_x_array;
+typedef tdual_x_array::t_host t_x_array;
+typedef tdual_x_array::t_host_const t_x_array_const;
+typedef tdual_x_array::t_host_um t_x_array_um;
+typedef tdual_x_array::t_host_const_um t_x_array_const_um;
+typedef tdual_x_array::t_host_const_randomread t_x_array_randomread;
+
+//Velocity Types
+//1d V_FLOAT array n
+typedef Kokkos::DualView<V_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_vfloat_1d;
+typedef tdual_vfloat_1d::t_host t_vfloat_1d;
+typedef tdual_vfloat_1d::t_host_const t_vfloat_1d_const;
+typedef tdual_vfloat_1d::t_host_um t_vfloat_1d_um;
+typedef tdual_vfloat_1d::t_host_const_um t_vfloat_1d_const_um;
+typedef tdual_vfloat_1d::t_host_const_randomread t_vfloat_1d_randomread;
+
+//2d V_FLOAT array n*m
+typedef Kokkos::DualView<V_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_vfloat_2d;
+typedef tdual_vfloat_2d::t_host t_vfloat_2d;
+typedef tdual_vfloat_2d::t_host_const t_vfloat_2d_const;
+typedef tdual_vfloat_2d::t_host_um t_vfloat_2d_um;
+typedef tdual_vfloat_2d::t_host_const_um t_vfloat_2d_const_um;
+typedef tdual_vfloat_2d::t_host_const_randomread t_vfloat_2d_randomread;
+
+//2d V_FLOAT array n*3
+typedef Kokkos::DualView<V_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_v_array;
+//typedef Kokkos::DualView<V_FLOAT*[3], LMPDeviceType::array_layout, LMPDeviceType> tdual_v_array;
+typedef tdual_v_array::t_host t_v_array;
+typedef tdual_v_array::t_host_const t_v_array_const;
+typedef tdual_v_array::t_host_um t_v_array_um;
+typedef tdual_v_array::t_host_const_um t_v_array_const_um;
+typedef tdual_v_array::t_host_const_randomread t_v_array_randomread;
+
+//Force Types
+//1d F_FLOAT array n
+typedef Kokkos::DualView<F_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_ffloat_1d;
+typedef tdual_ffloat_1d::t_host t_ffloat_1d;
+typedef tdual_ffloat_1d::t_host_const t_ffloat_1d_const;
+typedef tdual_ffloat_1d::t_host_um t_ffloat_1d_um;
+typedef tdual_ffloat_1d::t_host_const_um t_ffloat_1d_const_um;
+typedef tdual_ffloat_1d::t_host_const_randomread t_ffloat_1d_randomread;
+
+//2d F_FLOAT array n*m
+typedef Kokkos::DualView<F_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_ffloat_2d;
+typedef tdual_ffloat_2d::t_host t_ffloat_2d;
+typedef tdual_ffloat_2d::t_host_const t_ffloat_2d_const;
+typedef tdual_ffloat_2d::t_host_um t_ffloat_2d_um;
+typedef tdual_ffloat_2d::t_host_const_um t_ffloat_2d_const_um;
+typedef tdual_ffloat_2d::t_host_const_randomread t_ffloat_2d_randomread;
+
+//2d F_FLOAT array n*3
+typedef Kokkos::DualView<F_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_f_array;
+//typedef Kokkos::DualView<F_FLOAT*[3], LMPDeviceType::array_layout, LMPDeviceType> tdual_f_array;
+typedef tdual_f_array::t_host t_f_array;
+typedef tdual_f_array::t_host_const t_f_array_const;
+typedef tdual_f_array::t_host_um t_f_array_um;
+typedef tdual_f_array::t_host_const_um t_f_array_const_um;
+typedef tdual_f_array::t_host_const_randomread t_f_array_randomread;
+
+//2d F_FLOAT array n*6 (for virial)
+typedef Kokkos::DualView<F_FLOAT*[6], Kokkos::LayoutRight, LMPDeviceType> tdual_virial_array;
+typedef tdual_virial_array::t_host t_virial_array;
+typedef tdual_virial_array::t_host_const t_virial_array_const;
+typedef tdual_virial_array::t_host_um t_virial_array_um;
+typedef tdual_virial_array::t_host_const_um t_virial_array_const_um;
+typedef tdual_virial_array::t_host_const_randomread t_virial_array_randomread;
+
+
+
+//Energy Types
+//1d E_FLOAT array n
+typedef Kokkos::DualView<E_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_efloat_1d;
+typedef tdual_efloat_1d::t_host t_efloat_1d;
+typedef tdual_efloat_1d::t_host_const t_efloat_1d_const;
+typedef tdual_efloat_1d::t_host_um t_efloat_1d_um;
+typedef tdual_efloat_1d::t_host_const_um t_efloat_1d_const_um;
+typedef tdual_efloat_1d::t_host_const_randomread t_efloat_1d_randomread;
+
+//2d E_FLOAT array n*m
+typedef Kokkos::DualView<E_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_efloat_2d;
+typedef tdual_efloat_2d::t_host t_efloat_2d;
+typedef tdual_efloat_2d::t_host_const t_efloat_2d_const;
+typedef tdual_efloat_2d::t_host_um t_efloat_2d_um;
+typedef tdual_efloat_2d::t_host_const_um t_efloat_2d_const_um;
+typedef tdual_efloat_2d::t_host_const_randomread t_efloat_2d_randomread;
+
+//2d E_FLOAT array n*3
+typedef Kokkos::DualView<E_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_e_array;
+typedef tdual_e_array::t_host t_e_array;
+typedef tdual_e_array::t_host_const t_e_array_const;
+typedef tdual_e_array::t_host_um t_e_array_um;
+typedef tdual_e_array::t_host_const_um t_e_array_const_um;
+typedef tdual_e_array::t_host_const_randomread t_e_array_randomread;
+
+//Neighbor Types
+typedef Kokkos::DualView<int**, LMPDeviceType::array_layout, LMPDeviceType> tdual_neighbors_2d;
+typedef tdual_neighbors_2d::t_host t_neighbors_2d;
+typedef tdual_neighbors_2d::t_host_const t_neighbors_2d_const;
+typedef tdual_neighbors_2d::t_host_um t_neighbors_2d_um;
+typedef tdual_neighbors_2d::t_host_const_um t_neighbors_2d_const_um;
+typedef tdual_neighbors_2d::t_host_const_randomread t_neighbors_2d_randomread;
+
+};
+#endif
+//default LAMMPS Types
+typedef struct ArrayTypes<LMPDeviceType> DAT;
+typedef struct ArrayTypes<LMPHostType> HAT;
+
+template<class DeviceType, class BufferView, class DualView>
+void buffer_view(BufferView &buf, DualView &view,
+                 const size_t n0,
+                 const size_t n1 = 0,
+                 const size_t n2 = 0,
+                 const size_t n3 = 0,
+                 const size_t n4 = 0,
+                 const size_t n5 = 0,
+                 const size_t n6 = 0,
+                 const size_t n7 = 0) {
+
+  buf = BufferView(
+          view.template view<DeviceType>().ptr_on_device(),
+          n0,n1,n2,n3,n4,n5,n6,n7);
+
+}
+
+template<class DeviceType>
+struct MemsetZeroFunctor {
+  typedef DeviceType  device_type ;
+  void* ptr;
+  KOKKOS_INLINE_FUNCTION void operator()(const int i) const {
+    ((int*)ptr)[i] = 0;
+  }
+};
+
+template<class ViewType>
+void memset_kokkos (ViewType &view) {
+  static MemsetZeroFunctor<typename ViewType::device_type> f;
+  f.ptr = view.ptr_on_device();
+  Kokkos::parallel_for(view.capacity()*sizeof(typename ViewType::value_type)/4, f);
+  ViewType::device_type::fence();
+}
+
+
+#endif
diff --git a/src/KOKKOS/memory_kokkos.h b/src/KOKKOS/memory_kokkos.h
new file mode 100644
index 000000000..2651c5e5c
--- /dev/null
+++ b/src/KOKKOS/memory_kokkos.h
@@ -0,0 +1,208 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Kokkos versions of create/grow/destroy multi-dimensional arrays
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   create a 1d array
+------------------------------------------------------------------------- */
+
+template <typename TYPE>
+TYPE create_kokkos(TYPE &data, typename TYPE::value_type *&array, 
+                   int n1, const char *name)
+{
+  data = TYPE(name,n1);
+  array = data.h_view.ptr_on_device();
+  return data;
+}
+
+template <typename TYPE, typename HTYPE>
+  TYPE create_kokkos(TYPE &data, HTYPE &h_data, 
+                     typename TYPE::value_type *&array, int n1, 
+                     const char *name)
+{
+  data = TYPE(std::string(name),n1);
+#ifndef KOKKOS_USE_UVM
+  h_data = Kokkos::create_mirror_view(data);
+#else
+  h_data = data;
+#endif
+  array = h_data.ptr_on_device();
+  return data;
+}
+
+
+template <typename TYPE, typename HTYPE>
+  TYPE create_kokkos(TYPE &data, HTYPE &h_data,
+                     int n1, const char *name)
+{
+  data = TYPE(std::string(name),n1);
+#ifndef KOKKOS_USE_UVM
+  h_data = Kokkos::create_mirror_view(data);
+#else
+  h_data = data;
+#endif
+  return data;
+}
+
+/* ----------------------------------------------------------------------
+   grow or shrink 1st dim of a 1d array
+   last dim must stay the same
+------------------------------------------------------------------------- */
+
+template <typename TYPE>
+TYPE grow_kokkos(TYPE &data, typename TYPE::value_type *&array, 
+                 int n1, const char *name)
+{
+  if (array == NULL) return create_kokkos(data,array,n1,name);
+  
+  data.resize(n1);
+  array = data.h_view.ptr_on_device();
+  return data;
+}
+
+template <typename TYPE>
+void destroy_kokkos(TYPE data, typename TYPE::value_type* &array)
+{
+  if (array == NULL) return;
+  data = TYPE();
+  array = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   create a 2d array
+------------------------------------------------------------------------- */
+
+template <typename TYPE>
+TYPE create_kokkos(TYPE &data, int n1, int n2, const char *name)
+{
+  data = TYPE(name,n1,n2);
+  return data;
+}
+
+template <typename TYPE, typename HTYPE>
+  TYPE create_kokkos(TYPE &data, HTYPE &h_data, int n1, int n2, 
+                     const char *name)
+{
+  data = TYPE(std::string(name),n1,n2);
+#ifndef KOKKOS_USE_UVM
+  h_data = Kokkos::create_mirror_view(data);
+#else
+  h_data = data;
+#endif
+  return data;
+}
+
+template <typename TYPE>
+TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array, 
+                   int n1, int n2, const char *name)
+{
+  data = TYPE(std::string(name),n1,n2);
+  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1;
+  array = (typename TYPE::value_type **) smalloc(nbytes,name);
+  
+  bigint n = 0;
+  for (int i = 0; i < n1; i++) {
+    array[i] = &data.h_view(i,0);
+    n += n2;
+  }
+  return data;
+}
+
+template <typename TYPE, typename HTYPE>
+  TYPE create_kokkos(TYPE &data, HTYPE &h_data, 
+                     typename TYPE::value_type **&array, int n1, int n2, 
+                     const char *name)
+{
+  data = TYPE(std::string(name),n1,n2);
+#ifndef KOKKOS_USE_UVM
+  h_data = Kokkos::create_mirror_view(data);
+#else
+  h_data = data;
+#endif
+  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1;
+  array = (typename TYPE::value_type **) smalloc(nbytes,name);
+  
+  bigint n = 0;
+  for (int i = 0; i < n1; i++) {
+    array[i] = &h_data(i,0);
+    n += n2;
+  }
+  return data;
+}
+
+/* ----------------------------------------------------------------------
+   grow or shrink 1st dim of a 2d array
+   last dim must stay the same
+------------------------------------------------------------------------- */
+
+template <typename TYPE>
+TYPE grow_kokkos(TYPE &data, typename TYPE::value_type **&array, 
+                 int n1, int n2, const char *name)
+{
+  if (array == NULL) return create_kokkos(data,array,n1,n2,name);
+  data.resize(n1,n2);
+  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1;
+  array = (typename TYPE::value_type**) srealloc(array,nbytes,name);
+  
+  for (int i = 0; i < n1; i++)
+    array[i] = &data.h_view(i,0);
+  
+  return data;
+}
+
+template <typename TYPE>
+TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array, 
+                   int n1, const char *name)
+{
+  data = TYPE(std::string(name),n1);
+  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1;
+  array = (typename TYPE::value_type **) smalloc(nbytes,name);
+  
+  for (int i = 0; i < n1; i++)
+    array[i] = &data.h_view(i,0);
+  
+  return data;
+}
+
+template <typename TYPE>
+TYPE grow_kokkos(TYPE &data, typename TYPE::value_type **&array, 
+                 int n1, const char *name)
+{
+  if (array == NULL) return create_kokkos(data,array,n1,name);
+  
+  data.resize(n1);
+  
+  bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1;
+  array = (typename TYPE::value_type **) smalloc(nbytes,name);
+  
+  for (int i = 0; i < n1; i++)
+    array[i] = &data.h_view(i,0);
+  
+  return data;
+}
+
+/* ----------------------------------------------------------------------
+   destroy a 2d array
+------------------------------------------------------------------------- */
+
+template <typename TYPE>
+void destroy_kokkos(TYPE data, typename TYPE::value_type** &array)
+{
+  if (array == NULL) return;
+  data = TYPE();
+  sfree(array);
+  array = NULL;
+}
diff --git a/src/KOKKOS/modify_kokkos.cpp b/src/KOKKOS/modify_kokkos.cpp
new file mode 100644
index 000000000..4fcd13615
--- /dev/null
+++ b/src/KOKKOS/modify_kokkos.cpp
@@ -0,0 +1,585 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "modify_kokkos.h"
+#include "atom_kokkos.h"
+#include "update.h"
+#include "fix.h"
+#include "compute.h"
+
+using namespace LAMMPS_NS;
+
+#define BIG 1.0e20
+
+/* ---------------------------------------------------------------------- */
+
+ModifyKokkos::ModifyKokkos(LAMMPS *lmp) : Modify(lmp) 
+{
+  atomKK = (AtomKokkos *) atom;
+}
+
+/* ----------------------------------------------------------------------
+   setup for run, calls setup() of all fixes and computes
+   called from Verlet, RESPA, Min
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::setup(int vflag)
+{
+  // compute setup needs to come before fix setup
+  // b/c NH fixes need use DOF of temperature computes
+
+  for (int i = 0; i < ncompute; i++) compute[i]->setup();
+
+  if (update->whichflag == 1)
+    for (int i = 0; i < nfix; i++) {
+      atomKK->sync(fix[i]->execution_space,fix[i]->datamask_read);
+      atomKK->modified(fix[i]->execution_space,fix[i]->datamask_modify);
+      fix[i]->setup(vflag);
+    }
+  else if (update->whichflag == 2)
+    for (int i = 0; i < nfix; i++) {
+      atomKK->sync(fix[i]->execution_space,fix[i]->datamask_read);
+      atomKK->modified(fix[i]->execution_space,fix[i]->datamask_modify);
+      fix[i]->min_setup(vflag);
+    }
+}
+
+/* ----------------------------------------------------------------------
+   setup pre_exchange call, only for fixes that define pre_exchange
+   called from Verlet, RESPA, Min, and WriteRestart with whichflag = 0
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::setup_pre_exchange()
+{
+  if (update->whichflag <= 1)
+    for (int i = 0; i < n_pre_exchange; i++) {
+      atomKK->sync(fix[list_pre_exchange[i]]->execution_space,
+                   fix[list_pre_exchange[i]]->datamask_read);
+      atomKK->modified(fix[list_pre_exchange[i]]->execution_space,
+                       fix[list_pre_exchange[i]]->datamask_modify);
+      fix[list_pre_exchange[i]]->setup_pre_exchange();
+    }
+  else if (update->whichflag == 2)
+    for (int i = 0; i < n_min_pre_exchange; i++) {
+      atomKK->sync(fix[list_min_pre_exchange[i]]->execution_space,
+                   fix[list_min_pre_exchange[i]]->datamask_read);
+      atomKK->modified(fix[list_min_pre_exchange[i]]->execution_space,
+                       fix[list_min_pre_exchange[i]]->datamask_modify);
+      fix[list_min_pre_exchange[i]]->min_setup_pre_exchange();
+    }
+}
+
+/* ----------------------------------------------------------------------
+   setup pre_neighbor call, only for fixes that define pre_neighbor
+   called from Verlet, RESPA
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::setup_pre_neighbor()
+{
+  if (update->whichflag == 1)
+    for (int i = 0; i < n_pre_neighbor; i++) {
+      atomKK->sync(fix[list_pre_neighbor[i]]->execution_space,
+                   fix[list_pre_neighbor[i]]->datamask_read);
+      atomKK->modified(fix[list_pre_neighbor[i]]->execution_space,
+                       fix[list_pre_neighbor[i]]->datamask_modify);
+      fix[list_pre_neighbor[i]]->setup_pre_neighbor();
+    }
+  else if (update->whichflag == 2)
+    for (int i = 0; i < n_min_pre_neighbor; i++) {
+      atomKK->sync(fix[list_min_pre_neighbor[i]]->execution_space,
+                   fix[list_min_pre_neighbor[i]]->datamask_read);
+      atomKK->modified(fix[list_min_pre_neighbor[i]]->execution_space,
+                       fix[list_min_pre_neighbor[i]]->datamask_modify);
+      fix[list_min_pre_neighbor[i]]->min_setup_pre_neighbor();
+    }
+}
+
+/* ----------------------------------------------------------------------
+   setup pre_force call, only for fixes that define pre_force
+   called from Verlet, RESPA, Min
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::setup_pre_force(int vflag)
+{
+  if (update->whichflag == 1)
+    for (int i = 0; i < n_pre_force; i++) {
+      atomKK->sync(fix[list_pre_force[i]]->execution_space,
+                   fix[list_pre_force[i]]->datamask_read);
+      atomKK->modified(fix[list_pre_force[i]]->execution_space,
+                       fix[list_pre_force[i]]->datamask_modify);
+      fix[list_pre_force[i]]->setup_pre_force(vflag);
+    }
+  else if (update->whichflag == 2)
+    for (int i = 0; i < n_min_pre_force; i++) {
+      atomKK->sync(fix[list_min_pre_force[i]]->execution_space,
+                   fix[list_min_pre_force[i]]->datamask_read);
+      atomKK->modified(fix[list_min_pre_force[i]]->execution_space,
+                       fix[list_min_pre_force[i]]->datamask_modify);
+      fix[list_min_pre_force[i]]->min_setup_pre_force(vflag);
+    }
+}
+
+/* ----------------------------------------------------------------------
+   1st half of integrate call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::initial_integrate(int vflag)
+{
+  for (int i = 0; i < n_initial_integrate; i++) {
+    atomKK->sync(fix[list_initial_integrate[i]]->execution_space,
+                 fix[list_initial_integrate[i]]->datamask_read);
+    atomKK->modified(fix[list_initial_integrate[i]]->execution_space,
+                     fix[list_initial_integrate[i]]->datamask_modify);
+    fix[list_initial_integrate[i]]->initial_integrate(vflag);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   post_integrate call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::post_integrate()
+{
+  for (int i = 0; i < n_post_integrate; i++) {
+    atomKK->sync(fix[list_post_integrate[i]]->execution_space,
+                 fix[list_post_integrate[i]]->datamask_read);
+    atomKK->modified(fix[list_post_integrate[i]]->execution_space,
+                     fix[list_post_integrate[i]]->datamask_modify);
+    fix[list_post_integrate[i]]->post_integrate();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pre_exchange call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::pre_exchange()
+{
+  for (int i = 0; i < n_pre_exchange; i++) {
+    atomKK->sync(fix[list_pre_exchange[i]]->execution_space,
+                 fix[list_pre_exchange[i]]->datamask_read);
+    atomKK->modified(fix[list_pre_exchange[i]]->execution_space,
+                     fix[list_pre_exchange[i]]->datamask_modify);
+    fix[list_pre_exchange[i]]->pre_exchange();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pre_neighbor call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::pre_neighbor()
+{
+  for (int i = 0; i < n_pre_neighbor; i++) {
+    atomKK->sync(fix[list_pre_neighbor[i]]->execution_space,
+                 fix[list_pre_neighbor[i]]->datamask_read);
+    atomKK->modified(fix[list_pre_neighbor[i]]->execution_space,
+                     fix[list_pre_neighbor[i]]->datamask_modify);
+    fix[list_pre_neighbor[i]]->pre_neighbor();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pre_force call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::pre_force(int vflag)
+{
+  for (int i = 0; i < n_pre_force; i++) {
+    atomKK->sync(fix[list_pre_force[i]]->execution_space,
+                 fix[list_pre_force[i]]->datamask_read);
+    atomKK->modified(fix[list_pre_force[i]]->execution_space,
+                     fix[list_pre_force[i]]->datamask_modify);
+    fix[list_pre_force[i]]->pre_force(vflag);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   post_force call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::post_force(int vflag)
+{
+  for (int i = 0; i < n_post_force; i++) {
+    atomKK->sync(fix[list_post_force[i]]->execution_space,
+                 fix[list_post_force[i]]->datamask_read);
+    atomKK->modified(fix[list_post_force[i]]->execution_space,
+                     fix[list_post_force[i]]->datamask_modify);
+    fix[list_post_force[i]]->post_force(vflag);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   2nd half of integrate call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::final_integrate()
+{
+  for (int i = 0; i < n_final_integrate; i++) {
+    atomKK->sync(fix[list_final_integrate[i]]->execution_space,
+                 fix[list_final_integrate[i]]->datamask_read);
+    atomKK->modified(fix[list_final_integrate[i]]->execution_space,
+                     fix[list_final_integrate[i]]->datamask_modify);
+    fix[list_final_integrate[i]]->final_integrate();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   end-of-timestep call, only for relevant fixes
+   only call fix->end_of_step() on timesteps that are multiples of nevery
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::end_of_step()
+{
+  for (int i = 0; i < n_end_of_step; i++)
+    if (update->ntimestep % end_of_step_every[i] == 0) {
+      atomKK->sync(fix[list_end_of_step[i]]->execution_space,
+                   fix[list_end_of_step[i]]->datamask_read);
+      atomKK->modified(fix[list_end_of_step[i]]->execution_space,
+                       fix[list_end_of_step[i]]->datamask_modify);
+      fix[list_end_of_step[i]]->end_of_step();
+    }
+}
+
+/* ----------------------------------------------------------------------
+   thermo energy call, only for relevant fixes
+   called by Thermo class
+   compute_scalar() is fix call to return energy
+------------------------------------------------------------------------- */
+
+double ModifyKokkos::thermo_energy()
+{
+  double energy = 0.0;
+  for (int i = 0; i < n_thermo_energy; i++) {
+    atomKK->sync(fix[list_thermo_energy[i]]->execution_space,
+                 fix[list_thermo_energy[i]]->datamask_read);
+    atomKK->modified(fix[list_thermo_energy[i]]->execution_space,
+                     fix[list_thermo_energy[i]]->datamask_modify);
+    energy += fix[list_thermo_energy[i]]->compute_scalar();
+  }
+  return energy;
+}
+
+/* ----------------------------------------------------------------------
+   post_run call
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::post_run()
+{
+  for (int i = 0; i < nfix; i++) {
+    atomKK->sync(fix[i]->execution_space,
+                 fix[i]->datamask_read);
+    atomKK->modified(fix[i]->execution_space,
+                     fix[i]->datamask_modify);
+    fix[i]->post_run();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   setup rRESPA pre_force call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::setup_pre_force_respa(int vflag, int ilevel)
+{
+  for (int i = 0; i < n_pre_force; i++) {
+    atomKK->sync(fix[list_pre_force[i]]->execution_space,
+                 fix[list_pre_force[i]]->datamask_read);
+    atomKK->modified(fix[list_pre_force[i]]->execution_space,
+                     fix[list_pre_force[i]]->datamask_modify);
+    fix[list_pre_force[i]]->setup_pre_force_respa(vflag,ilevel);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   1st half of rRESPA integrate call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::initial_integrate_respa(int vflag, int ilevel, int iloop)
+{
+  for (int i = 0; i < n_initial_integrate_respa; i++) {
+    atomKK->sync(fix[list_initial_integrate_respa[i]]->execution_space,
+                 fix[list_initial_integrate_respa[i]]->datamask_read);
+    atomKK->modified(fix[list_initial_integrate_respa[i]]->execution_space,
+                     fix[list_initial_integrate_respa[i]]->datamask_modify);
+    fix[list_initial_integrate_respa[i]]->
+      initial_integrate_respa(vflag,ilevel,iloop);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   rRESPA post_integrate call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::post_integrate_respa(int ilevel, int iloop)
+{
+  for (int i = 0; i < n_post_integrate_respa; i++) {
+    atomKK->sync(fix[list_post_integrate_respa[i]]->execution_space,
+                 fix[list_post_integrate_respa[i]]->datamask_read);
+    atomKK->modified(fix[list_post_integrate_respa[i]]->execution_space,
+                     fix[list_post_integrate_respa[i]]->datamask_modify);
+    fix[list_post_integrate_respa[i]]->post_integrate_respa(ilevel,iloop);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   rRESPA pre_force call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::pre_force_respa(int vflag, int ilevel, int iloop)
+{
+  for (int i = 0; i < n_pre_force_respa; i++) {
+    atomKK->sync(fix[list_pre_force_respa[i]]->execution_space,
+                 fix[list_pre_force_respa[i]]->datamask_read);
+    atomKK->modified(fix[list_pre_force_respa[i]]->execution_space,
+                     fix[list_pre_force_respa[i]]->datamask_modify);
+    fix[list_pre_force_respa[i]]->pre_force_respa(vflag,ilevel,iloop);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   rRESPA post_force call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::post_force_respa(int vflag, int ilevel, int iloop)
+{
+  for (int i = 0; i < n_post_force_respa; i++) {
+    atomKK->sync(fix[list_post_force_respa[i]]->execution_space,
+                 fix[list_post_force_respa[i]]->datamask_read);
+    atomKK->modified(fix[list_post_force_respa[i]]->execution_space,
+                     fix[list_post_force_respa[i]]->datamask_modify);
+    fix[list_post_force_respa[i]]->post_force_respa(vflag,ilevel,iloop);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   2nd half of rRESPA integrate call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::final_integrate_respa(int ilevel, int iloop)
+{
+  for (int i = 0; i < n_final_integrate_respa; i++) {
+    atomKK->sync(fix[list_final_integrate_respa[i]]->execution_space,
+                 fix[list_final_integrate_respa[i]]->datamask_read);
+    atomKK->modified(fix[list_final_integrate_respa[i]]->execution_space,
+                     fix[list_final_integrate_respa[i]]->datamask_modify);
+    fix[list_final_integrate_respa[i]]->final_integrate_respa(ilevel,iloop);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   minimizer pre-exchange call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::min_pre_exchange()
+{
+  for (int i = 0; i < n_min_pre_exchange; i++) {
+    atomKK->sync(fix[list_min_pre_exchange[i]]->execution_space,
+                 fix[list_min_pre_exchange[i]]->datamask_read);
+    atomKK->modified(fix[list_min_pre_exchange[i]]->execution_space,
+                     fix[list_min_pre_exchange[i]]->datamask_modify);
+    fix[list_min_pre_exchange[i]]->min_pre_exchange();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   minimizer pre-neighbor call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::min_pre_neighbor()
+{
+  for (int i = 0; i < n_min_pre_neighbor; i++) {
+    atomKK->sync(fix[list_min_pre_neighbor[i]]->execution_space,
+                 fix[list_min_pre_neighbor[i]]->datamask_read);
+    atomKK->modified(fix[list_min_pre_neighbor[i]]->execution_space,
+                     fix[list_min_pre_neighbor[i]]->datamask_modify);
+    fix[list_min_pre_neighbor[i]]->min_pre_neighbor();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   minimizer pre-force call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::min_pre_force(int vflag)
+{
+  for (int i = 0; i < n_min_pre_force; i++) {
+    atomKK->sync(fix[list_min_pre_force[i]]->execution_space,
+                 fix[list_min_pre_force[i]]->datamask_read);
+    atomKK->modified(fix[list_min_pre_force[i]]->execution_space,
+                     fix[list_min_pre_force[i]]->datamask_modify);
+    fix[list_min_pre_force[i]]->min_pre_force(vflag);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   minimizer force adjustment call, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::min_post_force(int vflag)
+{
+  for (int i = 0; i < n_min_post_force; i++) {
+    atomKK->sync(fix[list_min_post_force[i]]->execution_space,
+                 fix[list_min_post_force[i]]->datamask_read);
+    atomKK->modified(fix[list_min_post_force[i]]->execution_space,
+                     fix[list_min_post_force[i]]->datamask_modify);
+    fix[list_min_post_force[i]]->min_post_force(vflag);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   minimizer energy/force evaluation, only for relevant fixes
+   return energy and forces on extra degrees of freedom
+------------------------------------------------------------------------- */
+
+double ModifyKokkos::min_energy(double *fextra)
+{
+  int ifix,index;
+
+  index = 0;
+  double eng = 0.0;
+  for (int i = 0; i < n_min_energy; i++) {
+    ifix = list_min_energy[i];
+    atomKK->sync(fix[ifix]->execution_space,fix[ifix]->datamask_read);
+    atomKK->modified(fix[ifix]->execution_space,fix[ifix]->datamask_modify);
+    eng += fix[ifix]->min_energy(&fextra[index]);
+    index += fix[ifix]->min_dof();
+  }
+  return eng;
+}
+
+/* ----------------------------------------------------------------------
+   store current state of extra dof, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::min_store()
+{
+  for (int i = 0; i < n_min_energy; i++) {
+    atomKK->sync(fix[list_min_energy[i]]->execution_space,
+                 fix[list_min_energy[i]]->datamask_read);
+    atomKK->modified(fix[list_min_energy[i]]->execution_space,
+                     fix[list_min_energy[i]]->datamask_modify);
+    fix[list_min_energy[i]]->min_store();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   mange state of extra dof on a stack, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::min_clearstore()
+{
+  for (int i = 0; i < n_min_energy; i++) {
+    atomKK->sync(fix[list_min_energy[i]]->execution_space,
+                 fix[list_min_energy[i]]->datamask_read);
+    atomKK->modified(fix[list_min_energy[i]]->execution_space,
+                     fix[list_min_energy[i]]->datamask_modify);
+    fix[list_min_energy[i]]->min_clearstore();
+  }
+}
+
+void ModifyKokkos::min_pushstore()
+{
+  for (int i = 0; i < n_min_energy; i++) {
+    atomKK->sync(fix[list_min_energy[i]]->execution_space,
+                 fix[list_min_energy[i]]->datamask_read);
+    atomKK->modified(fix[list_min_energy[i]]->execution_space,
+                     fix[list_min_energy[i]]->datamask_modify);
+    fix[list_min_energy[i]]->min_pushstore();
+  }
+}
+
+void ModifyKokkos::min_popstore()
+{
+  for (int i = 0; i < n_min_energy; i++) {
+    atomKK->sync(fix[list_min_energy[i]]->execution_space,
+                 fix[list_min_energy[i]]->datamask_read);
+    atomKK->modified(fix[list_min_energy[i]]->execution_space,
+                     fix[list_min_energy[i]]->datamask_modify);
+    fix[list_min_energy[i]]->min_popstore();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   displace extra dof along vector hextra, only for relevant fixes
+------------------------------------------------------------------------- */
+
+void ModifyKokkos::min_step(double alpha, double *hextra)
+{
+  int ifix,index;
+
+  index = 0;
+  for (int i = 0; i < n_min_energy; i++) {
+    ifix = list_min_energy[i];
+    atomKK->sync(fix[ifix]->execution_space,fix[ifix]->datamask_read);
+    atomKK->modified(fix[ifix]->execution_space,fix[ifix]->datamask_modify);
+    fix[ifix]->min_step(alpha,&hextra[index]);
+    index += fix[ifix]->min_dof();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute max allowed step size along vector hextra, only for relevant fixes
+------------------------------------------------------------------------- */
+
+double ModifyKokkos::max_alpha(double *hextra)
+{
+  int ifix,index;
+
+  double alpha = BIG;
+  index = 0;
+  for (int i = 0; i < n_min_energy; i++) {
+    ifix = list_min_energy[i];
+    atomKK->sync(fix[ifix]->execution_space,fix[ifix]->datamask_read);
+    atomKK->modified(fix[ifix]->execution_space,fix[ifix]->datamask_modify);
+    double alpha_one = fix[ifix]->max_alpha(&hextra[index]);
+    alpha = MIN(alpha,alpha_one);
+    index += fix[ifix]->min_dof();
+  }
+  return alpha;
+}
+
+/* ----------------------------------------------------------------------
+   extract extra dof for minimization, only for relevant fixes
+------------------------------------------------------------------------- */
+
+int ModifyKokkos::min_dof()
+{
+  int ndof = 0;
+  for (int i = 0; i < n_min_energy; i++) {
+    atomKK->sync(fix[list_min_energy[i]]->execution_space,
+                 fix[list_min_energy[i]]->datamask_read);
+    atomKK->modified(fix[list_min_energy[i]]->execution_space,
+                     fix[list_min_energy[i]]->datamask_modify);
+    ndof += fix[list_min_energy[i]]->min_dof();
+  }
+  return ndof;
+}
+
+/* ----------------------------------------------------------------------
+   reset reference state of fix, only for relevant fixes
+------------------------------------------------------------------------- */
+
+int ModifyKokkos::min_reset_ref()
+{
+  int itmp,itmpall;
+  itmpall = 0;
+  for (int i = 0; i < n_min_energy; i++) {
+    atomKK->sync(fix[list_min_energy[i]]->execution_space,
+                 fix[list_min_energy[i]]->datamask_read);
+    atomKK->modified(fix[list_min_energy[i]]->execution_space,
+                     fix[list_min_energy[i]]->datamask_modify);
+    itmp = fix[list_min_energy[i]]->min_reset_ref();
+    if (itmp) itmpall = 1;
+  }
+  return itmpall;
+}
diff --git a/src/KOKKOS/modify_kokkos.h b/src/KOKKOS/modify_kokkos.h
new file mode 100644
index 000000000..c0c3a8d68
--- /dev/null
+++ b/src/KOKKOS/modify_kokkos.h
@@ -0,0 +1,73 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_MODIFY_KOKKOS_H
+#define LMP_MODIFY_KOKKOS_H
+
+#include "modify.h"
+
+namespace LAMMPS_NS {
+
+class ModifyKokkos : public Modify {
+ public:
+  ModifyKokkos(class LAMMPS *);
+  ~ModifyKokkos() {}
+  void setup(int);
+  void setup_pre_exchange();
+  void setup_pre_neighbor();
+  void setup_pre_force(int);
+  void initial_integrate(int);
+  void post_integrate();
+  void pre_decide();
+  void pre_exchange();
+  void pre_neighbor();
+  void pre_force(int);
+  void post_force(int);
+  void final_integrate();
+  void end_of_step();
+  double thermo_energy();
+  void post_run();
+
+  void setup_pre_force_respa(int, int);
+  void initial_integrate_respa(int, int, int);
+  void post_integrate_respa(int, int);
+  void pre_force_respa(int, int, int);
+  void post_force_respa(int, int, int);
+  void final_integrate_respa(int, int);
+
+  void min_pre_exchange();
+  void min_pre_neighbor();
+  void min_pre_force(int);
+  void min_post_force(int);
+
+  double min_energy(double *);
+  void min_store();
+  void min_step(double, double *);
+  void min_clearstore();
+  void min_pushstore();
+  void min_popstore();
+  double max_alpha(double *);
+  int min_dof();
+  int min_reset_ref();
+
+ protected:
+  class AtomKokkos *atomKK;
+};
+
+}
+
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/KOKKOS/neigh_full_kokkos.h b/src/KOKKOS/neigh_full_kokkos.h
new file mode 100644
index 000000000..9112e5049
--- /dev/null
+++ b/src/KOKKOS/neigh_full_kokkos.h
@@ -0,0 +1,507 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType, int HALF_NEIGH>
+void NeighborKokkos::full_bin_kokkos(NeighListKokkos<DeviceType> *list)
+{
+  const int nall = includegroup?atom->nfirst:atom->nlocal;
+  list->grow(nall);
+
+  NeighborKokkosExecute<DeviceType> 
+    data(*list,
+         k_cutneighsq.view<DeviceType>(),
+         k_bincount.view<DeviceType>(),
+         k_bins.view<DeviceType>(),nall,
+         atomKK->k_x.view<DeviceType>(),
+         atomKK->k_type.view<DeviceType>(),
+         atomKK->k_mask.view<DeviceType>(),
+         atomKK->k_molecule.view<DeviceType>(),
+         nbinx,nbiny,nbinz,mbinx,mbiny,mbinz,mbinxlo,mbinylo,mbinzlo,
+         bininvx,bininvy,bininvz,
+         bboxhi,bboxlo);
+
+  k_cutneighsq.sync<DeviceType>();
+  atomKK->sync(Device,X_MASK|TYPE_MASK|MASK_MASK);
+  Kokkos::deep_copy(list->d_stencil,list->h_stencil);
+
+  while(data.h_resize() > 0) {
+    data.h_resize() = 0;
+    deep_copy(data.resize, data.h_resize);
+
+    MemsetZeroFunctor<DeviceType> f_zero;
+    f_zero.ptr = (void*) k_bincount.view<DeviceType>().ptr_on_device();
+    Kokkos::parallel_for(mbins, f_zero);
+    DeviceType::fence();
+
+    NeighborKokkosBinAtomsFunctor<DeviceType> f(data);
+
+    Kokkos::parallel_for(atom->nlocal+atom->nghost, f);
+    DeviceType::fence();
+
+    deep_copy(data.h_resize, data.resize);
+    if(data.h_resize()) {
+
+      atoms_per_bin += 16;
+      k_bins = DAT::tdual_int_2d("bins", mbins, atoms_per_bin);
+      data.bins = k_bins.view<DeviceType>();
+      data.c_bins = data.bins;
+    }
+  }
+
+  if(list->d_neighbors.dimension_0()<nall) {
+    list->d_neighbors = typename ArrayTypes<DeviceType>::t_neighbors_2d("neighbors", nall*1.1, list->maxneighs);
+    list->d_numneigh = typename ArrayTypes<DeviceType>::t_int_1d("numneigh", nall*1.1);
+    data.neigh_list.d_neighbors = list->d_neighbors;
+    data.neigh_list.d_numneigh = list->d_numneigh;
+  }
+  data.h_resize()=1;
+  while(data.h_resize()) {
+    data.h_new_maxneighs() = list->maxneighs;
+  data.h_resize() = 0;
+
+  Kokkos::deep_copy(data.resize, data.h_resize);
+  Kokkos::deep_copy(data.new_maxneighs, data.h_new_maxneighs);
+#if DEVICE==2
+    #define BINS_PER_BLOCK 2
+    const int factor = atoms_per_bin<64?2:1;
+    Kokkos::ParallelWorkRequest config((mbins+factor-1)/factor,atoms_per_bin*factor);
+#else
+    const int factor = 1;
+#endif
+
+if(newton_pair) {
+  NeighborKokkosBuildFunctor<DeviceType,HALF_NEIGH,1> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
+#if DEVICE==2
+  Kokkos::parallel_for(config, f);
+#else
+  Kokkos::parallel_for(nall, f);
+#endif
+} else {
+  NeighborKokkosBuildFunctor<DeviceType,HALF_NEIGH,0> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
+#if DEVICE==2
+  Kokkos::parallel_for(config, f);
+#else
+  Kokkos::parallel_for(nall, f);
+#endif
+}
+  DeviceType::fence();
+    deep_copy(data.h_resize, data.resize);
+
+    if(data.h_resize()) {
+      deep_copy(data.h_new_maxneighs, data.new_maxneighs);
+      list->maxneighs = data.h_new_maxneighs() * 1.2;
+      list->d_neighbors = typename ArrayTypes<DeviceType>::t_neighbors_2d("neighbors", list->d_neighbors.dimension_0(), list->maxneighs);
+      data.neigh_list.d_neighbors = list->d_neighbors;
+      data.neigh_list.maxneighs = list->maxneighs;
+    }
+  }
+
+  list->inum = nall;
+  list->gnum = 0;
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class Device>
+KOKKOS_INLINE_FUNCTION
+void NeighborKokkosExecute<Device>::binatomsItem(const int &i) const
+{
+  const int ibin = coord2bin(x(i, 0), x(i, 1), x(i, 2));
+
+  const int ac = Kokkos::atomic_fetch_add(&bincount[ibin], (int)1);
+  if(ac < bins.dimension_1()) {
+    bins(ibin, ac) = i;
+  } else {
+    resize() = 1;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class Device> template<int HalfNeigh,int GhostNewton>
+void NeighborKokkosExecute<Device>::
+   build_Item(const int &i) const
+{
+  /* if necessary, goto next page and add pages */
+  int n = 0;
+
+  // get subview of neighbors of i
+
+  const AtomNeighbors neighbors_i = neigh_list.get_neighbors(i);
+  const X_FLOAT xtmp = x(i, 0);
+  const X_FLOAT ytmp = x(i, 1);
+  const X_FLOAT ztmp = x(i, 2);
+  const int itype = type(i);
+
+  const int ibin = coord2bin(xtmp, ytmp, ztmp);
+
+  const int nstencil = neigh_list.nstencil;
+  const typename ArrayTypes<Device>::t_int_1d_const_um stencil
+    = neigh_list.d_stencil;
+
+  // loop over all bins in neighborhood (includes ibin)
+  if(HalfNeigh)
+  for(int m = 0; m < c_bincount(ibin); m++) {
+    const int j = c_bins(ibin,m);
+  // printf("%i %i %i\n",i,ibin,m,c_bincount(ibin),j);
+    const int jtype = type(j);
+    //for same bin as atom i skip j if i==j and skip atoms "below and to the left" if using HalfNeighborlists
+    if((j == i) || (HalfNeigh && !GhostNewton && (j < i))  ||
+        (HalfNeigh && GhostNewton && ((j < i) || ((j >= nlocal) &&
+                                       ((x(j, 2) < ztmp) || (x(j, 2) == ztmp && x(j, 1) < ytmp) ||
+                                        (x(j, 2) == ztmp && x(j, 1)  == ytmp && x(j, 0) < xtmp)))))
+      ) continue;
+    //if(Exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+
+    const X_FLOAT delx = xtmp - x(j, 0);
+    const X_FLOAT dely = ytmp - x(j, 1);
+    const X_FLOAT delz = ztmp - x(j, 2);
+    const X_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+    if(rsq <= cutneighsq(itype,jtype)) {
+      if(n<neigh_list.maxneighs) neighbors_i(n) = j;
+      n++;
+    }
+  }
+
+  for(int k = 0; k < nstencil; k++) {
+    const int jbin = ibin + stencil[k];
+    // get subview of jbin
+    if(!GhostNewton&&HalfNeigh&&(ibin==jbin)) continue;
+    //const ArrayTypes<Device>::t_int_1d_const_um =Kokkos::subview<t_int_1d_const_um>(bins,jbin,ALL);
+      for(int m = 0; m < c_bincount(jbin); m++) {
+        const int j = c_bins(jbin,m);
+        //if(i==0)
+        //printf("%i %i %i %i %i %i %i\n",i,jbin,m,c_bincount(jbin),j,k,stencil[k]);
+        const int jtype = type(j);
+
+        if(HalfNeigh && !GhostNewton && (j < i)) continue;
+        if(!HalfNeigh && j==i) continue;
+        //if(Exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+        const X_FLOAT delx = xtmp - x(j, 0);
+        const X_FLOAT dely = ytmp - x(j, 1);
+        const X_FLOAT delz = ztmp - x(j, 2);
+        const X_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+      //if(i==0)
+        //printf("%i %i %lf %lf NEIGHS\n",i,j,rsq,cutneighsq(itype,jtype));
+
+        if(rsq <= cutneighsq(itype,jtype)) {
+          if(n<neigh_list.maxneighs) neighbors_i(n) = j;
+          n++;
+        }
+
+      }
+  }
+
+  neigh_list.d_numneigh(i) = n;
+
+  if(n >= neigh_list.maxneighs) {
+    resize() = 1;
+
+    if(n >= new_maxneighs()) new_maxneighs() = n;
+  }
+  neigh_list.d_ilist(i) = i;
+}
+
+#if DEVICE==2
+extern __shared__ X_FLOAT sharedmem[];
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType> template<int HalfNeigh>
+__device__ inline
+void NeighborKokkosExecute<DeviceType>::build_ItemCuda(DeviceType dev) const
+{
+  /* loop over atoms in i's bin,
+  */
+  const int atoms_per_bin = c_bins.dimension_1();
+  const int BINS_PER_TEAM = blockDim.x/atoms_per_bin;
+  const int MY_BIN = threadIdx.x/atoms_per_bin;
+  const int MY_II = threadIdx.x%atoms_per_bin;
+
+  const int ibin = (blockIdx.x)*BINS_PER_TEAM+MY_BIN;
+
+  if(ibin >=c_bincount.dimension_0()) return;
+  X_FLOAT* other_x = sharedmem;
+  other_x = other_x + 5*atoms_per_bin*MY_BIN;
+
+  int* other_id = (int*) &other_x[4 * atoms_per_bin];
+
+  int bincount_current = c_bincount[ibin];
+
+  const int i = MY_II < bincount_current ? c_bins(ibin, MY_II) : -1;
+  /* if necessary, goto next page and add pages */
+
+  int n = 0;
+
+  X_FLOAT xtmp;
+  X_FLOAT ytmp;
+  X_FLOAT ztmp;
+  int itype;
+  const AtomNeighbors neighbors_i = neigh_list.get_neighbors((i>=0&&i<nlocal)?i:0);
+
+  if(i >= 0) {
+    xtmp = x(i, 0);
+    ytmp = x(i, 1);
+    ztmp = x(i, 2);
+    itype = type(i);
+    other_x[MY_II] = xtmp;
+    other_x[MY_II + atoms_per_bin] = ytmp;
+    other_x[MY_II + 2 * atoms_per_bin] = ztmp;
+    other_x[MY_II + 3 * atoms_per_bin] = itype;
+  }
+  other_id[MY_II] = i;
+  int test = (__syncthreads_count(i >= 0 && i <= nlocal) == 0);
+
+  if(test) return;
+
+  if(i >= 0 && i < nlocal) {
+    #pragma unroll 4
+    for(int m = 0; m < bincount_current; m++) {
+      int j = other_id[m];
+
+      //for same bin as atom i skip j if i==j and skip atoms "below and to the left" if using halfneighborlists
+      //if(j==i) continue;
+      if((j == i) || (HalfNeigh && (j < i)))  continue;
+
+      const X_FLOAT delx = xtmp - other_x[m];
+      const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin];
+      const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin];
+      const int jtype = other_x[m + 3 * atoms_per_bin];
+      const X_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+      if((rsq <= cutneighsq(itype,jtype)) && (n < neigh_list.maxneighs)) neighbors_i(n++) = j;
+    }
+  }
+  __syncthreads();
+
+  const int nstencil = neigh_list.nstencil;
+  const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil
+    = neigh_list.d_stencil;
+  for(int k = 0; k < nstencil; k++) {
+    const int jbin = ibin + stencil[k];
+
+    if(ibin == jbin) continue;
+
+    bincount_current = c_bincount[jbin];
+    int j = MY_II < bincount_current ? c_bins(jbin, MY_II) : -1;
+
+    if(j >= 0) {
+      other_x[MY_II] = x(j, 0);
+      other_x[MY_II + atoms_per_bin] = x(j, 1);
+      other_x[MY_II + 2 * atoms_per_bin] = x(j, 2);
+      other_x[MY_II + 3 * atoms_per_bin] = type(j);
+     }
+
+    other_id[MY_II] = j;
+
+    __syncthreads();
+
+    if(i >= 0 && i < nlocal) {
+      #pragma unroll 8
+      for(int m = 0; m < bincount_current; m++) {
+        const int j = other_id[m];
+
+        if(HalfNeigh && (j < i))  continue;
+
+        const X_FLOAT delx = xtmp - other_x[m];
+        const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin];
+        const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin];
+        const int jtype = other_x[m + 3 * atoms_per_bin];
+        const X_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+        if((rsq <= cutneighsq(itype,jtype)) && (n < neigh_list.maxneighs)) neighbors_i(n++) = j;
+      }
+    }
+    __syncthreads();
+  }
+
+  if(i >= 0 && i < nlocal) {
+    neigh_list.d_numneigh(i) = n;
+    neigh_list.d_ilist(i) = i;
+  }
+
+  if(n >= neigh_list.maxneighs) {
+    resize() = 1;
+
+    if(n >= new_maxneighs()) new_maxneighs() = n;
+  }
+}
+#endif
+
+template<class DeviceType>
+void NeighborKokkos::full_bin_cluster_kokkos(NeighListKokkos<DeviceType> *list)
+{
+  const int nall = includegroup?atom->nfirst:atom->nlocal;
+  list->grow(nall);
+
+  NeighborKokkosExecute<DeviceType>
+    data(*list,
+         k_cutneighsq.view<DeviceType>(),
+         k_bincount.view<DeviceType>(),
+         k_bins.view<DeviceType>(),nall,
+         atomKK->k_x.view<DeviceType>(),
+         atomKK->k_type.view<DeviceType>(),
+         atomKK->k_mask.view<DeviceType>(),
+         atomKK->k_molecule.view<DeviceType>(),
+         nbinx,nbiny,nbinz,mbinx,mbiny,mbinz,mbinxlo,mbinylo,mbinzlo,
+         bininvx,bininvy,bininvz,
+         bboxhi,bboxlo);
+
+  k_cutneighsq.sync<DeviceType>();
+  atomKK->sync(Device,X_MASK|TYPE_MASK|MASK_MASK);
+  Kokkos::deep_copy(list->d_stencil,list->h_stencil);
+  DeviceType::fence();
+
+  while(data.h_resize() > 0) {
+    data.h_resize() = 0;
+    deep_copy(data.resize, data.h_resize);
+
+    MemsetZeroFunctor<DeviceType> f_zero;
+    f_zero.ptr = (void*) k_bincount.view<DeviceType>().ptr_on_device();
+    Kokkos::parallel_for(mbins, f_zero);
+    DeviceType::fence();
+
+    NeighborKokkosBinAtomsFunctor<DeviceType> f(data);
+
+    Kokkos::parallel_for(atom->nlocal+atom->nghost, f);
+    DeviceType::fence();
+
+    deep_copy(data.h_resize, data.resize);
+    if(data.h_resize()) {
+
+      atoms_per_bin += 16;
+      k_bins = DAT::tdual_int_2d("bins", mbins, atoms_per_bin);
+      data.bins = k_bins.view<DeviceType>();
+      data.c_bins = data.bins;
+    }
+  }
+
+  if(list->d_neighbors.dimension_0()<nall) {
+    list->d_neighbors = typename ArrayTypes<DeviceType>::t_neighbors_2d("neighbors", nall*1.1, list->maxneighs);
+    list->d_numneigh = typename ArrayTypes<DeviceType>::t_int_1d("numneigh", nall*1.1);
+    data.neigh_list.d_neighbors = list->d_neighbors;
+    data.neigh_list.d_numneigh = list->d_numneigh;
+  }
+  data.h_resize()=1;
+  while(data.h_resize()) {
+    data.h_new_maxneighs() = list->maxneighs;
+  data.h_resize() = 0;
+
+  Kokkos::deep_copy(data.resize, data.h_resize);
+  Kokkos::deep_copy(data.new_maxneighs, data.h_new_maxneighs);
+#if DEVICE==2
+    #define BINS_PER_BLOCK 2
+    const int factor = atoms_per_bin<64?2:1;
+    Kokkos::ParallelWorkRequest config((mbins+factor-1)/factor,atoms_per_bin*factor);
+#else
+    const int factor = 1;
+#endif
+
+if(newton_pair) {
+  NeighborClusterKokkosBuildFunctor<DeviceType,NeighClusterSize> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
+//#if DEVICE==2
+//  Kokkos::parallel_for(config, f);
+//#else
+  Kokkos::parallel_for(nall, f);
+//#endif
+} else {
+  NeighborClusterKokkosBuildFunctor<DeviceType,NeighClusterSize> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
+//#if DEVICE==2
+//  Kokkos::parallel_for(config, f);
+//#else
+  Kokkos::parallel_for(nall, f);
+//#endif
+}
+  DeviceType::fence();
+    deep_copy(data.h_resize, data.resize);
+
+    if(data.h_resize()) {
+      deep_copy(data.h_new_maxneighs, data.new_maxneighs);
+      list->maxneighs = data.h_new_maxneighs() * 1.2;
+      list->d_neighbors = typename ArrayTypes<DeviceType>::t_neighbors_2d("neighbors", list->d_neighbors.dimension_0(), list->maxneighs);
+      data.neigh_list.d_neighbors = list->d_neighbors;
+      data.neigh_list.maxneighs = list->maxneighs;
+    }
+  }
+
+  list->inum = nall;
+  list->gnum = 0;
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class Device> template<int ClusterSize>
+void NeighborKokkosExecute<Device>::
+   build_cluster_Item(const int &i) const
+{
+  /* if necessary, goto next page and add pages */
+  int n = 0;
+
+  // get subview of neighbors of i
+
+  const AtomNeighbors neighbors_i = neigh_list.get_neighbors(i);
+  const X_FLOAT xtmp = x(i, 0);
+  const X_FLOAT ytmp = x(i, 1);
+  const X_FLOAT ztmp = x(i, 2);
+  const int itype = type(i);
+
+  const int ibin = coord2bin(xtmp, ytmp, ztmp);
+
+  const int nstencil = neigh_list.nstencil;
+  const typename ArrayTypes<Device>::t_int_1d_const_um stencil
+    = neigh_list.d_stencil;
+
+  for(int k = 0; k < nstencil; k++) {
+    const int jbin = ibin + stencil[k];
+      for(int m = 0; m < c_bincount(jbin); m++) {
+        const int j = c_bins(jbin,m);
+        bool skip = i == j;
+        for(int k = 0; k< (n<neigh_list.maxneighs?n:neigh_list.maxneighs); k++)
+          if((j-(j%ClusterSize)) == neighbors_i(k)) {skip=true;};//{m += ClusterSize - j&(ClusterSize-1)-1; skip=true;}
+
+        if(!skip) {
+          const int jtype = type(j);
+
+          const X_FLOAT delx = xtmp - x(j, 0);
+          const X_FLOAT dely = ytmp - x(j, 1);
+          const X_FLOAT delz = ztmp - x(j, 2);
+          const X_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+          if(rsq <= cutneighsq(itype,jtype)) {
+            if(n<neigh_list.maxneighs) neighbors_i(n) = (j-(j%ClusterSize));
+            n++;
+            //m += ClusterSize - j&(ClusterSize-1)-1;
+          }
+        }
+
+      }
+  }
+
+  neigh_list.d_numneigh(i) = n;
+
+  if(n >= neigh_list.maxneighs) {
+    resize() = 1;
+
+    if(n >= new_maxneighs()) new_maxneighs() = n;
+  }
+  neigh_list.d_ilist(i) = i;
+}
diff --git a/src/KOKKOS/neigh_list_kokkos.cpp b/src/KOKKOS/neigh_list_kokkos.cpp
new file mode 100644
index 000000000..dbb0aa572
--- /dev/null
+++ b/src/KOKKOS/neigh_list_kokkos.cpp
@@ -0,0 +1,118 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "neigh_list_kokkos.h"
+#include "atom.h"
+#include "memory.h"
+
+using namespace LAMMPS_NS;
+
+enum{NSQ,BIN,MULTI};
+
+/* ---------------------------------------------------------------------- */
+
+template<class Device>
+void NeighListKokkos<Device>::clean_copy()
+{
+  ilist = NULL;
+  numneigh = NULL;
+  firstneigh = NULL;
+  firstdouble = NULL;
+  dnum = 0;
+  iskip = NULL;
+  ijskip = NULL;
+  
+  ipage = NULL;
+  dpage = NULL;
+  maxstencil = 0;
+  ghostflag = 0;
+  maxstencil_multi = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class Device>
+void NeighListKokkos<Device>::grow(int nmax)
+{
+  // skip if this list is already long enough to store nmax atoms
+
+  if (nmax <= maxatoms) return;
+  maxatoms = nmax;
+
+  d_ilist = 
+    typename ArrayTypes<Device>::t_int_1d("neighlist:ilist",maxatoms);
+  d_numneigh = 
+    typename ArrayTypes<Device>::t_int_1d("neighlist:numneigh",maxatoms);
+  d_neighbors = 
+    typename ArrayTypes<Device>::t_neighbors_2d("neighlist:neighbors",
+                                                maxatoms,maxneighs);
+
+  memory->sfree(firstneigh);
+  memory->sfree(firstdouble);
+
+  firstneigh = (int **) memory->smalloc(maxatoms*sizeof(int *),
+                                        "neighlist:firstneigh");
+  if (dnum)
+    firstdouble = (double **) memory->smalloc(maxatoms*sizeof(double *),
+                                              "neighlist:firstdouble");
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class Device>
+void NeighListKokkos<Device>::stencil_allocate(int smax, int style)
+{
+  int i;
+
+  if (style == BIN) {
+    if (smax > maxstencil) {
+      maxstencil = smax;
+      d_stencil = 
+        memory->create_kokkos(d_stencil,h_stencil,stencil,maxstencil,
+                              "neighlist:stencil");
+      if (ghostflag) {
+        memory->destroy(stencilxyz);
+        memory->create(stencilxyz,maxstencil,3,"neighlist:stencilxyz");
+      }
+    }
+
+  } else {
+    int n = atom->ntypes;
+    if (maxstencil_multi == 0) {
+      nstencil_multi = new int[n+1];
+      stencil_multi = new int*[n+1];
+      distsq_multi = new double*[n+1];
+      for (i = 1; i <= n; i++) {
+        nstencil_multi[i] = 0;
+        stencil_multi[i] = NULL;
+        distsq_multi[i] = NULL;
+      }
+    }
+    if (smax > maxstencil_multi) {
+      maxstencil_multi = smax;
+      for (i = 1; i <= n; i++) {
+        memory->destroy(stencil_multi[i]);
+        memory->destroy(distsq_multi[i]);
+        memory->create(stencil_multi[i],maxstencil_multi,
+                       "neighlist:stencil_multi");
+        memory->create(distsq_multi[i],maxstencil_multi,
+                       "neighlist:distsq_multi");
+      }
+    }
+  }
+}
+
+template class NeighListKokkos<LMPDeviceType>;
+#if DEVICE==2
+template class NeighListKokkos<LMPHostType>;
+#endif
diff --git a/src/KOKKOS/neigh_list_kokkos.h b/src/KOKKOS/neigh_list_kokkos.h
new file mode 100644
index 000000000..fd4ac3acc
--- /dev/null
+++ b/src/KOKKOS/neigh_list_kokkos.h
@@ -0,0 +1,104 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_NEIGH_LIST_KOKKOS_H
+#define LMP_NEIGH_LIST_KOKKOS_H
+
+#include "pointers.h"
+#include "neigh_list.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+enum{FULL,HALFTHREAD,HALF,N2,FULLCLUSTER};
+
+class AtomNeighbors
+{
+ public:
+  const int num_neighs;
+
+  KOKKOS_INLINE_FUNCTION
+  AtomNeighbors(int* const & firstneigh, const int & _num_neighs, 
+                const int & stride):
+  _firstneigh(firstneigh), _stride(stride), num_neighs(_num_neighs) {};
+  KOKKOS_INLINE_FUNCTION
+  int& operator()(const int &i) const {
+    return _firstneigh[i*_stride];
+  }
+
+ private:
+  int* const _firstneigh;
+  const int _stride;
+};
+
+class AtomNeighborsConst
+{
+ public:
+  const int* const _firstneigh;
+  const int numneigh;
+
+  KOKKOS_INLINE_FUNCTION
+  AtomNeighborsConst(int* const & firstneigh, const int & _numneigh, 
+                     const int & stride):
+  _firstneigh(firstneigh), _stride(stride), numneigh(_numneigh) {};
+  KOKKOS_INLINE_FUNCTION
+  const int& operator()(const int &i) const {
+    return _firstneigh[i*_stride];
+  }
+
+ private:
+  //const int* const _firstneigh;
+  const int _stride;
+};
+
+template<class Device>
+class NeighListKokkos: public NeighList {
+  int _stride;
+
+public:
+  int maxneighs;
+
+  void clean_copy();
+  void grow(int nmax);
+  typename ArrayTypes<Device>::t_neighbors_2d d_neighbors;
+  typename ArrayTypes<Device>::t_int_1d d_ilist;   // local indices of I atoms
+  typename ArrayTypes<Device>::t_int_1d d_numneigh; // # of J neighs for each I
+  typename ArrayTypes<Device>::t_int_1d d_stencil;  // # of J neighs for each I
+  typename ArrayTypes<LMPHostType>::t_int_1d h_stencil; // # of J neighs per I
+
+  NeighListKokkos(class LAMMPS *lmp):
+  NeighList(lmp) {_stride = 1; maxneighs = 16;};
+  ~NeighListKokkos() {stencil = NULL; numneigh = NULL; ilist = NULL;};
+
+  KOKKOS_INLINE_FUNCTION
+  AtomNeighbors get_neighbors(const int &i) const {
+    return AtomNeighbors(&d_neighbors(i,0),d_numneigh(i),
+                         &d_neighbors(i,1)-&d_neighbors(i,0));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  AtomNeighborsConst get_neighbors_const(const int &i) const {
+    return AtomNeighborsConst(&d_neighbors(i,0),d_numneigh(i),
+                              &d_neighbors(i,1)-&d_neighbors(i,0));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int& num_neighs(const int & i) const {
+    return d_numneigh(i);
+  }
+  void stencil_allocate(int smax, int style);
+};
+
+}
+
+#endif
diff --git a/src/KOKKOS/neighbor_kokkos.cpp b/src/KOKKOS/neighbor_kokkos.cpp
new file mode 100644
index 000000000..adea82397
--- /dev/null
+++ b/src/KOKKOS/neighbor_kokkos.cpp
@@ -0,0 +1,269 @@
+;/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "neighbor_kokkos.h"
+#include "atom.h"
+#include "pair.h"
+#include "neigh_request.h"
+#include "memory.h"
+
+using namespace LAMMPS_NS;
+
+enum{NSQ,BIN,MULTI};     // also in neigh_list.cpp
+
+/* ---------------------------------------------------------------------- */
+
+NeighborKokkos::NeighborKokkos(LAMMPS *lmp) : Neighbor(lmp)
+{
+  atoms_per_bin = 16;
+
+  nlist_host = 0;
+  lists_host = NULL;
+  pair_build_host = NULL;
+  stencil_create_host = NULL;
+  nlist_device = 0;
+  lists_device = NULL;
+  pair_build_device = NULL;
+  stencil_create_device = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+NeighborKokkos::~NeighborKokkos()
+{
+  memory->destroy_kokkos(k_cutneighsq,cutneighsq);
+  cutneighsq = NULL;
+
+  for (int i = 0; i < nlist_host; i++) delete lists_host[i];
+  delete [] lists_host;
+  for (int i = 0; i < nlist_device; i++) delete lists_device[i];
+  delete [] lists_device;
+
+  delete [] pair_build_device;
+  delete [] pair_build_host;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NeighborKokkos::init()
+{
+  atomKK = (AtomKokkos *) atom;
+  Neighbor::init();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NeighborKokkos::init_cutneighsq_kokkos(int n)
+{
+  memory->create_kokkos(k_cutneighsq,cutneighsq,n+1,n+1,"neigh:cutneighsq");
+  k_cutneighsq.modify<LMPHostType>();
+}
+
+/* ---------------------------------------------------------------------- */
+
+int NeighborKokkos::init_lists_kokkos()
+{ 
+  int i;
+
+  for (i = 0; i < nlist_host; i++) delete lists_host[i];
+  delete [] lists_host;
+  delete [] pair_build_host;
+  delete [] stencil_create_host;
+  nlist_host = 0;
+
+  for (i = 0; i < nlist_device; i++) delete lists_device[i];
+  delete [] lists_device;
+  delete [] pair_build_device;
+  delete [] stencil_create_device;
+  nlist_device = 0;
+
+  nlist = 0;
+  for (i = 0; i < nrequest; i++) {
+    if (requests[i]->kokkos_device) nlist_device++;
+    else if (requests[i]->kokkos_host) nlist_host++;
+    else nlist++;
+  }
+
+  lists_host = new NeighListKokkos<LMPHostType>*[nrequest];
+  pair_build_host = new PairPtrHost[nrequest];
+  stencil_create_host = new StencilPtrHost[nrequest];
+  for (i = 0; i < nrequest; i++) {
+    lists_host[i] = NULL;
+    pair_build_host[i] = NULL;
+    stencil_create_host[i] = NULL;
+  }
+
+  for (i = 0; i < nrequest; i++) {
+    if (!requests[i]->kokkos_host) continue;
+    lists_host[i] = new NeighListKokkos<LMPHostType>(lmp);
+    lists_host[i]->index = i;
+    lists_host[i]->dnum = requests[i]->dnum;
+    if (requests[i]->pair) {
+      Pair *pair = (Pair *) requests[i]->requestor;
+      pair->init_list(requests[i]->id,lists_host[i]);
+    }
+  }
+
+  lists_device = new NeighListKokkos<LMPDeviceType>*[nrequest];
+  pair_build_device = new PairPtrDevice[nrequest];
+  stencil_create_device = new StencilPtrDevice[nrequest];
+  for (i = 0; i < nrequest; i++) {
+    lists_device[i] = NULL;
+    pair_build_device[i] = NULL;
+    stencil_create_device[i] = NULL;
+  }
+
+  for (i = 0; i < nrequest; i++) {
+    if (!requests[i]->kokkos_device) continue;
+    lists_device[i] = new NeighListKokkos<LMPDeviceType>(lmp);
+    lists_device[i]->index = i;
+    lists_device[i]->dnum = requests[i]->dnum;
+    if (requests[i]->pair) {
+      Pair *pair = (Pair *) requests[i]->requestor;
+      pair->init_list(requests[i]->id,lists_device[i]);
+    }
+  }
+
+  // return # of non-Kokkos lists
+
+  return nlist;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NeighborKokkos::init_list_flags1_kokkos(int i)
+{ 
+  if (lists_host[i]) {
+    lists_host[i]->buildflag = 1;
+    if (pair_build_host[i] == NULL) lists_host[i]->buildflag = 0;
+    if (requests[i]->occasional) lists_host[i]->buildflag = 0;
+    
+    lists_host[i]->growflag = 1;
+    if (requests[i]->copy) lists_host[i]->growflag = 0;
+    
+    lists_host[i]->stencilflag = 1;
+    if (style == NSQ) lists_host[i]->stencilflag = 0;
+    if (stencil_create[i] == NULL) lists_host[i]->stencilflag = 0;
+    
+    lists_host[i]->ghostflag = 0;
+    if (requests[i]->ghost) lists_host[i]->ghostflag = 1;
+    if (requests[i]->ghost && !requests[i]->occasional) anyghostlist = 1;
+  }
+  
+  if (lists_device[i]) {
+    lists_device[i]->buildflag = 1;
+    if (pair_build_device[i] == NULL) lists_device[i]->buildflag = 0;
+    if (requests[i]->occasional) lists_device[i]->buildflag = 0;
+    
+    lists_device[i]->growflag = 1;
+    if (requests[i]->copy) lists_device[i]->growflag = 0;
+    
+    lists_device[i]->stencilflag = 1;
+    if (style == NSQ) lists_device[i]->stencilflag = 0;
+    if (stencil_create[i] == NULL) lists_device[i]->stencilflag = 0;
+    
+    lists_device[i]->ghostflag = 0;
+    if (requests[i]->ghost) lists_device[i]->ghostflag = 1;
+    if (requests[i]->ghost && !requests[i]->occasional) anyghostlist = 1;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NeighborKokkos::init_list_flags2_kokkos(int i)
+{ 
+  if (lists_host[i]) {
+    if (lists_host[i]->buildflag) blist[nblist++] = i;
+    if (lists_host[i]->growflag && requests[i]->occasional == 0)
+      glist[nglist++] = i;
+    if (lists_host[i]->stencilflag && requests[i]->occasional == 0)
+      slist[nslist++] = i;
+  }
+
+  if (lists_device[i]) {
+    if (lists_device[i]->buildflag) blist[nblist++] = i;
+    if (lists_device[i]->growflag && requests[i]->occasional == 0)
+      glist[nglist++] = i;
+    if (lists_device[i]->stencilflag && requests[i]->occasional == 0)
+      slist[nslist++] = i;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NeighborKokkos::init_list_grow_kokkos(int i)
+{
+  if (lists_host[i]!=NULL && lists_host[i]->growflag)
+    lists_host[i]->grow(maxatom);
+  if (lists_device[i]!=NULL && lists_device[i]->growflag)
+    lists_device[i]->grow(maxatom);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NeighborKokkos::choose_build(int index, NeighRequest *rq)
+{
+  if (rq->kokkos_host != 0) {
+    PairPtrHost pb = NULL;
+    if (rq->full) pb = &NeighborKokkos::full_bin_kokkos<LMPHostType,0>;
+    else if (rq->half) pb = &NeighborKokkos::full_bin_kokkos<LMPHostType,1>;
+    pair_build_host[index] = pb;
+    return;
+  }
+  if (rq->kokkos_device != 0) {
+    PairPtrDevice pb = NULL;
+    if (rq->full) {
+      if (rq->full_cluster) pb = &NeighborKokkos::full_bin_cluster_kokkos<LMPDeviceType>;
+      else pb = &NeighborKokkos::full_bin_kokkos<LMPDeviceType,0>;
+    }
+    else if (rq->half) pb = &NeighborKokkos::full_bin_kokkos<LMPDeviceType,1>;
+    pair_build_device[index] = pb;
+    return;
+  }
+
+  Neighbor::choose_build(index,rq);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NeighborKokkos::build_kokkos(int i)
+{
+  if (lists_host[blist[i]])
+    (this->*pair_build_host[blist[i]])(lists_host[blist[i]]);
+  else if (lists_device[blist[i]])
+    (this->*pair_build_device[blist[i]])(lists_device[blist[i]]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NeighborKokkos::setup_bins_kokkos(int i)
+{
+  if (lists_host[slist[i]]) {
+    lists_host[slist[i]]->stencil_allocate(smax,style);
+    (this->*stencil_create[slist[i]])(lists_host[slist[i]],sx,sy,sz);
+  } else if (lists_device[slist[i]]) {
+    lists_device[slist[i]]->stencil_allocate(smax,style);
+    (this->*stencil_create[slist[i]])(lists_device[slist[i]],sx,sy,sz);
+  }
+
+  if (i < nslist-1) return;
+
+  if (maxhead > k_bins.d_view.dimension_0()) {
+    k_bins = DAT::tdual_int_2d("Neighbor::d_bins",maxhead,atoms_per_bin);
+    k_bincount = DAT::tdual_int_1d("Neighbor::d_bincount",maxhead);
+  }
+}
+
+// include to trigger instantiation of templated functions
+
+#include "neigh_full_kokkos.h"
diff --git a/src/KOKKOS/neighbor_kokkos.h b/src/KOKKOS/neighbor_kokkos.h
new file mode 100644
index 000000000..30e73792e
--- /dev/null
+++ b/src/KOKKOS/neighbor_kokkos.h
@@ -0,0 +1,257 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_NEIGHBOR_KOKKOS_H
+#define LMP_NEIGHBOR_KOKKOS_H
+
+#include "neighbor.h"
+#include "neigh_list_kokkos.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+template<class Device>
+class NeighborKokkosExecute
+{
+  typedef ArrayTypes<Device> AT;
+
+ public:
+  NeighListKokkos<Device> neigh_list;
+  const typename AT::t_xfloat_2d_randomread cutneighsq;
+  const typename AT::t_int_1d bincount;
+  const typename AT::t_int_1d_const c_bincount;
+  typename AT::t_int_2d bins;
+  typename AT::t_int_2d_const c_bins;
+  const typename AT::t_x_array_randomread x;
+  const typename AT::t_int_1d_const type,mask,molecule;
+
+  const int nbinx,nbiny,nbinz;
+  const int mbinx,mbiny,mbinz;
+  const int mbinxlo,mbinylo,mbinzlo;
+  const X_FLOAT bininvx,bininvy,bininvz;
+  X_FLOAT bboxhi[3],bboxlo[3];
+
+  const int nlocal;
+
+  typename AT::t_int_scalar resize;
+  typename AT::t_int_scalar new_maxneighs;
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_resize;
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_new_maxneighs;
+
+  NeighborKokkosExecute(
+    const NeighListKokkos<Device> &_neigh_list,
+    const typename AT::t_xfloat_2d_randomread &_cutneighsq,
+    const typename AT::t_int_1d &_bincount,
+    const typename AT::t_int_2d &_bins,
+    const int _nlocal,
+        const typename AT::t_x_array_randomread &_x,
+    const typename AT::t_int_1d_const &_type,
+    const typename AT::t_int_1d_const &_mask,
+    const typename AT::t_int_1d_const &_molecule,
+    const int & _nbinx,const int & _nbiny,const int & _nbinz,
+    const int & _mbinx,const int & _mbiny,const int & _mbinz,
+    const int & _mbinxlo,const int & _mbinylo,const int & _mbinzlo,
+    const X_FLOAT &_bininvx,const X_FLOAT &_bininvy,const X_FLOAT &_bininvz,
+    const X_FLOAT *_bboxhi, const X_FLOAT* _bboxlo):
+    neigh_list(_neigh_list), cutneighsq(_cutneighsq),
+    bincount(_bincount),c_bincount(_bincount),bins(_bins),c_bins(_bins),
+    nlocal(_nlocal),
+    x(_x),type(_type),mask(_mask),molecule(_molecule),
+    nbinx(_nbinx),nbiny(_nbiny),nbinz(_nbinz),
+    mbinx(_mbinx),mbiny(_mbiny),mbinz(_mbinz),
+    mbinxlo(_mbinxlo),mbinylo(_mbinylo),mbinzlo(_mbinzlo),
+    bininvx(_bininvx),bininvy(_bininvy),bininvz(_bininvz) {
+
+    bboxlo[0] = _bboxlo[0]; bboxlo[1] = _bboxlo[1]; bboxlo[2] = _bboxlo[2];
+    bboxhi[0] = _bboxhi[0]; bboxhi[1] = _bboxhi[1]; bboxhi[2] = _bboxhi[2];
+    
+    resize = typename AT::t_int_scalar("NeighborKokkosFunctor::resize");
+#ifndef KOKKOS_USE_UVM
+    h_resize = Kokkos::create_mirror_view(resize);
+#else
+    h_resize = resize;
+#endif
+    h_resize() = 1;
+    new_maxneighs = typename AT::
+      t_int_scalar("NeighborKokkosFunctor::new_maxneighs");
+#ifndef KOKKOS_USE_UVM
+    h_new_maxneighs = Kokkos::create_mirror_view(new_maxneighs);
+#else
+    h_new_maxneighs = new_maxneighs;
+#endif
+    h_new_maxneighs() = neigh_list.maxneighs;
+  };
+
+  ~NeighborKokkosExecute() {neigh_list.clean_copy();};
+
+  template<int HalfNeigh, int GhostNewton>
+  KOKKOS_FUNCTION
+  void build_Item(const int &i) const;
+
+  template<int ClusterSize>
+  KOKKOS_FUNCTION
+  void build_cluster_Item(const int &i) const;
+
+#if DEVICE==2
+  template<int HalfNeigh>
+  __device__ inline
+  void build_ItemCuda(Device dev) const;
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  void binatomsItem(const int &i) const;
+
+  KOKKOS_INLINE_FUNCTION
+  int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z) const
+  {
+    int ix,iy,iz;
+
+    if (x >= bboxhi[0])
+      ix = static_cast<int> ((x-bboxhi[0])*bininvx) + nbinx;
+    else if (x >= bboxlo[0]) {
+      ix = static_cast<int> ((x-bboxlo[0])*bininvx);
+      ix = MIN(ix,nbinx-1);
+    } else
+      ix = static_cast<int> ((x-bboxlo[0])*bininvx) - 1;
+
+    if (y >= bboxhi[1])
+      iy = static_cast<int> ((y-bboxhi[1])*bininvy) + nbiny;
+    else if (y >= bboxlo[1]) {
+      iy = static_cast<int> ((y-bboxlo[1])*bininvy);
+      iy = MIN(iy,nbiny-1);
+    } else
+      iy = static_cast<int> ((y-bboxlo[1])*bininvy) - 1;
+
+    if (z >= bboxhi[2])
+      iz = static_cast<int> ((z-bboxhi[2])*bininvz) + nbinz;
+    else if (z >= bboxlo[2]) {
+      iz = static_cast<int> ((z-bboxlo[2])*bininvz);
+      iz = MIN(iz,nbinz-1);
+    } else
+      iz = static_cast<int> ((z-bboxlo[2])*bininvz) - 1;
+
+    return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo);
+  }
+};
+
+template<class Device>
+struct NeighborKokkosBinAtomsFunctor {
+  typedef Device device_type;
+
+  const NeighborKokkosExecute<Device> c;
+
+  NeighborKokkosBinAtomsFunctor(const NeighborKokkosExecute<Device> &_c):
+    c(_c) {};
+  ~NeighborKokkosBinAtomsFunctor() {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int & i) const {
+    c.binatomsItem(i);
+  }
+};
+
+template<class Device,int HALF_NEIGH,int GHOST_NEWTON>
+struct NeighborKokkosBuildFunctor {
+  typedef Device device_type;
+
+  const NeighborKokkosExecute<Device> c;
+  const size_t sharedsize;
+
+  NeighborKokkosBuildFunctor(const NeighborKokkosExecute<Device> &_c, 
+                             const size_t _sharedsize):c(_c),
+                             sharedsize(_sharedsize) {};
+  
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int & i) const {
+    c.template build_Item<HALF_NEIGH,GHOST_NEWTON>(i);
+  }
+#if DEVICE==2
+  KOKKOS_INLINE_FUNCTION
+  void operator() (Device dev) const {
+    c.template build_ItemCuda<HALF_NEIGH>(dev);
+  }
+  size_t shmem_size() const { return sharedsize; }
+#endif
+};
+
+template<class Device,int ClusterSize>
+struct NeighborClusterKokkosBuildFunctor {
+  typedef Device device_type;
+
+  const NeighborKokkosExecute<Device> c;
+  const size_t sharedsize;
+
+  NeighborClusterKokkosBuildFunctor(const NeighborKokkosExecute<Device> &_c,
+                             const size_t _sharedsize):c(_c),
+                             sharedsize(_sharedsize) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int & i) const {
+    c.template build_cluster_Item<ClusterSize>(i);
+  }
+};
+
+class NeighborKokkos : public Neighbor {
+ public:
+  class AtomKokkos *atomKK;
+
+  int nlist_host;                       // pairwise neighbor lists on Host
+  NeighListKokkos<LMPHostType> **lists_host;
+  int nlist_device;                     // pairwise neighbor lists on Device
+  NeighListKokkos<LMPDeviceType> **lists_device;
+
+  NeighborKokkos(class LAMMPS *);
+  ~NeighborKokkos();
+  void init();
+
+ private:
+  int atoms_per_bin;
+  DAT::tdual_xfloat_2d k_cutneighsq;
+  DAT::tdual_int_1d k_bincount;
+  DAT::tdual_int_2d k_bins;
+
+  void init_cutneighsq_kokkos(int);
+  int init_lists_kokkos();
+  void init_list_flags1_kokkos(int);
+  void init_list_flags2_kokkos(int);
+  void init_list_grow_kokkos(int);
+  void choose_build(int, NeighRequest *);
+  void build_kokkos(int);
+  void setup_bins_kokkos(int);
+  
+  typedef void (NeighborKokkos::*PairPtrHost)
+    (class NeighListKokkos<LMPHostType> *);
+  PairPtrHost *pair_build_host;
+  typedef void (NeighborKokkos::*PairPtrDevice)
+    (class NeighListKokkos<LMPDeviceType> *);
+  PairPtrDevice *pair_build_device;
+
+  template<class DeviceType,int HALF_NEIGH>
+  void full_bin_kokkos(NeighListKokkos<DeviceType> *list);
+  template<class DeviceType>
+  void full_bin_cluster_kokkos(NeighListKokkos<DeviceType> *list);
+
+  typedef void (NeighborKokkos::*StencilPtrHost)
+    (class NeighListKokkos<LMPHostType> *, int, int, int);
+  StencilPtrHost *stencil_create_host;
+  typedef void (NeighborKokkos::*StencilPtrDevice)
+    (class NeighListKokkos<LMPDeviceType> *, int, int, int);
+  StencilPtrDevice *stencil_create_device;
+};
+
+}
+
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/KOKKOS/pair_kokkos.h b/src/KOKKOS/pair_kokkos.h
new file mode 100644
index 000000000..de67e7df0
--- /dev/null
+++ b/src/KOKKOS/pair_kokkos.h
@@ -0,0 +1,655 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+#else
+
+#ifndef LMP_PAIR_KOKKOS_H
+#define LMP_PAIR_KOKKOS_H
+
+#include "Kokkos_Macros.hpp"
+#include "pair.h"
+#include "neigh_list_kokkos.h"
+#include "Kokkos_Vectorization.hpp"
+
+namespace LAMMPS_NS {
+
+template <class PairStyle, int NEIGHFLAG, bool STACKPARAMS, class Specialisation = void>
+struct PairComputeFunctor  {
+  typedef typename PairStyle::device_type device_type ;
+  typedef EV_FLOAT value_type;
+
+  PairStyle c;
+  NeighListKokkos<device_type> list;
+
+  PairComputeFunctor(PairStyle* c_ptr,
+                          NeighListKokkos<device_type>* list_ptr):
+  c(*c_ptr),list(*list_ptr) {};
+  ~PairComputeFunctor() {c.cleanup_copy();list.clean_copy();};
+
+  KOKKOS_INLINE_FUNCTION int sbmask(const int& j) const {
+    return j >> SBBITS & 3;
+  }
+
+  template<int EVFLAG, int NEWTON_PAIR>
+  KOKKOS_FUNCTION
+  EV_FLOAT compute_item(const int& ii,
+                        const NeighListKokkos<device_type> &list) const {
+    EV_FLOAT ev;
+    const int i = list.d_ilist[ii];
+    const X_FLOAT xtmp = c.x(i,0);
+    const X_FLOAT ytmp = c.x(i,1);
+    const X_FLOAT ztmp = c.x(i,2);
+    const int itype = c.type(i);
+
+    const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i);
+    const int jnum = list.d_numneigh[i];
+
+    F_FLOAT fxtmp = 0.0;
+    F_FLOAT fytmp = 0.0;
+    F_FLOAT fztmp = 0.0;
+
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = neighbors_i(jj);
+      const F_FLOAT factor_lj = c.special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+      const X_FLOAT delx = xtmp - c.x(j,0);
+      const X_FLOAT dely = ytmp - c.x(j,1);
+      const X_FLOAT delz = ztmp - c.x(j,2);
+      const int jtype = c.type(j);
+      const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+      if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) {
+
+        const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if ((NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) {
+          Kokkos::atomic_fetch_add(&c.f(j,0),-delx*fpair);
+          Kokkos::atomic_fetch_add(&c.f(j,1),-dely*fpair);
+          Kokkos::atomic_fetch_add(&c.f(j,2),-delz*fpair);
+        }
+
+        if ((NEIGHFLAG==HALF) && (NEWTON_PAIR || j < c.nlocal)) {
+          c.f(j,0) -= delx*fpair;
+          c.f(j,1) -= dely*fpair;
+          c.f(j,2) -= delz*fpair;
+        }
+
+        if (EVFLAG) {
+          if (c.eflag) {
+            ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j<c.nlocal)))?1.0:0.5)*
+              factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
+            if (c.COUL_FLAG)
+              ev.ecoul += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j<c.nlocal)))?1.0:0.5)*
+                factor_lj * c.template compute_ecoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
+          }
+
+          if (c.vflag_either) ev_tally(ev,i,j,fpair,delx,dely,delz);
+        }
+      }
+
+    }
+    if (NEIGHFLAG == HALFTHREAD) {
+      Kokkos::atomic_fetch_add(&c.f(i,0),fxtmp);
+      Kokkos::atomic_fetch_add(&c.f(i,1),fytmp);
+      Kokkos::atomic_fetch_add(&c.f(i,2),fztmp);
+    } else {
+      c.f(i,0) += fxtmp;
+      c.f(i,1) += fytmp;
+      c.f(i,2) += fztmp;
+    }
+
+    return ev;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+    void ev_tally(EV_FLOAT &ev, const int &i, const int &j,
+      const F_FLOAT &fpair, const F_FLOAT &delx,
+                  const F_FLOAT &dely, const F_FLOAT &delz) const
+  {
+    const int EFLAG = c.eflag;
+    const int NEWTON_PAIR = c.newton_pair;
+    const int VFLAG = c.vflag_either;
+
+    if (EFLAG) {
+      if (c.eflag_atom) {
+        const E_FLOAT epairhalf = 0.5 * (ev.evdwl + ev.ecoul);
+        if (NEWTON_PAIR || i < c.nlocal) c.eatom[i] += epairhalf;
+        if (NEWTON_PAIR || j < c.nlocal) c.eatom[j] += epairhalf;
+      }
+    }
+
+    if (VFLAG) {
+      const E_FLOAT v0 = delx*delx*fpair;
+      const E_FLOAT v1 = dely*dely*fpair;
+      const E_FLOAT v2 = delz*delz*fpair;
+      const E_FLOAT v3 = delx*dely*fpair;
+      const E_FLOAT v4 = delx*delz*fpair;
+      const E_FLOAT v5 = dely*delz*fpair;
+
+      if (c.vflag_global) {
+        if (NEIGHFLAG) {
+          if (NEWTON_PAIR) {
+            ev.v[0] += v0;
+            ev.v[1] += v1;
+            ev.v[2] += v2;
+            ev.v[3] += v3;
+            ev.v[4] += v4;
+            ev.v[5] += v5;
+          } else {
+            if (i < c.nlocal) {
+              ev.v[0] += 0.5*v0;
+              ev.v[1] += 0.5*v1;
+              ev.v[2] += 0.5*v2;
+              ev.v[3] += 0.5*v3;
+              ev.v[4] += 0.5*v4;
+              ev.v[5] += 0.5*v5;
+            }
+            if (j < c.nlocal) {
+              ev.v[0] += 0.5*v0;
+              ev.v[1] += 0.5*v1;
+              ev.v[2] += 0.5*v2;
+              ev.v[3] += 0.5*v3;
+              ev.v[4] += 0.5*v4;
+              ev.v[5] += 0.5*v5;
+            }
+          }
+        } else {
+          ev.v[0] += 0.5*v0;
+          ev.v[1] += 0.5*v1;
+          ev.v[2] += 0.5*v2;
+          ev.v[3] += 0.5*v3;
+          ev.v[4] += 0.5*v4;
+          ev.v[5] += 0.5*v5;
+        }
+      }
+
+      if (c.vflag_atom) {
+        if (NEWTON_PAIR || i < c.nlocal) {
+          c.d_vatom(i,0) += 0.5*v0;
+          c.d_vatom(i,1) += 0.5*v1;
+          c.d_vatom(i,2) += 0.5*v2;
+          c.d_vatom(i,3) += 0.5*v3;
+          c.d_vatom(i,4) += 0.5*v4;
+          c.d_vatom(i,5) += 0.5*v5;
+        }
+        if (NEWTON_PAIR || (NEIGHFLAG && j < c.nlocal)) {
+        c.d_vatom(j,0) += 0.5*v0;
+        c.d_vatom(j,1) += 0.5*v1;
+        c.d_vatom(j,2) += 0.5*v2;
+        c.d_vatom(j,3) += 0.5*v3;
+        c.d_vatom(j,4) += 0.5*v4;
+        c.d_vatom(j,5) += 0.5*v5;
+        }
+      }
+    }
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    if (c.newton_pair) compute_item<0,1>(i,list);
+    else compute_item<0,0>(i,list);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i, value_type &energy_virial) const {
+    if (c.newton_pair)
+      energy_virial += compute_item<1,1>(i,list);
+    else
+      energy_virial += compute_item<1,0>(i,list);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(volatile value_type &update) {
+    update.evdwl = 0;
+    update.ecoul = 0;
+    update.v[0] = 0;
+    update.v[1] = 0;
+    update.v[2] = 0;
+    update.v[3] = 0;
+    update.v[4] = 0;
+    update.v[5] = 0;
+  }
+  KOKKOS_INLINE_FUNCTION 
+  static void join(volatile value_type &update,
+                   const volatile value_type &source) {
+    update.evdwl += source.evdwl;
+    update.ecoul += source.ecoul;
+    update.v[0] += source.v[0];
+    update.v[1] += source.v[1];
+    update.v[2] += source.v[2];
+    update.v[3] += source.v[3];
+    update.v[4] += source.v[4];
+    update.v[5] += source.v[5];
+  }
+
+
+};
+
+template <class PairStyle, bool STACKPARAMS, class Specialisation>
+struct PairComputeFunctor<PairStyle,FULLCLUSTER,STACKPARAMS,Specialisation>  {
+  typedef typename PairStyle::device_type device_type ;
+  typedef Kokkos::Vectorization<device_type,NeighClusterSize> vectorization;
+  typedef EV_FLOAT value_type;
+
+  PairStyle c;
+  NeighListKokkos<device_type> list;
+
+  PairComputeFunctor(PairStyle* c_ptr,
+                          NeighListKokkos<device_type>* list_ptr):
+  c(*c_ptr),list(*list_ptr) {};
+  ~PairComputeFunctor() {c.cleanup_copy();list.clean_copy();};
+
+  KOKKOS_INLINE_FUNCTION int sbmask(const int& j) const {
+    return j >> SBBITS & 3;
+  }
+
+  template<int EVFLAG, int NEWTON_PAIR>
+  KOKKOS_FUNCTION
+  EV_FLOAT compute_item(const device_type& dev,
+                        const NeighListKokkos<device_type> &list) const {
+    EV_FLOAT ev;
+    const int i = vectorization::global_thread_rank(dev);
+
+    const X_FLOAT xtmp = c.c_x(i,0);
+    const X_FLOAT ytmp = c.c_x(i,1);
+    const X_FLOAT ztmp = c.c_x(i,2);
+    const int itype = c.type(i);
+
+    const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i);
+    const int jnum = list.d_numneigh[i];
+
+    F_FLOAT fxtmp = 0.0;
+    F_FLOAT fytmp = 0.0;
+    F_FLOAT fztmp = 0.0;
+
+    for (int jj = 0; jj < jnum; jj++) {
+      const int jjj = neighbors_i(jj);
+
+      for (int k = vectorization::begin(); k<NeighClusterSize; k+=vectorization::increment) {
+        const F_FLOAT factor_lj = c.special_lj[sbmask(jjj+k)];
+        const int j = (jjj + k)&NEIGHMASK;
+        if((j==i)||(j>=c.nall)) continue;
+        const X_FLOAT delx = xtmp - c.c_x(j,0);
+        const X_FLOAT dely = ytmp - c.c_x(j,1);
+        const X_FLOAT delz = ztmp - c.c_x(j,2);
+        const int jtype = c.type(j);
+        const F_FLOAT rsq = (delx*delx + dely*dely + delz*delz);
+
+        if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) {
+
+          const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
+          fxtmp += delx*fpair;
+          fytmp += dely*fpair;
+          fztmp += delz*fpair;
+
+          if (EVFLAG) {
+            if (c.eflag) {
+              ev.evdwl += 0.5*
+                factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
+              if (c.COUL_FLAG)
+                ev.ecoul += 0.5*
+                  factor_lj * c.template compute_ecoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
+            }
+
+            if (c.vflag_either) ev_tally(ev,i,j,fpair,delx,dely,delz);
+          }
+        }
+      }
+    }
+
+    const F_FLOAT fx = vectorization::reduce(fxtmp);
+    const F_FLOAT fy = vectorization::reduce(fytmp);
+    const F_FLOAT fz = vectorization::reduce(fztmp);
+    if(vectorization::is_lane_0(dev)) {
+      c.f(i,0) += fx;
+      c.f(i,1) += fy;
+      c.f(i,2) += fz;
+    }
+
+    return ev;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+    void ev_tally(EV_FLOAT &ev, const int &i, const int &j,
+      const F_FLOAT &fpair, const F_FLOAT &delx,
+                  const F_FLOAT &dely, const F_FLOAT &delz) const
+  {
+    const int EFLAG = c.eflag;
+    const int NEWTON_PAIR = c.newton_pair;
+    const int VFLAG = c.vflag_either;
+
+    if (EFLAG) {
+      if (c.eflag_atom) {
+        const E_FLOAT epairhalf = 0.5 * (ev.evdwl + ev.ecoul);
+        if (NEWTON_PAIR || i < c.nlocal) c.eatom[i] += epairhalf;
+        if (NEWTON_PAIR || j < c.nlocal) c.eatom[j] += epairhalf;
+      }
+    }
+
+    if (VFLAG) {
+      const E_FLOAT v0 = delx*delx*fpair;
+      const E_FLOAT v1 = dely*dely*fpair;
+      const E_FLOAT v2 = delz*delz*fpair;
+      const E_FLOAT v3 = delx*dely*fpair;
+      const E_FLOAT v4 = delx*delz*fpair;
+      const E_FLOAT v5 = dely*delz*fpair;
+
+      if (c.vflag_global) {
+          ev.v[0] += 0.5*v0;
+          ev.v[1] += 0.5*v1;
+          ev.v[2] += 0.5*v2;
+          ev.v[3] += 0.5*v3;
+          ev.v[4] += 0.5*v4;
+          ev.v[5] += 0.5*v5;
+      }
+
+      if (c.vflag_atom) {
+        if (i < c.nlocal) {
+          c.d_vatom(i,0) += 0.5*v0;
+          c.d_vatom(i,1) += 0.5*v1;
+          c.d_vatom(i,2) += 0.5*v2;
+          c.d_vatom(i,3) += 0.5*v3;
+          c.d_vatom(i,4) += 0.5*v4;
+          c.d_vatom(i,5) += 0.5*v5;
+        }
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const device_type& dev) const {
+    if (c.newton_pair) compute_item<0,1>(dev,list);
+    else compute_item<0,0>(dev,list);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const device_type& dev, value_type &energy_virial) const {
+    if (c.newton_pair)
+      energy_virial += compute_item<1,1>(dev,list);
+    else
+      energy_virial += compute_item<1,0>(dev,list);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(volatile value_type &update) {
+    update.evdwl = 0;
+    update.ecoul = 0;
+    update.v[0] = 0;
+    update.v[1] = 0;
+    update.v[2] = 0;
+    update.v[3] = 0;
+    update.v[4] = 0;
+    update.v[5] = 0;
+  }
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type &update,
+                   const volatile value_type &source) {
+    update.evdwl += source.evdwl;
+    update.ecoul += source.ecoul;
+    update.v[0] += source.v[0];
+    update.v[1] += source.v[1];
+    update.v[2] += source.v[2];
+    update.v[3] += source.v[3];
+    update.v[4] += source.v[4];
+    update.v[5] += source.v[5];
+  }
+
+
+};
+
+template <class PairStyle, bool STACKPARAMS, class Specialisation>
+struct PairComputeFunctor<PairStyle,N2,STACKPARAMS,Specialisation>  {
+  typedef typename PairStyle::device_type device_type ;
+  typedef EV_FLOAT value_type;
+
+  PairStyle c;
+  NeighListKokkos<device_type> list;
+
+  PairComputeFunctor(PairStyle* c_ptr,
+                          NeighListKokkos<device_type>* list_ptr):
+  c(*c_ptr),list(*list_ptr) {};
+  ~PairComputeFunctor() {c.cleanup_copy();list.clean_copy();};
+
+  KOKKOS_INLINE_FUNCTION int sbmask(const int& j) const {
+    return j >> SBBITS & 3;
+  }
+
+  template<int EVFLAG, int NEWTON_PAIR>
+  KOKKOS_FUNCTION
+  EV_FLOAT compute_item(const int& ii,
+                        const NeighListKokkos<device_type> &list) const {
+    EV_FLOAT ev;
+    const int i = ii;//list.d_ilist[ii];
+    const X_FLOAT xtmp = c.x(i,0);
+    const X_FLOAT ytmp = c.x(i,1);
+    const X_FLOAT ztmp = c.x(i,2);
+    const int itype = c.type(i);
+
+    //const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i);
+    const int jnum = c.nall;
+
+    F_FLOAT fxtmp = 0.0;
+    F_FLOAT fytmp = 0.0;
+    F_FLOAT fztmp = 0.0;
+
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = jj;//neighbors_i(jj);
+      if(i==j) continue;
+      const F_FLOAT factor_lj = c.special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+      const X_FLOAT delx = xtmp - c.x(j,0);
+      const X_FLOAT dely = ytmp - c.x(j,1);
+      const X_FLOAT delz = ztmp - c.x(j,2);
+      const int jtype = c.type(j);
+      const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+      if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) {
+
+        const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+
+        if (EVFLAG) {
+          if (c.eflag) {
+            ev.evdwl += 0.5*
+              factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
+            if (c.COUL_FLAG)
+              ev.ecoul += 0.5*
+                factor_lj * c.template compute_ecoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype);
+          }
+
+          if (c.vflag_either) ev_tally(ev,i,j,fpair,delx,dely,delz);
+        }
+      }
+    }
+
+    c.f(i,0) += fxtmp;
+    c.f(i,1) += fytmp;
+    c.f(i,2) += fztmp;
+
+    return ev;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+    void ev_tally(EV_FLOAT &ev, const int &i, const int &j,
+      const F_FLOAT &fpair, const F_FLOAT &delx,
+                  const F_FLOAT &dely, const F_FLOAT &delz) const
+  {
+    const int EFLAG = c.eflag;
+    const int VFLAG = c.vflag_either;
+
+    if (EFLAG) {
+      if (c.eflag_atom) {
+        const E_FLOAT epairhalf = 0.5 * (ev.evdwl + ev.ecoul);
+        if (i < c.nlocal) c.eatom[i] += epairhalf;
+        if (j < c.nlocal) c.eatom[j] += epairhalf;
+      }
+    }
+
+    if (VFLAG) {
+      const E_FLOAT v0 = delx*delx*fpair;
+      const E_FLOAT v1 = dely*dely*fpair;
+      const E_FLOAT v2 = delz*delz*fpair;
+      const E_FLOAT v3 = delx*dely*fpair;
+      const E_FLOAT v4 = delx*delz*fpair;
+      const E_FLOAT v5 = dely*delz*fpair;
+
+      if (c.vflag_global) {
+          ev.v[0] += 0.5*v0;
+          ev.v[1] += 0.5*v1;
+          ev.v[2] += 0.5*v2;
+          ev.v[3] += 0.5*v3;
+          ev.v[4] += 0.5*v4;
+          ev.v[5] += 0.5*v5;
+      }
+
+      if (c.vflag_atom) {
+        if (i < c.nlocal) {
+          c.d_vatom(i,0) += 0.5*v0;
+          c.d_vatom(i,1) += 0.5*v1;
+          c.d_vatom(i,2) += 0.5*v2;
+          c.d_vatom(i,3) += 0.5*v3;
+          c.d_vatom(i,4) += 0.5*v4;
+          c.d_vatom(i,5) += 0.5*v5;
+        }
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    compute_item<0,0>(i,list);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i, value_type &energy_virial) const {
+    energy_virial += compute_item<1,0>(i,list);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(volatile value_type &update) {
+    update.evdwl = 0;
+    update.ecoul = 0;
+    update.v[0] = 0;
+    update.v[1] = 0;
+    update.v[2] = 0;
+    update.v[3] = 0;
+    update.v[4] = 0;
+    update.v[5] = 0;
+  }
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type &update,
+                   const volatile value_type &source) {
+    update.evdwl += source.evdwl;
+    update.ecoul += source.ecoul;
+    update.v[0] += source.v[0];
+    update.v[1] += source.v[1];
+    update.v[2] += source.v[2];
+    update.v[3] += source.v[3];
+    update.v[4] += source.v[4];
+    update.v[5] += source.v[5];
+  }
+
+
+};
+
+template<class PairStyle, class Specialisation>
+EV_FLOAT pair_compute (PairStyle* fpair, NeighListKokkos<typename PairStyle::device_type>* list) {
+  EV_FLOAT ev;
+  if(fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
+    if (fpair->neighflag == FULL) {
+      PairComputeFunctor<PairStyle,FULL,false,Specialisation >
+        ff(fpair, list);
+      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
+      else Kokkos::parallel_for(list->inum,ff);
+    } else if (fpair->neighflag == HALFTHREAD) {
+      PairComputeFunctor<PairStyle,HALFTHREAD,false,Specialisation >
+        ff(fpair, list);
+      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
+      else Kokkos::parallel_for(list->inum,ff);
+    } else if (fpair->neighflag == HALF) {
+      PairComputeFunctor<PairStyle,HALF,false,Specialisation >
+        ff(fpair, list);
+      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
+      else Kokkos::parallel_for(list->inum,ff);
+    } else if (fpair->neighflag == N2) {
+      PairComputeFunctor<PairStyle,N2,false,Specialisation >
+        ff(fpair, list);
+      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(fpair->nlocal,ff,ev);
+      else Kokkos::parallel_for(fpair->nlocal,ff);
+    } else if (fpair->neighflag == FULLCLUSTER) {
+      typedef PairComputeFunctor<PairStyle,FULLCLUSTER,false,Specialisation >
+        f_type;
+      f_type ff(fpair, list);
+      #ifdef KOKKOS_HAVE_CUDA
+        const int teamsize = Kokkos::Impl::is_same<typename f_type::device_type, Kokkos::Cuda>::value ? 256 : 1;
+      #else
+        const int teamsize = 1;
+      #endif
+      const int nteams = (list->inum*f_type::vectorization::increment+teamsize-1)/teamsize;
+      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(Kokkos::ParallelWorkRequest(nteams,teamsize),ff,ev);
+      else Kokkos::parallel_for(Kokkos::ParallelWorkRequest(nteams,teamsize),ff);
+    }
+  } else {
+    if (fpair->neighflag == FULL) {
+      PairComputeFunctor<PairStyle,FULL,true,Specialisation >
+        ff(fpair, list);
+      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
+      else Kokkos::parallel_for(list->inum,ff);
+    } else if (fpair->neighflag == HALFTHREAD) {
+      PairComputeFunctor<PairStyle,HALFTHREAD,true,Specialisation >
+        ff(fpair, list);
+      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
+      else Kokkos::parallel_for(list->inum,ff);
+    } else if (fpair->neighflag == HALF) {
+      PairComputeFunctor<PairStyle,HALF,true,Specialisation >
+        ff(fpair, list);
+      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
+      else Kokkos::parallel_for(list->inum,ff);
+    } else if (fpair->neighflag == N2) {
+      PairComputeFunctor<PairStyle,N2,true,Specialisation >
+        ff(fpair, list);
+      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(fpair->nlocal,ff,ev);
+      else Kokkos::parallel_for(fpair->nlocal,ff);
+    } else if (fpair->neighflag == FULLCLUSTER) {
+      typedef PairComputeFunctor<PairStyle,FULLCLUSTER,true,Specialisation >
+        f_type;
+      f_type ff(fpair, list);
+      #ifdef KOKKOS_HAVE_CUDA
+        const int teamsize = Kokkos::Impl::is_same<typename f_type::device_type, Kokkos::Cuda>::value ? 256 : 1;
+      #else
+        const int teamsize = 1;
+      #endif
+      const int nteams = (list->inum*f_type::vectorization::increment+teamsize-1)/teamsize;
+      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(Kokkos::ParallelWorkRequest(nteams,teamsize),ff,ev);
+      else Kokkos::parallel_for(Kokkos::ParallelWorkRequest(nteams,teamsize),ff);
+    }
+  }
+  return ev;
+}
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/KOKKOS/pair_lj_cut_kokkos.cpp b/src/KOKKOS/pair_lj_cut_kokkos.cpp
new file mode 100644
index 000000000..94576a36c
--- /dev/null
+++ b/src/KOKKOS/pair_lj_cut_kokkos.cpp
@@ -0,0 +1,267 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "string.h"
+#include "pair_lj_cut_kokkos.h"
+#include "kokkos.h"
+#include "atom_kokkos.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "math_const.h"
+#include "memory.h"
+#include "error.h"
+#include "atom_masks.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define KOKKOS_CUDA_MAX_THREADS 256
+#define KOKKOS_CUDA_MIN_BLOCKS 8
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairLJCutKokkos<DeviceType>::PairLJCutKokkos(LAMMPS *lmp) : PairLJCut(lmp)
+{
+  respa_enable = 0;
+
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
+  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
+  cutsq = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairLJCutKokkos<DeviceType>::~PairLJCutKokkos()
+{
+  if (allocated) {
+    k_cutsq = DAT::tdual_ffloat_2d();
+    memory->sfree(cutsq);
+    cutsq = NULL;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairLJCutKokkos<DeviceType>::cleanup_copy() {
+  // WHY needed: this prevents parent copy from deallocating any arrays
+  allocated = 0;
+  cutsq = NULL;
+  eatom = NULL;
+  vatom = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairLJCutKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  eflag = eflag_in;
+  vflag = vflag_in;
+
+
+  if (neighflag == FULL || neighflag == FULLCLUSTER) no_virial_fdotr_compute = 1;
+
+  double evdwl = 0.0;
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  atomKK->sync(execution_space,datamask_read);
+  k_cutsq.template sync<DeviceType>();
+  k_params.template sync<DeviceType>();
+  if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
+  else atomKK->modified(execution_space,F_MASK);
+
+  x = atomKK->k_x.view<DeviceType>();
+  c_x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  nlocal = atom->nlocal;
+  nall = atom->nlocal + atom->nghost;
+  special_lj[0] = force->special_lj[0];
+  special_lj[1] = force->special_lj[1];
+  special_lj[2] = force->special_lj[2];
+  special_lj[3] = force->special_lj[3];
+  newton_pair = force->newton_pair;
+
+  // loop over neighbors of my atoms
+
+  EV_FLOAT ev = pair_compute<PairLJCutKokkos<DeviceType>,void >(this,(NeighListKokkos<DeviceType>*)list);
+
+  DeviceType::fence();
+
+  if (eflag) eng_vdwl += ev.evdwl;
+  if (vflag_global) {
+    virial[0] += ev.v[0];
+    virial[1] += ev.v[1];
+    virial[2] += ev.v[2];
+    virial[3] += ev.v[3];
+    virial[4] += ev.v[4];
+    virial[5] += ev.v[5];
+  }
+
+  if (vflag_fdotr) virial_fdotr_compute();
+}
+
+template<class DeviceType>
+template<bool STACKPARAMS, class Specialisation>
+KOKKOS_INLINE_FUNCTION
+F_FLOAT PairLJCutKokkos<DeviceType>::
+compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const {
+  const F_FLOAT r2inv = 1.0/rsq;
+  const F_FLOAT r6inv = r2inv*r2inv*r2inv;
+
+  const F_FLOAT forcelj = r6inv *
+    ((STACKPARAMS?m_params[itype][jtype].lj1:params(itype,jtype).lj1)*r6inv -
+     (STACKPARAMS?m_params[itype][jtype].lj2:params(itype,jtype).lj2));
+  return forcelj*r2inv;
+}
+
+template<class DeviceType>
+template<bool STACKPARAMS, class Specialisation>
+KOKKOS_INLINE_FUNCTION
+F_FLOAT PairLJCutKokkos<DeviceType>::
+compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const {
+  const F_FLOAT r2inv = 1.0/rsq;
+  const F_FLOAT r6inv = r2inv*r2inv*r2inv;
+  return r6inv*((STACKPARAMS?m_params[itype][jtype].lj3:params(itype,jtype).lj3)*r6inv -
+                (STACKPARAMS?m_params[itype][jtype].lj4:params(itype,jtype).lj4)) -
+                (STACKPARAMS?m_params[itype][jtype].offset:params(itype,jtype).offset);
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairLJCutKokkos<DeviceType>::allocate()
+{
+  PairLJCut::allocate();
+
+  int n = atom->ntypes;
+  memory->destroy(cutsq);
+  memory->create_kokkos(k_cutsq,cutsq,n+1,n+1,"pair:cutsq");
+  d_cutsq = k_cutsq.template view<DeviceType>();
+  k_params = Kokkos::DualView<params_lj**,Kokkos::LayoutRight,DeviceType>("PairLJCut::params",n+1,n+1);
+  params = k_params.d_view;
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairLJCutKokkos<DeviceType>::settings(int narg, char **arg)
+{
+  if (narg > 2) error->all(FLERR,"Illegal pair_style command");
+
+  PairLJCut::settings(1,arg);
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairLJCutKokkos<DeviceType>::init_style()
+{
+  PairLJCut::init_style();
+
+  // error if rRESPA with inner levels
+
+  if (update->whichflag == 1 && strstr(update->integrate_style,"respa")) {
+    int respa = 0;
+    if (((Respa *) update->integrate)->level_inner >= 0) respa = 1;
+    if (((Respa *) update->integrate)->level_middle >= 0) respa = 2;
+    if (respa) 
+      error->all(FLERR,"Cannot use Kokkos pair style with rRESPA inner/middle");
+  }
+
+  // irequest = neigh request made by parent class
+
+  neighflag = lmp->kokkos->neighflag;
+  int irequest = neighbor->nrequest - 1;
+
+  neighbor->requests[irequest]->
+    kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value && 
+    !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+
+  if (neighflag == FULL) {
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full_cluster = 0;
+  } else if (neighflag == HALF || neighflag == HALFTHREAD) {
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 1;
+    neighbor->requests[irequest]->full_cluster = 0;
+  } else if (neighflag == N2) {
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full_cluster = 0;
+  } else if (neighflag == FULLCLUSTER) {
+    neighbor->requests[irequest]->full_cluster = 1;
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+  } else {
+    error->all(FLERR,"Cannot use chosen neighbor list style with lj/cut/kk");
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PairLJCutKokkos<DeviceType>::init_one(int i, int j)
+{
+  double cutone = PairLJCut::init_one(i,j);
+
+  k_params.h_view(i,j).lj1 = lj1[i][j];
+  k_params.h_view(i,j).lj2 = lj2[i][j];
+  k_params.h_view(i,j).lj3 = lj3[i][j];
+  k_params.h_view(i,j).lj4 = lj4[i][j];
+  k_params.h_view(i,j).offset = offset[i][j];
+  k_params.h_view(i,j).cutsq = cutone*cutone;
+  k_params.h_view(j,i) = k_params.h_view(i,j);
+  if(i<MAX_TYPES_STACKPARAMS+1 && j<MAX_TYPES_STACKPARAMS+1) {
+    m_params[i][j] = m_params[j][i] = k_params.h_view(i,j);
+    m_cutsq[j][i] = m_cutsq[i][j] = cutone*cutone;
+  }
+  k_cutsq.h_view(i,j) = cutone*cutone;
+  k_cutsq.template modify<LMPHostType>();
+  k_params.template modify<LMPHostType>();
+
+  return cutone;
+}
+
+
+
+template class PairLJCutKokkos<LMPDeviceType>;
+#if DEVICE==2
+template class PairLJCutKokkos<LMPHostType>;
+#endif
diff --git a/src/KOKKOS/pair_lj_cut_kokkos.h b/src/KOKKOS/pair_lj_cut_kokkos.h
new file mode 100644
index 000000000..5c3c002af
--- /dev/null
+++ b/src/KOKKOS/pair_lj_cut_kokkos.h
@@ -0,0 +1,112 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/kk,PairLJCutKokkos<LMPDeviceType>)
+PairStyle(lj/cut/kk/device,PairLJCutKokkos<LMPDeviceType>)
+PairStyle(lj/cut/kk/host,PairLJCutKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_KOKKOS_H
+#define LMP_PAIR_LJ_CUT_KOKKOS_H
+
+#include "pair_kokkos.h"
+#include "pair_lj_cut.h"
+#include "neigh_list_kokkos.h"
+
+namespace LAMMPS_NS {
+
+template<class DeviceType>
+class PairLJCutKokkos : public PairLJCut {
+ public:
+  enum {COUL_FLAG=0};
+  typedef DeviceType device_type;
+  PairLJCutKokkos(class LAMMPS *);
+  ~PairLJCutKokkos();
+
+  void compute(int, int);
+
+  void settings(int, char **);
+  void init_style();
+  double init_one(int, int);
+
+  struct params_lj{
+    params_lj(){cutsq=0,lj1=0;lj2=0;lj3=0;lj4=0;offset=0;};
+    params_lj(int i){cutsq=0,lj1=0;lj2=0;lj3=0;lj4=0;offset=0;};
+    F_FLOAT cutsq,lj1,lj2,lj3,lj4,offset;
+  };
+
+ protected:
+  void cleanup_copy();
+
+  template<bool STACKPARAMS, class Specialisation>
+  KOKKOS_INLINE_FUNCTION
+  F_FLOAT compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const;
+
+  template<bool STACKPARAMS, class Specialisation>
+  KOKKOS_INLINE_FUNCTION
+  F_FLOAT compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const;
+
+  template<bool STACKPARAMS, class Specialisation>
+  KOKKOS_INLINE_FUNCTION
+  F_FLOAT compute_ecoul(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const {
+    return 0;
+  }
+
+
+  Kokkos::DualView<params_lj**,Kokkos::LayoutRight,DeviceType> k_params;
+  typename Kokkos::DualView<params_lj**,Kokkos::LayoutRight,DeviceType>::t_dev_const params;
+  params_lj m_params[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];  // hardwired to space for 15 atom types
+  F_FLOAT m_cutsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+  typename ArrayTypes<DeviceType>::t_x_array_randomread x;
+  typename ArrayTypes<DeviceType>::t_x_array c_x;
+  typename ArrayTypes<DeviceType>::t_f_array f;
+  typename ArrayTypes<DeviceType>::t_int_1d_randomread type;
+  typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom;
+  typename ArrayTypes<DeviceType>::t_virial_array d_vatom;
+  //typename ArrayTypes<DeviceType>::t_ffloat_1d special_lj;
+
+  int newton_pair;
+
+  typename ArrayTypes<DeviceType>::tdual_ffloat_2d k_cutsq;
+  typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;
+
+  class AtomKokkos *atomKK;
+  int neighflag;
+  int nlocal,nall,eflag,vflag;
+
+  void allocate();
+  friend class PairComputeFunctor<PairLJCutKokkos,FULL,true>;
+  friend class PairComputeFunctor<PairLJCutKokkos,HALF,true>;
+  friend class PairComputeFunctor<PairLJCutKokkos,HALFTHREAD,true>;
+  friend class PairComputeFunctor<PairLJCutKokkos,N2,true>;
+  friend class PairComputeFunctor<PairLJCutKokkos,FULLCLUSTER,true >;
+  friend class PairComputeFunctor<PairLJCutKokkos,FULL,false>;
+  friend class PairComputeFunctor<PairLJCutKokkos,HALF,false>;
+  friend class PairComputeFunctor<PairLJCutKokkos,HALFTHREAD,false>;
+  friend class PairComputeFunctor<PairLJCutKokkos,N2,false>;
+  friend class PairComputeFunctor<PairLJCutKokkos,FULLCLUSTER,false >;
+  friend EV_FLOAT pair_compute<PairLJCutKokkos,void>(PairLJCutKokkos*,NeighListKokkos<DeviceType>*);
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/KOKKOS/pair_table_kokkos.cpp b/src/KOKKOS/pair_table_kokkos.cpp
new file mode 100644
index 000000000..cc8072991
--- /dev/null
+++ b/src/KOKKOS/pair_table_kokkos.cpp
@@ -0,0 +1,1500 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include "math.h"
+#include "stdlib.h"
+#include "string.h"
+#include "pair_table_kokkos.h"
+#include "kokkos.h"
+#include "atom.h"
+#include "force.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "memory.h"
+#include "error.h"
+#include "atom_masks.h"
+
+using namespace LAMMPS_NS;
+
+enum{NONE,RLINEAR,RSQ,BMP};
+enum{FULL,HALFTHREAD,HALF};
+
+#define MAXLINE 1024
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairTableKokkos<DeviceType>::PairTableKokkos(LAMMPS *lmp) : Pair(lmp)
+{
+  update_table = 0;
+  atomKK = (AtomKokkos *) atom;
+  ntables = 0;
+  tables = NULL;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
+  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
+  h_table = new TableHost();
+  d_table = new TableDevice();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairTableKokkos<DeviceType>::~PairTableKokkos()
+{
+/*  for (int m = 0; m < ntables; m++) free_table(&tables[m]);
+  memory->sfree(tables);
+
+  if (allocated) {
+    memory->destroy(setflag);
+    memory->destroy(cutsq);
+    memory->destroy(tabindex);
+  }*/
+  delete h_table;
+  delete d_table;
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  if(update_table)
+    create_kokkos_tables();
+  if(tabstyle == LOOKUP)
+    compute_style<LOOKUP>(eflag_in,vflag_in);
+  if(tabstyle == LINEAR)
+    compute_style<LINEAR>(eflag_in,vflag_in);
+  if(tabstyle == SPLINE)
+    compute_style<SPLINE>(eflag_in,vflag_in);
+  if(tabstyle == BITMAP)
+    compute_style<BITMAP>(eflag_in,vflag_in);
+}
+
+template<class DeviceType>
+template<int TABSTYLE>
+void PairTableKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
+{
+  eflag = eflag_in;
+  vflag = vflag_in;
+
+  if (neighflag == FULL || neighflag == FULLCLUSTER) no_virial_fdotr_compute = 1;
+
+  double evdwl = 0.0;
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  atomKK->sync(execution_space,datamask_read);
+  //k_cutsq.template sync<DeviceType>();
+  //k_params.template sync<DeviceType>();
+  if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
+  else atomKK->modified(execution_space,F_MASK);
+
+  x = c_x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  nlocal = atom->nlocal;
+  nall = atom->nlocal + atom->nghost;
+  special_lj[0] = force->special_lj[0];
+  special_lj[1] = force->special_lj[1];
+  special_lj[2] = force->special_lj[2];
+  special_lj[3] = force->special_lj[3];
+  newton_pair = force->newton_pair;
+  d_cutsq = d_table->cutsq;
+  // loop over neighbors of my atoms
+
+  EV_FLOAT ev;
+  if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
+    if (neighflag == FULL) {
+      PairComputeFunctor<PairTableKokkos<DeviceType>,FULL,false,S_TableCompute<DeviceType,TABSTYLE> >
+        ff(this,(NeighListKokkos<DeviceType>*) list);
+      if (eflag || vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
+      else Kokkos::parallel_for(list->inum,ff);
+    } else if (neighflag == HALFTHREAD) {
+      PairComputeFunctor<PairTableKokkos<DeviceType>,HALFTHREAD,false,S_TableCompute<DeviceType,TABSTYLE> >
+        ff(this,(NeighListKokkos<DeviceType>*) list);
+      if (eflag || vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
+      else Kokkos::parallel_for(list->inum,ff);
+    } else if (neighflag == HALF) {
+      PairComputeFunctor<PairTableKokkos<DeviceType>,HALF,false,S_TableCompute<DeviceType,TABSTYLE> >
+        f(this,(NeighListKokkos<DeviceType>*) list);
+      if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
+      else Kokkos::parallel_for(list->inum,f);
+    } else if (neighflag == N2) {
+      PairComputeFunctor<PairTableKokkos<DeviceType>,N2,false,S_TableCompute<DeviceType,TABSTYLE> >
+        f(this,(NeighListKokkos<DeviceType>*) list);
+      if (eflag || vflag) Kokkos::parallel_reduce(nlocal,f,ev);
+      else Kokkos::parallel_for(nlocal,f);
+    } else if (neighflag == FULLCLUSTER) {
+      typedef PairComputeFunctor<PairTableKokkos<DeviceType>,FULLCLUSTER,false,S_TableCompute<DeviceType,TABSTYLE> >
+        f_type;
+      f_type f(this,(NeighListKokkos<DeviceType>*) list);
+      #ifdef KOKKOS_HAVE_CUDA
+        const int teamsize = Kokkos::Impl::is_same<typename f_type::device_type, Kokkos::Cuda>::value ? 256 : 1;
+      #else
+        const int teamsize = 1;
+      #endif
+      const int nteams = (list->inum*f_type::vectorization::increment+teamsize-1)/teamsize;
+      if (eflag || vflag) Kokkos::parallel_reduce(Kokkos::ParallelWorkRequest(nteams,teamsize),f,ev);
+      else Kokkos::parallel_for(Kokkos::ParallelWorkRequest(nteams,teamsize),f);
+    }
+  } else {
+    if (neighflag == FULL) {
+      PairComputeFunctor<PairTableKokkos<DeviceType>,FULL,true,S_TableCompute<DeviceType,TABSTYLE> >
+        f(this,(NeighListKokkos<DeviceType>*) list);
+      if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
+      else Kokkos::parallel_for(list->inum,f);
+    } else if (neighflag == HALFTHREAD) {
+      PairComputeFunctor<PairTableKokkos<DeviceType>,HALFTHREAD,true,S_TableCompute<DeviceType,TABSTYLE> >
+        f(this,(NeighListKokkos<DeviceType>*) list);
+      if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
+      else Kokkos::parallel_for(list->inum,f);
+    } else if (neighflag == HALF) {
+      PairComputeFunctor<PairTableKokkos<DeviceType>,HALF,true,S_TableCompute<DeviceType,TABSTYLE> >
+        f(this,(NeighListKokkos<DeviceType>*) list);
+      if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev);
+      else Kokkos::parallel_for(list->inum,f);
+    } else if (neighflag == N2) {
+      PairComputeFunctor<PairTableKokkos<DeviceType>,N2,true,S_TableCompute<DeviceType,TABSTYLE> >
+        f(this,(NeighListKokkos<DeviceType>*) list);
+      if (eflag || vflag) Kokkos::parallel_reduce(nlocal,f,ev);
+      else Kokkos::parallel_for(nlocal,f);
+    } else if (neighflag == FULLCLUSTER) {
+      typedef PairComputeFunctor<PairTableKokkos<DeviceType>,FULLCLUSTER,true,S_TableCompute<DeviceType,TABSTYLE> >
+        f_type;
+      f_type f(this,(NeighListKokkos<DeviceType>*) list);
+      #ifdef KOKKOS_HAVE_CUDA
+        const int teamsize = Kokkos::Impl::is_same<typename f_type::device_type, Kokkos::Cuda>::value ? 256 : 1;
+      #else
+        const int teamsize = 1;
+      #endif
+      const int nteams = (list->inum*f_type::vectorization::increment+teamsize-1)/teamsize;
+      if (eflag || vflag) Kokkos::parallel_reduce(Kokkos::ParallelWorkRequest(nteams,teamsize),f,ev);
+      else Kokkos::parallel_for(Kokkos::ParallelWorkRequest(nteams,teamsize),f);
+    }
+  }
+  DeviceType::fence();
+
+  if (eflag) eng_vdwl += ev.evdwl;
+  if (vflag_global) {
+    virial[0] += ev.v[0];
+    virial[1] += ev.v[1];
+    virial[2] += ev.v[2];
+    virial[3] += ev.v[3];
+    virial[4] += ev.v[4];
+    virial[5] += ev.v[5];
+  }
+
+  if (vflag_fdotr) virial_fdotr_compute();
+}
+
+template<class DeviceType>
+template<bool STACKPARAMS, class Specialisation>
+KOKKOS_INLINE_FUNCTION
+F_FLOAT PairTableKokkos<DeviceType>::
+compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const {
+  union_int_float_t rsq_lookup;
+  double fpair;
+  const int tidx = d_table_const.tabindex(itype,jtype);
+  //const Table* const tb = &tables[tabindex[itype][jtype]];
+
+  //if (rsq < d_table_const.innersq(tidx))
+  //  error->one(FLERR,"Pair distance < table inner cutoff");
+
+  if (Specialisation::TabStyle == LOOKUP) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    //if (itable >= tlm1)
+    //  error->one(FLERR,"Pair distance > table outer cutoff");
+    fpair = d_table_const.f(tidx,itable);
+  } else if (Specialisation::TabStyle == LINEAR) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    //if (itable >= tlm1)
+    //  error->one(FLERR,"Pair distance > table outer cutoff");
+    const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
+    fpair = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable);
+  } else if (Specialisation::TabStyle == SPLINE) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    //if (itable >= tlm1)
+    //  error->one(FLERR,"Pair distance > table outer cutoff");
+    const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
+    const double a = 1.0 - b;
+    fpair = a * d_table_const.f(tidx,itable) + b * d_table_const.f(tidx,itable+1) +
+      ((a*a*a-a)*d_table_const.f2(tidx,itable) + (b*b*b-b)*d_table_const.f2(tidx,itable+1)) *
+      d_table_const.deltasq6(tidx);
+  } else {
+    rsq_lookup.f = rsq;
+    int itable = rsq_lookup.i & d_table_const.nmask(tidx);
+    itable >>= d_table_const.nshiftbits(tidx);
+    const double fraction = (rsq_lookup.f - d_table_const.rsq(tidx,itable)) * d_table_const.drsq(tidx,itable);
+    fpair = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable);
+  }
+  return fpair;
+}
+
+template<class DeviceType>
+template<bool STACKPARAMS, class Specialisation>
+KOKKOS_INLINE_FUNCTION
+F_FLOAT PairTableKokkos<DeviceType>::
+compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const {
+  double evdwl;
+  union_int_float_t rsq_lookup;
+  const int tidx = d_table_const.tabindex(itype,jtype);
+  //const Table* const tb = &tables[tabindex[itype][jtype]];
+
+  //if (rsq < d_table_const.innersq(tidx))
+  //  error->one(FLERR,"Pair distance < table inner cutoff");
+
+  if (Specialisation::TabStyle == LOOKUP) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    //if (itable >= tlm1)
+    //  error->one(FLERR,"Pair distance > table outer cutoff");
+    evdwl = d_table_const.e(tidx,itable);
+  } else if (Specialisation::TabStyle == LINEAR) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    //if (itable >= tlm1)
+    //  error->one(FLERR,"Pair distance > table outer cutoff");
+    const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
+    evdwl = d_table_const.e(tidx,itable) + fraction*d_table_const.de(tidx,itable);
+  } else if (Specialisation::TabStyle == SPLINE) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    //if (itable >= tlm1)
+    //  error->one(FLERR,"Pair distance > table outer cutoff");
+    const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
+    const double a = 1.0 - b;
+    evdwl = a * d_table_const.e(tidx,itable) + b * d_table_const.e(tidx,itable+1) +
+        ((a*a*a-a)*d_table_const.e2(tidx,itable) + (b*b*b-b)*d_table_const.e2(tidx,itable+1)) *
+        d_table_const.deltasq6(tidx);
+  } else {
+    rsq_lookup.f = rsq;
+    int itable = rsq_lookup.i & d_table_const.nmask(tidx);
+    itable >>= d_table_const.nshiftbits(tidx);
+    const double fraction = (rsq_lookup.f - d_table_const.rsq(tidx,itable)) * d_table_const.drsq(tidx,itable);
+    evdwl = d_table_const.e(tidx,itable) + fraction*d_table_const.de(tidx,itable);
+  }
+  return evdwl;
+}
+
+/*
+template<class DeviceType>
+template<int EVFLAG, int NEIGHFLAG, int NEWTON_PAIR,int TABSTYLE>
+KOKKOS_FUNCTION
+EV_FLOAT PairTableKokkos<DeviceType>::
+compute_item(const int &ii, const NeighListKokkos<DeviceType> &list) const
+{
+  EV_FLOAT ev;
+  const int tlm1 = tablength - 1;
+  union_int_float_t rsq_lookup;
+  const int i = list.d_ilist[ii];
+  const X_FLOAT xtmp = x(i,0);
+  const X_FLOAT ytmp = x(i,1);
+  const X_FLOAT ztmp = x(i,2);
+  const int itype = type(i);
+
+  const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i);
+  const int jnum = list.d_numneigh[i];
+
+  F_FLOAT fxtmp = 0.0;
+  F_FLOAT fytmp = 0.0;
+  F_FLOAT fztmp = 0.0;
+
+  for (int jj = 0; jj < jnum; jj++) {
+    int j = neighbors_i(jj);
+    const F_FLOAT factor_lj = 1.0;  //special_lj[sbmask(j)];
+    j &= NEIGHMASK;
+    const X_FLOAT delx = xtmp - x(j,0);
+    const X_FLOAT dely = ytmp - x(j,1);
+    const X_FLOAT delz = ztmp - x(j,2);
+    const int jtype = type(j);
+    const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < d_table_const.cutsq(itype,jtype)) {
+        double fpair; 
+        const int tidx = d_table_const.tabindex(itype,jtype);
+        //const Table* const tb = &tables[tabindex[itype][jtype]];
+        
+        //if (rsq < d_table_const.innersq(tidx))
+        //  error->one(FLERR,"Pair distance < table inner cutoff");
+
+        if (TABSTYLE == LOOKUP) {
+          const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+          //if (itable >= tlm1)
+          //  error->one(FLERR,"Pair distance > table outer cutoff");
+          fpair = factor_lj * d_table_const.f(tidx,itable);
+          if (EVFLAG)
+            ev.evdwl = d_table_const.e(tidx,itable);
+        } else if (TABSTYLE == LINEAR) {
+          const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+          //if (itable >= tlm1)
+          //  error->one(FLERR,"Pair distance > table outer cutoff");
+          const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
+          const double value = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable);          
+          fpair = factor_lj * value;
+          if (EVFLAG)
+            ev.evdwl = d_table_const.e(tidx,itable) + fraction*d_table_const.de(tidx,itable);
+        } else if (TABSTYLE == SPLINE) {
+          const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+          //if (itable >= tlm1)
+          //  error->one(FLERR,"Pair distance > table outer cutoff");
+          const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
+          const double a = 1.0 - b;
+          const double value = a * d_table_const.f(tidx,itable) + b * d_table_const.f(tidx,itable+1) +
+            ((a*a*a-a)*d_table_const.f2(tidx,itable) + (b*b*b-b)*d_table_const.f2(tidx,itable+1)) *
+            d_table_const.deltasq6(tidx);
+          fpair = factor_lj * value;
+          if (EVFLAG)
+            ev.evdwl = a * d_table_const.e(tidx,itable) + b * d_table_const.e(tidx,itable+1) +
+              ((a*a*a-a)*d_table_const.e2(tidx,itable) + (b*b*b-b)*d_table_const.e2(tidx,itable+1)) *
+              d_table_const.deltasq6(tidx);
+        } else {
+          rsq_lookup.f = rsq;
+          int itable = rsq_lookup.i & d_table_const.nmask(tidx);
+          itable >>= d_table_const.nshiftbits(tidx);
+          const double fraction = (rsq_lookup.f - d_table_const.rsq(tidx,itable)) * d_table_const.drsq(tidx,itable);
+          const double value = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable);
+          fpair = factor_lj * value;
+          if (EVFLAG)
+            ev.evdwl = d_table_const.e(tidx,itable) + fraction*d_table_const.de(tidx,itable);
+        }
+
+      fxtmp += delx*fpair;
+      fytmp += dely*fpair;
+      fztmp += delz*fpair;
+      if ((NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
+        Kokkos::atomic_fetch_add(&f(j,0),-delx*fpair);
+        Kokkos::atomic_fetch_add(&f(j,1),-dely*fpair);
+        Kokkos::atomic_fetch_add(&f(j,2),-delz*fpair);
+      }
+
+      if ((NEIGHFLAG==HALF) && (NEWTON_PAIR || j < nlocal)) {
+        f(j,0) -= delx*fpair;
+        f(j,1) -= dely*fpair;
+        f(j,2) -= delz*fpair;
+      }
+
+      if(EVFLAG) {
+        if (eflag) {
+          ev.evdwl *= factor_lj;
+        }
+
+        if (evflag) ev_tally<NEIGHFLAG>(ev,i,j
+,fpair,delx,dely,delz);
+      }  
+    }
+  }
+
+  if (NEIGHFLAG == HALFTHREAD) {
+    Kokkos::atomic_fetch_add(&f(i,0),fxtmp);
+    Kokkos::atomic_fetch_add(&f(i,1),fytmp);
+    Kokkos::atomic_fetch_add(&f(i,2),fztmp);
+  } else {
+    f(i,0) += fxtmp;
+    f(i,1) += fytmp;
+    f(i,2) += fztmp;
+  }
+
+  return ev;
+}
+*/
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::create_kokkos_tables()
+{
+  const int tlm1 = tablength-1;
+
+  memory->create_kokkos(d_table->nshiftbits,h_table->nshiftbits,ntables,"Table::nshiftbits");
+  memory->create_kokkos(d_table->nmask,h_table->nmask,ntables,"Table::nmask");
+  memory->create_kokkos(d_table->innersq,h_table->innersq,ntables,"Table::innersq");
+  memory->create_kokkos(d_table->invdelta,h_table->invdelta,ntables,"Table::invdelta");
+  memory->create_kokkos(d_table->deltasq6,h_table->deltasq6,ntables,"Table::deltasq6");
+
+  if(tabstyle == LOOKUP) {
+    memory->create_kokkos(d_table->e,h_table->e,ntables,tlm1,"Table::e");
+    memory->create_kokkos(d_table->f,h_table->f,ntables,tlm1,"Table::f");
+  }
+
+  if(tabstyle == LINEAR) {
+    memory->create_kokkos(d_table->rsq,h_table->rsq,ntables,tablength,"Table::rsq");
+    memory->create_kokkos(d_table->e,h_table->e,ntables,tablength,"Table::e");
+    memory->create_kokkos(d_table->f,h_table->f,ntables,tablength,"Table::f");
+    memory->create_kokkos(d_table->de,h_table->de,ntables,tlm1,"Table::de");
+    memory->create_kokkos(d_table->df,h_table->df,ntables,tlm1,"Table::df");
+  }
+
+  if(tabstyle == SPLINE) {
+    memory->create_kokkos(d_table->rsq,h_table->rsq,ntables,tablength,"Table::rsq");
+    memory->create_kokkos(d_table->e,h_table->e,ntables,tablength,"Table::e");
+    memory->create_kokkos(d_table->f,h_table->f,ntables,tablength,"Table::f");
+    memory->create_kokkos(d_table->e2,h_table->e2,ntables,tablength,"Table::e2");
+    memory->create_kokkos(d_table->f2,h_table->f2,ntables,tablength,"Table::f2");
+  }
+
+  if(tabstyle == BITMAP) {
+    int ntable = 1 << tablength;
+    memory->create_kokkos(d_table->rsq,h_table->rsq,ntables,ntable,"Table::rsq");
+    memory->create_kokkos(d_table->e,h_table->e,ntables,ntable,"Table::e");
+    memory->create_kokkos(d_table->f,h_table->f,ntables,ntable,"Table::f");
+    memory->create_kokkos(d_table->de,h_table->de,ntables,ntable,"Table::de");
+    memory->create_kokkos(d_table->df,h_table->df,ntables,ntable,"Table::df");
+    memory->create_kokkos(d_table->drsq,h_table->drsq,ntables,ntable,"Table::drsq");
+  }
+
+
+
+  for(int i=0; i < ntables; i++) {
+    Table* tb = &tables[i];
+
+    h_table->nshiftbits[i] = tb->nshiftbits;
+    h_table->nmask[i] = tb->nmask;
+    h_table->innersq[i] = tb->innersq;
+    h_table->invdelta[i] = tb->invdelta;
+    h_table->deltasq6[i] = tb->deltasq6;
+
+    for(int j = 0; j<h_table->rsq.dimension_1(); j++)
+      h_table->rsq(i,j) = tb->rsq[j];
+    for(int j = 0; j<h_table->drsq.dimension_1(); j++)
+      h_table->drsq(i,j) = tb->drsq[j];
+    for(int j = 0; j<h_table->e.dimension_1(); j++)
+      h_table->e(i,j) = tb->e[j];
+    for(int j = 0; j<h_table->de.dimension_1(); j++)
+      h_table->de(i,j) = tb->de[j];
+    for(int j = 0; j<h_table->f.dimension_1(); j++)
+      h_table->f(i,j) = tb->f[j];
+    for(int j = 0; j<h_table->df.dimension_1(); j++)
+      h_table->df(i,j) = tb->df[j];
+    for(int j = 0; j<h_table->e2.dimension_1(); j++)
+      h_table->e2(i,j) = tb->e2[j];
+    for(int j = 0; j<h_table->f2.dimension_1(); j++)
+      h_table->f2(i,j) = tb->f2[j];
+  }
+
+  
+  Kokkos::deep_copy(d_table->nshiftbits,h_table->nshiftbits);
+  Kokkos::deep_copy(d_table->nmask,h_table->nmask);
+  Kokkos::deep_copy(d_table->innersq,h_table->innersq);
+  Kokkos::deep_copy(d_table->invdelta,h_table->invdelta);
+  Kokkos::deep_copy(d_table->deltasq6,h_table->deltasq6);
+  Kokkos::deep_copy(d_table->rsq,h_table->rsq);
+  Kokkos::deep_copy(d_table->drsq,h_table->drsq);
+  Kokkos::deep_copy(d_table->e,h_table->e);
+  Kokkos::deep_copy(d_table->de,h_table->de);
+  Kokkos::deep_copy(d_table->f,h_table->f);
+  Kokkos::deep_copy(d_table->df,h_table->df);
+  Kokkos::deep_copy(d_table->e2,h_table->e2);
+  Kokkos::deep_copy(d_table->f2,h_table->f2);
+  Kokkos::deep_copy(d_table->tabindex,h_table->tabindex);
+
+  d_table_const.nshiftbits = d_table->nshiftbits;
+  d_table_const.nmask = d_table->nmask;
+  d_table_const.innersq = d_table->innersq;
+  d_table_const.invdelta = d_table->invdelta;
+  d_table_const.deltasq6 = d_table->deltasq6;
+  d_table_const.rsq = d_table->rsq;
+  d_table_const.drsq = d_table->drsq;
+  d_table_const.e = d_table->e;
+  d_table_const.de = d_table->de;
+  d_table_const.f = d_table->f;
+  d_table_const.df = d_table->df;
+  d_table_const.e2 = d_table->e2;
+  d_table_const.f2 = d_table->f2;
+
+
+  Kokkos::deep_copy(d_table->cutsq,h_table->cutsq);
+  update_table = 0;
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::allocate()
+{
+  allocated = 1;
+  const int nt = atom->ntypes + 1;
+
+  memory->create(setflag,nt,nt,"pair:setflag");
+  memory->create_kokkos(d_table->cutsq,h_table->cutsq,cutsq,nt,nt,"pair:cutsq");
+  memory->create_kokkos(d_table->tabindex,h_table->tabindex,tabindex,nt,nt,"pair:tabindex");
+
+  d_table_const.cutsq = d_table->cutsq;
+  d_table_const.tabindex = d_table->tabindex;
+  memset(&setflag[0][0],0,nt*nt*sizeof(int));
+  memset(&cutsq[0][0],0,nt*nt*sizeof(double));
+  memset(&tabindex[0][0],0,nt*nt*sizeof(int));
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::settings(int narg, char **arg)
+{
+  if (narg < 2) error->all(FLERR,"Illegal pair_style command");
+
+  // new settings
+
+  if (strcmp(arg[0],"lookup") == 0) tabstyle = LOOKUP;
+  else if (strcmp(arg[0],"linear") == 0) tabstyle = LINEAR;
+  else if (strcmp(arg[0],"spline") == 0) tabstyle = SPLINE;
+  else if (strcmp(arg[0],"bitmap") == 0) tabstyle = BITMAP;
+  else error->all(FLERR,"Unknown table style in pair_style command");
+
+  tablength = force->inumeric(FLERR,arg[1]);
+  if (tablength < 2) error->all(FLERR,"Illegal number of pair table entries");
+
+  // optional keywords
+  // assert the tabulation is compatible with a specific long-range solver
+
+  int iarg = 2;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"ewald") == 0) ewaldflag = 1;
+    else if (strcmp(arg[iarg],"pppm") == 0) pppmflag = 1;
+    else if (strcmp(arg[iarg],"msm") == 0) msmflag = 1;
+    else if (strcmp(arg[iarg],"dispersion") == 0) dispersionflag = 1;
+    else if (strcmp(arg[iarg],"tip4p") == 0) tip4pflag = 1;
+    else error->all(FLERR,"Illegal pair_style command");
+    iarg++;
+  }
+
+  // delete old tables, since cannot just change settings
+
+  for (int m = 0; m < ntables; m++) free_table(&tables[m]);
+  memory->sfree(tables);
+
+  if (allocated) {
+    memory->destroy(setflag);
+    
+    d_table_const.tabindex = d_table->tabindex = typename ArrayTypes<DeviceType>::t_int_2d();
+    h_table->tabindex = typename ArrayTypes<LMPHostType>::t_int_2d();
+
+    d_table_const.cutsq = d_table->cutsq = typename ArrayTypes<DeviceType>::t_ffloat_2d();  
+    h_table->cutsq = typename ArrayTypes<LMPHostType>::t_ffloat_2d();  
+  }
+  allocated = 0;
+
+  ntables = 0;
+  tables = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::coeff(int narg, char **arg)
+{
+  if (narg != 4 && narg != 5) error->all(FLERR,"Illegal pair_coeff command");
+  if (!allocated) allocate();
+
+  int ilo,ihi,jlo,jhi;
+  force->bounds(arg[0],atom->ntypes,ilo,ihi);
+  force->bounds(arg[1],atom->ntypes,jlo,jhi);
+
+  int me;
+  MPI_Comm_rank(world,&me);
+  tables = (Table *)
+    memory->srealloc(tables,(ntables+1)*sizeof(Table),"pair:tables");
+  Table *tb = &tables[ntables];
+  null_table(tb);
+  if (me == 0) read_table(tb,arg[2],arg[3]);
+  bcast_table(tb);
+
+  // set table cutoff
+
+  if (narg == 5) tb->cut = force->numeric(FLERR,arg[4]);
+  else if (tb->rflag) tb->cut = tb->rhi;
+  else tb->cut = tb->rfile[tb->ninput-1];
+
+  // error check on table parameters
+  // insure cutoff is within table
+  // for BITMAP tables, file values can be in non-ascending order
+
+  if (tb->ninput <= 1) error->one(FLERR,"Invalid pair table length");
+  double rlo,rhi;
+  if (tb->rflag == 0) {
+    rlo = tb->rfile[0];
+    rhi = tb->rfile[tb->ninput-1];
+  } else {
+    rlo = tb->rlo;
+    rhi = tb->rhi;
+  }
+  if (tb->cut <= rlo || tb->cut > rhi)
+    error->all(FLERR,"Invalid pair table cutoff");
+  if (rlo <= 0.0) error->all(FLERR,"Invalid pair table cutoff");
+
+  // match = 1 if don't need to spline read-in tables
+  // this is only the case if r values needed by final tables
+  //   exactly match r values read from file
+  // for tabstyle SPLINE, always need to build spline tables
+
+  tb->match = 0;
+  if (tabstyle == LINEAR && tb->ninput == tablength &&
+      tb->rflag == RSQ && tb->rhi == tb->cut) tb->match = 1;
+  if (tabstyle == BITMAP && tb->ninput == 1 << tablength &&
+      tb->rflag == BMP && tb->rhi == tb->cut) tb->match = 1;
+  if (tb->rflag == BMP && tb->match == 0)
+    error->all(FLERR,"Bitmapped table in file does not match requested table");
+
+  // spline read-in values and compute r,e,f vectors within table
+
+  if (tb->match == 0) spline_table(tb);
+  compute_table(tb);
+
+  // store ptr to table in tabindex
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++) {
+    for (int j = MAX(jlo,i); j <= jhi; j++) {
+      tabindex[i][j] = ntables;
+      setflag[i][j] = 1;
+      count++;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Illegal pair_coeff command");
+  ntables++;
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PairTableKokkos<DeviceType>::init_one(int i, int j)
+{
+  if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
+
+  tabindex[j][i] = tabindex[i][j];
+
+  if(i<MAX_TYPES_STACKPARAMS+1 && j<MAX_TYPES_STACKPARAMS+1) {
+    m_cutsq[j][i] = m_cutsq[i][j] = tables[tabindex[i][j]].cut*tables[tabindex[i][j]].cut;
+  }
+
+  return tables[tabindex[i][j]].cut;
+}
+
+/* ----------------------------------------------------------------------
+   read a table section from a tabulated potential file
+   only called by proc 0
+   this function sets these values in Table:
+     ninput,rfile,efile,ffile,rflag,rlo,rhi,fpflag,fplo,fphi,ntablebits
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::read_table(Table *tb, char *file, char *keyword)
+{
+  char line[MAXLINE];
+
+  // open file
+
+  FILE *fp = force->open_potential(file);
+  if (fp == NULL) {
+    char str[128];
+    sprintf(str,"Cannot open file %s",file);
+    error->one(FLERR,str);
+  }
+
+  // loop until section found with matching keyword
+
+  while (1) {
+    if (fgets(line,MAXLINE,fp) == NULL)
+      error->one(FLERR,"Did not find keyword in table file");
+    if (strspn(line," \t\n\r") == strlen(line)) continue;  // blank line
+    if (line[0] == '#') continue;                          // comment
+    char *word = strtok(line," \t\n\r");
+    if (strcmp(word,keyword) == 0) break;           // matching keyword
+    fgets(line,MAXLINE,fp);                         // no match, skip section
+    param_extract(tb,line);
+    fgets(line,MAXLINE,fp);
+    for (int i = 0; i < tb->ninput; i++) fgets(line,MAXLINE,fp);
+  }
+
+  // read args on 2nd line of section
+  // allocate table arrays for file values
+
+  fgets(line,MAXLINE,fp);
+  param_extract(tb,line);
+  memory->create(tb->rfile,tb->ninput,"pair:rfile");
+  memory->create(tb->efile,tb->ninput,"pair:efile");
+  memory->create(tb->ffile,tb->ninput,"pair:ffile");
+
+  // setup bitmap parameters for table to read in
+
+  tb->ntablebits = 0;
+  int masklo,maskhi,nmask,nshiftbits;
+  if (tb->rflag == BMP) {
+    while (1 << tb->ntablebits < tb->ninput) tb->ntablebits++;
+    if (1 << tb->ntablebits != tb->ninput)
+      error->one(FLERR,"Bitmapped table is incorrect length in table file");
+    init_bitmap(tb->rlo,tb->rhi,tb->ntablebits,masklo,maskhi,nmask,nshiftbits);
+  }
+
+  // read r,e,f table values from file
+  // if rflag set, compute r
+  // if rflag not set, use r from file
+
+  int itmp;
+  double rtmp;
+  union_int_float_t rsq_lookup;
+
+  fgets(line,MAXLINE,fp);
+  for (int i = 0; i < tb->ninput; i++) {
+    fgets(line,MAXLINE,fp);
+    sscanf(line,"%d %lg %lg %lg",&itmp,&rtmp,&tb->efile[i],&tb->ffile[i]);
+
+    if (tb->rflag == RLINEAR)
+      rtmp = tb->rlo + (tb->rhi - tb->rlo)*i/(tb->ninput-1);
+    else if (tb->rflag == RSQ) {
+      rtmp = tb->rlo*tb->rlo +
+        (tb->rhi*tb->rhi - tb->rlo*tb->rlo)*i/(tb->ninput-1);
+      rtmp = sqrt(rtmp);
+    } else if (tb->rflag == BMP) {
+      rsq_lookup.i = i << nshiftbits;
+      rsq_lookup.i |= masklo;
+      if (rsq_lookup.f < tb->rlo*tb->rlo) {
+        rsq_lookup.i = i << nshiftbits;
+        rsq_lookup.i |= maskhi;
+      }
+      rtmp = sqrtf(rsq_lookup.f);
+    }
+
+    tb->rfile[i] = rtmp;
+  }
+
+  // close file
+
+  fclose(fp);
+}
+
+/* ----------------------------------------------------------------------
+   broadcast read-in table info from proc 0 to other procs
+   this function communicates these values in Table:
+     ninput,rfile,efile,ffile,rflag,rlo,rhi,fpflag,fplo,fphi
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::bcast_table(Table *tb)
+{
+  MPI_Bcast(&tb->ninput,1,MPI_INT,0,world);
+
+  int me;
+  MPI_Comm_rank(world,&me);
+  if (me > 0) {
+    memory->create(tb->rfile,tb->ninput,"pair:rfile");
+    memory->create(tb->efile,tb->ninput,"pair:efile");
+    memory->create(tb->ffile,tb->ninput,"pair:ffile");
+  }
+
+  MPI_Bcast(tb->rfile,tb->ninput,MPI_DOUBLE,0,world);
+  MPI_Bcast(tb->efile,tb->ninput,MPI_DOUBLE,0,world);
+  MPI_Bcast(tb->ffile,tb->ninput,MPI_DOUBLE,0,world);
+
+  MPI_Bcast(&tb->rflag,1,MPI_INT,0,world);
+  if (tb->rflag) {
+    MPI_Bcast(&tb->rlo,1,MPI_DOUBLE,0,world);
+    MPI_Bcast(&tb->rhi,1,MPI_DOUBLE,0,world);
+  }
+  MPI_Bcast(&tb->fpflag,1,MPI_INT,0,world);
+  if (tb->fpflag) {
+    MPI_Bcast(&tb->fplo,1,MPI_DOUBLE,0,world);
+    MPI_Bcast(&tb->fphi,1,MPI_DOUBLE,0,world);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   build spline representation of e,f over entire range of read-in table
+   this function sets these values in Table: e2file,f2file
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::spline_table(Table *tb)
+{
+  memory->create(tb->e2file,tb->ninput,"pair:e2file");
+  memory->create(tb->f2file,tb->ninput,"pair:f2file");
+
+  double ep0 = - tb->ffile[0];
+  double epn = - tb->ffile[tb->ninput-1];
+  spline(tb->rfile,tb->efile,tb->ninput,ep0,epn,tb->e2file);
+
+  if (tb->fpflag == 0) {
+    tb->fplo = (tb->ffile[1] - tb->ffile[0]) / (tb->rfile[1] - tb->rfile[0]);
+    tb->fphi = (tb->ffile[tb->ninput-1] - tb->ffile[tb->ninput-2]) /
+      (tb->rfile[tb->ninput-1] - tb->rfile[tb->ninput-2]);
+  }
+
+  double fp0 = tb->fplo;
+  double fpn = tb->fphi;
+  spline(tb->rfile,tb->ffile,tb->ninput,fp0,fpn,tb->f2file);
+}
+
+/* ----------------------------------------------------------------------
+   extract attributes from parameter line in table section
+   format of line: N value R/RSQ/BITMAP lo hi FP fplo fphi
+   N is required, other params are optional
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::param_extract(Table *tb, char *line)
+{
+  tb->ninput = 0;
+  tb->rflag = NONE;
+  tb->fpflag = 0;
+
+  char *word = strtok(line," \t\n\r\f");
+  while (word) {
+    if (strcmp(word,"N") == 0) {
+      word = strtok(NULL," \t\n\r\f");
+      tb->ninput = atoi(word);
+    } else if (strcmp(word,"R") == 0 || strcmp(word,"RSQ") == 0 ||
+               strcmp(word,"BITMAP") == 0) {
+      if (strcmp(word,"R") == 0) tb->rflag = RLINEAR;
+      else if (strcmp(word,"RSQ") == 0) tb->rflag = RSQ;
+      else if (strcmp(word,"BITMAP") == 0) tb->rflag = BMP;
+      word = strtok(NULL," \t\n\r\f");
+      tb->rlo = atof(word);
+      word = strtok(NULL," \t\n\r\f");
+      tb->rhi = atof(word);
+    } else if (strcmp(word,"FP") == 0) {
+      tb->fpflag = 1;
+      word = strtok(NULL," \t\n\r\f");
+      tb->fplo = atof(word);
+      word = strtok(NULL," \t\n\r\f");
+      tb->fphi = atof(word);
+    } else {
+      printf("WORD: %s\n",word);
+      error->one(FLERR,"Invalid keyword in pair table parameters");
+    }
+    word = strtok(NULL," \t\n\r\f");
+  }
+
+  if (tb->ninput == 0) error->one(FLERR,"Pair table parameters did not set N");
+}
+
+/* ----------------------------------------------------------------------
+   compute r,e,f vectors from splined values
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::compute_table(Table *tb)
+{
+  update_table = 1;
+  int tlm1 = tablength-1;
+
+  // inner = inner table bound
+  // cut = outer table bound
+  // delta = table spacing in rsq for N-1 bins
+
+  double inner;
+  if (tb->rflag) inner = tb->rlo;
+  else inner = tb->rfile[0];
+  tb->innersq = inner*inner;
+  tb->delta = (tb->cut*tb->cut - tb->innersq) / tlm1;
+  tb->invdelta = 1.0/tb->delta;
+
+  // direct lookup tables
+  // N-1 evenly spaced bins in rsq from inner to cut
+  // e,f = value at midpt of bin
+  // e,f are N-1 in length since store 1 value at bin midpt
+  // f is converted to f/r when stored in f[i]
+  // e,f are never a match to read-in values, always computed via spline interp
+
+  if (tabstyle == LOOKUP) {
+    memory->create(tb->e,tlm1,"pair:e");
+    memory->create(tb->f,tlm1,"pair:f");
+
+    double r,rsq;
+    for (int i = 0; i < tlm1; i++) {
+      rsq = tb->innersq + (i+0.5)*tb->delta;
+      r = sqrt(rsq);
+      tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
+      tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
+    }
+  }
+
+  // linear tables
+  // N-1 evenly spaced bins in rsq from inner to cut
+  // rsq,e,f = value at lower edge of bin
+  // de,df values = delta from lower edge to upper edge of bin
+  // rsq,e,f are N in length so de,df arrays can compute difference
+  // f is converted to f/r when stored in f[i]
+  // e,f can match read-in values, else compute via spline interp
+
+  if (tabstyle == LINEAR) {
+    memory->create(tb->rsq,tablength,"pair:rsq");
+    memory->create(tb->e,tablength,"pair:e");
+    memory->create(tb->f,tablength,"pair:f");
+    memory->create(tb->de,tlm1,"pair:de");
+    memory->create(tb->df,tlm1,"pair:df");
+
+    double r,rsq;
+    for (int i = 0; i < tablength; i++) {
+      rsq = tb->innersq + i*tb->delta;
+      r = sqrt(rsq);
+      tb->rsq[i] = rsq;
+      if (tb->match) {
+        tb->e[i] = tb->efile[i];
+        tb->f[i] = tb->ffile[i]/r;
+      } else {
+        tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
+        tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
+      }
+    }
+
+    for (int i = 0; i < tlm1; i++) {
+      tb->de[i] = tb->e[i+1] - tb->e[i];
+      tb->df[i] = tb->f[i+1] - tb->f[i];
+    }
+  }
+
+  // cubic spline tables
+  // N-1 evenly spaced bins in rsq from inner to cut
+  // rsq,e,f = value at lower edge of bin
+  // e2,f2 = spline coefficient for each bin
+  // rsq,e,f,e2,f2 are N in length so have N-1 spline bins
+  // f is converted to f/r after e is splined
+  // e,f can match read-in values, else compute via spline interp
+
+  if (tabstyle == SPLINE) {
+    memory->create(tb->rsq,tablength,"pair:rsq");
+    memory->create(tb->e,tablength,"pair:e");
+    memory->create(tb->f,tablength,"pair:f");
+    memory->create(tb->e2,tablength,"pair:e2");
+    memory->create(tb->f2,tablength,"pair:f2");
+
+    tb->deltasq6 = tb->delta*tb->delta / 6.0;
+
+    double r,rsq;
+    for (int i = 0; i < tablength; i++) {
+      rsq = tb->innersq + i*tb->delta;
+      r = sqrt(rsq);
+      tb->rsq[i] = rsq;
+      if (tb->match) {
+        tb->e[i] = tb->efile[i];
+        tb->f[i] = tb->ffile[i]/r;
+      } else {
+        tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
+        tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r);
+      }
+    }
+
+    // ep0,epn = dh/dg at inner and at cut
+    // h(r) = e(r) and g(r) = r^2
+    // dh/dg = (de/dr) / 2r = -f/2r
+
+    double ep0 = - tb->f[0] / (2.0 * sqrt(tb->innersq));
+    double epn = - tb->f[tlm1] / (2.0 * tb->cut);
+    spline(tb->rsq,tb->e,tablength,ep0,epn,tb->e2);
+
+    // fp0,fpn = dh/dg at inner and at cut
+    // h(r) = f(r)/r and g(r) = r^2
+    // dh/dg = (1/r df/dr - f/r^2) / 2r
+    // dh/dg in secant approx = (f(r2)/r2 - f(r1)/r1) / (g(r2) - g(r1))
+
+    double fp0,fpn;
+    double secant_factor = 0.1;
+    if (tb->fpflag) fp0 = (tb->fplo/sqrt(tb->innersq) - tb->f[0]/tb->innersq) /
+      (2.0 * sqrt(tb->innersq));
+    else {
+      double rsq1 = tb->innersq;
+      double rsq2 = rsq1 + secant_factor*tb->delta;
+      fp0 = (splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,sqrt(rsq2)) /
+             sqrt(rsq2) - tb->f[0] / sqrt(rsq1)) / (secant_factor*tb->delta);
+    }
+
+    if (tb->fpflag && tb->cut == tb->rfile[tb->ninput-1]) fpn =
+      (tb->fphi/tb->cut - tb->f[tlm1]/(tb->cut*tb->cut)) / (2.0 * tb->cut);
+    else {
+      double rsq2 = tb->cut * tb->cut;
+      double rsq1 = rsq2 - secant_factor*tb->delta;
+      fpn = (tb->f[tlm1] / sqrt(rsq2) -
+             splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,sqrt(rsq1)) /
+             sqrt(rsq1)) / (secant_factor*tb->delta);
+    }
+
+    for (int i = 0; i < tablength; i++) tb->f[i] /= sqrt(tb->rsq[i]);
+    spline(tb->rsq,tb->f,tablength,fp0,fpn,tb->f2);
+  }
+
+  // bitmapped linear tables
+  // 2^N bins from inner to cut, spaced in bitmapped manner
+  // f is converted to f/r when stored in f[i]
+  // e,f can match read-in values, else compute via spline interp
+
+  if (tabstyle == BITMAP) {
+    double r;
+    union_int_float_t rsq_lookup;
+    int masklo,maskhi;
+
+    // linear lookup tables of length ntable = 2^n
+    // stored value = value at lower edge of bin
+
+    init_bitmap(inner,tb->cut,tablength,masklo,maskhi,tb->nmask,tb->nshiftbits);
+    int ntable = 1 << tablength;
+    int ntablem1 = ntable - 1;
+
+    memory->create(tb->rsq,ntable,"pair:rsq");
+    memory->create(tb->e,ntable,"pair:e");
+    memory->create(tb->f,ntable,"pair:f");
+    memory->create(tb->de,ntable,"pair:de");
+    memory->create(tb->df,ntable,"pair:df");
+    memory->create(tb->drsq,ntable,"pair:drsq");
+
+    union_int_float_t minrsq_lookup;
+    minrsq_lookup.i = 0 << tb->nshiftbits;
+    minrsq_lookup.i |= maskhi;
+
+    for (int i = 0; i < ntable; i++) {
+      rsq_lookup.i = i << tb->nshiftbits;
+      rsq_lookup.i |= masklo;
+      if (rsq_lookup.f < tb->innersq) {
+        rsq_lookup.i = i << tb->nshiftbits;
+        rsq_lookup.i |= maskhi;
+      }
+      r = sqrtf(rsq_lookup.f);
+      tb->rsq[i] = rsq_lookup.f;
+      if (tb->match) {
+        tb->e[i] = tb->efile[i];
+        tb->f[i] = tb->ffile[i]/r;
+      } else {
+        tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
+        tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
+      }
+      minrsq_lookup.f = MIN(minrsq_lookup.f,rsq_lookup.f);
+    }
+
+    tb->innersq = minrsq_lookup.f;
+
+    for (int i = 0; i < ntablem1; i++) {
+      tb->de[i] = tb->e[i+1] - tb->e[i];
+      tb->df[i] = tb->f[i+1] - tb->f[i];
+      tb->drsq[i] = 1.0/(tb->rsq[i+1] - tb->rsq[i]);
+    }
+
+    // get the delta values for the last table entries
+    // tables are connected periodically between 0 and ntablem1
+
+    tb->de[ntablem1] = tb->e[0] - tb->e[ntablem1];
+    tb->df[ntablem1] = tb->f[0] - tb->f[ntablem1];
+    tb->drsq[ntablem1] = 1.0/(tb->rsq[0] - tb->rsq[ntablem1]);
+
+    // get the correct delta values at itablemax
+    // smallest r is in bin itablemin
+    // largest r is in bin itablemax, which is itablemin-1,
+    //   or ntablem1 if itablemin=0
+
+    // deltas at itablemax only needed if corresponding rsq < cut*cut
+    // if so, compute deltas between rsq and cut*cut
+    //   if tb->match, data at cut*cut is unavailable, so we'll take
+    //   deltas at itablemax-1 as a good approximation
+
+    double e_tmp,f_tmp;
+    int itablemin = minrsq_lookup.i & tb->nmask;
+    itablemin >>= tb->nshiftbits;
+    int itablemax = itablemin - 1;
+    if (itablemin == 0) itablemax = ntablem1;
+    int itablemaxm1 = itablemax - 1;
+    if (itablemax == 0) itablemaxm1 = ntablem1;
+    rsq_lookup.i = itablemax << tb->nshiftbits;
+    rsq_lookup.i |= maskhi;
+    if (rsq_lookup.f < tb->cut*tb->cut) {
+      if (tb->match) {
+        tb->de[itablemax] = tb->de[itablemaxm1];
+        tb->df[itablemax] = tb->df[itablemaxm1];
+        tb->drsq[itablemax] = tb->drsq[itablemaxm1];
+      } else {
+            rsq_lookup.f = tb->cut*tb->cut;
+        r = sqrtf(rsq_lookup.f);
+        e_tmp = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
+        f_tmp = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
+        tb->de[itablemax] = e_tmp - tb->e[itablemax];
+        tb->df[itablemax] = f_tmp - tb->f[itablemax];
+        tb->drsq[itablemax] = 1.0/(rsq_lookup.f - tb->rsq[itablemax]);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   set all ptrs in a table to NULL, so can be freed safely
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::null_table(Table *tb)
+{
+  tb->rfile = tb->efile = tb->ffile = NULL;
+  tb->e2file = tb->f2file = NULL;
+  tb->rsq = tb->drsq = tb->e = tb->de = NULL;
+  tb->f = tb->df = tb->e2 = tb->f2 = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays in a table
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::free_table(Table *tb)
+{
+  memory->destroy(tb->rfile);
+  memory->destroy(tb->efile);
+  memory->destroy(tb->ffile);
+  memory->destroy(tb->e2file);
+  memory->destroy(tb->f2file);
+
+  memory->destroy(tb->rsq);
+  memory->destroy(tb->drsq);
+  memory->destroy(tb->e);
+  memory->destroy(tb->de);
+  memory->destroy(tb->f);
+  memory->destroy(tb->df);
+  memory->destroy(tb->e2);
+  memory->destroy(tb->f2);
+}
+
+/* ----------------------------------------------------------------------
+   spline and splint routines modified from Numerical Recipes
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::spline(double *x, double *y, int n,
+                       double yp1, double ypn, double *y2)
+{
+  int i,k;
+  double p,qn,sig,un;
+  double *u = new double[n];
+
+  if (yp1 > 0.99e30) y2[0] = u[0] = 0.0;
+  else {
+    y2[0] = -0.5;
+    u[0] = (3.0/(x[1]-x[0])) * ((y[1]-y[0]) / (x[1]-x[0]) - yp1);
+  }
+  for (i = 1; i < n-1; i++) {
+    sig = (x[i]-x[i-1]) / (x[i+1]-x[i-1]);
+    p = sig*y2[i-1] + 2.0;
+    y2[i] = (sig-1.0) / p;
+    u[i] = (y[i+1]-y[i]) / (x[i+1]-x[i]) - (y[i]-y[i-1]) / (x[i]-x[i-1]);
+    u[i] = (6.0*u[i] / (x[i+1]-x[i-1]) - sig*u[i-1]) / p;
+  }
+  if (ypn > 0.99e30) qn = un = 0.0;
+  else {
+    qn = 0.5;
+    un = (3.0/(x[n-1]-x[n-2])) * (ypn - (y[n-1]-y[n-2]) / (x[n-1]-x[n-2]));
+  }
+  y2[n-1] = (un-qn*u[n-2]) / (qn*y2[n-2] + 1.0);
+  for (k = n-2; k >= 0; k--) y2[k] = y2[k]*y2[k+1] + u[k];
+
+  delete [] u;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PairTableKokkos<DeviceType>::splint(double *xa, double *ya, double *y2a, int n, double x)
+{
+  int klo,khi,k;
+  double h,b,a,y;
+
+  klo = 0;
+  khi = n-1;
+  while (khi-klo > 1) {
+    k = (khi+klo) >> 1;
+    if (xa[k] > x) khi = k;
+    else klo = k;
+  }
+  h = xa[khi]-xa[klo];
+  a = (xa[khi]-x) / h;
+  b = (x-xa[klo]) / h;
+  y = a*ya[klo] + b*ya[khi] +
+    ((a*a*a-a)*y2a[klo] + (b*b*b-b)*y2a[khi]) * (h*h)/6.0;
+  return y;
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::write_restart(FILE *fp)
+{
+  write_restart_settings(fp);
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::read_restart(FILE *fp)
+{
+  read_restart_settings(fp);
+  allocate();
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::write_restart_settings(FILE *fp)
+{
+  fwrite(&tabstyle,sizeof(int),1,fp);
+  fwrite(&tablength,sizeof(int),1,fp);
+  fwrite(&ewaldflag,sizeof(int),1,fp);
+  fwrite(&pppmflag,sizeof(int),1,fp);
+  fwrite(&msmflag,sizeof(int),1,fp);
+  fwrite(&dispersionflag,sizeof(int),1,fp);
+  fwrite(&tip4pflag,sizeof(int),1,fp);
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::read_restart_settings(FILE *fp)
+{
+  if (comm->me == 0) {
+    fread(&tabstyle,sizeof(int),1,fp);
+    fread(&tablength,sizeof(int),1,fp);
+    fread(&ewaldflag,sizeof(int),1,fp);
+    fread(&pppmflag,sizeof(int),1,fp);
+    fread(&msmflag,sizeof(int),1,fp);
+    fread(&dispersionflag,sizeof(int),1,fp);
+    fread(&tip4pflag,sizeof(int),1,fp);
+  }
+  MPI_Bcast(&tabstyle,1,MPI_INT,0,world);
+  MPI_Bcast(&tablength,1,MPI_INT,0,world);
+  MPI_Bcast(&ewaldflag,1,MPI_INT,0,world);
+  MPI_Bcast(&pppmflag,1,MPI_INT,0,world);
+  MPI_Bcast(&msmflag,1,MPI_INT,0,world);
+  MPI_Bcast(&dispersionflag,1,MPI_INT,0,world);
+  MPI_Bcast(&tip4pflag,1,MPI_INT,0,world);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PairTableKokkos<DeviceType>::single(int i, int j, int itype, int jtype, double rsq,
+                         double factor_coul, double factor_lj,
+                         double &fforce)
+{
+  int itable;
+  double fraction,value,a,b,phi;
+  int tlm1 = tablength - 1;
+
+  Table *tb = &tables[tabindex[itype][jtype]];
+  if (rsq < tb->innersq) error->one(FLERR,"Pair distance < table inner cutoff");
+
+  if (tabstyle == LOOKUP) {
+    itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta);
+    if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff");
+    fforce = factor_lj * tb->f[itable];
+  } else if (tabstyle == LINEAR) {
+    itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta);
+    if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff");
+    fraction = (rsq - tb->rsq[itable]) * tb->invdelta;
+    value = tb->f[itable] + fraction*tb->df[itable];
+    fforce = factor_lj * value;
+  } else if (tabstyle == SPLINE) {
+    itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta);
+    if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff");
+    b = (rsq - tb->rsq[itable]) * tb->invdelta;
+    a = 1.0 - b;
+    value = a * tb->f[itable] + b * tb->f[itable+1] +
+      ((a*a*a-a)*tb->f2[itable] + (b*b*b-b)*tb->f2[itable+1]) *
+      tb->deltasq6;
+    fforce = factor_lj * value;
+  } else {
+    union_int_float_t rsq_lookup;
+    rsq_lookup.f = rsq;
+    itable = rsq_lookup.i & tb->nmask;
+    itable >>= tb->nshiftbits;
+    fraction = (rsq_lookup.f - tb->rsq[itable]) * tb->drsq[itable];
+    value = tb->f[itable] + fraction*tb->df[itable];
+    fforce = factor_lj * value;
+  }
+
+  if (tabstyle == LOOKUP)
+    phi = tb->e[itable];
+  else if (tabstyle == LINEAR || tabstyle == BITMAP)
+    phi = tb->e[itable] + fraction*tb->de[itable];
+  else
+    phi = a * tb->e[itable] + b * tb->e[itable+1] +
+      ((a*a*a-a)*tb->e2[itable] + (b*b*b-b)*tb->e2[itable+1]) * tb->deltasq6;
+  return factor_lj*phi;
+}
+
+/* ----------------------------------------------------------------------
+   return the Coulomb cutoff for tabled potentials
+   called by KSpace solvers which require that all pairwise cutoffs be the same
+   loop over all tables not just those indexed by tabindex[i][j] since
+     no way to know which tables are active since pair::init() not yet called
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void *PairTableKokkos<DeviceType>::extract(const char *str, int &dim)
+{
+  if (strcmp(str,"cut_coul") != 0) return NULL;
+  if (ntables == 0) error->all(FLERR,"All pair coeffs are not set");
+
+  double cut_coul = tables[0].cut;
+  for (int m = 1; m < ntables; m++)
+    if (tables[m].cut != cut_coul)
+      error->all(FLERR,
+                 "Pair table cutoffs must all be equal to use with KSpace");
+  dim = 0;
+  return &tables[0].cut;
+}
+
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::init_style()
+{
+  neighbor->request(this);
+  neighflag = lmp->kokkos->neighflag;
+  int irequest = neighbor->nrequest - 1;
+
+  neighbor->requests[irequest]->
+    kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value &&
+    !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+
+  if (neighflag == FULL) {
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full_cluster = 0;
+  } else if (neighflag == HALF || neighflag == HALFTHREAD) {
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 1;
+    neighbor->requests[irequest]->full_cluster = 0;
+  } else if (neighflag == N2) {
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full_cluster = 0;
+  } else if (neighflag == FULLCLUSTER) {
+    neighbor->requests[irequest]->full_cluster = 1;
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+  } else {
+    error->all(FLERR,"Cannot use chosen neighbor list style with lj/cut/kk");
+  }
+}
+
+/*
+template <class DeviceType> template<int NEIGHFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairTableKokkos<DeviceType>::
+ev_tally(EV_FLOAT &ev, const int &i, const int &j, const F_FLOAT &fpair,
+         const F_FLOAT &delx, const F_FLOAT &dely, const F_FLOAT &delz) const
+{
+  const int EFLAG = eflag;
+  const int NEWTON_PAIR = newton_pair;
+  const int VFLAG = vflag_either;
+
+  if (EFLAG) {
+    if (eflag_atom) {
+      E_FLOAT epairhalf = 0.5 * (ev.evdwl + ev.ecoul);
+      if (NEWTON_PAIR || i < nlocal) eatom[i] += epairhalf;
+      if (NEWTON_PAIR || j < nlocal) eatom[j] += epairhalf;
+    }
+  }
+
+  if (VFLAG) {
+    const E_FLOAT v0 = delx*delx*fpair;
+    const E_FLOAT v1 = dely*dely*fpair;
+    const E_FLOAT v2 = delz*delz*fpair;
+    const E_FLOAT v3 = delx*dely*fpair;
+    const E_FLOAT v4 = delx*delz*fpair;
+    const E_FLOAT v5 = dely*delz*fpair;
+
+    if (vflag_global) {
+      if (NEIGHFLAG) {
+        if (NEWTON_PAIR) {
+          ev.v[0] += v0;
+          ev.v[1] += v1;
+          ev.v[2] += v2;
+          ev.v[3] += v3;
+          ev.v[4] += v4;
+          ev.v[5] += v5;
+        } else {
+          if (i < nlocal) {
+            ev.v[0] += 0.5*v0;
+            ev.v[1] += 0.5*v1;
+            ev.v[2] += 0.5*v2;
+            ev.v[3] += 0.5*v3;
+            ev.v[4] += 0.5*v4;
+            ev.v[5] += 0.5*v5;
+          }
+          if (j < nlocal) {
+            ev.v[0] += 0.5*v0;
+            ev.v[1] += 0.5*v1;
+            ev.v[2] += 0.5*v2;
+            ev.v[3] += 0.5*v3;
+            ev.v[4] += 0.5*v4;
+            ev.v[5] += 0.5*v5;
+          }
+        }
+      } else {
+        ev.v[0] += 0.5*v0;
+        ev.v[1] += 0.5*v1;
+        ev.v[2] += 0.5*v2;
+        ev.v[3] += 0.5*v3;
+        ev.v[4] += 0.5*v4;
+        ev.v[5] += 0.5*v5;
+      }
+    }
+
+    if (vflag_atom) {
+      if (NEWTON_PAIR || i < nlocal) {
+        d_vatom(i,0) += 0.5*v0;
+        d_vatom(i,1) += 0.5*v1;
+        d_vatom(i,2) += 0.5*v2;
+        d_vatom(i,3) += 0.5*v3;
+        d_vatom(i,4) += 0.5*v4;
+        d_vatom(i,5) += 0.5*v5;
+      }
+      if (NEWTON_PAIR || (NEIGHFLAG && j < nlocal)) {
+        d_vatom(j,0) += 0.5*v0;
+        d_vatom(j,1) += 0.5*v1;
+        d_vatom(j,2) += 0.5*v2;
+        d_vatom(j,3) += 0.5*v3;
+        d_vatom(j,4) += 0.5*v4;
+        d_vatom(j,5) += 0.5*v5;
+      }
+    }
+  }
+}
+*/
+template<class DeviceType>
+void PairTableKokkos<DeviceType>::cleanup_copy() {
+  // WHY needed: this prevents parent copy from deallocating any arrays
+  allocated = 0;
+  cutsq = NULL;
+  eatom = NULL;
+  vatom = NULL;
+  h_table=NULL; d_table=NULL;
+}
+
+template class PairTableKokkos<LMPDeviceType>;
+#if DEVICE==2
+template class PairTableKokkos<LMPHostType>;
+#endif
+
diff --git a/src/KOKKOS/pair_table_kokkos.h b/src/KOKKOS/pair_table_kokkos.h
new file mode 100644
index 000000000..317703c89
--- /dev/null
+++ b/src/KOKKOS/pair_table_kokkos.h
@@ -0,0 +1,352 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(table/kk,PairTableKokkos<LMPDeviceType>)
+PairStyle(table/kk/device,PairTableKokkos<LMPDeviceType>)
+PairStyle(table/kk/host,PairTableKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_PAIR_TABLE_KOKKOS_H
+#define LMP_PAIR_TABLE_KOKKOS_H
+
+#include "pair.h"
+#include "pair_kokkos.h"
+#include "neigh_list_kokkos.h"
+#include "atom_kokkos.h"
+
+namespace LAMMPS_NS {
+
+template<class Device,int TABSTYLE>
+struct S_TableCompute {
+  enum {TabStyle = TABSTYLE};
+};
+
+template <class DeviceType, int NEIGHFLAG, int TABSTYLE>
+class PairTableComputeFunctor;
+
+template<class DeviceType>
+class PairTableKokkos : public Pair {
+ public:
+
+  enum {COUL_FLAG=0};
+  typedef DeviceType device_type;
+
+  PairTableKokkos(class LAMMPS *);
+  virtual ~PairTableKokkos();
+
+  virtual void compute(int, int);
+  
+  template<int TABSTYLE> 
+  void compute_style(int, int);
+
+  /*template<int EVFLAG, int NEIGHFLAG, int NEWTON_PAIR, int TABSTYLE>
+  KOKKOS_FUNCTION
+  EV_FLOAT compute_item(const int& i,
+                        const NeighListKokkos<DeviceType> &list) const;
+*/
+  void settings(int, char **);
+  void coeff(int, char **);
+  double init_one(int, int);
+  void write_restart(FILE *);
+  void read_restart(FILE *);
+  void write_restart_settings(FILE *);
+  void read_restart_settings(FILE *);
+  double single(int, int, int, int, double, double, double, double &);
+  void *extract(const char *, int &);
+
+  void init_style();
+
+
+ protected:
+  enum{LOOKUP,LINEAR,SPLINE,BITMAP};
+
+  int tabstyle,tablength;
+  /*struct TableDeviceConst {
+    typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread cutsq;
+    typename ArrayTypes<DeviceType>::t_int_2d_randomread tabindex;
+    typename ArrayTypes<DeviceType>::t_int_1d_randomread nshiftbits,nmask;
+    typename ArrayTypes<DeviceType>::t_ffloat_1d_randomread innersq,invdelta,deltasq6;
+    typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread rsq,drsq,e,de,f,df,e2,f2;
+  };*/
+ //Its faster not to use texture fetch if the number of tables is less than 32! 
+  struct TableDeviceConst {
+    typename ArrayTypes<DeviceType>::t_ffloat_2d cutsq;
+    typename ArrayTypes<DeviceType>::t_int_2d tabindex;
+    typename ArrayTypes<DeviceType>::t_int_1d nshiftbits,nmask;
+    typename ArrayTypes<DeviceType>::t_ffloat_1d innersq,invdelta,deltasq6;
+    typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread rsq,drsq,e,de,f,df,e2,f2;
+  };
+
+  struct TableDevice {
+    typename ArrayTypes<DeviceType>::t_ffloat_2d cutsq;
+    typename ArrayTypes<DeviceType>::t_int_2d tabindex;
+    typename ArrayTypes<DeviceType>::t_int_1d nshiftbits,nmask;
+    typename ArrayTypes<DeviceType>::t_ffloat_1d innersq,invdelta,deltasq6;
+    typename ArrayTypes<DeviceType>::t_ffloat_2d rsq,drsq,e,de,f,df,e2,f2;
+  };
+
+  struct TableHost {
+    typename ArrayTypes<LMPHostType>::t_ffloat_2d cutsq;
+    typename ArrayTypes<LMPHostType>::t_int_2d tabindex;
+    typename ArrayTypes<LMPHostType>::t_int_1d nshiftbits,nmask;
+    typename ArrayTypes<LMPHostType>::t_ffloat_1d innersq,invdelta,deltasq6;
+    typename ArrayTypes<LMPHostType>::t_ffloat_2d rsq,drsq,e,de,f,df,e2,f2;
+  };
+
+  struct Table {
+    int ninput,rflag,fpflag,match,ntablebits;
+    int nshiftbits,nmask;
+    double rlo,rhi,fplo,fphi,cut;
+    double *rfile,*efile,*ffile;
+    double *e2file,*f2file;
+    double innersq,delta,invdelta,deltasq6;
+    double *rsq,*drsq,*e,*de,*f,*df,*e2,*f2;
+  };
+  int ntables;
+  Table *tables;
+  TableDeviceConst d_table_const;
+  TableDevice* d_table;
+  TableHost* h_table;
+
+  int **tabindex;
+  F_FLOAT m_cutsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+
+  typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;
+
+  void allocate();
+  void read_table(Table *, char *, char *);
+  void param_extract(Table *, char *);
+  void bcast_table(Table *);
+  void spline_table(Table *);
+  void compute_table(Table *);
+  void null_table(Table *);
+  void free_table(Table *);
+  void spline(double *, double *, int, double, double, double *);
+  double splint(double *, double *, double *, int, double);
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread x;
+  typename ArrayTypes<DeviceType>::t_x_array_const c_x;
+  typename ArrayTypes<DeviceType>::t_f_array f;
+  typename ArrayTypes<DeviceType>::t_int_1d_randomread type;
+  typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom;
+  typename ArrayTypes<DeviceType>::t_virial_array d_vatom;
+
+ protected:
+  int nlocal,nall,eflag,vflag,neighflag,newton_pair;
+  class AtomKokkos *atomKK;
+  int update_table;
+  void create_kokkos_tables();
+  void cleanup_copy();
+
+  template<bool STACKPARAMS, class Specialisation>
+  KOKKOS_INLINE_FUNCTION
+  F_FLOAT compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const;
+
+  template<bool STACKPARAMS, class Specialisation>
+  KOKKOS_INLINE_FUNCTION
+  F_FLOAT compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const;
+
+  template<bool STACKPARAMS, class Specialisation>
+  KOKKOS_INLINE_FUNCTION
+  F_FLOAT compute_ecoul(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const {
+    return 0;
+  }
+
+  friend class PairComputeFunctor<PairTableKokkos,FULL,true,S_TableCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableKokkos,HALF,true,S_TableCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableKokkos,N2,true,S_TableCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableKokkos,FULLCLUSTER,true,S_TableCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableKokkos,FULL,false,S_TableCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableKokkos,HALF,false,S_TableCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableKokkos,N2,false,S_TableCompute<DeviceType,LOOKUP> >;
+  friend class PairComputeFunctor<PairTableKokkos,FULLCLUSTER,false,S_TableCompute<DeviceType,LOOKUP> >;
+
+  friend class PairComputeFunctor<PairTableKokkos,FULL,true,S_TableCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableKokkos,HALF,true,S_TableCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableKokkos,N2,true,S_TableCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableKokkos,FULLCLUSTER,true,S_TableCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableKokkos,FULL,false,S_TableCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableKokkos,HALF,false,S_TableCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableKokkos,N2,false,S_TableCompute<DeviceType,LINEAR> >;
+  friend class PairComputeFunctor<PairTableKokkos,FULLCLUSTER,false,S_TableCompute<DeviceType,LINEAR> >;
+
+  friend class PairComputeFunctor<PairTableKokkos,FULL,true,S_TableCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableKokkos,HALF,true,S_TableCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableKokkos,N2,true,S_TableCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableKokkos,FULLCLUSTER,true,S_TableCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableKokkos,FULL,false,S_TableCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableKokkos,HALF,false,S_TableCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableKokkos,N2,false,S_TableCompute<DeviceType,SPLINE> >;
+  friend class PairComputeFunctor<PairTableKokkos,FULLCLUSTER,false,S_TableCompute<DeviceType,SPLINE> >;
+
+  friend class PairComputeFunctor<PairTableKokkos,FULL,true,S_TableCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableKokkos,HALF,true,S_TableCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableKokkos,N2,true,S_TableCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableKokkos,FULLCLUSTER,true,S_TableCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableKokkos,FULL,false,S_TableCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableKokkos,HALF,false,S_TableCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableKokkos,N2,false,S_TableCompute<DeviceType,BITMAP> >;
+  friend class PairComputeFunctor<PairTableKokkos,FULLCLUSTER,false,S_TableCompute<DeviceType,BITMAP> >;
+/*template<int FULL_NEIGH>
+  KOKKOS_INLINE_FUNCTION
+    void ev_tally(EV_FLOAT &ev, const int &i, const int &j,
+                  const F_FLOAT &fpair, const F_FLOAT &delx,
+                  const F_FLOAT &dely, const F_FLOAT &delz) const;
+*/
+};
+/*
+template <class DeviceType, int NEIGHFLAG, int TABSTYLE>
+struct PairTableComputeFunctor  {
+  typedef DeviceType device_type ;
+  typedef EV_FLOAT value_type;
+
+  PairTableKokkos<DeviceType> c;
+  NeighListKokkos<DeviceType> list;
+
+  PairTableComputeFunctor(PairTableKokkos<DeviceType>* c_ptr,
+                          NeighListKokkos<DeviceType>* list_ptr):
+  c(*c_ptr),list(*list_ptr) {};
+  ~PairTableComputeFunctor() {c.cleanup_copy();list.clean_copy();};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    if (c.newton_pair) c.template compute_item<0,NEIGHFLAG,1,TABSTYLE>(i,list);
+    else c.template compute_item<0,NEIGHFLAG,0,TABSTYLE>(i,list);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i, value_type &energy_virial) const {
+    if (c.newton_pair)
+      energy_virial += c.template compute_item<1,NEIGHFLAG,1,TABSTYLE>(i,list);
+    else
+      energy_virial += c.template compute_item<1,NEIGHFLAG,0,TABSTYLE>(i,list);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init(volatile value_type &update) {
+    update.evdwl = 0;
+    update.ecoul = 0;
+    update.v[0] = 0;
+    update.v[1] = 0;
+    update.v[2] = 0;
+    update.v[3] = 0;
+    update.v[4] = 0;
+    update.v[5] = 0;
+  }
+  KOKKOS_INLINE_FUNCTION
+  static void join(volatile value_type &update,
+                   const volatile value_type &source) {
+    update.evdwl += source.evdwl;
+    update.ecoul += source.ecoul;
+    update.v[0] += source.v[0];
+    update.v[1] += source.v[1];
+    update.v[2] += source.v[2];
+    update.v[3] += source.v[3];
+    update.v[4] += source.v[4];
+    update.v[5] += source.v[5];
+  }
+};
+
+*/
+
+
+
+
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Pair distance < table inner cutoff
+
+Two atoms are closer together than the pairwise table allows.
+
+E: Pair distance > table outer cutoff
+
+Two atoms are further apart than the pairwise table allows.
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Unknown table style in pair_style command
+
+Style of table is invalid for use with pair_style table command.
+
+E: Illegal number of pair table entries
+
+There must be at least 2 table entries.
+
+E: Invalid pair table length
+
+Length of read-in pair table is invalid
+
+E: Invalid pair table cutoff
+
+Cutoffs in pair_coeff command are not valid with read-in pair table.
+
+E: Bitmapped table in file does not match requested table
+
+Setting for bitmapped table in pair_coeff command must match table
+in file exactly.
+
+E: All pair coeffs are not set
+
+All pair coefficients must be set in the data file or by the
+pair_coeff command before running a simulation.
+
+E: Cannot open file %s
+
+The specified file cannot be opened.  Check that the path and name are
+correct. If the file is a compressed file, also check that the gzip
+executable can be found and run.
+
+E: Did not find keyword in table file
+
+Keyword used in pair_coeff command was not found in table file.
+
+E: Bitmapped table is incorrect length in table file
+
+Number of table entries is not a correct power of 2.
+
+E: Invalid keyword in pair table parameters
+
+Keyword used in list of table parameters is not recognized.
+
+E: Pair table parameters did not set N
+
+List of pair table parameters must include N setting.
+
+E: Pair table cutoffs must all be equal to use with KSpace
+
+When using pair style table with a long-range KSpace solver, the
+cutoffs for all atom type pairs must all be the same, since the
+long-range solver starts at that cutoff.
+
+*/
diff --git a/src/KOKKOS/verlet_kokkos.cpp b/src/KOKKOS/verlet_kokkos.cpp
new file mode 100644
index 000000000..2883cb06e
--- /dev/null
+++ b/src/KOKKOS/verlet_kokkos.cpp
@@ -0,0 +1,443 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "string.h"
+#include "verlet_kokkos.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "comm.h"
+#include "atom.h"
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "dihedral.h"
+#include "improper.h"
+#include "kspace.h"
+#include "output.h"
+#include "update.h"
+#include "modify.h"
+#include "compute.h"
+#include "fix.h"
+#include "timer.h"
+#include "memory.h"
+#include "error.h"
+
+#include <ctime>
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+VerletKokkos::VerletKokkos(LAMMPS *lmp, int narg, char **arg) :
+  Verlet(lmp, narg, arg) 
+{
+  atomKK = (AtomKokkos *) atom;
+}
+
+/* ----------------------------------------------------------------------
+   setup before run
+------------------------------------------------------------------------- */
+
+void VerletKokkos::setup()
+{
+  if (comm->me == 0 && screen) fprintf(screen,"Setting up run ...\n");
+  update->setupflag = 1;
+
+  // setup domain, communication and neighboring
+  // acquire ghosts
+  // build neighbor lists
+
+  atomKK->modified(Host,ALL_MASK);
+
+  atomKK->setup();
+  modify->setup_pre_exchange();
+      // debug
+      atomKK->sync(Host,ALL_MASK);
+      atomKK->modified(Host,ALL_MASK);
+  if (triclinic) domain->x2lamda(atomKK->nlocal);
+  domain->pbc();
+
+  atomKK->sync(Host,ALL_MASK);
+
+  domain->reset_box();
+  comm->setup();
+  if (neighbor->style) neighbor->setup_bins();
+  comm->exchange();
+  if (atomKK->sortfreq > 0) atomKK->sort();
+  comm->borders();
+  if (triclinic) domain->lamda2x(atomKK->nlocal+atomKK->nghost);
+
+  atomKK->sync(Host,ALL_MASK);
+
+  domain->image_check();
+  domain->box_too_small_check();
+  modify->setup_pre_neighbor();
+
+  atomKK->modified(Host,ALL_MASK);
+
+  neighbor->build();
+  neighbor->ncalls = 0;
+
+  // compute all forces
+
+  ev_set(update->ntimestep);
+  force_clear();
+  modify->setup_pre_force(vflag);
+
+  if (pair_compute_flag) force->pair->compute(eflag,vflag);
+  else if (force->pair) force->pair->compute_dummy(eflag,vflag);
+
+  if (atomKK->molecular) {
+    if (force->bond) force->bond->compute(eflag,vflag);
+    if (force->angle) force->angle->compute(eflag,vflag);
+    if (force->dihedral) force->dihedral->compute(eflag,vflag);
+    if (force->improper) force->improper->compute(eflag,vflag);
+  }
+
+  if (force->kspace) {
+    force->kspace->setup();
+    if (kspace_compute_flag) force->kspace->compute(eflag,vflag);
+    else force->kspace->compute_dummy(eflag,vflag);
+  }
+
+  if (force->newton) comm->reverse_comm();
+
+  modify->setup(vflag);
+  output->setup();
+  update->setupflag = 0;
+}
+
+/* ----------------------------------------------------------------------
+   setup without output
+   flag = 0 = just force calculation
+   flag = 1 = reneighbor and force calculation
+------------------------------------------------------------------------- */
+
+void VerletKokkos::setup_minimal(int flag)
+{
+  update->setupflag = 1;
+
+  // setup domain, communication and neighboring
+  // acquire ghosts
+  // build neighbor lists
+
+  if (flag) {
+    atomKK->modified(Host,ALL_MASK);
+
+    modify->setup_pre_exchange();
+      // debug
+      atomKK->sync(Host,ALL_MASK);
+      atomKK->modified(Host,ALL_MASK);
+
+    if (triclinic) domain->x2lamda(atomKK->nlocal);
+    domain->pbc();
+
+    atomKK->sync(Host,ALL_MASK);
+
+    domain->reset_box();
+    comm->setup();
+    if (neighbor->style) neighbor->setup_bins();
+    comm->exchange();
+    comm->borders();
+    if (triclinic) domain->lamda2x(atomKK->nlocal+atomKK->nghost);
+
+    atomKK->sync(Host,ALL_MASK);
+
+    domain->image_check();
+    domain->box_too_small_check();
+    modify->setup_pre_neighbor();
+
+    atomKK->modified(Host,ALL_MASK);
+
+    neighbor->build();
+    neighbor->ncalls = 0;
+  }
+
+  // compute all forces
+
+  ev_set(update->ntimestep);
+  force_clear();
+  modify->setup_pre_force(vflag);
+
+  if (pair_compute_flag) force->pair->compute(eflag,vflag);
+  else if (force->pair) force->pair->compute_dummy(eflag,vflag);
+
+  if (atomKK->molecular) {
+    if (force->bond) force->bond->compute(eflag,vflag);
+    if (force->angle) force->angle->compute(eflag,vflag);
+    if (force->dihedral) force->dihedral->compute(eflag,vflag);
+    if (force->improper) force->improper->compute(eflag,vflag);
+  }
+
+  if (force->kspace) {
+    force->kspace->setup();
+    if (kspace_compute_flag) force->kspace->compute(eflag,vflag);
+    else force->kspace->compute_dummy(eflag,vflag);
+  }
+
+  if (force->newton) comm->reverse_comm();
+
+  modify->setup(vflag);
+  update->setupflag = 0;
+}
+
+/* ----------------------------------------------------------------------
+   run for N steps
+------------------------------------------------------------------------- */
+
+void VerletKokkos::run(int n)
+{
+  bigint ntimestep;
+  int nflag,sortflag;
+
+  int n_post_integrate = modify->n_post_integrate;
+  int n_pre_exchange = modify->n_pre_exchange;
+  int n_pre_neighbor = modify->n_pre_neighbor;
+  int n_pre_force = modify->n_pre_force;
+  int n_post_force = modify->n_post_force;
+  int n_end_of_step = modify->n_end_of_step;
+
+  if (atomKK->sortfreq > 0) sortflag = 1;
+  else sortflag = 0;
+
+  static double time = 0.0;
+  static int count = 0;
+  atomKK->sync(Device,ALL_MASK);
+  Kokkos::Impl::Timer ktimer;
+
+  for (int i = 0; i < n; i++) {
+
+    ntimestep = ++update->ntimestep;
+    ev_set(ntimestep);
+
+    // initial time integration
+
+    ktimer.reset();
+    modify->initial_integrate(vflag);
+    time += ktimer.seconds();
+    if (n_post_integrate) modify->post_integrate();
+
+    // regular communication vs neighbor list rebuild
+
+    nflag = neighbor->decide();
+
+    if (nflag == 0) {
+      timer->stamp();
+      comm->forward_comm();
+      timer->stamp(TIME_COMM);
+    } else {
+      // added debug
+      //atomKK->sync(Host,ALL_MASK);
+      //atomKK->modified(Host,ALL_MASK);
+
+      if (n_pre_exchange) modify->pre_exchange();
+      // debug
+      //atomKK->sync(Host,ALL_MASK);
+      //atomKK->modified(Host,ALL_MASK);
+      if (triclinic) domain->x2lamda(atomKK->nlocal);
+      domain->pbc();
+      if (domain->box_change) {
+        domain->reset_box();
+        comm->setup();
+        if (neighbor->style) neighbor->setup_bins();
+      }
+      timer->stamp();
+
+      // added debug
+      //atomKK->sync(Device,ALL_MASK);
+      //atomKK->modified(Device,ALL_MASK);
+
+      comm->exchange();
+      if (sortflag && ntimestep >= atomKK->nextsort) atomKK->sort();
+      comm->borders();
+
+      // added debug
+      //atomKK->sync(Host,ALL_MASK);
+      //atomKK->modified(Host,ALL_MASK);
+
+      if (triclinic) domain->lamda2x(atomKK->nlocal+atomKK->nghost);
+
+      timer->stamp(TIME_COMM);
+      if (n_pre_neighbor) modify->pre_neighbor();
+      neighbor->build();
+      timer->stamp(TIME_NEIGHBOR);
+    }
+
+    // force computations
+    // important for pair to come before bonded contributions
+    // since some bonded potentials tally pairwise energy/virial
+    // and Pair:ev_tally() needs to be called before any tallying
+
+    force_clear();
+    // added for debug
+    //atomKK->k_x.sync<LMPHostType>();
+    //atomKK->k_f.sync<LMPHostType>();
+    //atomKK->k_f.modify<LMPHostType>();
+    if (n_pre_force) modify->pre_force(vflag);
+
+    timer->stamp();
+
+    if (pair_compute_flag) {
+      force->pair->compute(eflag,vflag);
+      timer->stamp(TIME_PAIR);
+    }
+
+    if (atomKK->molecular) {
+      if (force->bond) force->bond->compute(eflag,vflag);
+      if (force->angle) force->angle->compute(eflag,vflag);
+      if (force->dihedral) force->dihedral->compute(eflag,vflag);
+      if (force->improper) force->improper->compute(eflag,vflag);
+      timer->stamp(TIME_BOND);
+    }
+
+    if (kspace_compute_flag) {
+      force->kspace->compute(eflag,vflag);
+      timer->stamp(TIME_KSPACE);
+    }
+
+    // reverse communication of forces
+
+    if (force->newton) {
+      atomKK->sync(Host,F_MASK);
+      comm->reverse_comm();
+      atomKK->modified(Host,F_MASK);
+      timer->stamp(TIME_COMM);
+    }
+
+    // force modifications, final time integration, diagnostics
+
+    ktimer.reset();
+
+    if (n_post_force) modify->post_force(vflag);
+    modify->final_integrate();
+    if (n_end_of_step) modify->end_of_step();
+
+    time += ktimer.seconds();
+
+    // all output
+
+    if (ntimestep == output->next) {
+       atomKK->sync(Host,ALL_MASK);
+
+      timer->stamp();
+      output->write(ntimestep);
+      timer->stamp(TIME_OUTPUT);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   clear force on own & ghost atoms
+   clear other arrays as needed
+------------------------------------------------------------------------- */
+
+void VerletKokkos::force_clear()
+{
+  int i;
+
+  if (external_force_clear) return;
+
+  // clear force on all particles
+  // if either newton flag is set, also include ghosts
+  // when using threads always clear all forces.
+
+  if (neighbor->includegroup == 0) {
+    int nall;
+    if (force->newton) nall = atomKK->nlocal + atomKK->nghost;
+    else nall = atomKK->nlocal;
+
+    size_t nbytes = sizeof(double) * nall;
+
+    if (nbytes) {
+      if (atomKK->k_f.modified_host > atomKK->k_f.modified_device) {
+    	memset_kokkos(atomKK->k_f.view<LMPHostType>());
+    	atomKK->modified(Host,F_MASK);
+      } else {
+        memset_kokkos(atomKK->k_f.view<LMPDeviceType>());
+        atomKK->modified(Device,F_MASK);
+      }
+      if (torqueflag)  memset(&(atomKK->torque[0][0]),0,3*nbytes);
+      if (erforceflag) memset(&(atomKK->erforce[0]),  0,  nbytes);
+      if (e_flag)      memset(&(atomKK->de[0]),       0,  nbytes);
+      if (rho_flag)    memset(&(atomKK->drho[0]),     0,  nbytes);
+    }
+
+  // neighbor includegroup flag is set
+  // clear force only on initial nfirst particles
+  // if either newton flag is set, also include ghosts
+
+  } else {
+    int nall = atomKK->nfirst;
+    if (atomKK->k_f.modified_host > atomKK->k_f.modified_device) {
+      memset_kokkos(atomKK->k_f.view<LMPHostType>());
+      atomKK->modified(Host,F_MASK);
+    } else {
+      memset_kokkos(atomKK->k_f.view<LMPDeviceType>());
+      atomKK->modified(Device,F_MASK);
+    }
+    if (torqueflag) {
+      double **torque = atomKK->torque;
+      for (i = 0; i < nall; i++) {
+        torque[i][0] = 0.0;
+        torque[i][1] = 0.0;
+        torque[i][2] = 0.0;
+      }
+    }
+
+    if (erforceflag) {
+      double *erforce = atomKK->erforce;
+      for (i = 0; i < nall; i++) erforce[i] = 0.0;
+    }
+
+    if (e_flag) {
+      double *de = atomKK->de;
+      for (i = 0; i < nall; i++) de[i] = 0.0;
+    }
+
+    if (rho_flag) {
+      double *drho = atomKK->drho;
+      for (i = 0; i < nall; i++) drho[i] = 0.0;
+    }
+
+    if (force->newton) {
+      nall = atomKK->nlocal + atomKK->nghost;
+
+      if (torqueflag) {
+        double **torque = atomKK->torque;
+        for (i = atomKK->nlocal; i < nall; i++) {
+          torque[i][0] = 0.0;
+          torque[i][1] = 0.0;
+          torque[i][2] = 0.0;
+        }
+      }
+
+      if (erforceflag) {
+        double *erforce = atomKK->erforce;
+        for (i = atomKK->nlocal; i < nall; i++) erforce[i] = 0.0;
+      }
+
+      if (e_flag) {
+        double *de = atomKK->de;
+        for (i = 0; i < nall; i++) de[i] = 0.0;
+      }
+
+      if (rho_flag) {
+        double *drho = atomKK->drho;
+        for (i = 0; i < nall; i++) drho[i] = 0.0;
+      }
+    }
+  }
+}
diff --git a/src/KOKKOS/verlet_kokkos.h b/src/KOKKOS/verlet_kokkos.h
new file mode 100644
index 000000000..63531bda2
--- /dev/null
+++ b/src/KOKKOS/verlet_kokkos.h
@@ -0,0 +1,48 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef INTEGRATE_CLASS
+
+IntegrateStyle(verlet/kk,VerletKokkos)
+
+#else
+
+#ifndef LMP_VERLET_KOKKOS_H
+#define LMP_VERLET_KOKKOS_H
+
+#include "verlet.h"
+
+namespace LAMMPS_NS {
+
+class VerletKokkos : public Verlet {
+ public:
+  VerletKokkos(class LAMMPS *, int, char **);
+  ~VerletKokkos() {}
+  void setup();
+  void setup_minimal(int);
+  void run(int);
+
+ protected:
+  class AtomKokkos *atomKK;
+
+  void force_clear();
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/MAKE/Makefile.cuda b/src/MAKE/Makefile.cuda
new file mode 100755
index 000000000..61b1738ba
--- /dev/null
+++ b/src/MAKE/Makefile.cuda
@@ -0,0 +1,111 @@
+# cuda = RedHat Linux box, nvcc for Kokkos, MPICH2, FFTW
+
+SHELL = /bin/sh
+
+# ---------------------------------------------------------------------
+# compiler/linker settings
+# specify flags and libraries needed for your compiler
+
+CC =		nvcc
+CCFLAGS =	-g -O3 -arch=sm_20
+SHFLAGS =	-fPIC
+DEPFLAGS =	-M
+
+LINK =		g++
+LINKFLAGS =	-g -O
+LIB = 
+SIZE =		size
+
+ARCHIVE =	ar
+ARFLAGS =	-rc
+SHLIBFLAGS =	-shared
+
+# ---------------------------------------------------------------------
+# LAMMPS-specific settings
+# specify settings for LAMMPS features you will use
+# if you change any -D setting, do full re-compile after "make clean"
+
+# LAMMPS ifdef settings, OPTIONAL
+# see possible settings in doc/Section_start.html#2_2 (step 4)
+
+LMP_INC =	-DLAMMPS_GZIP -DLAMMPS_JPEG
+
+# MPI library, REQUIRED
+# see discussion in doc/Section_start.html#2_2 (step 5)
+# can point to dummy MPI library in src/STUBS as in Makefile.serial
+# INC = path for mpi.h, MPI compiler settings
+# PATH = path for MPI library
+# LIB = name of MPI library
+
+MPI_INC =       -DMPICH_SKIP_MPICXX
+MPI_PATH = 
+MPI_LIB =	-lmpich -lmpl -lpthread
+
+# FFT library, OPTIONAL
+# see discussion in doc/Section_start.html#2_2 (step 6)
+# can be left blank to use provided KISS FFT library
+# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
+# PATH = path for FFT library
+# LIB = name of FFT library
+
+FFT_INC =    	-DFFT_FFTW
+FFT_PATH = 
+FFT_LIB =	-lfftw
+
+# JPEG and/or PNG library, OPTIONAL
+# see discussion in doc/Section_start.html#2_2 (step 7)
+# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
+# INC = path(s) for jpeglib.h and/or png.h
+# PATH = path(s) for JPEG library and/or PNG library
+# LIB = name(s) of JPEG library and/or PNG library
+
+JPG_INC =       
+JPG_PATH = 	
+JPG_LIB =	-ljpeg
+
+# ---------------------------------------------------------------------
+# build rules and dependencies
+# no need to edit this section
+
+include	Makefile.package.settings
+include	Makefile.package
+
+EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
+EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
+EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
+
+# Path to src files
+
+vpath %.cpp ..
+vpath %.h ..
+
+# Link target
+
+$(EXE):	$(OBJ)
+	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
+	$(SIZE) $(EXE)
+
+# Library targets
+
+lib:	$(OBJ)
+	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
+
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
+# Compilation rules
+
+%.o:%.cu
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
+
+%.o:%.cpp
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
+
+%.d:%.cpp
+	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
+
+# Individual dependencies
+
+DEPENDS = $(OBJ:.o=.d)
+sinclude $(DEPENDS)
diff --git a/src/Makefile b/src/Makefile
index 8241135cc..f8e70a94d 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,240 +1,240 @@
 # LAMMPS multiple-machine Makefile
 
 SHELL = /bin/bash
 #.IGNORE:
 
 # Definitions
 
 ROOT =	lmp
 EXE =	$(ROOT)_$@
 SRC =	$(wildcard *.cpp)
 INC =	$(wildcard *.h)
 OBJ = 	$(SRC:.cpp=.o)
 
 # Package variables
 
 PACKAGE = asphere body class2 colloid dipole fld gpu granular kim \
-	  kspace manybody mc meam misc molecule mpiio opt peri poems \
+	  kokkos kspace manybody mc meam misc molecule mpiio opt peri poems \
 	  reax replica rigid shock srd voronoi xtc
 
 PACKUSER = user-atc user-awpmd user-cg-cmm user-colvars \
 	   user-cuda user-eff user-fep user-lb user-misc user-molfile \
 	   user-omp user-phonon user-qmmm user-reaxc user-sph
 
 PACKLIB = gpu kim meam poems reax voronoi \
 	  user-atc user-awpmd user-colvars user-qmmm user-cuda user-molfile
 
 PACKALL = $(PACKAGE) $(PACKUSER)
 
 PACKAGEUC = $(shell echo $(PACKAGE) | tr a-z A-Z)
 PACKUSERUC = $(shell echo $(PACKUSER) | tr a-z A-Z)
 
 YESDIR = $(shell echo $(@:yes-%=%) | tr a-z A-Z)
 NODIR  = $(shell echo $(@:no-%=%) | tr a-z A-Z)
 
 # List of all targets
 
 help:
 	@echo ''
 	@echo 'make clean-all           delete all object files'
 	@echo 'make clean-machine       delete object files for one machine'
 	@echo 'make purge               purge obsolete copies of package sources'
 	@echo 'make tar                 create lmp_src.tar.gz of src dir and packages'
 	@echo 'make makelib             create Makefile.lib for static library build'
 	@echo 'make makeshlib           create Makefile.shlib for shared library build'
 	@echo 'make makelist            create Makefile.list used by old makes'
 	@echo 'make -f Makefile.lib machine      build LAMMPS as static library for machine'
 	@echo 'make -f Makefile.shlib machine    build LAMMPS as shared library for machine'
 	@echo 'make -f Makefile.list machine     build LAMMPS from explicit list of files'
 	@echo 'make stubs               build dummy MPI library in STUBS'
 	@echo 'make install-python      install LAMMPS wrapper in Python'
 	@echo ''
 	@echo 'make package             list available packages'
 	@echo 'make package-status (ps) status of all packages'
 	@echo 'make yes-package         install a single package in src dir'
 	@echo 'make no-package          remove a single package from src dir'
 	@echo 'make yes-all             install all packages in src dir'
 	@echo 'make no-all              remove all packages from src dir'
 	@echo 'make yes-standard        install all standard packages'
 	@echo 'make no-standard         remove all standard packages'
 	@echo 'make yes-user            install all user packages'
 	@echo 'make no-user             remove all user packages'
 	@echo 'make no-lib              remove all packages with external libs'
 	@echo ''
 	@echo 'make package-update (pu) replace src files with updated package files'
 	@echo 'make package-overwrite   replace package files with src files'
 	@echo 'make package-diff (pd)   diff src files against package files'
 	@echo ''
 	@echo 'make machine             build LAMMPS where machine is one of:'
 	@echo ''
 	@files="`ls MAKE/Makefile.*`"; \
 	  for file in $$files; do head -1 $$file; done
 	@echo ''
 
 # Build the code
 
 .DEFAULT:
 	@test -f MAKE/Makefile.$@
 	@if [ ! -d Obj_$@ ]; then mkdir Obj_$@; fi
 	@$(SHELL) Make.sh style
 	@cp MAKE/Makefile.$@ Obj_$@/Makefile
 	@if [ ! -e Makefile.package ]; \
 	  then cp Makefile.package.empty Makefile.package; fi
 	@if [ ! -e Makefile.package.settings ]; \
 	  then cp Makefile.package.settings.empty Makefile.package.settings; fi
 	@cp Makefile.package Makefile.package.settings Obj_$@
 	@cd Obj_$@; \
 	$(MAKE) $(MFLAGS) "OBJ = $(OBJ)" "INC = $(INC)" "SHFLAGS =" \
 	  "EXE = ../$(EXE)" ../$(EXE)
 
 # Remove machine-specific object files
 
 clean:
 	@echo 'make clean-all           delete all object files'
 	@echo 'make clean-machine       delete object files for one machine'
 
 clean-all:
 	rm -rf Obj_*
 
 clean-%:
 	rm -rf Obj_$(@:clean-%=%)
 
 purge: Purge.list
 	@echo 'Purging obsolete and auto-generated source files'
 	@for f in `grep -v '#' Purge.list` ;		\
 	    do test -f $$f && rm $$f && echo $$f || : ;		\
 	done
 
 # Create a tarball of src dir and packages
 
 tar:
 	@cd STUBS; $(MAKE) clean
 	@cd ..; tar cvzf src/$(ROOT)_src.tar.gz \
 	  src/Make* src/Package.sh src/MAKE src/*.cpp src/*.h src/STUBS \
 	  $(patsubst %,src/%,$(PACKAGEUC)) $(patsubst %,src/%,$(PACKUSERUC)) \
           --exclude=*/.svn
 	@cd STUBS; $(MAKE)
 	@echo "Created $(ROOT)_src.tar.gz"
 
 # Make MPI STUBS library
 
 stubs:
 	@cd STUBS; $(MAKE) clean; $(MAKE)
 
 # Create Makefile.lib, Makefile.shlib, and Makefile.list
 
 makelib:
 	@$(SHELL) Make.sh style
 	@$(SHELL) Make.sh Makefile.lib
 
 makeshlib:
 	@$(SHELL) Make.sh style
 	@$(SHELL) Make.sh Makefile.shlib
 
 makelist:
 	@$(SHELL) Make.sh style
 	@$(SHELL) Make.sh Makefile.list
 
 # install LAMMPS shared lib and Python wrapper for Python usage
 
 install-python:
 	@python ../python/install.py
 
 # Package management
 
 package:
 	@echo 'Standard packages:' $(PACKAGE)
 	@echo ''
 	@echo 'User-contributed packages:' $(PACKUSER)
 	@echo ''
 	@echo 'make package              list available packages'
 	@echo 'make package-status (ps)  status of all packages'
 	@echo 'make yes-package          install a single package in src dir'
 	@echo 'make no-package           remove a single package from src dir'
 	@echo 'make yes-all              install all packages in src dir'
 	@echo 'make no-all               remove all packages from src dir'
 	@echo 'make yes-standard         install all standard packages'
 	@echo 'make no-standard          remove all standard packages'
 	@echo 'make yes-user             install all user packages'
 	@echo 'make no-user              remove all user packages'
 	@echo 'make no-lib               remove all packages with external libs'
 	@echo ''
 	@echo 'make package-update (pu)  replace src files with package files'
 	@echo 'make package-overwrite    replace package files with src files'
 	@echo 'make package-diff (pd)    diff src files against package file'
 
 yes-all:
 	@for p in $(PACKALL); do $(MAKE) yes-$$p; done
 
 no-all:
 	@for p in $(PACKALL); do $(MAKE) no-$$p; done
 
 yes-standard:
 	@for p in $(PACKAGE); do $(MAKE) yes-$$p; done
 
 no-standard:
 	@for p in $(PACKAGE); do $(MAKE) no-$$p; done
 
 yes-user:
 	@for p in $(PACKUSER); do $(MAKE) yes-$$p; done
 
 no-user:
 	@for p in $(PACKUSER); do $(MAKE) no-$$p; done
 
 no-lib:
 	@for p in $(PACKLIB); do $(MAKE) no-$$p; done
 
 yes-%:
 	@if [ ! -e Makefile.package ]; \
 	  then cp Makefile.package.empty Makefile.package; fi
 	@if [ ! -e Makefile.package.settings ]; \
 	  then cp Makefile.package.settings.empty Makefile.package.settings; fi
 	@if [ ! -e $(YESDIR) ]; then \
 	  echo "Package $(@:yes-%=%) does not exist"; \
 	elif [ -e $(YESDIR)/Install.sh ]; then \
 	  echo "Installing package $(@:yes-%=%)"; \
 	  cd $(YESDIR); $(SHELL) Install.sh 1; cd ..; \
 		$(SHELL) Depend.sh $(YESDIR) 1; \
 	else \
 	  echo "Installing package $(@:yes-%=%)"; \
 	  cd $(YESDIR); $(SHELL) ../Install.sh 1; cd ..; \
 		$(SHELL) Depend.sh $(YESDIR) 1; \
 	fi;
 
 no-%:
 	@if [ ! -e $(NODIR) ]; then \
 	  echo "Package $(@:no-%=%) does not exist"; \
 	elif [ -e $(NODIR)/Install.sh ]; then \
 	  echo "Uninstalling package $(@:no-%=%)"; \
 	  cd $(NODIR); $(SHELL) Install.sh 0; cd ..; \
 		$(SHELL) Depend.sh $(NODIR) 0; \
 	else \
 	  echo "Uninstalling package $(@:no-%=%)"; \
 	  cd $(NODIR); $(SHELL) ../Install.sh 0; cd ..; \
 		$(SHELL) Depend.sh $(NODIR) 0; \
         fi;
 
 # status = list src files that differ from package files
 # update = replace src files with newer package files
 # overwrite = overwrite package files with newer src files
 # diff = show differences between src and package files
 
 package-status ps:
 	@for p in $(PACKAGEUC); do $(SHELL) Package.sh $$p status; done
 	@echo ''
 	@for p in $(PACKUSERUC); do $(SHELL) Package.sh $$p status; done
 
 package-update pu:
 	@for p in $(PACKAGEUC); do $(SHELL) Package.sh $$p update; done
 	@echo ''
 	@for p in $(PACKUSERUC); do $(SHELL) Package.sh $$p update; done
 
 package-overwrite:
 	@for p in $(PACKAGEUC); do $(SHELL) Package.sh $$p overwrite; done
 	@echo ''
 	@for p in $(PACKUSERUC); do $(SHELL) Package.sh $$p overwrite; done
 
 package-diff pd:
 	@for p in $(PACKAGEUC); do $(SHELL) Package.sh $$p diff; done
 	@echo ''
 	@for p in $(PACKUSERUC); do $(SHELL) Package.sh $$p diff; done
diff --git a/src/atom_vec.cpp b/src/atom_vec.cpp
index 0fd8043d8..06c61caa2 100644
--- a/src/atom_vec.cpp
+++ b/src/atom_vec.cpp
@@ -1,382 +1,384 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "string.h"
 #include "stdlib.h"
 #include "atom_vec.h"
 #include "atom.h"
 #include "force.h"
 #include "domain.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 #define DELTA 16384
 #define DELTA_BONUS 8192
 
 /* ---------------------------------------------------------------------- */
 
 AtomVec::AtomVec(LAMMPS *lmp) : Pointers(lmp)
 {
   nmax = 0;
   bonds_allow = angles_allow = dihedrals_allow = impropers_allow = 0;
   mass_type = dipole_type = 0;
   size_data_bonus = 0;
   cudable = kokkosable = 0;
 
   nargcopy = 0;
   argcopy = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 AtomVec::~AtomVec()
 {
   for (int i = 0; i < nargcopy; i++) delete [] argcopy[i];
   delete [] argcopy;
 }
 
 /* ----------------------------------------------------------------------
    make copy of args for use by restart & replicate
 ------------------------------------------------------------------------- */
 
 void AtomVec::store_args(int narg, char **arg)
 {
   nargcopy = narg;
   argcopy = new char*[nargcopy];
   for (int i = 0; i < nargcopy; i++) {
     int n = strlen(arg[i]) + 1;
     argcopy[i] = new char[n];
     strcpy(argcopy[i],arg[i]);
   }
 }
 
 /* ----------------------------------------------------------------------
    no additional args by default
 ------------------------------------------------------------------------- */
 
 void AtomVec::process_args(int narg, char **arg)
 {
   if (narg) error->all(FLERR,"Invalid atom_style command");
 }
 
 /* ----------------------------------------------------------------------
    copy of velocity remap settings from Domain
 ------------------------------------------------------------------------- */
 
 void AtomVec::init()
 {
   deform_vremap = domain->deform_vremap;
   deform_groupbit = domain->deform_groupbit;
   h_rate = domain->h_rate;
 
-  if (lmp->cuda != NULL && cudable == false)
+  if (lmp->cuda != NULL && !cudable)
     error->all(FLERR,"USER-CUDA package requires a cuda enabled atom_style");
+  if (lmp->kokkos != NULL && !kokkosable)
+    error->all(FLERR,"KOKKOS package requires a kokkos enabled atom_style");
 }
 
 /* ----------------------------------------------------------------------
    grow nmax so it is a multiple of DELTA
 ------------------------------------------------------------------------- */
 
 void AtomVec::grow_nmax()
 {
   nmax = nmax/DELTA * DELTA;
   nmax += DELTA;
 }
 
 /* ----------------------------------------------------------------------
    grow nmax_bonus so it is a multiple of DELTA_BONUS
 ------------------------------------------------------------------------- */
 
 int AtomVec::grow_nmax_bonus(int nmax_bonus)
 {
   nmax_bonus = nmax_bonus/DELTA_BONUS * DELTA_BONUS;
   nmax_bonus += DELTA_BONUS;
   return nmax_bonus;
 }
 
 /* ----------------------------------------------------------------------
    unpack one line from Velocities section of data file
 ------------------------------------------------------------------------- */
 
 void AtomVec::data_vel(int m, char **values)
 {
   double **v = atom->v;
   v[m][0] = atof(values[0]);
   v[m][1] = atof(values[1]);
   v[m][2] = atof(values[2]);
 }
 
 /* ----------------------------------------------------------------------
    pack velocity info for data file
 ------------------------------------------------------------------------- */
 
 void AtomVec::pack_vel(double **buf)
 {
   double **v = atom->v;
   tagint *tag = atom->tag;
   int nlocal = atom->nlocal;
 
   for (int i = 0; i < nlocal; i++) {
     buf[i][0] = ubuf(tag[i]).d;
     buf[i][1] = v[i][0];
     buf[i][2] = v[i][1];
     buf[i][3] = v[i][2];
   }
 }
 
 /* ----------------------------------------------------------------------
    write velocity info to data file
 ------------------------------------------------------------------------- */
 
 void AtomVec::write_vel(FILE *fp, int n, double **buf)
 {
   for (int i = 0; i < n; i++)
     fprintf(fp,TAGINT_FORMAT " %-1.16e %-1.16e %-1.16e\n",
             (tagint) ubuf(buf[i][0]).i,buf[i][1],buf[i][2],buf[i][3]);
 }
 
 /* ----------------------------------------------------------------------
    pack bond info for data file into buf if non-NULL
    return count of bonds from this proc
    do not count/pack bonds with bondtype = 0
    if bondtype is negative, flip back to positive
 ------------------------------------------------------------------------- */
 
 int AtomVec::pack_bond(tagint **buf)
 {
   tagint *tag = atom->tag;
   int *num_bond = atom->num_bond;
   int **bond_type = atom->bond_type;
   tagint **bond_atom = atom->bond_atom;
   int nlocal = atom->nlocal;
   int newton_bond = force->newton_bond;
 
   int i,j;
   int m = 0;
   if (newton_bond) {
     for (i = 0; i < nlocal; i++)
       for (j = 0; j < num_bond[i]; j++) {
         if (bond_type[i][j] == 0) continue;
         if (buf) {
           buf[m][0] = MAX(bond_type[i][j],-bond_type[i][j]);
           buf[m][1] = tag[i];
           buf[m][2] = bond_atom[i][j];
         }
         m++;
       }
   } else {
     for (i = 0; i < nlocal; i++)
       for (j = 0; j < num_bond[i]; j++)
         if (tag[i] < bond_atom[i][j]) {
           if (bond_type[i][j] == 0) continue;
           if (buf) {
             buf[m][0] = MAX(bond_type[i][j],-bond_type[i][j]);
             buf[m][1] = tag[i];
             buf[m][2] = bond_atom[i][j];
           }
           m++;
         }
   }
 
   return m;
 }
 
 /* ----------------------------------------------------------------------
    write bond info to data file
 ------------------------------------------------------------------------- */
 
 void AtomVec::write_bond(FILE *fp, int n, tagint **buf, int index)
 {
   for (int i = 0; i < n; i++) {
     fprintf(fp,"%d " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT "\n",
             index,buf[i][0],buf[i][1],buf[i][2]);
     index++;
   }
 }
 
 /* ----------------------------------------------------------------------
    pack angle info for data file into buf if non-NULL
    return count of angles from this proc
    do not count/pack angles with angletype = 0
    if angletype is negative, flip back to positive
 ------------------------------------------------------------------------- */
 
 int AtomVec::pack_angle(tagint **buf)
 {
   tagint *tag = atom->tag;
   int *num_angle = atom->num_angle;
   int **angle_type = atom->angle_type;
   tagint **angle_atom1 = atom->angle_atom1;
   tagint **angle_atom2 = atom->angle_atom2;
   tagint **angle_atom3 = atom->angle_atom3;
   int nlocal = atom->nlocal;
   int newton_bond = force->newton_bond;
 
   int i,j;
   int m = 0;
   if (newton_bond) {
     for (i = 0; i < nlocal; i++)
       for (j = 0; j < num_angle[i]; j++) {
         if (angle_type[i][j] == 0) continue;
         if (buf) {
           buf[m][0] = MAX(angle_type[i][j],-angle_type[i][j]);
           buf[m][1] = angle_atom1[i][j];
           buf[m][2] = angle_atom2[i][j];
           buf[m][3] = angle_atom3[i][j];
         }
         m++;
       }
   } else {
     for (i = 0; i < nlocal; i++)
       for (j = 0; j < num_angle[i]; j++)
         if (tag[i] == angle_atom2[i][j]) {
           if (angle_type[i][j] == 0) continue;
           if (buf) {
             buf[m][0] = MAX(angle_type[i][j],-angle_type[i][j]);
             buf[m][1] = angle_atom1[i][j];
             buf[m][2] = angle_atom2[i][j];
             buf[m][3] = angle_atom3[i][j];
           }
           m++;
         }
   }
 
   return m;
 }
 
 /* ----------------------------------------------------------------------
    write angle info to data file
 ------------------------------------------------------------------------- */
 
 void AtomVec::write_angle(FILE *fp, int n, tagint **buf, int index)
 {
   for (int i = 0; i < n; i++) {
     fprintf(fp,"%d " TAGINT_FORMAT " " TAGINT_FORMAT " " 
             TAGINT_FORMAT " " TAGINT_FORMAT "\n",
             index,buf[i][0],buf[i][1],buf[i][2],buf[i][3]);
     index++;
   }
 }
 
 /* ----------------------------------------------------------------------
    pack dihedral info for data file
 ------------------------------------------------------------------------- */
 
 void AtomVec::pack_dihedral(tagint **buf)
 {
   tagint *tag = atom->tag;
   int *num_dihedral = atom->num_dihedral;
   int **dihedral_type = atom->dihedral_type;
   tagint **dihedral_atom1 = atom->dihedral_atom1;
   tagint **dihedral_atom2 = atom->dihedral_atom2;
   tagint **dihedral_atom3 = atom->dihedral_atom3;
   tagint **dihedral_atom4 = atom->dihedral_atom4;
   int nlocal = atom->nlocal;
   int newton_bond = force->newton_bond;
 
   int i,j;
   int m = 0;
   if (newton_bond) {
     for (i = 0; i < nlocal; i++)
       for (j = 0; j < num_dihedral[i]; j++) {
         buf[m][0] = dihedral_type[i][j];
         buf[m][1] = dihedral_atom1[i][j];
         buf[m][2] = dihedral_atom2[i][j];
         buf[m][3] = dihedral_atom3[i][j];
         buf[m][4] = dihedral_atom4[i][j];
         m++;
       }
   } else {
     for (i = 0; i < nlocal; i++)
       for (j = 0; j < num_dihedral[i]; j++)
         if (tag[i] == dihedral_atom2[i][j]) {
           buf[m][0] = dihedral_type[i][j];
           buf[m][1] = dihedral_atom1[i][j];
           buf[m][2] = dihedral_atom2[i][j];
           buf[m][3] = dihedral_atom3[i][j];
           buf[m][4] = dihedral_atom4[i][j];
           m++;
         }
   }
 }
 
 /* ----------------------------------------------------------------------
    write dihedral info to data file
 ------------------------------------------------------------------------- */
 
 void AtomVec::write_dihedral(FILE *fp, int n, tagint **buf, int index)
 {
   for (int i = 0; i < n; i++) {
     fprintf(fp,"%d " TAGINT_FORMAT " " TAGINT_FORMAT " " 
             TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT "\n",
             index,buf[i][0],buf[i][1],buf[i][2],buf[i][3],buf[i][4]);
     index++;
   }
 }
 
 /* ----------------------------------------------------------------------
    pack improper info for data file
 ------------------------------------------------------------------------- */
 
 void AtomVec::pack_improper(tagint **buf)
 {
   tagint *tag = atom->tag;
   int *num_improper = atom->num_improper;
   int **improper_type = atom->improper_type;
   tagint **improper_atom1 = atom->improper_atom1;
   tagint **improper_atom2 = atom->improper_atom2;
   tagint **improper_atom3 = atom->improper_atom3;
   tagint **improper_atom4 = atom->improper_atom4;
   int nlocal = atom->nlocal;
   int newton_bond = force->newton_bond;
 
   int i,j;
   int m = 0;
   if (newton_bond) {
     for (i = 0; i < nlocal; i++)
       for (j = 0; j < num_improper[i]; j++) {
         buf[m][0] = improper_type[i][j];
         buf[m][1] = improper_atom1[i][j];
         buf[m][2] = improper_atom2[i][j];
         buf[m][3] = improper_atom3[i][j];
         buf[m][4] = improper_atom4[i][j];
         m++;
       }
   } else {
     for (i = 0; i < nlocal; i++)
       for (j = 0; j < num_improper[i]; j++)
         if (tag[i] == improper_atom2[i][j]) {
           buf[m][0] = improper_type[i][j];
           buf[m][1] = improper_atom1[i][j];
           buf[m][2] = improper_atom2[i][j];
           buf[m][3] = improper_atom3[i][j];
           buf[m][4] = improper_atom4[i][j];
           m++;
         }
   }
 }
 
 /* ----------------------------------------------------------------------
    write improper info to data file
 ------------------------------------------------------------------------- */
 
 void AtomVec::write_improper(FILE *fp, int n, tagint **buf, int index)
 {
   for (int i = 0; i < n; i++) {
     fprintf(fp,"%d " TAGINT_FORMAT " " TAGINT_FORMAT " " 
             TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT "\n",
             index,buf[i][0],buf[i][1],buf[i][2],buf[i][3],buf[i][4]);
     index++;
   }
 }
diff --git a/src/compute_property_local.cpp b/src/compute_property_local.cpp
index da34de08a..82d85d4be 100644
--- a/src/compute_property_local.cpp
+++ b/src/compute_property_local.cpp
@@ -1,915 +1,915 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 #include "string.h"
 #include "compute_property_local.h"
 #include "atom.h"
 #include "atom_vec.h"
 #include "update.h"
 #include "force.h"
 #include "pair.h"
 #include "neighbor.h"
 #include "neigh_request.h"
 #include "neigh_list.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 enum{NONE,NEIGH,PAIR,BOND,ANGLE,DIHEDRAL,IMPROPER};
 
 #define DELTA 10000
 
 /* ---------------------------------------------------------------------- */
 
 ComputePropertyLocal::ComputePropertyLocal(LAMMPS *lmp, int narg, char **arg) :
   Compute(lmp, narg, arg)
 {
   if (narg < 4) error->all(FLERR,"Illegal compute property/local command");
 
   local_flag = 1;
   nvalues = narg - 3;
   if (nvalues == 1) size_local_cols = 0;
   else size_local_cols = nvalues;
 
   pack_choice = new FnPtrPack[nvalues];
 
   kindflag = NONE;
 
   int i;
   for (int iarg = 3; iarg < narg; iarg++) {
     i = iarg-3;
 
     if (strcmp(arg[iarg],"natom1") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_patom1;
       if (kindflag != NONE && kindflag != NEIGH)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = NEIGH;
     } else if (strcmp(arg[iarg],"natom2") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_patom2;
       if (kindflag != NONE && kindflag != NEIGH)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = NEIGH;
     } else if (strcmp(arg[iarg],"ntype1") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_ptype1;
       if (kindflag != NONE && kindflag != NEIGH)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = NEIGH;
     } else if (strcmp(arg[iarg],"ntype2") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_ptype2;
       if (kindflag != NONE && kindflag != NEIGH)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = NEIGH;
 
     } else if (strcmp(arg[iarg],"patom1") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_patom1;
       if (kindflag != NONE && kindflag != PAIR)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = PAIR;
     } else if (strcmp(arg[iarg],"patom2") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_patom2;
       if (kindflag != NONE && kindflag != PAIR)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = PAIR;
     } else if (strcmp(arg[iarg],"ptype1") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_ptype1;
       if (kindflag != NONE && kindflag != PAIR)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = PAIR;
     } else if (strcmp(arg[iarg],"ptype2") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_ptype2;
       if (kindflag != NONE && kindflag != PAIR)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = PAIR;
 
     } else if (strcmp(arg[iarg],"batom1") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_batom1;
       if (kindflag != NONE && kindflag != BOND)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = BOND;
     } else if (strcmp(arg[iarg],"batom2") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_batom2;
       if (kindflag != NONE && kindflag != BOND)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = BOND;
     } else if (strcmp(arg[iarg],"btype") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_btype;
       if (kindflag != NONE && kindflag != BOND)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = BOND;
 
     } else if (strcmp(arg[iarg],"aatom1") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_aatom1;
       if (kindflag != NONE && kindflag != ANGLE)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = ANGLE;
     } else if (strcmp(arg[iarg],"aatom2") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_aatom2;
       if (kindflag != NONE && kindflag != ANGLE)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = ANGLE;
     } else if (strcmp(arg[iarg],"aatom3") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_aatom3;
       if (kindflag != NONE && kindflag != ANGLE)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = ANGLE;
     } else if (strcmp(arg[iarg],"atype") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_atype;
       if (kindflag != NONE && kindflag != ANGLE)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = ANGLE;
 
     } else if (strcmp(arg[iarg],"datom1") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_datom1;
       if (kindflag != NONE && kindflag != DIHEDRAL)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = DIHEDRAL;
     } else if (strcmp(arg[iarg],"datom2") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_datom2;
       if (kindflag != NONE && kindflag != DIHEDRAL)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = DIHEDRAL;
     } else if (strcmp(arg[iarg],"datom3") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_datom3;
       if (kindflag != NONE && kindflag != DIHEDRAL)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = DIHEDRAL;
     } else if (strcmp(arg[iarg],"datom4") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_datom4;
       if (kindflag != NONE && kindflag != DIHEDRAL)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = DIHEDRAL;
     } else if (strcmp(arg[iarg],"dtype") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_dtype;
       if (kindflag != NONE && kindflag != DIHEDRAL)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = DIHEDRAL;
 
     } else if (strcmp(arg[iarg],"iatom1") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_iatom1;
       if (kindflag != NONE && kindflag != IMPROPER)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = IMPROPER;
     } else if (strcmp(arg[iarg],"iatom2") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_iatom2;
       if (kindflag != NONE && kindflag != IMPROPER)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = IMPROPER;
     } else if (strcmp(arg[iarg],"iatom3") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_iatom3;
       if (kindflag != NONE && kindflag != IMPROPER)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = IMPROPER;
     } else if (strcmp(arg[iarg],"iatom4") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_iatom4;
       if (kindflag != NONE && kindflag != IMPROPER)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = IMPROPER;
     } else if (strcmp(arg[iarg],"itype") == 0) {
       pack_choice[i] = &ComputePropertyLocal::pack_itype;
       if (kindflag != NONE && kindflag != IMPROPER)
         error->all(FLERR,
                    "Compute property/local cannot use these inputs together");
       kindflag = IMPROPER;
 
     } else error->all(FLERR,
                       "Invalid keyword in compute property/local command");
   }
 
   // error check
 
   if (atom->molecular == 2 && (kindflag == BOND || kindflag == ANGLE ||
                                kindflag == DIHEDRAL || kindflag == IMPROPER))
     error->all(FLERR,"Compute property/local does not (yet) work "
                "with atom_style template");
 
   if (kindflag == BOND && atom->avec->bonds_allow == 0)
     error->all(FLERR,
                "Compute property/local for property that isn't allocated");
   if (kindflag == ANGLE && atom->avec->angles_allow == 0)
     error->all(FLERR,
                "Compute property/local for property that isn't allocated");
   if (kindflag == DIHEDRAL && atom->avec->dihedrals_allow == 0)
     error->all(FLERR,
                "Compute property/local for property that isn't allocated");
   if (kindflag == IMPROPER && atom->avec->impropers_allow == 0)
     error->all(FLERR,
                "Compute property/local for property that isn't allocated");
 
   nmax = 0;
   vector = NULL;
   array = NULL;
   indices = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 ComputePropertyLocal::~ComputePropertyLocal()
 {
   delete [] pack_choice;
   memory->destroy(vector);
   memory->destroy(array);
   memory->destroy(indices);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::init()
 {
   if (kindflag == NEIGH || kindflag == PAIR) {
     if (force->pair == NULL)
       error->all(FLERR,"No pair style is defined for compute property/local");
     if (force->pair->single_enable == 0)
       error->all(FLERR,"Pair style does not support compute property/local");
   }
 
   // for NEIGH/PAIR need an occasional half neighbor list
 
   if (kindflag == NEIGH || kindflag == PAIR) {
     int irequest = neighbor->request((void *) this);
     neighbor->requests[irequest]->pair = 0;
     neighbor->requests[irequest]->compute = 1;
     neighbor->requests[irequest]->occasional = 1;
   }
 
   // do initial memory allocation so that memory_usage() is correct
   // cannot be done yet for NEIGH/PAIR, since neigh list does not exist
 
   if (kindflag == NEIGH) ncount = 0;
   else if (kindflag == PAIR) ncount = 0;
   else if (kindflag == BOND) ncount = count_bonds(0);
   else if (kindflag == ANGLE) ncount = count_angles(0);
   else if (kindflag == DIHEDRAL) ncount = count_dihedrals(0);
   else if (kindflag == IMPROPER) ncount = count_impropers(0);
 
   if (ncount > nmax) reallocate(ncount);
   size_local_rows = ncount;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::init_list(int id, NeighList *ptr)
 {
   list = ptr;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::compute_local()
 {
   invoked_local = update->ntimestep;
 
   // count local entries and generate list of indices
 
   if (kindflag == NEIGH) ncount = count_pairs(0,0);
   else if (kindflag == PAIR) ncount = count_pairs(0,1);
   else if (kindflag == BOND) ncount = count_bonds(0);
   else if (kindflag == ANGLE) ncount = count_angles(0);
   else if (kindflag == DIHEDRAL) ncount = count_dihedrals(0);
   else if (kindflag == IMPROPER) ncount = count_impropers(0);
 
   if (ncount > nmax) reallocate(ncount);
   size_local_rows = ncount;
 
   if (kindflag == NEIGH) ncount = count_pairs(1,0);
   else if (kindflag == PAIR) ncount = count_pairs(1,1);
   else if (kindflag == BOND) ncount = count_bonds(1);
   else if (kindflag == ANGLE) ncount = count_angles(1);
   else if (kindflag == DIHEDRAL) ncount = count_dihedrals(1);
   else if (kindflag == IMPROPER) ncount = count_impropers(1);
 
   // fill vector or array with local values
 
   if (nvalues == 1) {
     buf = vector;
     (this->*pack_choice[0])(0);
   } else {
     if (array) buf = &array[0][0];
     for (int n = 0; n < nvalues; n++)
       (this->*pack_choice[n])(n);
   }
 }
 
 /* ----------------------------------------------------------------------
    count pairs and compute pair info on this proc
    only count pair once if newton_pair is off
    both atom I,J must be in group
    if allflag is set, compute requested info about pair
    if forceflag = 1, pair must be within force cutoff, else neighbor cutoff
 ------------------------------------------------------------------------- */
 
 int ComputePropertyLocal::count_pairs(int allflag, int forceflag)
 {
-  int i,j,m,n,ii,jj,inum,jnum,itype,jtype;
+  int i,j,m,ii,jj,inum,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
   int *ilist,*jlist,*numneigh,**firstneigh;
 
   double **x = atom->x;
   int *type = atom->type;
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
   int newton_pair = force->newton_pair;
 
   // invoke half neighbor list (will copy or build if necessary)
 
   if (allflag == 0) neighbor->build_one(list->index);
 
   inum = list->inum;
   ilist = list->ilist;
   numneigh = list->numneigh;
   firstneigh = list->firstneigh;
 
   // loop over neighbors of my atoms
   // skip if I or J are not in group
 
   double **cutsq = force->pair->cutsq;
 
-  m = n = 0;
+  m = 0;
   for (ii = 0; ii < inum; ii++) {
     i = ilist[ii];
     if (!(mask[i] & groupbit)) continue;
 
     xtmp = x[i][0];
     ytmp = x[i][1];
     ztmp = x[i][2];
     itype = type[i];
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       j &= NEIGHMASK;
 
       if (!(mask[j] & groupbit)) continue;
       if (newton_pair == 0 && j >= nlocal) continue;
 
       delx = xtmp - x[j][0];
       dely = ytmp - x[j][1];
       delz = ztmp - x[j][2];
       rsq = delx*delx + dely*dely + delz*delz;
       jtype = type[j];
       if (forceflag && rsq >= cutsq[itype][jtype]) continue;
 
       if (allflag) {
         indices[m][0] = i;
         indices[m][1] = j;
       }
       m++;
     }
   }
 
   return m;
 }
 
 /* ----------------------------------------------------------------------
    count bonds on this proc
    only count bond once if newton_bond is off
    all atoms in interaction must be in group
    all atoms in interaction must be known to proc
    if bond is deleted (type = 0), do not count
    if bond is turned off (type < 0), still count
 ------------------------------------------------------------------------- */
 
 int ComputePropertyLocal::count_bonds(int flag)
 {
   int i,atom1,atom2;
 
   int *num_bond = atom->num_bond;
   tagint **bond_atom = atom->bond_atom;
   int **bond_type = atom->bond_type;
   tagint *tag = atom->tag;
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
   int newton_bond = force->newton_bond;
 
   int m = 0;
   for (atom1 = 0; atom1 < nlocal; atom1++) {
     if (!(mask[atom1] & groupbit)) continue;
     for (i = 0; i < num_bond[atom1]; i++) {
       atom2 = atom->map(bond_atom[atom1][i]);
       if (atom2 < 0 || !(mask[atom2] & groupbit)) continue;
       if (newton_bond == 0 && tag[atom1] > tag[atom2]) continue;
       if (bond_type[atom1][i] == 0) continue;
 
       if (flag) {
         indices[m][0] = atom1;
         indices[m][1] = i;
       }
       m++;
     }
   }
 
   return m;
 }
 
 /* ----------------------------------------------------------------------
    count angles on this proc
    only count if 2nd atom is the one storing the angle
    all atoms in interaction must be in group
    all atoms in interaction must be known to proc
    if angle is deleted (type = 0), do not count
    if angle is turned off (type < 0), still count
 ------------------------------------------------------------------------- */
 
 int ComputePropertyLocal::count_angles(int flag)
 {
   int i,atom1,atom2,atom3;
 
   int *num_angle = atom->num_angle;
   tagint **angle_atom1 = atom->angle_atom1;
   tagint **angle_atom2 = atom->angle_atom2;
   tagint **angle_atom3 = atom->angle_atom3;
   int **angle_type = atom->angle_type;
   tagint *tag = atom->tag;
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
 
   int m = 0;
   for (atom2 = 0; atom2 < nlocal; atom2++) {
     if (!(mask[atom2] & groupbit)) continue;
     for (i = 0; i < num_angle[atom2]; i++) {
       if (tag[atom2] != angle_atom2[atom2][i]) continue;
       atom1 = atom->map(angle_atom1[atom2][i]);
       if (atom1 < 0 || !(mask[atom1] & groupbit)) continue;
       atom3 = atom->map(angle_atom3[atom2][i]);
       if (atom3 < 0 || !(mask[atom3] & groupbit)) continue;
       if (angle_type[atom2][i] == 0) continue;
 
       if (flag) {
         indices[m][0] = atom2;
         indices[m][1] = i;
       }
       m++;
     }
   }
 
   return m;
 }
 
 /* ----------------------------------------------------------------------
    count dihedrals on this proc
    only count if 2nd atom is the one storing the dihedral
    all atoms in interaction must be in group
    all atoms in interaction must be known to proc
 ------------------------------------------------------------------------- */
 
 int ComputePropertyLocal::count_dihedrals(int flag)
 {
   int i,atom1,atom2,atom3,atom4;
 
   int *num_dihedral = atom->num_dihedral;
   tagint **dihedral_atom1 = atom->dihedral_atom1;
   tagint **dihedral_atom2 = atom->dihedral_atom2;
   tagint **dihedral_atom3 = atom->dihedral_atom3;
   tagint **dihedral_atom4 = atom->dihedral_atom4;
   tagint *tag = atom->tag;
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
 
   int m = 0;
   for (atom2 = 0; atom2 < nlocal; atom2++) {
     if (!(mask[atom2] & groupbit)) continue;
     for (i = 0; i < num_dihedral[atom2]; i++) {
       if (tag[atom2] != dihedral_atom2[atom2][i]) continue;
       atom1 = atom->map(dihedral_atom1[atom2][i]);
       if (atom1 < 0 || !(mask[atom1] & groupbit)) continue;
       atom3 = atom->map(dihedral_atom3[atom2][i]);
       if (atom3 < 0 || !(mask[atom3] & groupbit)) continue;
       atom4 = atom->map(dihedral_atom4[atom2][i]);
       if (atom4 < 0 || !(mask[atom4] & groupbit)) continue;
 
       if (flag) {
         indices[m][0] = atom2;
         indices[m][1] = i;
       }
       m++;
     }
   }
 
   return m;
 }
 
 /* ----------------------------------------------------------------------
    count impropers on this proc
    only count if 2nd atom is the one storing the improper
    all atoms in interaction must be in group
    all atoms in interaction must be known to proc
 ------------------------------------------------------------------------- */
 
 int ComputePropertyLocal::count_impropers(int flag)
 {
   int i,atom1,atom2,atom3,atom4;
 
   int *num_improper = atom->num_improper;
   tagint **improper_atom1 = atom->improper_atom1;
   tagint **improper_atom2 = atom->improper_atom2;
   tagint **improper_atom3 = atom->improper_atom3;
   tagint **improper_atom4 = atom->improper_atom4;
   tagint *tag = atom->tag;
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
 
   int m = 0;
   for (atom2 = 0; atom2 < nlocal; atom2++) {
     if (!(mask[atom2] & groupbit)) continue;
     for (i = 0; i < num_improper[atom2]; i++) {
       if (tag[atom2] != improper_atom2[atom2][i]) continue;
       atom1 = atom->map(improper_atom1[atom2][i]);
       if (atom1 < 0 || !(mask[atom1] & groupbit)) continue;
       atom3 = atom->map(improper_atom3[atom2][i]);
       if (atom3 < 0 || !(mask[atom3] & groupbit)) continue;
       atom4 = atom->map(improper_atom4[atom2][i]);
       if (atom4 < 0 || !(mask[atom4] & groupbit)) continue;
 
       if (flag) {
         indices[m][0] = atom2;
         indices[m][1] = i;
       }
       m++;
     }
   }
 
   return m;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::reallocate(int n)
 {
   // grow vector or array and indices array
 
   while (nmax < n) nmax += DELTA;
   if (nvalues == 1) {
     memory->destroy(vector);
     memory->create(vector,nmax,"property/local:vector");
     vector_local = vector;
   } else {
     memory->destroy(array);
     memory->create(array,nmax,nvalues,"property/local:array");
     array_local = array;
   }
 
   memory->destroy(indices);
   memory->create(indices,nmax,2,"property/local:indices");
 }
 
 /* ----------------------------------------------------------------------
    memory usage of local data
 ------------------------------------------------------------------------- */
 
 double ComputePropertyLocal::memory_usage()
 {
   double bytes = nmax*nvalues * sizeof(double);
   bytes += nmax*2 * sizeof(int);
   return bytes;
 }
 
 /* ----------------------------------------------------------------------
    one method for every keyword compute property/local can output
    the atom property is packed into buf starting at n with stride nvalues
    customize a new keyword by adding a method
 ------------------------------------------------------------------------- */
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_patom1(int n)
 {
   int i;
   tagint *tag = atom->tag;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][0];
     buf[n] = tag[i];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_patom2(int n)
 {
   int i;
   tagint *tag = atom->tag;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][1];
     buf[n] = tag[i];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_ptype1(int n)
 {
   int i;
   int *type = atom->type;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][0];
     buf[n] = type[i];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_ptype2(int n)
 {
   int i;
   int *type = atom->type;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][1];
     buf[n] = type[i];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_batom1(int n)
 {
   int i;
   tagint *tag = atom->tag;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][0];
     buf[n] = tag[i];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_batom2(int n)
 {
   int i,j;
   tagint **bond_atom = atom->bond_atom;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][0];
     j = indices[m][1];
     buf[n] = bond_atom[i][j];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_btype(int n)
 {
   int i,j;
   int **bond_type = atom->bond_type;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][0];
     j = indices[m][1];
     buf[n] = bond_type[i][j];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_aatom1(int n)
 {
   int i,j;
   tagint **angle_atom1 = atom->angle_atom1;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][0];
     j = indices[m][1];
     buf[n] = angle_atom1[i][j];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_aatom2(int n)
 {
   int i,j;
   tagint **angle_atom2 = atom->angle_atom2;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][0];
     j = indices[m][1];
     buf[n] = angle_atom2[i][j];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_aatom3(int n)
 {
   int i,j;
   tagint **angle_atom3 = atom->angle_atom3;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][0];
     j = indices[m][1];
     buf[n] = angle_atom3[i][j];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_atype(int n)
 {
   int i,j;
   int **angle_type = atom->angle_type;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][0];
     j = indices[m][1];
     buf[n] = angle_type[i][j];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_datom1(int n)
 {
   int i,j;
   tagint **dihedral_atom1 = atom->dihedral_atom1;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][0];
     j = indices[m][1];
     buf[n] = dihedral_atom1[i][j];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_datom2(int n)
 {
   int i,j;
   tagint **dihedral_atom2 = atom->dihedral_atom2;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][0];
     j = indices[m][1];
     buf[n] = dihedral_atom2[i][j];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_datom3(int n)
 {
   int i,j;
   tagint **dihedral_atom3 = atom->dihedral_atom3;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][0];
     j = indices[m][1];
     buf[n] = dihedral_atom3[i][j];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_datom4(int n)
 {
   int i,j;
   tagint **dihedral_atom4 = atom->dihedral_atom4;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][0];
     j = indices[m][1];
     buf[n] = dihedral_atom4[i][j];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_dtype(int n)
 {
   int i,j;
   int **dihedral_type = atom->dihedral_type;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][0];
     j = indices[m][1];
     buf[n] = dihedral_type[i][j];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_iatom1(int n)
 {
   int i,j;
   tagint **improper_atom1 = atom->improper_atom1;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][0];
     j = indices[m][1];
     buf[n] = improper_atom1[i][j];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_iatom2(int n)
 {
   int i,j;
   tagint **improper_atom2 = atom->improper_atom2;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][0];
     j = indices[m][1];
     buf[n] = improper_atom2[i][j];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_iatom3(int n)
 {
   int i,j;
   tagint **improper_atom3 = atom->improper_atom3;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][0];
     j = indices[m][1];
     buf[n] = improper_atom3[i][j];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_iatom4(int n)
 {
   int i,j;
   tagint **improper_atom4 = atom->improper_atom4;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][0];
     j = indices[m][1];
     buf[n] = improper_atom4[i][j];
     n += nvalues;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ComputePropertyLocal::pack_itype(int n)
 {
   int i,j;
   int **improper_type = atom->improper_type;
 
   for (int m = 0; m < ncount; m++) {
     i = indices[m][0];
     j = indices[m][1];
     buf[n] = improper_type[i][j];
     n += nvalues;
   }
 }
diff --git a/src/version.h b/src/version.h
index 535898f61..218ca98a1 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1 +1 @@
-#define LAMMPS_VERSION "27 May 2014"
+#define LAMMPS_VERSION "29 May 2014"