diff --git a/doc/package.txt b/doc/package.txt
index a74bded19..54f534313 100644
--- a/doc/package.txt
+++ b/doc/package.txt
@@ -1,286 +1,287 @@
 "LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
 
 :link(lws,http://lammps.sandia.gov)
 :link(ld,Manual.html)
 :link(lc,Section_commands.html#comm)
 
 :line
 
 package command :h3
 
 [Syntax:]
 
 package style args :pre
 
 style = {gpu} or {cuda} or {omp} :ulb,l
 args = arguments specific to the style :l
   {gpu} args = mode first last split keyword value ...
     mode = force or force/neigh
     first = ID of first GPU to be used on each node
     last = ID of last GPU to be used on each node
     split = fraction of particles assigned to the GPU
     zero or more keyword/value pairs may be appended
     keywords = {threads_per_atom} or {cellsize} or {device}
       {threads_per_atom} value = Nthreads
         Nthreads = # of GPU threads used per atom
       {cellsize} value = dist
         dist = length (distance units) in each dimension for neighbor bins
       {device} value = device_type
-        device_type = {kepler} or {fermi} or {cypress} or {generic}
+        device_type = {kepler} or {fermi} or {cypress} or {phi} or {intel} or {generic}
   {cuda} args = keyword value ...
     one or more keyword/value pairs may be appended
     keywords = {gpu/node} or {gpu/node/special} or {timing} or {test} or {override/bpa}
       {gpu/node} value = N
         N = number of GPUs to be used per node
       {gpu/node/special} values = N gpu1 .. gpuN
         N = number of GPUs to be used per node
         gpu1 .. gpuN = N IDs of the GPUs to use
       {timing} values = none
       {test} values = id
         id = atom-ID of a test particle
       {override/bpa} values = flag
         flag = 0 for TpA algorithm, 1 for BpA algorithm 
   {omp} args = Nthreads mode
     Nthreads = # of OpenMP threads to associate with each MPI process
     mode = force or force/neigh (optional) :pre
 :ule
 
 [Examples:]
 
 package gpu force 0 0 1.0
 package gpu force 0 0 0.75
 package gpu force/neigh 0 0 1.0
 package gpu force/neigh 0 1 -1.0
 package cuda gpu/node/special 2 0 2
 package cuda test 3948
 package omp * force/neigh
 package omp 4 force :pre
 
 [Description:]
 
 This command invokes package-specific settings.  Currently the
 following packages use it: GPU, USER-CUDA, and USER-OMP.
 
 To use the accelerated GPU and USER-OMP styles, the use of the package
 command is required.  However, as described in the "Defaults" section
 below, if you use the "-sf gpu" or "-sf omp" "command-line
 options"_Section_start.html#start_7 to enable use of these styles,
 then default package settings are enabled.  In that case you only need
 to use the package command if you want to change the defaults.
 
 To use the accelerate USER-CUDA styles, the package command is not
 required as defaults are assigned internally.  You only need to use
 the package command if you want to change the defaults.
 
 See "Section_accelerate"_Section_accelerate.html of the manual for
 more details about using these various packages for accelerating
 LAMMPS calculations.
 
 :line
 
 The {gpu} style invokes options associated with the use of the GPU
 package. 
 
 The {mode} setting specifies where neighbor list calculations will be
 performed.  If {mode} is force, neighbor list calculation is performed
 on the CPU. If {mode} is force/neigh, neighbor list calculation is
 performed on the GPU. GPU neighbor list calculation currently cannot
 be used with a triclinic box. GPU neighbor list calculation currently
 cannot be used with "hybrid"_pair_hybrid.html pair styles.  GPU
 neighbor lists are not compatible with styles that are not
 GPU-enabled.  When a non-GPU enabled style requires a neighbor list,
 it will also be built using CPU routines. In these cases, it will
 typically be more efficient to only use CPU neighbor list builds.
 
 The {first} and {last} settings specify the GPUs that will be used for
 simulation.  On each node, the GPU IDs in the inclusive range from
 {first} to {last} will be used.
 
 The {split} setting can be used for load balancing force calculation
 work between CPU and GPU cores in GPU-enabled pair styles. If 0 <
 {split} < 1.0, a fixed fraction of particles is offloaded to the GPU
 while force calculation for the other particles occurs simulataneously
 on the CPU. If {split}<0, the optimal fraction (based on CPU and GPU
 timings) is calculated every 25 timesteps. If {split} = 1.0, all force
 calculations for GPU accelerated pair styles are performed on the
 GPU. In this case, "hybrid"_pair_hybrid.html, "bond"_bond_style.html,
 "angle"_angle_style.html, "dihedral"_dihedral_style.html,
 "improper"_improper_style.html, and "long-range"_kspace_style.html
 calculations can be performed on the CPU while the GPU is performing
 force calculations for the GPU-enabled pair style.  If all CPU force
 computations complete before the GPU, LAMMPS will block until the GPU
 has finished before continuing the timestep.
 
 As an example, if you have two GPUs per node and 8 CPU cores per node,
 and would like to run on 4 nodes (32 cores) with dynamic balancing of
 force calculation across CPU and GPU cores, you could specify
 
 package gpu force/neigh 0 1 -1 :pre
 
 In this case, all CPU cores and GPU devices on the nodes would be
 utilized.  Each GPU device would be shared by 4 CPU cores. The CPU
 cores would perform force calculations for some fraction of the
 particles at the same time the GPUs performed force calculation for
 the other particles.
 
 The {threads_per_atom} keyword allows control of the number of GPU
 threads used per-atom to perform the short range force calculation.
 By default, the value will be chosen based on the pair style, however,
 the value can be set with this keyword to fine-tune performance.  For
 large cutoffs or with a small number of particles per GPU, increasing
 the value can improve performance. The number of threads per atom must
 be a power of 2 and currently cannot be greater than 32.
 
 The {cellsize} keyword can be used to control the size of the cells used
 for binning atoms in neighbor list calculations. Setting this value is 
 normally not needed; the optimal value is close to the default 
 (equal to the cutoff distance for the short range interactions 
 plus the neighbor skin). GPUs can perform efficiently with much larger cutoffs 
 than CPUs and this can be used to reduce the time required for long-range 
 calculations or in some cases to eliminate them with models such as 
 "coul/wolf"_pair_coul.html or "coul/dsf"_pair_coul.html. For very large cutoffs,
 it can be more efficient to use smaller values for cellsize in parallel
 simulations. For example, with a cutoff of 20*sigma and a neighbor skin of
 sigma, a cellsize of 5.25*sigma can be efficient for parallel simulations.
 
 The {device} keyword can be used to tune parameters to optimize for a specific
 accelerator when using OpenCL. For CUDA, the {device} keyword is ignored.
 Currently, the device type is limited to NVIDIA Kepler, NVIDIA Fermi, 
-AMD Cypress, or a generic device. More devices will be added soon. The default
-device type can be specified when building LAMMPS with the GPU library.
+AMD Cypress, Intel CPU, Intel Phi, or a generic device. More devices will be 
+added soon. The default device type can be specified when building LAMMPS with
+the GPU library.
 
 :line
 
 The {cuda} style invokes options associated with the use of the
 USER-CUDA package.  
 
 The {gpu/node} keyword specifies the number {N} of GPUs to be used on
 each node.  An MPI process with rank {K} will use the GPU (K mod N).
 This implies that processes should be assigned with successive ranks
 on each node, which is the default with most (or even all) MPI
 implementations. The default value for {N} is 2.
 
 The {gpu/node/special} keyword also specifies the number (N) of GPUs
 to be used on each node, but allows more control over their
 specification.  An MPI process with rank {K} will use the GPU {gpuI}
 with l = (K mod N) + 1. This implies that processes should be assigned
 with successive ranks on each node, which is the default with most (or
 even all) MPI implementations.  For example if you have three GPUs on
 a machine, one of which is used for the X-Server (the GPU with the ID
 1) while the others (with IDs 0 and 2) are used for computations you
 would specify:
 
 package cuda gpu/node/special 2 0 2 :pre
 
 A main purpose of the {gpu/node/special} optoin is to allow two (or
 more) simulations to be run on one workstation.  In that case one
 would set the first simulation to use GPU 0 and the second to use GPU
 1. This is not necessary though, if the GPUs are in what is called
 {compute exclusive} mode.  Using that setting, every process will get
 its own GPU automatically.  This {compute exclusive} mode can be set
 as root using the {nvidia-smi} tool which is part of the CUDA
 installation.
 
 Note that if the {gpu/node/special} keyword is not used, the USER-CUDA
 package sorts existing GPUs on each node according to their number of
 multiprocessors.  This way, compute GPUs will be priorized over
 X-Server GPUs.
  
 Use of the {timing} keyword will output detailed timing information
 for various subroutines.
 
 The {test} keyword will output info for the the specified atom at
 several points during each time step.  This is mainly usefull for
 debugging purposes.  Note that the simulation will be severly slowed
 down if this option is used.
 
 The {override/bpa} keyword can be used to specify which mode is used
 for pair-force evaluation.  TpA = one thread per atom; BpA = one block
 per atom.  If this keyword is not used, a short test at the begin of
 each run will determine which method is more effective (the result of
 this test is part of the LAMMPS output).  Therefore it is usually not
 necessary to use this keyword.
 
 :line
 
 The {omp} style invokes options associated with the use of the
 USER-OMP package.
 
 The first argument allows to explicitly set the number of OpenMP
 threads to be allocated for each MPI process.  For example, if your
 system has nodes with dual quad-core processors, it has a total of 8
 cores per node.  You could run MPI on 2 cores on each node (e.g. using
 options for the mpirun command), and set the {Nthreads} setting to 4.
 This would effectively use all 8 cores on each node.  Since each MPI
 process would spawn 4 threads (one of which runs as part of the MPI
 process itself).
 
 For performance reasons, you should not set {Nthreads} to more threads
 than there are physical cores (per MPI task), but LAMMPS cannot check
 for this.
 
 An {Nthreads} value of '*' instructs LAMMPS to use whatever is the
 default for the given OpenMP environment. This is usually determined
 via the {OMP_NUM_THREADS} environment variable or the compiler
 runtime.  Please note that in most cases the default for OpenMP
 capable compilers is to use one thread for each available CPU core
 when {OMP_NUM_THREADS} is not set, which can lead to extremely bad
 performance.
 
 Which combination of threads and MPI tasks gives the best performance
 is difficult to predict and can depend on many components of your input.
 Not all features of LAMMPS support OpenMP and the parallel efficiency
 can be very different, too.
 
 The {mode} setting specifies where neighbor list calculations will be
 multi-threaded as well.  If {mode} is force, neighbor list calculation
 is performed in serial. If {mode} is force/neigh, a multi-threaded
 neighbor list build is used. Using the force/neigh setting is almost
 always faster and should produce idential neighbor lists at the
 expense of using some more memory (neighbor list pages are always
 allocated for all threads at the same time and each thread works on
 its own pages).
 
 :line
 
 [Restrictions:]
 
 This command cannot be used after the simulation box is defined by a
 "read_data"_read_data.html or "create_box"_create_box.html command.
 
 The cuda style of this command can only be invoked if LAMMPS was built
 with the USER-CUDA package.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.
 
 The gpu style of this command can only be invoked if LAMMPS was built
 with the GPU package.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.
 When using the "r-RESPA run style"_run_style.html, GPU accelerated
 styles can only be used on the outermost RESPA level.
 
 The omp style of this command can only be invoked if LAMMPS was built
 with the USER-OMP package.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.
 
 [Related commands:]
 
 "suffix"_suffix.html
 
 [Default:]
 
 If the "-sf gpu" "command-line switch"_Section_start.html#start_7 is
 used then it is as if the command "package gpu force/neigh 0 0 1" were
 invoked, to specify default settings for the GPU package.  If the
 command-line switch is not used, then no defaults are set, and you
 must specify the appropriate package command in your input script.
 
 The default settings for the USER CUDA package are "package cuda gpu
 2".  This is the case whether the "-sf cuda" "command-line
 switch"_Section_start.html#start_7 is used or not.
 
 If the "-sf omp" "command-line switch"_Section_start.html#start_7 is
 used then it is as if the command "package omp *" were invoked, to
 specify default settings for the USER-OMP package.  If the
 command-line switch is not used, then no defaults are set, and you
 must specify the appropriate package command in your input script.
 
diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h
index 5fffe77c8..12a18ae87 100644
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@@ -1,457 +1,463 @@
 /***************************************************************************
                                 nvd_device.h
                              -------------------
                                W. Michael Brown
 
   Utilities for dealing with cuda devices
 
  __________________________________________________________________________
     This file is part of the Geryon Unified Coprocessor Library (UCL)
  __________________________________________________________________________
 
     begin                : Thu Jan 21 2010
     copyright            : (C) 2010 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
 #ifndef NVD_DEVICE
 #define NVD_DEVICE
 
 #include <string>
 #include <vector>
 #include <iostream>
 #include "nvd_macros.h"
 #include "ucl_types.h"
 
 namespace ucl_cudadr {
 
 // --------------------------------------------------------------------------
 // - COMMAND QUEUE STUFF
 // --------------------------------------------------------------------------
 typedef CUstream command_queue; 
 
 inline void ucl_sync(CUstream &stream) {
   CU_SAFE_CALL(cuStreamSynchronize(stream));
 }
 
 struct NVDProperties {
   int device_id;
   std::string name;
   int major;
   int minor;
   CUDA_INT_TYPE totalGlobalMem;
   int multiProcessorCount;
   CUdevprop_st p;
   int kernelExecTimeoutEnabled;
   int integrated;
   int canMapHostMemory;
   int concurrentKernels;
   int ECCEnabled;
   int computeMode;
 };
 
 /// Class for looking at device properties
 /** \note Calls to change the device outside of the class results in incorrect
   *       behavior 
   * \note There is no error checking for indexing past the number of devices **/
 class UCL_Device {
  public:
   /// Collect properties for every GPU on the node
   /** \note You must set the active GPU with set() before using the device **/
   inline UCL_Device();
   
   inline ~UCL_Device();
 
   /// Returns 1 (For compatibility with OpenCL)
   inline int num_platforms() { return 1; }
 
   /// Return a string with name and info of the current platform
   inline std::string platform_name() 
     { return "NVIDIA Corporation NVIDIA CUDA Driver"; }
 
   /// Delete any contexts/data and set the platform number to be used
   inline int set_platform(const int pid);
 
   /// Return the number of devices that support CUDA
   inline int num_devices() { return _properties.size(); }
 
   /// Set the CUDA device to the specified device number
   /** A context and default command queue will be created for the device
     * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
     * be allocated for use. clear() is called to delete any contexts and
     * associated data from previous calls to set(). **/
   inline int set(int num);
 
   /// Delete any context and associated data stored from a call to set()
   inline void clear();
 
   /// Get the current device number
   inline int device_num() { return _device; }
 
   /// Returns the default stream for the current device
   inline command_queue & cq() { return cq(0); }
   
   /// Returns the stream indexed by i
   inline command_queue & cq(const int i) { return _cq[i]; }
   
   /// Block until all commands in the default stream have completed
   inline void sync() { sync(0); }
   
   /// Block until all commands in the specified stream have completed
   inline void sync(const int i) { ucl_sync(cq(i)); }
   
   /// Get the number of command queues currently available on device
   inline int num_queues() 
     { return _cq.size(); }
   
   /// Add a stream for device computations
   inline void push_command_queue() {
     _cq.push_back(CUstream()); 
     CU_SAFE_CALL(cuStreamCreate(&_cq.back(),0)); 
   }
 
   /// Remove a stream for device computations
   /** \note You cannot delete the default stream **/
   inline void pop_command_queue() {
     if (_cq.size()<2) return;
     CU_SAFE_CALL_NS(cuStreamDestroy(_cq.back()));
     _cq.pop_back();
   }
   
   /// Set the default command queue (by default this is the null stream)
   /** \param i index of the command queue (as added by push_command_queue()) 
       If i is 0, the default command queue is set to the null stream **/
   inline void set_command_queue(const int i) {
     if (i==0) _cq[0]=0;
     else _cq[0]=_cq[i];
   }
   
   /// Get the current CUDA device name
   inline std::string name() { return name(_device); }
   /// Get the CUDA device name
   inline std::string name(const int i) 
     { return std::string(_properties[i].name); }
 
   /// Get a string telling the type of the current device
   inline std::string device_type_name() { return device_type_name(_device); }
   /// Get a string telling the type of the device
   inline std::string device_type_name(const int i) { return "GPU"; }
 
   /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
   inline int device_type() { return device_type(_device); }
   /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
   inline int device_type(const int i) { return UCL_GPU; }
   
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory() { return shared_memory(_device); }
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
   
   /// Returns true if double precision is support for the current device
   inline bool double_precision() { return double_precision(_device); }
   /// Returns true if double precision is support for the device
   inline bool double_precision(const int i) {return arch(i)>=1.3;}
   
+  /// Get the number of compute units on the current device
+  inline unsigned cus() { return cus(_device); }
+  /// Get the number of compute units
+  inline unsigned cus(const int i) 
+    { return _properties[i].multiProcessorCount; }
+
   /// Get the number of cores in the current device
   inline unsigned cores() { return cores(_device); }
   /// Get the number of cores
   inline unsigned cores(const int i) 
     { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; 
       else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32;
       else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48;
       else return _properties[i].multiProcessorCount*192; }
   
   /// Get the gigabytes of global memory in the current device
   inline double gigabytes() { return gigabytes(_device); }
   /// Get the gigabytes of global memory
   inline double gigabytes(const int i) 
     { return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
   
   /// Get the bytes of global memory in the current device
   inline size_t bytes() { return bytes(_device); }
   /// Get the bytes of global memory
   inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; }
 
   // Get the gigabytes of free memory in the current device
   inline double free_gigabytes() { return free_gigabytes(_device); }
   // Get the gigabytes of free memory
   inline double free_gigabytes(const int i) 
     { return static_cast<double>(free_bytes(i))/1073741824; }
   
   // Get the bytes of free memory in the current device
   inline size_t free_bytes() { return free_bytes(_device); }
   // Get the bytes of free memory
   inline size_t free_bytes(const int i) { 
     CUDA_INT_TYPE dfree, dtotal;
     CU_SAFE_CALL_NS(cuMemGetInfo(&dfree, &dtotal));
     return static_cast<size_t>(dfree);
   }
 
   /// Return the GPGPU compute capability for current device
   inline double arch() { return arch(_device); }
   /// Return the GPGPU compute capability
   inline double arch(const int i) 
     { return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
   
   /// Clock rate in GHz for current device
   inline double clock_rate() { return clock_rate(_device); }
   /// Clock rate in GHz
   inline double clock_rate(const int i) 
     { return _properties[i].p.clockRate*1e-6;}
                
   /// Get the maximum number of threads per block
   inline size_t group_size() { return group_size(_device); }
   /// Get the maximum number of threads per block
   inline size_t group_size(const int i) 
     { return _properties[i].p.maxThreadsPerBlock; }
   
   /// Return the maximum memory pitch in bytes for current device
   inline size_t max_pitch() { return max_pitch(_device); }
   /// Return the maximum memory pitch in bytes
   inline size_t max_pitch(const int i) { return _properties[i].p.memPitch; }
 
   /// Returns false if accelerator cannot be shared by multiple processes
   /** If it cannot be determined, true is returned **/
   inline bool sharing_supported() { return sharing_supported(_device); }
   /// Returns false if accelerator cannot be shared by multiple processes
   /** If it cannot be determined, true is returned **/
   inline bool sharing_supported(const int i)
     { return (_properties[i].computeMode == CU_COMPUTEMODE_DEFAULT); }
 
   /// True if splitting device into equal subdevices supported
   inline bool fission_equal()
     { return fission_equal(_device); }
   /// True if splitting device into equal subdevices supported
   inline bool fission_equal(const int i)
     { return false; }
   /// True if splitting device into subdevices by specified counts supported
   inline bool fission_by_counts()
     { return fission_by_counts(_device); }
   /// True if splitting device into subdevices by specified counts supported
   inline bool fission_by_counts(const int i)
     { return false; }    
   /// True if splitting device into subdevices by affinity domains supported
   inline bool fission_by_affinity()
     { return fission_by_affinity(_device); }
   /// True if splitting device into subdevices by affinity domains supported
   inline bool fission_by_affinity(const int i)
     { return false; }
 
   /// Maximum number of subdevices allowed from device fission
   inline int max_sub_devices()
     { return max_sub_devices(_device); }
   /// Maximum number of subdevices allowed from device fission
   inline int max_sub_devices(const int i)
     { return 0; }
 
   /// List all devices along with all properties
   inline void print_all(std::ostream &out);
  
  private:
   int _device, _num_devices;
   std::vector<NVDProperties> _properties;
   std::vector<CUstream> _cq;
   CUdevice _cu_device;
   CUcontext _context;
 };
 
 // Grabs the properties for all devices
 UCL_Device::UCL_Device() {
   CU_SAFE_CALL_NS(cuInit(0));
   CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices));
   for (int dev=0; dev<_num_devices; ++dev) {
     CUdevice m;
     CU_SAFE_CALL_NS(cuDeviceGet(&m,dev));
     int major, minor;
     CU_SAFE_CALL_NS(cuDeviceComputeCapability(&major,&minor,m));
     if (major==9999)
       continue;
       
     _properties.push_back(NVDProperties());
     _properties.back().device_id=dev;
     _properties.back().major=major;
     _properties.back().minor=minor;
     
     char namecstr[1024];
     CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m));
     _properties.back().name=namecstr;
     
     CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m));
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount,
                                        CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                                          m));
     CU_SAFE_CALL_NS(cuDeviceGetProperties(&_properties.back().p,m));
     #if CUDA_VERSION >= 2020
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(
                       &_properties.back().kernelExecTimeoutEnabled, 
                       CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(
                       &_properties.back().integrated,
                       CU_DEVICE_ATTRIBUTE_INTEGRATED, dev));
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(
                       &_properties.back().canMapHostMemory, 
                       CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().computeMode, 
                       CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,dev));
     #endif
     #if CUDA_VERSION >= 3010
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(
                       &_properties.back().concurrentKernels, 
                       CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(
                       &_properties.back().ECCEnabled,  
                       CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
     #endif
   }
   _device=-1;
   _cq.push_back(CUstream());
   _cq.back()=0;
 }
 
 UCL_Device::~UCL_Device() {
   clear();
 }
 
 int UCL_Device::set_platform(const int pid) {
   clear();
   #ifdef UCL_DEBUG
   assert(pid<num_platforms());
   #endif
   return UCL_SUCCESS;
 }
 
 // Set the CUDA device to the specified device number
 int UCL_Device::set(int num) {
   clear();
   _device=_properties[num].device_id;
   CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
   CUresult err=cuCtxCreate(&_context,0,_cu_device);
   if (err!=CUDA_SUCCESS) {
     #ifndef UCL_NO_EXIT
     std::cerr << "UCL Error: Could not access accelerator number " << num
               << " for use.\n";
     UCL_GERYON_EXIT;
     #endif
     return UCL_ERROR;
   }
   return UCL_SUCCESS;
 }
 
 void UCL_Device::clear() {
   if (_device>-1) {
     for (int i=1; i<num_queues(); i++) pop_command_queue();
     cuCtxDestroy(_context);
   }
   _device=-1;
 }
 
 // List all devices along with all properties
 void UCL_Device::print_all(std::ostream &out) {
   #if CUDA_VERSION >= 2020
   int driver_version;
   cuDriverGetVersion(&driver_version);
   out << "CUDA Driver Version:                           "
       << driver_version/1000 << "." << driver_version%100
 		  << std::endl;
   #endif
 
   if (num_devices() == 0)
     out << "There is no device supporting CUDA\n";
   for (int i=0; i<num_devices(); ++i) {
     out << "\nDevice " << i << ": \"" << name(i) << "\"\n";
     out << "  Type of device:                                "
         << device_type_name(i).c_str() << std::endl;
     out << "  Compute capability:                            "
         << arch(i) << std::endl;
     out << "  Double precision support:                      ";
     if (double_precision(i))
       out << "Yes\n";
     else
       out << "No\n";
     out << "  Total amount of global memory:                 "
         << gigabytes(i) << " GB\n";
     #if CUDA_VERSION >= 2000
     out << "  Number of compute units/multiprocessors:       "
         << _properties[i].multiProcessorCount << std::endl;
     out << "  Number of cores:                               "
         << cores(i) << std::endl;
     #endif
     out << "  Total amount of constant memory:               "
         << _properties[i].p.totalConstantMemory << " bytes\n";
     out << "  Total amount of local/shared memory per block: "
         << _properties[i].p.sharedMemPerBlock << " bytes\n";
     out << "  Total number of registers available per block: "
         << _properties[i].p.regsPerBlock << std::endl;
     out << "  Warp size:                                     "
         << _properties[i].p.SIMDWidth << std::endl;
     out << "  Maximum number of threads per block:           "
         << _properties[i].p.maxThreadsPerBlock << std::endl;
     out << "  Maximum group size (# of threads per block)    "
         << _properties[i].p.maxThreadsDim[0] << " x "
         << _properties[i].p.maxThreadsDim[1] << " x "
         << _properties[i].p.maxThreadsDim[2] << std::endl;
     out << "  Maximum item sizes (# threads for each dim)    "
         << _properties[i].p.maxGridSize[0] << " x "
         << _properties[i].p.maxGridSize[1] << " x "
         << _properties[i].p.maxGridSize[2] << std::endl;
     out << "  Maximum memory pitch:                          "
         << max_pitch(i) << " bytes\n";
     out << "  Texture alignment:                             "
         << _properties[i].p.textureAlign << " bytes\n";
     out << "  Clock rate:                                    "
         << clock_rate(i) << " GHz\n";
     #if CUDA_VERSION >= 2020
     out << "  Run time limit on kernels:                     ";
     if (_properties[i].kernelExecTimeoutEnabled)
       out << "Yes\n";
     else
       out << "No\n";
     out << "  Integrated:                                    ";
     if (_properties[i].integrated)
       out << "Yes\n";
     else
       out << "No\n";
     out << "  Support host page-locked memory mapping:       ";
     if (_properties[i].canMapHostMemory)
       out << "Yes\n";
     else
       out << "No\n";
     out << "  Compute mode:                                  ";
     if (_properties[i].computeMode == CU_COMPUTEMODE_DEFAULT)
       out << "Default\n"; // multiple threads can use device
     else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE)
       out << "Exclusive\n"; // only thread can use device
     else if (_properties[i].computeMode == CU_COMPUTEMODE_PROHIBITED)
       out << "Prohibited\n"; // no thread can use device
     #if CUDART_VERSION >= 4000
     else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
       out << "Exclusive Process\n"; // multiple threads 1 process
     #endif
     else
       out << "Unknown\n";
     #endif
     #if CUDA_VERSION >= 3010
     out << "  Concurrent kernel execution:                   ";
     if (_properties[i].concurrentKernels)
       out << "Yes\n";
     else
       out << "No\n";
     out << "  Device has ECC support enabled:                ";
     if (_properties[i].ECCEnabled)
       out << "Yes\n";
     else
       out << "No\n";
     #endif
   }
 }
 
 }
 
 #endif
diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h
index 79fa53d55..8dadcf2ef 100644
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@@ -1,598 +1,597 @@
 /***************************************************************************
                                 ocl_device.h
                              -------------------
                                W. Michael Brown
 
   Utilities for dealing with OpenCL devices
 
  __________________________________________________________________________
     This file is part of the Geryon Unified Coprocessor Library (UCL)
  __________________________________________________________________________
 
     begin                : Mon Dec 23 2009
     copyright            : (C) 2009 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the Simplified BSD License.
    ----------------------------------------------------------------------- */
 
 #ifndef OCL_DEVICE
 #define OCL_DEVICE
 
 #include <string>
 #include <vector>
 #include <iostream>
 
 #ifdef __APPLE__
 #include <OpenCL/cl.h>
 #include <OpenCL/cl_platform.h>
 #else
 #include <CL/cl.h>
 #include <CL/cl_platform.h>
 #endif
 
 #include "ocl_macros.h"
 #include "ucl_types.h"
 
 namespace ucl_opencl {
     
 // --------------------------------------------------------------------------
 // - COMMAND QUEUE STUFF
 // --------------------------------------------------------------------------
 typedef cl_command_queue command_queue; 
 typedef cl_context context_type;
   
 inline void ucl_sync(cl_command_queue &cq) {
   CL_SAFE_CALL(clFinish(cq));
 }
 
 inline bool _shared_mem_device(cl_device_type &device_type) {
   return (device_type==CL_DEVICE_TYPE_CPU);
 }
 
 struct OCLProperties {
   std::string name;
   cl_device_type device_type;
   cl_ulong global_mem;
   cl_ulong shared_mem;
   cl_ulong const_mem;
   cl_uint compute_units;
   cl_uint clock;
   size_t work_group_size;
   size_t work_item_size[3];
   bool double_precision;
   int alignment;
   size_t timer_resolution;
   bool ecc_support;
   std::string c_version;
   bool partition_equal, partition_counts, partition_affinity;
   cl_uint max_sub_devices;
 };
 
 /// Class for looking at data parallel device properties
 /** \note Calls to change the device outside of the class results in incorrect
   *       behavior 
   * \note There is no error checking for indexing past the number of devices **/
 class UCL_Device {
  public:
   /// Collect properties for every device on the node
    /** \note You must set the active GPU with set() before using the device **/
   inline UCL_Device();
   
   inline ~UCL_Device();
 
   /// Return the number of platforms (0 if error or no platforms)
   inline int num_platforms() { return _num_platforms; }
   
   /// Return a string with name and info of the current platform
   inline std::string platform_name();
 
   /// Delete any contexts/data and set the platform number to be used
   inline int set_platform(const int pid);
 
   /// Return the number of devices that support OpenCL
   inline int num_devices() { return _num_devices; }
 
   /// Set the OpenCL device to the specified device number
   /** A context and default command queue will be created for the device *
     * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
     * be allocated for use. clear() is called to delete any contexts and
     * associated data from previous calls to set(). **/
   inline int set(int num);
   
   /// Delete any context and associated data stored from a call to set()
   inline void clear();
 
   /// Get the current device number
   inline int device_num() { return _device; }
   
   /// Returns the context for the current device
   inline cl_context & context() { return _context; }
   
   /// Returns the default stream for the current device
   inline command_queue & cq() { return cq(_default_cq); }
   
   /// Returns the stream indexed by i
   inline command_queue & cq(const int i) { return _cq[i]; }
   
   /// Set the default command queue
   /** \param i index of the command queue (as added by push_command_queue()) 
       If i is 0, the command queue created with device initialization is
       used **/
   inline void set_command_queue(const int i) { _default_cq=i; }
   
   /// Block until all commands in the default stream have completed
   inline void sync() { sync(_default_cq); }
   
   /// Block until all commands in the specified stream have completed
   inline void sync(const int i) { ucl_sync(cq(i)); }
   
   /// Get the number of command queues currently available on device
   inline int num_queues() 
     { return _cq.size(); }
   
   /// Add a command queue for device computations (with profiling enabled)
   inline void push_command_queue() {
     cl_int errorv;
     _cq.push_back(cl_command_queue());
     _cq.back()=clCreateCommandQueue(_context,_cl_device,
                                     CL_QUEUE_PROFILING_ENABLE,&errorv);
     if (errorv!=CL_SUCCESS) {
       std::cerr << "Could not create command queue on device: " << name() 
                 << std::endl;
       UCL_GERYON_EXIT;
     }
   }
 
   /// Remove a stream for device computations
   /** \note You cannot delete the default stream **/
   inline void pop_command_queue() {
     if (_cq.size()<2) return;
     CL_SAFE_CALL(clReleaseCommandQueue(_cq.back()));
     _cq.pop_back();
   }
 
   /// Get the current OpenCL device name
   inline std::string name() { return name(_device); }
   /// Get the OpenCL device name
   inline std::string name(const int i) 
     { return std::string(_properties[i].name); }
 
   /// Get a string telling the type of the current device
   inline std::string device_type_name() { return device_type_name(_device); }
   /// Get a string telling the type of the device
   inline std::string device_type_name(const int i);
   
   /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
   inline int device_type() { return device_type(_device); }
   /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
   inline int device_type(const int i);
   
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory() { return shared_memory(_device); }
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory(const int i) 
     { return _shared_mem_device(_properties[i].device_type); }
   
   /// Returns true if double precision is support for the current device
   inline bool double_precision() { return double_precision(_device); }
   /// Returns true if double precision is support for the device
   inline bool double_precision(const int i) 
     {return _properties[i].double_precision;}
    
-  /// Get the number of cores in the current device
-  inline unsigned cores() { return cores(_device); }
-  /// Get the number of cores
-  inline unsigned cores(const int i) 
-    { if (device_type(i)==UCL_CPU) return _properties[i].compute_units;
-      else return _properties[i].compute_units*8; }
-  
+  /// Get the number of compute units on the current device
+  inline unsigned cus() { return cus(_device); }
+  /// Get the number of compute units
+  inline unsigned cus(const int i) 
+    { return _properties[i].compute_units; }
+
   /// Get the gigabytes of global memory in the current device
   inline double gigabytes() { return gigabytes(_device); }
   /// Get the gigabytes of global memory
   inline double gigabytes(const int i) 
     { return static_cast<double>(_properties[i].global_mem)/1073741824; }
 
   /// Get the bytes of global memory in the current device
   inline size_t bytes() { return bytes(_device); }
   /// Get the bytes of global memory
   inline size_t bytes(const int i) { return _properties[i].global_mem; }
   
   /// Return the GPGPU revision number for current device
   //inline double revision() { return revision(_device); }
   /// Return the GPGPU revision number
   //inline double revision(const int i) 
   //  { return //static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
   
   /// Clock rate in GHz for current device
   inline double clock_rate() { return clock_rate(_device); }
   /// Clock rate in GHz
   inline double clock_rate(const int i) { return _properties[i].clock*1e-3;}
   
   /// Return the address alignment in bytes
   inline int alignment() { return alignment(_device); }
   /// Return the address alignment in bytes
   inline int alignment(const int i) { return _properties[i].alignment; }
                
   /// Return the timer resolution
   inline size_t timer_resolution() { return timer_resolution(_device); }
   /// Return the timer resolution
   inline size_t timer_resolution(const int i) 
     { return _properties[i].timer_resolution; }
     
   /// Get the maximum number of threads per block
   inline size_t group_size() { return group_size(_device); }
   /// Get the maximum number of threads per block
   inline size_t group_size(const int i) 
     { return _properties[i].work_group_size; }
   
   /// Return the maximum memory pitch in bytes for current device
   inline size_t max_pitch() { return max_pitch(_device); }
   /// Return the maximum memory pitch in bytes
   inline size_t max_pitch(const int i) { return 0; }
 
   /// Returns false if accelerator cannot be shared by multiple processes
   /** If it cannot be determined, true is returned **/
   inline bool sharing_supported() { return sharing_supported(_device); }
   /// Returns false if accelerator cannot be shared by multiple processes
   /** If it cannot be determined, true is returned **/
   inline bool sharing_supported(const int i)
     { return true; }
 
   /// True if splitting device into equal subdevices supported
   inline bool fission_equal()
     { return fission_equal(_device); }
   /// True if splitting device into equal subdevices supported
   inline bool fission_equal(const int i)
     { return _properties[i].partition_equal; }
   /// True if splitting device into subdevices by specified counts supported
   inline bool fission_by_counts()
     { return fission_by_counts(_device); }
   /// True if splitting device into subdevices by specified counts supported
   inline bool fission_by_counts(const int i)
     { return _properties[i].partition_counts; }    
   /// True if splitting device into subdevices by affinity domains supported
   inline bool fission_by_affinity()
     { return fission_by_affinity(_device); }
   /// True if splitting device into subdevices by affinity domains supported
   inline bool fission_by_affinity(const int i)
     { return _properties[i].partition_affinity; }
 
   /// Maximum number of subdevices allowed from device fission
   inline int max_sub_devices()
     { return max_sub_devices(_device); }
   /// Maximum number of subdevices allowed from device fission
   inline int max_sub_devices(const int i)
     { return _properties[i].max_sub_devices; }
 
   /// List all devices along with all properties
   inline void print_all(std::ostream &out);
   
   /// Return the OpenCL type for the device
   inline cl_device_id & cl_device() { return _cl_device; }
  
  private:
   int _num_platforms;          // Number of platforms
   int _platform;               // UCL_Device ID for current platform
   cl_platform_id _cl_platform; // OpenCL ID for current platform
   cl_platform_id _cl_platforms[20]; // OpenCL IDs for all platforms
   cl_context _context;              // Context used for accessing the device
   std::vector<cl_command_queue> _cq;// The default command queue for this device
   int _device;                            // UCL_Device ID for current device
   cl_device_id _cl_device;                // OpenCL ID for current device
   std::vector<cl_device_id> _cl_devices;  // OpenCL IDs for all devices
   int _num_devices;                       // Number of devices
   std::vector<OCLProperties> _properties; // Properties for each device
   
   inline void add_properties(cl_device_id);
   inline int create_context();
   int _default_cq;
 };
 
 // Grabs the properties for all devices
 UCL_Device::UCL_Device() {
   _device=-1;
 
   // --- Get Number of Platforms
   cl_uint nplatforms;
   cl_int errorv=clGetPlatformIDs(20,_cl_platforms,&nplatforms);
   
   if (errorv!=CL_SUCCESS) {
     _num_platforms=0;
     return;
   } else
     _num_platforms=static_cast<int>(nplatforms);
 
   set_platform(0);
 }
 
 UCL_Device::~UCL_Device() {
   clear();
 }
 
 void UCL_Device::clear() {
   if (_device>-1) {
     for (size_t i=0; i<_cq.size(); i++) {
       CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
       _cq.pop_back();
     }
     CL_DESTRUCT_CALL(clReleaseContext(_context));
   }
   _device=-1;
 }
 
 int UCL_Device::set_platform(int pid) {
   clear();
   cl_int errorv;
   
   _cl_device=0;
   _device=-1;
   _num_devices=0;
   _default_cq=0;
  
   #ifdef UCL_DEBUG
   assert(pid<num_platforms());
   #endif
   _platform=pid;
   _cl_platform=_cl_platforms[_platform];
   
   // --- Get Number of Devices
   cl_uint n;
   errorv=clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,0,NULL,&n);
   _num_devices=n;
   if (errorv!=CL_SUCCESS || _num_devices==0) {
     _num_devices=0;
     return UCL_ERROR;
   }
   cl_device_id device_list[_num_devices];
   CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
                               &n));
   
   // --- Store properties for each device
   for (int i=0; i<_num_devices; i++) {
     _cl_devices.push_back(device_list[i]);
     add_properties(device_list[i]);
   }
 
   return UCL_SUCCESS;
 }
 
 int UCL_Device::create_context() {
   cl_int errorv;
   cl_context_properties props[3];
   props[0]=CL_CONTEXT_PLATFORM;
   props[1]=_platform;
   props[2]=0;
   _context=clCreateContext(0,1,&_cl_device,NULL,NULL,&errorv);
   if (errorv!=CL_SUCCESS) {
     #ifndef UCL_NO_EXIT
     std::cerr << "UCL Error: Could not access accelerator number " << _device
               << " for use.\n";
     UCL_GERYON_EXIT;
     #endif
     return UCL_ERROR;
   }
   push_command_queue();
   _default_cq=0;
   return UCL_SUCCESS;
 }
 
 void UCL_Device::add_properties(cl_device_id device_list) {
   OCLProperties op;
   char buffer[1024];
   cl_bool ans_bool;
     
   CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,NULL));
   op.name=buffer;
   CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_GLOBAL_MEM_SIZE,
                                sizeof(op.global_mem),&op.global_mem,NULL));
   CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_LOCAL_MEM_SIZE,
                                sizeof(op.shared_mem),&op.shared_mem,NULL));
   CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
                                sizeof(op.const_mem),&op.const_mem,NULL));
   CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_TYPE,
                                sizeof(op.device_type),&op.device_type,NULL));
   CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_COMPUTE_UNITS,
                                sizeof(op.compute_units),&op.compute_units,
                                NULL));
   CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_CLOCK_FREQUENCY,
                                sizeof(op.clock),&op.clock,NULL));
   CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_WORK_GROUP_SIZE,
                                sizeof(op.work_group_size),&op.work_group_size,
                                NULL));
   CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_WORK_ITEM_SIZES,
                                3*sizeof(op.work_item_size[0]),op.work_item_size,
                                NULL));
   CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MEM_BASE_ADDR_ALIGN,
                                sizeof(cl_uint),&op.alignment,NULL));
   op.alignment/=8;                               
   
   // Determine if double precision is supported
   cl_uint double_width;
   CL_SAFE_CALL(clGetDeviceInfo(device_list,
                                CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
                                sizeof(double_width),&double_width,NULL));
   if (double_width==0)
     op.double_precision=false;
   else
     op.double_precision=true;
   
   CL_SAFE_CALL(clGetDeviceInfo(device_list,
                                CL_DEVICE_PROFILING_TIMER_RESOLUTION,
                                sizeof(size_t),&op.timer_resolution,NULL));
   
 
   op.ecc_support=false;
   CL_SAFE_CALL(clGetDeviceInfo(device_list,
                                CL_DEVICE_ERROR_CORRECTION_SUPPORT,
                                sizeof(ans_bool),&ans_bool,NULL));
   if (ans_bool==CL_TRUE)
     op.ecc_support=true;
   
   op.c_version="";
   op.partition_equal=false;
   op.partition_counts=false;
   op.partition_affinity=false;
 
   #ifdef CL_VERSION_1_2
   size_t return_bytes;
   CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_OPENCL_C_VERSION,1024,
                                buffer,NULL));
   op.c_version=buffer;
 
   cl_device_partition_property pinfo[4];
   CL_SAFE_CALL(clGetDeviceInfo(device_list,
                                CL_DEVICE_PARTITION_PROPERTIES,
                                4*sizeof(cl_device_partition_property),
                                pinfo,&return_bytes));
   int nprops=return_bytes/sizeof(cl_device_partition_property);
   for (int i=0; i<nprops; i++) {
     if (pinfo[i]==CL_DEVICE_PARTITION_EQUALLY)
       op.partition_equal=true;
     else if (pinfo[i]==CL_DEVICE_PARTITION_BY_COUNTS)
       op.partition_counts=true;
     else if (pinfo[i]==CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN)
       op.partition_affinity=true;
   }
   
   CL_SAFE_CALL(clGetDeviceInfo(device_list,
                                CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
                                sizeof(cl_uint),&op.max_sub_devices,NULL));
   #endif
   
   _properties.push_back(op);
 }
 
 std::string UCL_Device::platform_name() {
   char info[1024];
   
   CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VENDOR,1024,info,
                                  NULL));
   std::string ans=std::string(info)+' ';
   
   CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_NAME,1024,info,
                                  NULL));
   ans+=std::string(info)+' ';
   
   CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VERSION,1024,info,
                NULL));
   ans+=std::string(info);
   
   return ans;
 }
 
 // Get a string telling the type of the device
 std::string UCL_Device::device_type_name(const int i) {
   if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
     return "CPU";
   else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
     return "GPU";
   else if (_properties[i].device_type==CL_DEVICE_TYPE_ACCELERATOR)
     return "ACCELERATOR";
   else
     return "DEFAULT";
 }
 
 // Get a string telling the type of the device
 int UCL_Device::device_type(const int i) {
   if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
     return UCL_CPU;
   else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
     return UCL_GPU;
   else if (_properties[i].device_type==CL_DEVICE_TYPE_ACCELERATOR)
     return UCL_ACCELERATOR;
   else
     return UCL_DEFAULT;
 }
 
 // Set the CUDA device to the specified device number
 int UCL_Device::set(int num) {
   clear();
   
   cl_device_id device_list[_num_devices];
   cl_uint n;
   CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
                                device_list,&n));
 
   _device=num;
   _cl_device=device_list[_device];
   return create_context();
 }
 
 // List all devices along with all properties
 void UCL_Device::print_all(std::ostream &out) {
   if (num_devices() == 0)
     out << "There is no device supporting OpenCL\n";
   for (int i=0; i<num_devices(); ++i) {
     out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n";
     out << "  Type of device:                                "
         << device_type_name(i).c_str() << std::endl;
     out << "  Double precision support:                      ";
     if (double_precision(i))
       out << "Yes\n";
     else
       out << "No\n";
     out << "  Total amount of global memory:                 "
         << gigabytes(i) << " GB\n";
     out << "  Number of compute units/multiprocessors:       "
         << _properties[i].compute_units << std::endl;
     //out << "  Number of cores:                               "
     //    << cores(i) << std::endl;
     out << "  Total amount of constant memory:               "
         << _properties[i].const_mem << " bytes\n";
     out << "  Total amount of local/shared memory per block: "
         << _properties[i].shared_mem << " bytes\n";
     //out << "  Total number of registers available per block: "
     //    << _properties[i].regsPerBlock << std::endl;
     //out << "  Warp size:                                     "
     //    << _properties[i].warpSize << std::endl;
     out << "  Maximum group size (# of threads per block)    "
         << _properties[i].work_group_size << std::endl;
     out << "  Maximum item sizes (# threads for each dim)    "
         << _properties[i].work_item_size[0] << " x "
         << _properties[i].work_item_size[1] << " x "
         << _properties[i].work_item_size[2] << std::endl;
     //out << "  Maximum sizes of each dimension of a grid:     "
     //    << _properties[i].maxGridSize[0] << " x " 
     //    << _properties[i].maxGridSize[1] << " x "
     //    << _properties[i].maxGridSize[2] << std::endl;
     //out << "  Maximum memory pitch:                          "
     //    << _properties[i].memPitch) << " bytes\n";
     //out << "  Texture alignment:                             "
     //    << _properties[i].textureAlignment << " bytes\n";
     out << "  Clock rate:                                    "
         << clock_rate(i) << " GHz\n";
     //out << "  Concurrent copy and execution:                 ";
     out << "  ECC support:                                   ";
     if (_properties[i].ecc_support)
       out << "Yes\n";
     else
       out << "No\n";
     out << "  Device fission into equal partitions:          ";
     if (fission_equal(i))
       out << "Yes\n";
     else
       out << "No\n";
     out << "  Device fission by counts:                      ";
     if (fission_by_counts(i))
       out << "Yes\n";
     else
       out << "No\n";
     out << "  Device fission by affinity:                    ";
     if (fission_by_affinity(i))
       out << "Yes\n";
     else
       out << "No\n";
     out << "  Maximum subdevices from fission:               "
         << max_sub_devices(i) << std::endl;
   }
 }
 
 }
 
 #endif
diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp
index 641087a6c..18bcf3c10 100644
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@@ -1,486 +1,488 @@
 /***************************************************************************
                               base_ellipsoid.cpp
                              -------------------
                             W. Michael Brown (ORNL)
 
   Base class for acceleration of ellipsoid potentials
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : Thu May 5 2011
     email                : brownw@ornl.gov
  ***************************************************************************/
 
 #include "lal_base_ellipsoid.h"
 #include <cstdlib>
 using namespace LAMMPS_AL;
 
 #if defined(USE_OPENCL)
 #include "ellipsoid_nbor_cl.h"
 #elif defined(USE_CUDART)
 const char *ellipsoid_nbor=0;
 #else
 #include "ellipsoid_nbor_cubin.h"
 #endif
 
 #define BaseEllipsoidT BaseEllipsoid<numtyp, acctyp>
 extern Device<PRECISION,ACC_PRECISION> global_device;
 
 template <class numtyp, class acctyp>
 BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0) {
   device=&global_device;
   ans=new Answer<numtyp,acctyp>();
   nbor=new Neighbor();
 }
 
 template <class numtyp, class acctyp>
 BaseEllipsoidT::~BaseEllipsoid() {
   delete ans;
   delete nbor;
 }
 
 template <class numtyp, class acctyp>
 int BaseEllipsoidT::bytes_per_atom(const int max_nbors) const {
   return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
          nbor->bytes_per_atom(max_nbors);
 }
 
 template <class numtyp, class acctyp>
 int BaseEllipsoidT::init_base(const int nlocal, const int nall,
                               const int max_nbors, const int maxspecial,
                               const double cell_size, const double gpu_split,
                               FILE *_screen, const int ntypes, int **h_form,
                               const void *ellipsoid_program,
                               const void *lj_program, const char *k_name,
                               const bool ellip_sphere) {
   screen=_screen;
   _ellipsoid_sphere=ellip_sphere;
 
   int gpu_nbor=0;
   if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
     gpu_nbor=1;
   else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
     gpu_nbor=2;
 
   int _gpu_host=0;
   int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
   if (host_nlocal>0)
     _gpu_host=1;
 
   _threads_per_atom=device->threads_per_atom();
     
   int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,
                            maxspecial,_gpu_host,max_nbors,cell_size,true,
                            1);
   if (success!=0)
     return success;
 
   ucl_device=device->gpu;
   atom=&device->atom;
 
   _block_size=device->block_ellipse();
   compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere);
 
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
 
   // Initialize timers for the selected GPU
   time_lj.init(*ucl_device);
   time_nbor1.init(*ucl_device);
   time_ellipsoid.init(*ucl_device);
   time_nbor2.init(*ucl_device);
   time_ellipsoid2.init(*ucl_device);
   time_nbor3.init(*ucl_device);
   time_ellipsoid3.init(*ucl_device);
   time_lj.zero();
   time_nbor1.zero();
   time_ellipsoid.zero();
   time_nbor2.zero();
   time_ellipsoid2.zero();
   time_nbor3.zero();
   time_ellipsoid3.zero();
 
   // See if we want fast GB-sphere or sphere-sphere calculations
   _host_form=h_form;
   _multiple_forms=false;
   for (int i=1; i<ntypes; i++)
     for (int j=i; j<ntypes; j++)
       if (_host_form[i][j]!=ELLIPSE_ELLIPSE)
         _multiple_forms=true;
   if (_multiple_forms && host_nlocal>0)
     return -8;
   if (_multiple_forms && gpu_nbor!=0)
     return -9;
   
   if (_multiple_forms)
     ans->force.zero();
 
   // Memory for ilist ordered by particle type
   if (host_olist.alloc(nbor->max_atoms(),*ucl_device)!=UCL_SUCCESS)
     return -3;
 
   _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
 
   neigh_tex.bind_float(atom->x,4);
   pos_tex.bind_float(atom->x,4);
   quat_tex.bind_float(atom->quat,4);
   lj_pos_tex.bind_float(atom->x,4);
   lj_quat_tex.bind_float(atom->quat,4);
 
   return 0;
 }
 
 template <class numtyp, class acctyp>
 void BaseEllipsoidT::estimate_gpu_overhead() {
   device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead);
 }
 
 template <class numtyp, class acctyp>
 void BaseEllipsoidT::clear_base() {
   // Output any timing information
   output_times();
   host_olist.clear();
   
   if (_compiled) {
     k_nbor_fast.clear();
     k_nbor.clear();
     k_ellipsoid.clear();
     k_ellipsoid_sphere.clear();
     k_sphere_ellipsoid.clear();
     k_lj_fast.clear();
     k_lj.clear();
     delete nbor_program;
     delete ellipsoid_program;
     delete lj_program;
     _compiled=false;
   }
  
   time_nbor1.clear();
   time_ellipsoid.clear();
   time_nbor2.clear();
   time_ellipsoid2.clear();
   time_nbor3.clear();
   time_ellipsoid3.clear();
   time_lj.clear();
   hd_balancer.clear();
 
   nbor->clear();
   ans->clear();
   device->clear();
 }
 
 template <class numtyp, class acctyp>
 void BaseEllipsoidT::output_times() {
   // Output any timing information
   acc_timers();
   double single[10], times[10];
 
   single[0]=atom->transfer_time()+ans->transfer_time();
   single[1]=nbor->time_nbor.total_seconds()+nbor->time_hybrid1.total_seconds()+
             nbor->time_hybrid2.total_seconds();
   single[2]=time_nbor1.total_seconds()+time_nbor2.total_seconds()+
             time_nbor3.total_seconds()+nbor->time_nbor.total_seconds();
   single[3]=time_ellipsoid.total_seconds()+time_ellipsoid2.total_seconds()+
             time_ellipsoid3.total_seconds();
   if (_multiple_forms)
     single[4]=time_lj.total_seconds();
   else
     single[4]=0;
   single[5]=atom->cast_time()+ans->cast_time();
   single[6]=_gpu_overhead;
   single[7]=_driver_overhead;
   single[8]=ans->cpu_idle_time();
   single[9]=nbor->bin_time();
 
   MPI_Reduce(single,times,10,MPI_DOUBLE,MPI_SUM,0,device->replica());
   double avg_split=hd_balancer.all_avg_split();
 
   _max_bytes+=atom->max_gpu_bytes();
   double mpi_max_bytes;
   MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
              device->replica());
   double max_mb=mpi_max_bytes/(1024*1024);
+  double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5];
 
   if (device->replica_me()==0)
     if (screen && times[5]>0.0) {
       int replica_size=device->replica_size();
 
       fprintf(screen,"\n\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"      GPU Time Info (average): ");
+      fprintf(screen,"    Device Time Info (average): ");
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
-      if (device->procs_per_gpu()==1) {
+      if (device->procs_per_gpu()==1 && t_time>0) {
         fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/replica_size);
         fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[5]/replica_size);
         fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/replica_size);
         if (nbor->gpu_nbor()>0)
           fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/replica_size);
         else
           fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/replica_size);
         fprintf(screen,"Force calc:      %.4f s.\n",times[3]/replica_size);
         fprintf(screen,"LJ calc:         %.4f s.\n",times[4]/replica_size);
       }
       if (nbor->gpu_nbor()==2)
         fprintf(screen,"Neighbor (CPU):  %.4f s.\n",times[9]/replica_size);
-      fprintf(screen,"GPU Overhead:    %.4f s.\n",times[6]/replica_size);
+      if (times[6]>0)
+        fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
       fprintf(screen,"Threads / atom:  %d.\n",_threads_per_atom);      
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
       fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
       fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[8]/replica_size);
       fprintf(screen,"-------------------------------------");
       fprintf(screen,"--------------------------------\n\n");
     }
   _max_bytes=0.0;
 }
 
 // ---------------------------------------------------------------------------
 // Pack neighbors to limit thread divergence for lj-lj and ellipse 
 // ---------------------------------------------------------------------------
 template<class numtyp, class acctyp>
 void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start, 
                                 const int inum, const int form_low,
                                 const int form_high, const bool shared_types,
                                 int ntypes) {
   int stride=nbor->nbor_pitch();
   if (shared_types) {
     k_nbor_fast.set_size(GX,BX);
     k_nbor_fast.run(&atom->x, &cut_form, &nbor->dev_nbor, &stride, &start,
                     &inum, &nbor->dev_packed, &form_low, &form_high);
   } else {
     k_nbor.set_size(GX,BX);
     k_nbor.run(&atom->x, &cut_form, &ntypes, &nbor->dev_nbor, &stride,
                &start, &inum, &nbor->dev_packed, &form_low, &form_high);
   }
 }
 
 // ---------------------------------------------------------------------------
 // Copy neighbor list from host
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void BaseEllipsoidT::reset_nbors(const int nall, const int inum, 
                                  const int osize, int *ilist,
                                  int *numj, int *type, int **firstneigh,
                                  bool &success) {
   success=true;
     
   int mn=nbor->max_nbor_loop(osize,numj,ilist);
   resize_atom(nall,success);
   resize_local(inum,0,mn,osize,success);
   if (!success)
     return;
     
   if (_multiple_forms) {
     int p=0;
     for (int i=0; i<osize; i++) {
       int itype=type[ilist[i]];
       if (_host_form[itype][itype]==ELLIPSE_ELLIPSE) {
         host_olist[p]=ilist[i];
         p++;
       }
     }
     _max_last_ellipse=p;
     _last_ellipse=std::min(inum,_max_last_ellipse);
     for (int i=0; i<osize; i++) {
       int itype=type[ilist[i]];
       if (_host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
         host_olist[p]=ilist[i];
         p++;
       }
     }
     nbor->get_host(inum,host_olist.begin(),numj,firstneigh,block_size());
     nbor->copy_unpacked(inum,mn);
     return;
   }
   _last_ellipse=inum;
   _max_last_ellipse=inum;
   nbor->get_host(inum,ilist,numj,firstneigh,block_size());
   nbor->copy_unpacked(inum,mn);
 
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
 }
 
 // ---------------------------------------------------------------------------
 // Build neighbor list on device
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum,
                                             const int nall, double **host_x,
                                             int *host_type, double *sublo,
                                             double *subhi, int *tag, 
                                             int **nspecial, int **special,
                                             bool &success) {
   success=true;
   resize_atom(nall,success);
   resize_local(inum,host_inum,nbor->max_nbors(),0,success);
   if (!success)
     return;
   atom->cast_copy_x(host_x,host_type);
 
   int mn;
   nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
                         nspecial, special, success, mn);
   nbor->copy_unpacked(inum,mn);
   _last_ellipse=inum;
   _max_last_ellipse=inum;
 
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
 }
 
 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
                              const int nall, double **host_x, int *host_type,
                              int *ilist, int *numj, int **firstneigh,
                              const bool eflag, const bool vflag,
                              const bool eatom, const bool vatom,
                              int &host_start, const double cpu_time,
                              bool &success, double **host_quat) {
   acc_timers();
   if (inum_full==0) {
     host_start=0;
     zero_timers();
     return NULL;
   }
   
   int ago=hd_balancer.ago_first(f_ago);
   int inum=hd_balancer.balance(ago,inum_full,cpu_time);
   ans->inum(inum);
   _last_ellipse=std::min(inum,_max_last_ellipse);
   host_start=inum;
 
   if (ago==0) {
     reset_nbors(nall, inum, inum_full, ilist, numj, host_type, firstneigh,
                 success);
     if (!success)
       return NULL;
   }
   int *list;
   if (_multiple_forms)
     list=host_olist.begin();
   else
     list=ilist;
 
   atom->cast_x_data(host_x,host_type);
   atom->cast_quat_data(host_quat[0]);
   hd_balancer.start_timer();
   atom->add_x_data(host_x,host_type);
   atom->add_quat_data();
 
   loop(eflag,vflag);
   ans->copy_answers(eflag,vflag,eatom,vatom,list);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
   return list;
 }
 
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary and then compute forces, virials, energies
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall,
                               double **host_x, int *host_type, double *sublo,
                               double *subhi, int *tag, int **nspecial,
                               int **special, const bool eflag, const bool vflag,
                               const bool eatom, const bool vatom, 
                               int &host_start, int **ilist, int **jnum,
                               const double cpu_time, bool &success,
                               double **host_quat) {
   acc_timers();
   if (inum_full==0) {
     host_start=0;
     zero_timers();
     return NULL;
   }
 
   hd_balancer.balance(cpu_time);
   int inum=hd_balancer.get_gpu_count(ago,inum_full);
   ans->inum(inum);
   _last_ellipse=std::min(inum,_max_last_ellipse);
   host_start=inum;
   
   // Build neighbor list on GPU if necessary
   if (ago==0) {
     build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                     sublo, subhi, tag, nspecial, special, success);
     if (!success)
       return NULL;
     atom->cast_quat_data(host_quat[0]);
     hd_balancer.start_timer();
   } else {    
     atom->cast_x_data(host_x,host_type);
     atom->cast_quat_data(host_quat[0]);
     hd_balancer.start_timer();
     atom->add_x_data(host_x,host_type);
   }
 
   atom->add_quat_data();
   *ilist=nbor->host_ilist.begin();
   *jnum=nbor->host_acc.begin();
 
   loop(eflag,vflag);
   ans->copy_answers(eflag,vflag,eatom,vatom);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
   return nbor->host_jlist.begin()-host_start;
 }
 
 template <class numtyp, class acctyp>
 double BaseEllipsoidT::host_memory_usage_base() const {
   return device->atom.host_memory_usage()+nbor->host_memory_usage()+
          4*sizeof(numtyp)+sizeof(BaseEllipsoid<numtyp,acctyp>);
 }
 
 template <class numtyp, class acctyp>
 void BaseEllipsoidT::compile_kernels(UCL_Device &dev, 
                                      const void *ellipsoid_string,
                                      const void *lj_string, 
                                      const char *kname, const bool e_s) {
   if (_compiled)
     return;
 
   std::string kns=kname;
   std::string s_sphere_ellipsoid=kns+"_sphere_ellipsoid";
   std::string s_ellipsoid_sphere=kns+"_ellipsoid_sphere";
   std::string s_lj=kns+"_lj";
   std::string s_lj_fast=kns+"_lj_fast";
 
   std::string flags=device->compile_string();
 
   nbor_program=new UCL_Program(dev);
   nbor_program->load_string(ellipsoid_nbor,flags.c_str());
   k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast");
   k_nbor.set_function(*nbor_program,"kernel_nbor");
   neigh_tex.get_texture(*nbor_program,"pos_tex");
 
   ellipsoid_program=new UCL_Program(dev);
   ellipsoid_program->load_string(ellipsoid_string,flags.c_str());
   k_ellipsoid.set_function(*ellipsoid_program,kname);
   pos_tex.get_texture(*ellipsoid_program,"pos_tex");
   quat_tex.get_texture(*ellipsoid_program,"quat_tex");
 
   lj_program=new UCL_Program(dev);
   lj_program->load_string(lj_string,flags.c_str());
   k_sphere_ellipsoid.set_function(*lj_program,s_sphere_ellipsoid.c_str());
   k_lj_fast.set_function(*lj_program,s_lj_fast.c_str());
   k_lj.set_function(*lj_program,s_lj.c_str());
   if (e_s)
     k_ellipsoid_sphere.set_function(*lj_program,s_ellipsoid_sphere.c_str());
   lj_pos_tex.get_texture(*lj_program,"pos_tex");
   lj_quat_tex.get_texture(*lj_program,"quat_tex");
 
   _compiled=true;
 }
 
 template class BaseEllipsoid<PRECISION,ACC_PRECISION>;
 
diff --git a/lib/gpu/lal_beck_ext.cpp b/lib/gpu/lal_beck_ext.cpp
index 1a736bf02..314854760 100644
--- a/lib/gpu/lal_beck_ext.cpp
+++ b/lib/gpu/lal_beck_ext.cpp
@@ -1,120 +1,120 @@
 /***************************************************************************
                                  beck_ext.cpp
                              -------------------
                             Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to beck acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_beck.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static Beck<PRECISION,ACC_PRECISION> BLMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
                   double **alpha, double **beta, double **AA, double **BB,
                   double *special_lj, const int inum, const int nall,
                   const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen) {
   BLMF.clear();
   gpu_mode=BLMF.device->gpu_mode();
   double gpu_split=BLMF.device->particle_split();
   int first_gpu=BLMF.device->first_device();
   int last_gpu=BLMF.device->last_device();
   int world_me=BLMF.device->world_me();
   int gpu_rank=BLMF.device->gpu_rank();
   int procs_per_gpu=BLMF.device->procs_per_gpu();
 
   BLMF.device->init_message(screen,"beck",first_gpu,last_gpu);
 
   bool message=false;
   if (BLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta,
                       AA, BB, special_lj, inum, nall, 300,
                       maxspecial, cell_size, gpu_split, screen);
 
   BLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta, AA, BB,
                         special_lj, inum, nall, 300, maxspecial,
                         cell_size, gpu_split, screen);
 
     BLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     BLMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void beck_gpu_clear() {
   BLMF.clear();
 }
 
 int ** beck_gpu_compute_n(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            double *sublo, double *subhi, int *tag, int **nspecial,
                            int **special, const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
                            bool &success) {
   return BLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
 void beck_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
                        const double cpu_time, bool &success) {
   BLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
 double beck_gpu_bytes() {
   return BLMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_born_coul_long_ext.cpp b/lib/gpu/lal_born_coul_long_ext.cpp
index e8ac4eff0..3d8810fca 100644
--- a/lib/gpu/lal_born_coul_long_ext.cpp
+++ b/lib/gpu/lal_born_coul_long_ext.cpp
@@ -1,132 +1,132 @@
 /***************************************************************************
                            born_coul_long_ext.cpp
                              -------------------
                            Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to born/coul/long acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_born_coul_long.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static BornCoulLong<PRECISION,ACC_PRECISION> BORNCLMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                     double **host_born1, double **host_born2, double **host_born3,
                     double **host_a, double **host_c, double **host_d,
                     double **sigma, double **offset, double *special_lj, 
                     const int inum, const int nall, const int max_nbors, 
                     const int maxspecial, const double cell_size, int &gpu_mode, 
                     FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
                     double *host_special_coul, const double qqrd2e,
                     const double g_ewald) {
   BORNCLMF.clear();
   gpu_mode=BORNCLMF.device->gpu_mode();
   double gpu_split=BORNCLMF.device->particle_split();
   int first_gpu=BORNCLMF.device->first_device();
   int last_gpu=BORNCLMF.device->last_device();
   int world_me=BORNCLMF.device->world_me();
   int gpu_rank=BORNCLMF.device->gpu_rank();
   int procs_per_gpu=BORNCLMF.device->procs_per_gpu();
 
   BORNCLMF.device->init_message(screen,"born/coul/long",first_gpu,last_gpu);
 
   bool message=false;
   if (BORNCLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
                           host_born3, host_a, host_c, host_d, sigma, offset, 
                           special_lj, inum, nall, 300, maxspecial, cell_size, 
                           gpu_split, screen, host_cut_ljsq, host_cut_coulsq, 
                           host_special_coul, qqrd2e, g_ewald);
 
   BORNCLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
                             host_born3, host_a, host_c, host_d, sigma, offset, 
                             special_lj, inum, nall, 300, maxspecial, cell_size, 
                             gpu_split, screen, host_cut_ljsq, host_cut_coulsq, 
                             host_special_coul, qqrd2e, g_ewald);
 
     BORNCLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     BORNCLMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void borncl_gpu_clear() {
   BORNCLMF.clear();
 }
 
 int** borncl_gpu_compute_n(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            double *sublo, double *subhi, int *tag, int **nspecial, 
                            int **special, const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum,  const double cpu_time,
                            bool &success, double *host_q, double *boxlo,
                            double *prd) {
   return BORNCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                           subhi, tag, nspecial, special, eflag, vflag, eatom,
                           vatom, host_start, ilist, jnum, cpu_time, success,
                           host_q, boxlo, prd);
 }  
 			
 void borncl_gpu_compute(const int ago, const int inum_full, const int nall,
                         double **host_x, int *host_type, int *ilist, int *numj,
                         int **firstneigh, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         const double cpu_time, bool &success, double *host_q,
                         const int nlocal, double *boxlo, double *prd) {
   BORNCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                    firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
                    host_q,nlocal,boxlo,prd);
 }
 
 double borncl_gpu_bytes() {
   return BORNCLMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_born_coul_wolf_ext.cpp b/lib/gpu/lal_born_coul_wolf_ext.cpp
index 3e779d099..bffab31d1 100644
--- a/lib/gpu/lal_born_coul_wolf_ext.cpp
+++ b/lib/gpu/lal_born_coul_wolf_ext.cpp
@@ -1,134 +1,134 @@
 /***************************************************************************
                            born_coul_wolf_ext.cpp
                              -------------------
                            Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to born/coul/wolf acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_born_coul_wolf.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static BornCoulWolf<PRECISION,ACC_PRECISION> BORNCWMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                     double **host_born1, double **host_born2, double **host_born3, 
                     double **host_a, double **host_c, double **host_d,
                     double **sigma, double **offset, double *special_lj, const int inum,
                     const int nall, const int max_nbors, const int maxspecial,
                     const double cell_size, int &gpu_mode, FILE *screen,
                     double **host_cut_ljsq, double host_cut_coulsq,
                     double *host_special_coul, const double qqrd2e,
                     const double alf, const double e_shift, const double f_shift) {
   BORNCWMF.clear();
   gpu_mode=BORNCWMF.device->gpu_mode();
   double gpu_split=BORNCWMF.device->particle_split();
   int first_gpu=BORNCWMF.device->first_device();
   int last_gpu=BORNCWMF.device->last_device();
   int world_me=BORNCWMF.device->world_me();
   int gpu_rank=BORNCWMF.device->gpu_rank();
   int procs_per_gpu=BORNCWMF.device->procs_per_gpu();
 
   BORNCWMF.device->init_message(screen,"born/coul/wolf",first_gpu,last_gpu);
 
   bool message=false;
   if (BORNCWMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                           host_born3, host_a, host_c, host_d, sigma,
                           offset, special_lj, inum, nall, 300, 
                           maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, qqrd2e, 
                           alf, e_shift, f_shift);
 
   BORNCWMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
                             host_born3, host_a, host_c, host_d, sigma, 
                             offset, special_lj, inum, nall, 300, 
                             maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                             host_cut_coulsq, host_special_coul, qqrd2e, 
                             alf, e_shift, f_shift);
 
     BORNCWMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     BORNCWMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void borncw_gpu_clear() {
   BORNCWMF.clear();
 }
 
 int** borncw_gpu_compute_n(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            double *sublo, double *subhi, int *tag, int **nspecial, 
                            int **special, const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum,  const double cpu_time,
                            bool &success, double *host_q, double *boxlo,
                            double *prd) {
   return BORNCWMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                           subhi, tag, nspecial, special, eflag, vflag, eatom,
                           vatom, host_start, ilist, jnum, cpu_time, success,
                           host_q, boxlo, prd);
 }  
 			
 void borncw_gpu_compute(const int ago, const int inum_full, const int nall,
                         double **host_x, int *host_type, int *ilist, int *numj,
                         int **firstneigh, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         const double cpu_time, bool &success, double *host_q,
                         const int nlocal, double *boxlo, double *prd) {
   BORNCWMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                    firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
                    host_q,nlocal,boxlo,prd);
 }
 
 double borncw_gpu_bytes() {
   return BORNCWMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_born_ext.cpp b/lib/gpu/lal_born_ext.cpp
index 7785353a8..a3fcfad4e 100644
--- a/lib/gpu/lal_born_ext.cpp
+++ b/lib/gpu/lal_born_ext.cpp
@@ -1,124 +1,124 @@
 /***************************************************************************
                                  born_ext.cpp
                              -------------------
                            Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to born acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_born.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static Born<PRECISION,ACC_PRECISION> BORNMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                   double **host_born1, double **host_born2, 
                   double **host_born3, double **host_a, double **host_c, 
                   double **host_d, double **sigma,      
                   double **offset, double *special_lj, const int inum,
                   const int nall, const int max_nbors,  const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen) {
   BORNMF.clear();
   gpu_mode=BORNMF.device->gpu_mode();
   double gpu_split=BORNMF.device->particle_split();
   int first_gpu=BORNMF.device->first_device();
   int last_gpu=BORNMF.device->last_device();
   int world_me=BORNMF.device->world_me();
   int gpu_rank=BORNMF.device->gpu_rank();
   int procs_per_gpu=BORNMF.device->procs_per_gpu();
 
   BORNMF.device->init_message(screen,"born",first_gpu,last_gpu);
 
   bool message=false;
   if (BORNMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
                         host_born3, host_a, host_c, host_d, sigma,
                         offset, special_lj, inum, nall, 300,
                         maxspecial, cell_size, gpu_split, screen);
 
   BORNMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, 
                           host_born3, host_a, host_c, host_d, sigma, 
                           offset, special_lj, inum, nall, 300,
                           maxspecial, cell_size, gpu_split, screen);
 
     BORNMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     BORNMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void born_gpu_clear() {
   BORNMF.clear(); 
 }
 
 int ** born_gpu_compute_n(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           double *sublo, double *subhi, int *tag, int **nspecial,
                           int **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
                           bool &success) {
   return BORNMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
 void born_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
                       const double cpu_time, bool &success) {
   BORNMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                  firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
 double born_gpu_bytes() {
   return BORNMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_buck_coul_ext.cpp b/lib/gpu/lal_buck_coul_ext.cpp
index ac3e6b891..be5dc718d 100644
--- a/lib/gpu/lal_buck_coul_ext.cpp
+++ b/lib/gpu/lal_buck_coul_ext.cpp
@@ -1,131 +1,131 @@
 /***************************************************************************
                               buck_coul_ext.cpp
                              -------------------
                            Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to buck/coul/cut acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_buck_coul.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static BuckCoul<PRECISION,ACC_PRECISION> BUCKCMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                  double **host_buck1, double **host_buck2, 
                  double **host_a, double **host_c,       
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors,  const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen,
                  double **host_cut_ljsq, double **host_cut_coulsq,
                  double *host_special_coul, const double qqrd2e) {
   BUCKCMF.clear();
   gpu_mode=BUCKCMF.device->gpu_mode();
   double gpu_split=BUCKCMF.device->particle_split();
   int first_gpu=BUCKCMF.device->first_device();
   int last_gpu=BUCKCMF.device->last_device();
   int world_me=BUCKCMF.device->world_me();
   int gpu_rank=BUCKCMF.device->gpu_rank();
   int procs_per_gpu=BUCKCMF.device->procs_per_gpu();
 
   BUCKCMF.device->init_message(screen,"buck",first_gpu,last_gpu);
 
   bool message=false;
   if (BUCKCMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
                        host_a, host_c, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen, 
                        host_cut_ljsq, host_cut_coulsq,
                        host_special_coul, qqrd2e);
 
   BUCKCMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
                        host_a, host_c, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen, 
                        host_cut_ljsq, host_cut_coulsq,
                        host_special_coul, qqrd2e);
 
     BUCKCMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     BUCKCMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void buckc_gpu_clear() {
   BUCKCMF.clear(); 
 }
 
 int ** buckc_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, int *tag, int **nspecial, 
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
                         bool &success, double *host_q, double *boxlo,
                         double *prd) {
   return BUCKCMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, boxlo, prd);
 }  
 			
 void buckc_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success, double *host_q,
                      const int nlocal, double *boxlo, double *prd) {
   BUCKCMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
                 vflag,eatom,vatom,host_start,cpu_time,success,host_q,
                 nlocal,boxlo,prd);
 }
 
 double buckc_gpu_bytes() {
   return BUCKCMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_buck_coul_long_ext.cpp b/lib/gpu/lal_buck_coul_long_ext.cpp
index d9328a921..9f614ffb1 100644
--- a/lib/gpu/lal_buck_coul_long_ext.cpp
+++ b/lib/gpu/lal_buck_coul_long_ext.cpp
@@ -1,130 +1,130 @@
 /***************************************************************************
                            buck_coul_long_ext.cpp
                              -------------------
                            Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to buck/coul/long acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_buck_coul_long.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static BuckCoulLong<PRECISION,ACC_PRECISION> BUCKCLMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                  double **host_buck1, double **host_buck2, 
                  double **host_a, double **host_c,
                   double **offset, double *special_lj, const int inum,
                   const int nall, const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen,
                   double **host_cut_ljsq, double host_cut_coulsq,
                   double *host_special_coul, const double qqrd2e,
                   const double g_ewald) {
   BUCKCLMF.clear();
   gpu_mode=BUCKCLMF.device->gpu_mode();
   double gpu_split=BUCKCLMF.device->particle_split();
   int first_gpu=BUCKCLMF.device->first_device();
   int last_gpu=BUCKCLMF.device->last_device();
   int world_me=BUCKCLMF.device->world_me();
   int gpu_rank=BUCKCLMF.device->gpu_rank();
   int procs_per_gpu=BUCKCLMF.device->procs_per_gpu();
 
   BUCKCLMF.device->init_message(screen,"buck/coul/long",first_gpu,last_gpu);
 
   bool message=false;
   if (BUCKCLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
                         host_a, host_c, offset, special_lj, inum, nall, 300, 
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
   BUCKCLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
                         host_a, host_c, offset, special_lj, inum, nall, 300, 
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
     BUCKCLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     BUCKCLMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void buckcl_gpu_clear() {
   BUCKCLMF.clear();
 }
 
 int** buckcl_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          double *sublo, double *subhi, int *tag, int **nspecial, 
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
                          bool &success, double *host_q, double *boxlo,
                          double *prd) {
   return BUCKCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
 }  
 			
 void buckcl_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
                       const double cpu_time, bool &success, double *host_q,
                       const int nlocal, double *boxlo, double *prd) {
   BUCKCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
                 host_q,nlocal,boxlo,prd);
 }
 
 double buckcl_gpu_bytes() {
   return BUCKCLMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_buck_ext.cpp b/lib/gpu/lal_buck_ext.cpp
index 9f7f725ae..4b25e7356 100644
--- a/lib/gpu/lal_buck_ext.cpp
+++ b/lib/gpu/lal_buck_ext.cpp
@@ -1,121 +1,121 @@
 /***************************************************************************
                                  buck_ext.cpp
                              -------------------
                            Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to buck acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_buck.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static Buck<PRECISION,ACC_PRECISION> BUCKMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                  double **host_buck1, double **host_buck2, 
                  double **host_a, double **host_c,       
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors,  const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen) {
   BUCKMF.clear();
   gpu_mode=BUCKMF.device->gpu_mode();
   double gpu_split=BUCKMF.device->particle_split();
   int first_gpu=BUCKMF.device->first_device();
   int last_gpu=BUCKMF.device->last_device();
   int world_me=BUCKMF.device->world_me();
   int gpu_rank=BUCKMF.device->gpu_rank();
   int procs_per_gpu=BUCKMF.device->procs_per_gpu();
 
   BUCKMF.device->init_message(screen,"buck",first_gpu,last_gpu);
 
   bool message=false;
   if (BUCKMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
                        host_a, host_c, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
   BUCKMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, 
                        host_a, host_c, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
     BUCKMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     BUCKMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void buck_gpu_clear() {
   BUCKMF.clear(); 
 }
 
 int ** buck_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, int *tag, int **nspecial,
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
                         bool &success) {
   return BUCKMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
 void buck_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success) {
   BUCKMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
 double buck_gpu_bytes() {
   return BUCKMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_cg_cmm_ext.cpp b/lib/gpu/lal_cg_cmm_ext.cpp
index 6d64c3043..0100ac890 100644
--- a/lib/gpu/lal_cg_cmm_ext.cpp
+++ b/lib/gpu/lal_cg_cmm_ext.cpp
@@ -1,121 +1,121 @@
 /***************************************************************************
                                   cg_cmm.h
                              -------------------
                             W. Michael Brown (ORNL)
 
   Functions for LAMMPS access to lj/sdk pair acceleration routines
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : brownw@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_cg_cmm.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static CGCMM<PRECISION,ACC_PRECISION> CMMMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
                  double **host_lj1, double **host_lj2, double **host_lj3, 
                  double **host_lj4, double **offset, double *special_lj,
                  const int inum, const int nall, const int max_nbors, 
                  const int maxspecial, const double cell_size, int &gpu_mode,
                  FILE *screen) {
   CMMMF.clear();
   gpu_mode=CMMMF.device->gpu_mode();
   double gpu_split=CMMMF.device->particle_split();
   int first_gpu=CMMMF.device->first_device();
   int last_gpu=CMMMF.device->last_device();
   int world_me=CMMMF.device->world_me();
   int gpu_rank=CMMMF.device->gpu_rank();
   int procs_per_gpu=CMMMF.device->procs_per_gpu();
 
   CMMMF.device->init_message(screen,"lj/sdk",first_gpu,last_gpu);
 
   bool message=false;
   if (CMMMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, 
                        host_lj4, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
   CMMMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
                          host_lj4, offset, special_lj, inum, nall, 300,
                          maxspecial, cell_size, gpu_split, screen);
 
     CMMMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     CMMMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void cmm_gpu_clear() {
   CMMMF.clear();
 }
 
 int** cmm_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, int *tag, int **nspecial,
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
                         bool &success) {
   return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
 void cmm_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success) {
   CMMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
 double cmm_gpu_bytes() {
   return CMMMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_cg_cmm_long_ext.cpp b/lib/gpu/lal_cg_cmm_long_ext.cpp
index ca7aab70c..9197e75e0 100644
--- a/lib/gpu/lal_cg_cmm_long_ext.cpp
+++ b/lib/gpu/lal_cg_cmm_long_ext.cpp
@@ -1,129 +1,129 @@
 /***************************************************************************
                                 cg_cmm_long.h
                              -------------------
                             W. Michael Brown (ORNL)
 
   Functions for LAMMPS access to lj/sdk/coul/long acceleration functions
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : brownw@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_cg_cmm_long.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static CGCMMLong<PRECISION,ACC_PRECISION> CMMLMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
                   double **host_lj1, double **host_lj2, double **host_lj3, 
                   double **host_lj4, double **offset, double *special_lj,
                   const int inum, const int nall, const int max_nbors, 
                   const int maxspecial, const double cell_size, int &gpu_mode,
                   FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
                   double *host_special_coul, const double qqrd2e,
                   const double g_ewald) {
   CMMLMF.clear();
   gpu_mode=CMMLMF.device->gpu_mode();
   double gpu_split=CMMLMF.device->particle_split();
   int first_gpu=CMMLMF.device->first_device();
   int last_gpu=CMMLMF.device->last_device();
   int world_me=CMMLMF.device->world_me();
   int gpu_rank=CMMLMF.device->gpu_rank();
   int procs_per_gpu=CMMLMF.device->procs_per_gpu();
 
   CMMLMF.device->init_message(screen,"lj/sdk/coul/long",first_gpu,last_gpu);
 
   bool message=false;
   if (CMMLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
                         host_lj4, offset, special_lj, inum, nall, 300, 
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e,g_ewald);
 
   CMMLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
                           host_lj4, offset, special_lj, inum,  nall, 300,
                           maxspecial, cell_size, gpu_split, screen,
                           host_cut_ljsq, host_cut_coulsq, host_special_coul,
                           qqrd2e, g_ewald);
     CMMLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     CMMLMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void cmml_gpu_clear() {
   CMMLMF.clear();
 }
 
 int** cmml_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          double *sublo, double *subhi, int *tag, int **nspecial, 
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time,
                          bool &success, double *host_q, double *boxlo,
                          double *prd) {
   return CMMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q,boxlo,prd);
 }  
 			
 void cmml_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
                       const double cpu_time, bool &success, double *host_q,
                       const int nlocal, double *boxlo, double *prd) {
   CMMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
                 host_q,nlocal,boxlo,prd);
 }
 
 double cmml_gpu_bytes() {
   return CMMLMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_charmm_long_ext.cpp b/lib/gpu/lal_charmm_long_ext.cpp
index 5d544dc87..8f359e361 100644
--- a/lib/gpu/lal_charmm_long_ext.cpp
+++ b/lib/gpu/lal_charmm_long_ext.cpp
@@ -1,135 +1,135 @@
 /***************************************************************************
                              charmm_long_ext.cpp
                              -------------------
                             W. Michael Brown (ORNL)
 
   Functions for LAMMPS access to charmm/coul/long acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : brownw@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_charmm_long.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static CHARMMLong<PRECISION,ACC_PRECISION> CRMLMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
                   double **host_lj2, double **host_lj3, double **host_lj4,
                   double **offset, double *special_lj, const int inum,
                   const int nall, const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen,
                   double host_cut_ljsq, double host_cut_coulsq,
                   double *host_special_coul, const double qqrd2e,
                   const double g_ewald, const double cut_lj_innersq,
                   const double denom_lj, double **epsilon,
                   double **sigma, const bool mix_arithmetic) {
   CRMLMF.clear();
   gpu_mode=CRMLMF.device->gpu_mode();
   double gpu_split=CRMLMF.device->particle_split();
   int first_gpu=CRMLMF.device->first_device();
   int last_gpu=CRMLMF.device->last_device();
   int world_me=CRMLMF.device->world_me();
   int gpu_rank=CRMLMF.device->gpu_rank();
   int procs_per_gpu=CRMLMF.device->procs_per_gpu();
 
   CRMLMF.device->init_message(screen,"lj/charmm/coul/long",first_gpu,last_gpu);
 
   bool message=false;
   if (CRMLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
                 offset, special_lj, inum, nall, 300, maxspecial, cell_size,
                 gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                 host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
                 epsilon,sigma,mix_arithmetic);
 
   CRMLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
                           host_lj4, offset, special_lj, inum, nall, 300,
                           maxspecial, cell_size, gpu_split, screen,
                           host_cut_ljsq, host_cut_coulsq, host_special_coul,
                           qqrd2e, g_ewald,  cut_lj_innersq, denom_lj, epsilon,
                           sigma, mix_arithmetic);
 
     CRMLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     CRMLMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void crml_gpu_clear() {
   CRMLMF.clear();
 }
 
 int** crml_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          double *sublo, double *subhi, int *tag, int **nspecial, 
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time,
                          bool &success, double *host_q, double *boxlo,
                          double *prd) {
   return CRMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
 }  
 			
 void crml_gpu_compute(const int ago, const int inum_full,
 	 	                  const int nall, double **host_x, int *host_type,
                       int *ilist, int *numj, int **firstneigh,
 		                  const bool eflag, const bool vflag, const bool eatom,
                       const bool vatom, int &host_start, const double cpu_time,
                       bool &success, double *host_q, const int nlocal, 
                       double *boxlo, double *prd) {
   CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
                  eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
                  nlocal,boxlo,prd);
 }
 
 double crml_gpu_bytes() {
   return CRMLMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_colloid_ext.cpp b/lib/gpu/lal_colloid_ext.cpp
index d4e8a2092..50c95c1e2 100644
--- a/lib/gpu/lal_colloid_ext.cpp
+++ b/lib/gpu/lal_colloid_ext.cpp
@@ -1,127 +1,127 @@
 /***************************************************************************
                                  colloid_ext.cpp
                              -------------------
                             Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to colloid acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_colloid.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static Colloid<PRECISION,ACC_PRECISION> COLLMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                      double **host_lj2, double **host_lj3, double **host_lj4,
                      double **offset, double *special_lj, 
                      double **host_a12, double **host_a1, double **host_a2, 
                      double **host_d1, double **host_d2, double **host_sigma3, 
                      double **host_sigma6, int **host_form, const int inum,
                      const int nall, const int max_nbors,  const int maxspecial,
                      const double cell_size, int &gpu_mode, FILE *screen) {
   COLLMF.clear();
   gpu_mode=COLLMF.device->gpu_mode();
   double gpu_split=COLLMF.device->particle_split();
   int first_gpu=COLLMF.device->first_device();
   int last_gpu=COLLMF.device->last_device();
   int world_me=COLLMF.device->world_me();
   int gpu_rank=COLLMF.device->gpu_rank();
   int procs_per_gpu=COLLMF.device->procs_per_gpu();
 
   COLLMF.device->init_message(screen,"colloid",first_gpu,last_gpu);
 
   bool message=false;
   if (COLLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
                         host_lj4, offset, special_lj, host_a12, host_a1,
                         host_a2, host_d1, host_d2, host_sigma3, 
                         host_sigma6, host_form, inum, nall, 300,
                         maxspecial, cell_size, gpu_split, screen);
 
   COLLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                           offset, special_lj, host_a12, host_a1, host_a2, 
                           host_d1, host_d2, host_sigma3, host_sigma6, host_form, 
                           inum, nall, 300, maxspecial,
                           cell_size, gpu_split, screen);
 
     COLLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     COLLMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void colloid_gpu_clear() {
   COLLMF.clear();
 }
 
 int ** colloid_gpu_compute_n(const int ago, const int inum_full,
                              const int nall, double **host_x, int *host_type,
                              double *sublo, double *subhi, int *tag, int **nspecial,
                              int **special, const bool eflag, const bool vflag,
                              const bool eatom, const bool vatom, int &host_start,
                              int **ilist, int **jnum, const double cpu_time,
                              bool &success) {
   return COLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
 void colloid_gpu_compute(const int ago, const int inum_full, const int nall,
                          double **host_x, int *host_type, int *ilist, int *numj,
                          int **firstneigh, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          const double cpu_time, bool &success) {
   COLLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                  firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
 double colloid_gpu_bytes() {
   return COLLMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_coul_dsf_ext.cpp b/lib/gpu/lal_coul_dsf_ext.cpp
index 19879cb9c..851df0f53 100644
--- a/lib/gpu/lal_coul_dsf_ext.cpp
+++ b/lib/gpu/lal_coul_dsf_ext.cpp
@@ -1,125 +1,125 @@
 /***************************************************************************
                                coul_dsf_ext.cpp
                              -------------------
                             Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to coul/dsf acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 8/15/2012
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_coul_dsf.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static CoulDSF<PRECISION,ACC_PRECISION> CDMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int cdsf_gpu_init(const int ntypes, const int inum, const int nall, 
                   const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen,
                   const double host_cut_coulsq, double *host_special_coul, 
                   const double qqrd2e, const double e_shift, const double f_shift, 
                   const double alpha) {
   CDMF.clear();
   gpu_mode=CDMF.device->gpu_mode();
   double gpu_split=CDMF.device->particle_split();
   int first_gpu=CDMF.device->first_device();
   int last_gpu=CDMF.device->last_device();
   int world_me=CDMF.device->world_me();
   int gpu_rank=CDMF.device->gpu_rank();
   int procs_per_gpu=CDMF.device->procs_per_gpu();
 
   CDMF.device->init_message(screen,"coul/dsf",first_gpu,last_gpu);
 
   bool message=false;
   if (CDMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, 
                       gpu_split, screen, host_cut_coulsq, host_special_coul, 
                       qqrd2e, e_shift, f_shift, alpha);
 
   CDMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, 
                         gpu_split, screen, host_cut_coulsq, host_special_coul, 
                         qqrd2e, e_shift, f_shift, alpha);
 
     CDMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     CDMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void cdsf_gpu_clear() {
   CDMF.clear();
 }
 
 int** cdsf_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          double *sublo, double *subhi, int *tag, int **nspecial, 
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time,
                          bool &success, double *host_q, double *boxlo,
                          double *prd) {
   return CDMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success,
                       host_q, boxlo, prd);
 }  
 			
 void cdsf_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
                       const double cpu_time, bool &success, double *host_q,
                       const int nlocal, double *boxlo, double *prd) {
   CDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
                vflag,eatom,vatom,host_start,cpu_time,success,host_q,
                nlocal,boxlo,prd);
 }
 
 double cdsf_gpu_bytes() {
   return CDMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_coul_long_ext.cpp b/lib/gpu/lal_coul_long_ext.cpp
index f6ce0c1d7..156de0be0 100644
--- a/lib/gpu/lal_coul_long_ext.cpp
+++ b/lib/gpu/lal_coul_long_ext.cpp
@@ -1,123 +1,123 @@
 /***************************************************************************
                               coul_long_ext.cpp
                              -------------------
                            Axel Kohlmeyer (Temple)
 
   Functions for LAMMPS access to coul/long acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : July 2011
     email                : a.kohlmeyer@temple.edu
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_coul_long.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static CoulLong<PRECISION,ACC_PRECISION> CLMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int cl_gpu_init(const int inum, const int nall, const int max_nbors,
 		const int maxspecial, const double cell_size, int &gpu_mode,
 		FILE *screen, double host_cut_coulsq, double *host_special_coul,
 		const double qqrd2e, const double g_ewald) {
   CLMF.clear();
   gpu_mode=CLMF.device->gpu_mode();
   double gpu_split=CLMF.device->particle_split();
   int first_gpu=CLMF.device->first_device();
   int last_gpu=CLMF.device->last_device();
   int world_me=CLMF.device->world_me();
   int gpu_rank=CLMF.device->gpu_rank();
   int procs_per_gpu=CLMF.device->procs_per_gpu();
 
   CLMF.device->init_message(screen,"coul/long",first_gpu,last_gpu);
 
   bool message=false;
   if (CLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split,
 		      screen, host_cut_coulsq, host_special_coul, qqrd2e,
 		      g_ewald);
 
   CLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split,
 			screen, host_cut_coulsq, host_special_coul,
 			qqrd2e, g_ewald);
 
     CLMF.device->gpu_barrier();
     if (message)
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     CLMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void cl_gpu_clear() {
   CLMF.clear();
 }
 
 int** cl_gpu_compute_n(const int ago, const int inum_full,
 		       const int nall, double **host_x, int *host_type,
 		       double *sublo, double *subhi, int *tag, int **nspecial,
 		       int **special, const bool eflag, const bool vflag,
 		       const bool eatom, const bool vatom, int &host_start,
 		       int **ilist, int **jnum,  const double cpu_time,
 		       bool &success, double *host_q, double *boxlo,
 		       double *prd) {
   return CLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
 		      subhi, tag, nspecial, special, eflag, vflag, eatom,
 		      vatom, host_start, ilist, jnum, cpu_time, success,
 		      host_q, boxlo, prd);
 }
 
 void cl_gpu_compute(const int ago, const int inum_full, const int nall,
 		    double **host_x, int *host_type, int *ilist, int *numj,
 		    int **firstneigh, const bool eflag, const bool vflag,
 		    const bool eatom, const bool vatom, int &host_start,
 		    const double cpu_time, bool &success, double *host_q,
 		    const int nlocal, double *boxlo, double *prd) {
   CLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
 	       firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
 	       host_q,nlocal,boxlo,prd);
 }
 
 double cl_gpu_bytes() {
   return CLMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 99e259213..e3f1037ba 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -1,720 +1,729 @@
 /***************************************************************************
                                   device.cpp
                              -------------------
                             W. Michael Brown (ORNL)
 
   Class for management of the device where the computations are performed
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : brownw@ornl.gov
  ***************************************************************************/
 
 #include "lal_device.h"
 #include "lal_precision.h"
 #include <map>
 #include <math.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 
 #if defined(USE_OPENCL)
 #include "device_cl.h"
 #elif defined(USE_CUDART)
 const char *device=0;
 #else
 #include "device_cubin.h"
 #endif
 
 using namespace LAMMPS_AL;
 #define DeviceT Device<numtyp, acctyp>
 
 template <class numtyp, class acctyp>
 DeviceT::Device() : _init_count(0), _device_init(false),
                                   _gpu_mode(GPU_FORCE), _first_device(0),
                                   _last_device(0), _compiled(false) {
 }
 
 template <class numtyp, class acctyp>
 DeviceT::~Device() {
   clear_device();
 }
 
 template <class numtyp, class acctyp>
 int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
                          const int last_gpu, const int gpu_mode, 
                          const double p_split, const int nthreads, 
                          const int t_per_atom, const double cell_size,
                          char *ocl_vendor) {
   _nthreads=nthreads;
   #ifdef _OPENMP
   omp_set_num_threads(nthreads);
   #endif
   _threads_per_atom=t_per_atom;
   _threads_per_charge=t_per_atom;
 
   if (_device_init)
     return 0;
   _device_init=true;
   _comm_world=replica; //world;
   _comm_replica=replica;
   _first_device=first_gpu;
   _last_device=last_gpu;
   _gpu_mode=gpu_mode;
   _particle_split=p_split;
   _cell_size=cell_size;
 
   // Get the rank/size within the world
   MPI_Comm_rank(_comm_world,&_world_me);
   MPI_Comm_size(_comm_world,&_world_size);
   // Get the rank/size within the replica
   MPI_Comm_rank(_comm_replica,&_replica_me);
   MPI_Comm_size(_comm_replica,&_replica_size);
 
   // Get the names of all nodes
   int name_length;
   char node_name[MPI_MAX_PROCESSOR_NAME];
   char node_names[MPI_MAX_PROCESSOR_NAME*_world_size];
   MPI_Get_processor_name(node_name,&name_length);
   MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names,
                 MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world);
   std::string node_string=std::string(node_name);
   
   // Get the number of procs per node                
   std::map<std::string,int> name_map;
   std::map<std::string,int>::iterator np;
   for (int i=0; i<_world_size; i++) {
     std::string i_string=std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]);
     np=name_map.find(i_string);
     if (np==name_map.end())
       name_map[i_string]=1;
     else
       np->second++;
   }
   int procs_per_node=name_map.begin()->second;
 
   // Assign a unique id to each node
   int split_num=0, split_id=0;
   for (np=name_map.begin(); np!=name_map.end(); ++np) {
     if (np->first==node_string)
       split_id=split_num;
     split_num++;
   }
   
   // Set up a per node communicator and find rank within
   MPI_Comm node_comm;
   MPI_Comm_split(_comm_world, split_id, 0, &node_comm);  
   int node_rank;
   MPI_Comm_rank(node_comm,&node_rank);                  
 
   // set the device ID
   _procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/
                                        (last_gpu-first_gpu+1)));
   int my_gpu=node_rank/_procs_per_gpu+first_gpu;
 
   // Time on the device only if 1 proc per gpu
   _time_device=true;
   if (_procs_per_gpu>1)
     _time_device=false;
   
   // Set up a per device communicator
   MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
   MPI_Comm_rank(_comm_gpu,&_gpu_rank);
 
   gpu=new UCL_Device();
   if (my_gpu>=gpu->num_devices())
     return -2;
     
   #ifndef CUDA_PROXY
   if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false)
     return -7;
   #endif
   
   if (gpu->set(my_gpu)!=UCL_SUCCESS)
     return -6;
 
   gpu->push_command_queue();
   gpu->set_command_queue(1);
 
   _long_range_precompute=0;
 
   if (set_ocl_params(ocl_vendor)!=0)
     return -11;
   
   int flag=0;
   for (int i=0; i<_procs_per_gpu; i++) {
     if (_gpu_rank==i)
       flag=compile_kernels();
     gpu_barrier();
   }
   return flag;
 }
 
 template <class numtyp, class acctyp>
 int DeviceT::set_ocl_params(char *ocl_vendor) {
   #ifdef USE_OPENCL
   std::string s_vendor=OCL_DEFAULT_VENDOR;
   if (ocl_vendor!=NULL)
     s_vendor=ocl_vendor;
   if (s_vendor=="none")
     s_vendor="generic";
   
   if (s_vendor=="kepler") {
     _ocl_vendor_name="NVIDIA Kepler";
     #if defined (__APPLE__) || defined(MACOSX)
     _ocl_vendor_string="-DKEPLER_OCL -DNO_OCL_PTX";
     #else
     _ocl_vendor_string="-DKEPLER_OCL";
     #endif
   } else if (s_vendor=="fermi") {    
     _ocl_vendor_name="NVIDIA Fermi";
     _ocl_vendor_string="-DFERMI_OCL";
   } else if (s_vendor=="cypress") {    
     _ocl_vendor_name="AMD Cypress";
     _ocl_vendor_string="-DCYPRESS_OCL";
+  } else if (s_vendor=="phi") {    
+    _ocl_vendor_name="Intel Phi";
+    _ocl_vendor_string="-DPHI_OCL";
+  } else if (s_vendor=="intel") {    
+    _ocl_vendor_name="Intel CPU";
+    _ocl_vendor_string="-DINTEL_OCL";
   } else if (s_vendor=="generic") {    
     _ocl_vendor_name="GENERIC";
     _ocl_vendor_string="-DGENERIC_OCL";
   } else {
     _ocl_vendor_name="CUSTOM";
     _ocl_vendor_string="-DUSE_OPENCL";
     int token_count=0;
     std::string params[13];
     char *pch = strtok(ocl_vendor,"\" ");
     while (pch != NULL) {
       if (token_count==13)
         return -11;
       params[token_count]=pch;
       token_count++;
       pch = strtok(NULL,"\" ");
     }
     _ocl_vendor_string+=" -DMEM_THREADS="+params[0]+
                         " -DTHREADS_PER_ATOM="+params[1]+
                         " -DTHREADS_PER_CHARGE="+params[2]+
                         " -DBLOCK_PAIR="+params[3]+
                         " -DMAX_SHARED_TYPES="+params[4]+
                         " -DBLOCK_NBOR_BUILD="+params[5]+
                         " -DBLOCK_BIO_PAIR="+params[6]+
                         " -DBLOCK_ELLIPSE="+params[7]+
                         " -DWARP_SIZE="+params[8]+
                         " -DPPPM_BLOCK_1D="+params[9]+
                         " -DBLOCK_CELL_2D="+params[10]+
                         " -DBLOCK_CELL_ID="+params[11]+
                         " -DMAX_BIO_SHARED_TYPES="+params[12];
   }
   _ocl_compile_string="-cl-fast-relaxed-math -cl-mad-enable "+
                       std::string(OCL_PRECISION_COMPILE)+" "+_ocl_vendor_string;
   #endif
   return 0;
 }
 
 template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
                          const bool rot, const int nlocal, 
                          const int host_nlocal, const int nall,
                          Neighbor *nbor, const int maxspecial,
                          const int gpu_host, const int max_nbors, 
                          const double cell_size, const bool pre_cut,
                          const int threads_per_atom) {
   if (!_device_init)
     return -1;
   if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
     return -5;
 
   // Counts of data transfers for timing overhead estimates
   _data_in_estimate=0;
   _data_out_estimate=1;
 
   // Initial number of local particles
   int ef_nlocal=nlocal;
   if (_particle_split<1.0 && _particle_split>0.0)
     ef_nlocal=static_cast<int>(_particle_split*nlocal);
 
   int gpu_nbor=0;
   if (_gpu_mode==Device<numtyp,acctyp>::GPU_NEIGH)
     gpu_nbor=1;
   else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
     gpu_nbor=2;
   #ifndef USE_CUDPP
   if (gpu_nbor==1)
     gpu_nbor=2;
   #endif
 
   if (_init_count==0) {
     // Initialize atom and nbor data
     if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0))
       return -3;
       
     _data_in_estimate++;
     if (charge)
       _data_in_estimate++;
     if (rot)
       _data_in_estimate++;
   } else {
     if (atom.charge()==false && charge)
       _data_in_estimate++;
     if (atom.quaternion()==false && rot)
       _data_in_estimate++;
     if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial))
       return -3;
   }
   
   if (!ans.init(ef_nlocal,charge,rot,*gpu))
     return -3;
 
   if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
                   *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, 
                   _block_cell_id, _block_nbor_build, threads_per_atom,
                   _warp_size, _time_device, compile_string()))
     return -3;
   if (_cell_size<0.0)
     nbor->cell_size(cell_size,cell_size);
   else
     nbor->cell_size(_cell_size,cell_size);
 
   _init_count++;
   return 0;
 }
 
 template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,
                          const int nall) {
   if (!_device_init)
     return -1;                          
   if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
     return -5;
 
   if (_init_count==0) {
     // Initialize atom and nbor data
     if (!atom.init(nall,true,false,*gpu,false,false))
       return -3;
   } else
     if (!atom.add_fields(true,false,false,false))
       return -3;
 
   if (!ans.init(nlocal,true,false,*gpu))
     return -3;
 
   _init_count++;
   return 0;
 }
 
 template <class numtyp, class acctyp>
 void DeviceT::set_single_precompute
                      (PPPM<numtyp,acctyp,float,_lgpu_float4> *pppm) {
   _long_range_precompute=1;
   pppm_single=pppm;
 }
 
 template <class numtyp, class acctyp>
 void DeviceT::set_double_precompute
                      (PPPM<numtyp,acctyp,double,_lgpu_double4> *pppm) {
   _long_range_precompute=2;
   pppm_double=pppm;
 }
 
 template <class numtyp, class acctyp>
 void DeviceT::init_message(FILE *screen, const char *name,
                                   const int first_gpu, const int last_gpu) {
   #if defined(USE_OPENCL)
   std::string fs="";
   #elif defined(USE_CUDART)
   std::string fs="";
   #else
   std::string fs=toa(gpu->free_gigabytes())+"/";
   #endif
   
   if (_replica_me == 0 && screen) {
     fprintf(screen,"\n-------------------------------------");
     fprintf(screen,"-------------------------------------\n");
-    fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
+    fprintf(screen,"- Using acceleration for %s:\n",name);
     fprintf(screen,"-  with %d proc(s) per device.\n",_procs_per_gpu);
     #ifdef _OPENMP
     fprintf(screen,"-  with %d thread(s) per proc.\n",_nthreads);
     #endif
     #ifdef USE_OPENCL
     fprintf(screen,"-  with OpenCL Parameters for: %s\n",
             _ocl_vendor_name.c_str());
     #endif
     fprintf(screen,"-------------------------------------");
     fprintf(screen,"-------------------------------------\n");
 
     int last=last_gpu+1;
     if (last>gpu->num_devices())
       last=gpu->num_devices();
     for (int i=first_gpu; i<last; i++) {
       std::string sname;
       if (i==first_gpu)
-        sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
+        sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+fs+
               toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+" GHZ (";
       else              
-        sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
+        sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+fs+
               toa(gpu->clock_rate(i))+" GHZ (";
       if (sizeof(PRECISION)==4) {
         if (sizeof(ACC_PRECISION)==4)
           sname+="Single Precision)";
         else
           sname+="Mixed Precision)";
       } else
         sname+="Double Precision)";
 
-      fprintf(screen,"GPU %d: %s\n",i,sname.c_str());
+      fprintf(screen,"Device %d: %s\n",i,sname.c_str());
     }
 
     fprintf(screen,"-------------------------------------");
     fprintf(screen,"-------------------------------------\n\n");
   }
 }
 
 template <class numtyp, class acctyp>
 void DeviceT::estimate_gpu_overhead(const int kernel_calls, 
                                            double &gpu_overhead,
                                            double &gpu_driver_overhead) {
   UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL;
   UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL;
   UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL;
   UCL_Timer over_timer(*gpu);
 
   if (_data_in_estimate>0) {
     host_data_in=new UCL_H_Vec<int>[_data_in_estimate];
     dev_data_in=new UCL_D_Vec<int>[_data_in_estimate];
     timers_in=new UCL_Timer[_data_in_estimate];
   }
   
   if (_data_out_estimate>0) {
     host_data_out=new UCL_H_Vec<int>[_data_out_estimate];
     dev_data_out=new UCL_D_Vec<int>[_data_out_estimate];
     timers_out=new UCL_Timer[_data_out_estimate];
   }
   
   if (kernel_calls>0) {
     kernel_data=new UCL_D_Vec<int>[kernel_calls];
     timers_kernel=new UCL_Timer[kernel_calls];
   }
   
   for (int i=0; i<_data_in_estimate; i++) {
     host_data_in[i].alloc(1,*gpu);
     dev_data_in[i].alloc(1,*gpu);
     timers_in[i].init(*gpu);
   }  
   
   for (int i=0; i<_data_out_estimate; i++) {
     host_data_out[i].alloc(1,*gpu);
     dev_data_out[i].alloc(1,*gpu);
     timers_out[i].init(*gpu);
   }  
   
   for (int i=0; i<kernel_calls; i++) {
     kernel_data[i].alloc(1,*gpu);
     timers_kernel[i].init(*gpu);
   }  
   
   gpu_overhead=0.0;
   gpu_driver_overhead=0.0;
   
   for (int i=0; i<10; i++) {
     gpu->sync();
     gpu_barrier();
     over_timer.start();
     gpu->sync();
     gpu_barrier();
 
     double driver_time=MPI_Wtime();
     for (int i=0; i<_data_in_estimate; i++) {
       timers_in[i].start();
       ucl_copy(dev_data_in[i],host_data_in[i],true);
       timers_in[i].stop();
     }
     
     for (int i=0; i<kernel_calls; i++) {
       timers_kernel[i].start();
       zero(kernel_data[i],1);
       timers_kernel[i].stop();
     }
 
     for (int i=0; i<_data_out_estimate; i++) {
       timers_out[i].start();
       ucl_copy(host_data_out[i],dev_data_out[i],true);
       timers_out[i].stop();
     }
     over_timer.stop();
 
     double time=over_timer.seconds();
     driver_time=MPI_Wtime()-driver_time;
      
     if (time_device()) {
       for (int i=0; i<_data_in_estimate; i++)
         timers_in[i].add_to_total();
       for (int i=0; i<kernel_calls; i++)
         timers_kernel[i].add_to_total();
       for (int i=0; i<_data_out_estimate; i++)
         timers_out[i].add_to_total();
     }
     
     double mpi_time, mpi_driver_time;
     MPI_Allreduce(&time,&mpi_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
     MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
     gpu_overhead+=mpi_time;
     gpu_driver_overhead+=mpi_driver_time;
   }
   gpu_overhead/=10.0;
   gpu_driver_overhead/=10.0;
 
   if (_data_in_estimate>0) {
     delete [] host_data_in;
     delete [] dev_data_in;
     delete [] timers_in;
   }
   
   if (_data_out_estimate>0) {
     delete [] host_data_out;
     delete [] dev_data_out;
     delete [] timers_out;
   }
   
   if (kernel_calls>0) {
     delete [] kernel_data;
     delete [] timers_kernel;
   }
 }              
 
 template <class numtyp, class acctyp>
 void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans, 
                            Neighbor &nbor, const double avg_split, 
                            const double max_bytes, const double gpu_overhead,
                            const double driver_overhead, 
                            const int threads_per_atom, FILE *screen) {
   double single[9], times[9];
 
   single[0]=atom.transfer_time()+ans.transfer_time();
   single[1]=nbor.time_nbor.total_seconds()+nbor.time_hybrid1.total_seconds()+
             nbor.time_hybrid2.total_seconds();
   single[2]=nbor.time_kernel.total_seconds();
   single[3]=time_pair.total_seconds();
   single[4]=atom.cast_time()+ans.cast_time();
   single[5]=gpu_overhead;
   single[6]=driver_overhead;
   single[7]=ans.cpu_idle_time();
   single[8]=nbor.bin_time();
 
   MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
 
   double my_max_bytes=max_bytes+atom.max_gpu_bytes();
   double mpi_max_bytes;
   MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
   double max_mb=mpi_max_bytes/(1024.0*1024.0);
+  double t_time=times[0]+times[1]+times[2]+times[3]+times[4];
 
   if (replica_me()==0)
     if (screen && times[5]>0.0) {
       fprintf(screen,"\n\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"      GPU Time Info (average): ");
+      fprintf(screen,"    Device Time Info (average): ");
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
-      if (time_device()) {
+      if (time_device() && t_time>0) {
         fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/_replica_size);
         fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[4]/_replica_size);
         fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/_replica_size);
         if (nbor.gpu_nbor()>0)
           fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/_replica_size);
         else
           fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
         fprintf(screen,"Force calc:      %.4f s.\n",times[3]/_replica_size);
       }
       if (nbor.gpu_nbor()==2)
         fprintf(screen,"Neighbor (CPU):  %.4f s.\n",times[8]/_replica_size);
-      fprintf(screen,"GPU Overhead:    %.4f s.\n",times[5]/_replica_size);
+      if (times[5]>0)
+        fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
       fprintf(screen,"Threads / atom:  %d.\n",threads_per_atom);
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
       fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size);
       fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[7]/_replica_size);
 
       fprintf(screen,"-------------------------------------");
       fprintf(screen,"--------------------------------\n\n");
     }
 }
 
 template <class numtyp, class acctyp>
 void DeviceT::output_kspace_times(UCL_Timer &time_in, 
                                          UCL_Timer &time_out,
                                          UCL_Timer &time_map,
                                          UCL_Timer &time_rho,
                                          UCL_Timer &time_interp,
                                          Answer<numtyp,acctyp> &ans, 
                                          const double max_bytes, 
                                          const double cpu_time, 
                                          const double idle_time, FILE *screen) {
   double single[8], times[8];
 
   single[0]=time_out.total_seconds();
   single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time();
   single[2]=time_map.total_seconds();
   single[3]=time_rho.total_seconds();
   single[4]=time_interp.total_seconds();
   single[5]=ans.transfer_time()+ans.cast_time();
   single[6]=cpu_time;
   single[7]=idle_time;
 
   MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
 
   double my_max_bytes=max_bytes+atom.max_gpu_bytes();
   double mpi_max_bytes;
   MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
   double max_mb=mpi_max_bytes/(1024.0*1024.0);
+  double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5];
 
   if (replica_me()==0)
     if (screen && times[6]>0.0) {
       fprintf(screen,"\n\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"      GPU Time Info (average): ");
+      fprintf(screen,"    Device Time Info (average): ");
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
-      if (time_device()) {
+      if (time_device() && t_time>0) {
         fprintf(screen,"Data Out:        %.4f s.\n",times[0]/_replica_size);
         fprintf(screen,"Data In:         %.4f s.\n",times[1]/_replica_size);
         fprintf(screen,"Kernel (map):    %.4f s.\n",times[2]/_replica_size);
         fprintf(screen,"Kernel (rho):    %.4f s.\n",times[3]/_replica_size);
         fprintf(screen,"Force interp:    %.4f s.\n",times[4]/_replica_size);
         fprintf(screen,"Total rho:       %.4f s.\n",
                 (times[0]+times[2]+times[3])/_replica_size);
         fprintf(screen,"Total interp:    %.4f s.\n",
                 (times[1]+times[4])/_replica_size);
         fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size);
         fprintf(screen,"Total:           %.4f s.\n",
                 (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/
                 _replica_size);
       }
       fprintf(screen,"CPU Poisson:     %.4f s.\n",times[6]/_replica_size);
       fprintf(screen,"CPU Idle Time:   %.4f s.\n",times[7]/_replica_size);
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
 
       fprintf(screen,"-------------------------------------");
       fprintf(screen,"--------------------------------\n\n");
     }
 }
 
 template <class numtyp, class acctyp>
 void DeviceT::clear() {
   if (_init_count>0) {
     _long_range_precompute=0;
     _init_count--;
     if (_init_count==0) {
       atom.clear();
       _neighbor_shared.clear();
     }
   }
 }
 
 template <class numtyp, class acctyp>
 void DeviceT::clear_device() {
   while (_init_count>0)
     clear();
   if (_compiled) {
     k_zero.clear();
     k_info.clear();
     delete dev_program;
     _compiled=false;
   }
   if (_device_init) {
     delete gpu;
     _device_init=false;
   }
 }
 
 template <class numtyp, class acctyp>
 int DeviceT::compile_kernels() {
   int flag=0;
 
   if (_compiled)
   	return flag;
   	
   dev_program=new UCL_Program(*gpu);
   int success=dev_program->load_string(device,compile_string().c_str());
   if (success!=UCL_SUCCESS)
     return -4;
   k_zero.set_function(*dev_program,"kernel_zero");
   k_info.set_function(*dev_program,"kernel_info");
   _compiled=true;
 
   UCL_Vector<int,int> gpu_lib_data(15,*gpu,UCL_NOT_PINNED);
   k_info.set_size(1,1);
   k_info.run(&gpu_lib_data);
   gpu_lib_data.update_host(false);
   
   _ptx_arch=static_cast<double>(gpu_lib_data[0])/100.0;
   #ifndef USE_OPENCL
   if (_ptx_arch>gpu->arch() || floor(_ptx_arch)<floor(gpu->arch()))
     return -4;
   #endif
 
   _num_mem_threads=gpu_lib_data[1];
   _warp_size=gpu_lib_data[2];
   if (_threads_per_atom<1)
     _threads_per_atom=gpu_lib_data[3];
   if (_threads_per_charge<1)
     _threads_per_charge=gpu_lib_data[13];
   _pppm_max_spline=gpu_lib_data[4];
   _pppm_block=gpu_lib_data[5];
   _block_pair=gpu_lib_data[6];
   _max_shared_types=gpu_lib_data[7];
   _block_cell_2d=gpu_lib_data[8];
   _block_cell_id=gpu_lib_data[9];
   _block_nbor_build=gpu_lib_data[10];
   _block_bio_pair=gpu_lib_data[11];
   _max_bio_shared_types=gpu_lib_data[12];
   _block_ellipse=gpu_lib_data[14];
 
   if (static_cast<size_t>(_block_pair)>gpu->group_size())
     _block_pair=gpu->group_size();
   if (static_cast<size_t>(_block_bio_pair)>gpu->group_size())
     _block_bio_pair=gpu->group_size();
   if (_threads_per_atom>_warp_size)
     _threads_per_atom=_warp_size;
   if (_warp_size%_threads_per_atom!=0)
     _threads_per_atom=1;
   if (_threads_per_atom & (_threads_per_atom - 1))
     _threads_per_atom=1;
   if (_threads_per_charge>_warp_size)
     _threads_per_charge=_warp_size;
   if (_warp_size%_threads_per_charge!=0)
     _threads_per_charge=1;
   if (_threads_per_charge & (_threads_per_charge - 1))
     _threads_per_charge=1;
 
   return flag;    
 }
 
 template <class numtyp, class acctyp>
 double DeviceT::host_memory_usage() const {
   return atom.host_memory_usage()+4*sizeof(numtyp)+
          sizeof(Device<numtyp,acctyp>);
 }
 
 template class Device<PRECISION,ACC_PRECISION>;
 Device<PRECISION,ACC_PRECISION> global_device;
 
 int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
                     const int last_gpu, const int gpu_mode, 
                     const double particle_split, const int nthreads,
                     const int t_per_atom, const double cell_size, 
                     char *opencl_vendor) {
   return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
                                    particle_split,nthreads,t_per_atom, 
                                    cell_size,opencl_vendor);
 }
 
 void lmp_clear_device() {
   global_device.clear_device();
 }
 
 double lmp_gpu_forces(double **f, double **tor, double *eatom,
                       double **vatom, double *virial, double &ecoul) {
   return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
 }
 
diff --git a/lib/gpu/lal_dipole_lj_ext.cpp b/lib/gpu/lal_dipole_lj_ext.cpp
index 05d8fd9f7..48db22d99 100644
--- a/lib/gpu/lal_dipole_lj_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_ext.cpp
@@ -1,128 +1,128 @@
 /***************************************************************************
                                dipole_lj_ext.cpp
                              -------------------
                             Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to dipole/cut acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_dipole_lj.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static DipoleLJ<PRECISION,ACC_PRECISION> DPLMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  double **host_lj2, double **host_lj3, double **host_lj4,
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors, const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen,
                  double **host_cut_ljsq, double **host_cut_coulsq,
                  double *host_special_coul, const double qqrd2e) {
   DPLMF.clear();
   gpu_mode=DPLMF.device->gpu_mode();
   double gpu_split=DPLMF.device->particle_split();
   int first_gpu=DPLMF.device->first_device();
   int last_gpu=DPLMF.device->last_device();
   int world_me=DPLMF.device->world_me();
   int gpu_rank=DPLMF.device->gpu_rank();
   int procs_per_gpu=DPLMF.device->procs_per_gpu();
 
   DPLMF.device->init_message(screen,"dipole/cut",first_gpu,last_gpu);
 
   bool message=false;
   if (DPLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=DPLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                        host_lj4, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, qqrd2e);
 
   DPLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=DPLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                          offset, special_lj, inum, nall, 300, maxspecial,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e);
 
     DPLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     DPLMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void dpl_gpu_clear() {
   DPLMF.clear();
 }
 
 int** dpl_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, int *tag, int **nspecial, 
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
                         bool &success, double *host_q, double **host_mu, 
                         double *boxlo, double *prd) {
   return DPLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, host_mu, boxlo, prd);
 }  
 			
 void dpl_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success, double *host_q,
                      double **host_mu, const int nlocal, double *boxlo, double *prd) {
   DPLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
                 vflag,eatom,vatom,host_start,cpu_time,success,host_q,host_mu,
                 nlocal,boxlo,prd);
 }
 
 double dpl_gpu_bytes() {
   return DPLMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_dipole_lj_sf_ext.cpp b/lib/gpu/lal_dipole_lj_sf_ext.cpp
index 53ef66fca..ad2bf10b0 100644
--- a/lib/gpu/lal_dipole_lj_sf_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_sf_ext.cpp
@@ -1,128 +1,128 @@
 /***************************************************************************
                                dipole_lj_sf_ext.cpp
                              -------------------
                             Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to dipole/sf acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_dipole_lj_sf.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static DipoleLJSF<PRECISION,ACC_PRECISION> DPLSFMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                    double **host_lj2, double **host_lj3, double **host_lj4,
                    double *special_lj, const int inum,
                    const int nall, const int max_nbors, const int maxspecial,
                    const double cell_size, int &gpu_mode, FILE *screen,
                    double **host_cut_ljsq, double **host_cut_coulsq,
                    double *host_special_coul, const double qqrd2e) {
   DPLSFMF.clear();
   gpu_mode=DPLSFMF.device->gpu_mode();
   double gpu_split=DPLSFMF.device->particle_split();
   int first_gpu=DPLSFMF.device->first_device();
   int last_gpu=DPLSFMF.device->last_device();
   int world_me=DPLSFMF.device->world_me();
   int gpu_rank=DPLSFMF.device->gpu_rank();
   int procs_per_gpu=DPLSFMF.device->procs_per_gpu();
 
   DPLSFMF.device->init_message(screen,"dipole/sf",first_gpu,last_gpu);
 
   bool message=false;
   if (DPLSFMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=DPLSFMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                          host_lj4, special_lj, inum, nall, 300,
                          maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e);
 
   DPLSFMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=DPLSFMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                            special_lj, inum, nall, 300, maxspecial,
                            cell_size, gpu_split, screen, host_cut_ljsq,
                            host_cut_coulsq, host_special_coul, qqrd2e);
 
     DPLSFMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     DPLSFMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void dplsf_gpu_clear() {
   DPLSFMF.clear();
 }
 
 int** dplsf_gpu_compute_n(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           double *sublo, double *subhi, int *tag, int **nspecial, 
                           int **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
                           bool &success, double *host_q, double **host_mu, 
                           double *boxlo, double *prd) {
   return DPLSFMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                          subhi, tag, nspecial, special, eflag, vflag, eatom,
                          vatom, host_start, ilist, jnum, cpu_time, success,
                          host_q, host_mu, boxlo, prd);
 }  
 			
 void dplsf_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
                        const double cpu_time, bool &success, double *host_q,
                        double **host_mu, const int nlocal, double *boxlo, double *prd) {
   DPLSFMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
                   vflag,eatom,vatom,host_start,cpu_time,success,host_q,host_mu,
                   nlocal,boxlo,prd);
 }
 
 double dplsf_gpu_bytes() {
   return DPLSFMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_eam_ext.cpp b/lib/gpu/lal_eam_ext.cpp
index 687a04529..e603f9cff 100644
--- a/lib/gpu/lal_eam_ext.cpp
+++ b/lib/gpu/lal_eam_ext.cpp
@@ -1,143 +1,143 @@
 /***************************************************************************
                                  eam_ext.cpp
                              -------------------
                    Trung Dac Nguyen, W. Michael Brown (ORNL)
 
   Functions for LAMMPS access to buck acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : brownw@ornl.gov nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_eam.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static EAM<PRECISION,ACC_PRECISION> EAMMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int eam_gpu_init(const int ntypes, double host_cutforcesq, 
                  int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
                  double ***host_rhor_spline, double ***host_z2r_spline,
                  double ***host_frho_spline,
                  double rdr, double rdrho, double rhomax, int nrhor, 
                  int nrho, int nz2r, int nfrho, int nr, 
                  const int nlocal, const int nall, const int max_nbors, 
                  const int maxspecial, const double cell_size, 
                  int &gpu_mode, FILE *screen, int &fp_size) {
   EAMMF.clear();
   gpu_mode=EAMMF.device->gpu_mode();
   double gpu_split=EAMMF.device->particle_split();
   int first_gpu=EAMMF.device->first_device();
   int last_gpu=EAMMF.device->last_device();
   int world_me=EAMMF.device->world_me();
   int gpu_rank=EAMMF.device->gpu_rank();
   int procs_per_gpu=EAMMF.device->procs_per_gpu();
 
   // disable host/device split for now
   if (gpu_split != 1.0) 
     return -8;
     
   fp_size=sizeof(PRECISION);
     
   EAMMF.device->init_message(screen,"eam",first_gpu,last_gpu);
 
   bool message=false;
   if (EAMMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                        host_type2frho, host_rhor_spline, host_z2r_spline,
                        host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, 
                        nfrho, nr, nlocal, nall, 300, maxspecial, cell_size,
                        gpu_split, screen);
 
   EAMMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                          host_type2frho, host_rhor_spline, host_z2r_spline,
                          host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, 
                          nz2r, nfrho, nr, nlocal, nall, 300, maxspecial,
                          cell_size, gpu_split, screen);
 
     EAMMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     EAMMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void eam_gpu_clear() {
   EAMMF.clear();
 }
 
 int ** eam_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          double *sublo, double *subhi, int *tag, int **nspecial, 
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
                          bool &success, int &inum, void **fp_ptr) {
   return EAMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        inum, fp_ptr);
 }  
 
 void eam_gpu_compute(const int ago, const int inum_full, const int nlocal, 
                      const int nall, double **host_x, int *host_type, 
                      int *ilist, int *numj, int **firstneigh, const bool eflag,
                      const bool vflag, const bool eatom, const bool vatom,
                      int &host_start, const double cpu_time, bool &success,
                      void **fp_ptr) {
   EAMMF.compute(ago,inum_full,nlocal,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
                 fp_ptr);
 }
 
 void eam_gpu_compute_force(int *ilist, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom) {
   EAMMF.compute2(ilist, eflag, vflag, eatom, vatom);
 }
 
 
 double eam_gpu_bytes() {
   return EAMMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_eam_lj_ext.cpp b/lib/gpu/lal_eam_lj_ext.cpp
index ea86a7be9..2719b5106 100644
--- a/lib/gpu/lal_eam_lj_ext.cpp
+++ b/lib/gpu/lal_eam_lj_ext.cpp
@@ -1,153 +1,153 @@
 // **************************************************************************
 //                               lal_eam_ext.cpp
 //                             -------------------
 //                     W. Michael Brown, Trung Dac Nguyen (ORNL)
 //
 //  Class for acceleration of the eam pair style
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 
 //    email                : brownw@ornl.gov nguyentd@ornl.gov
 // ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_eam_lj.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static EAMLJ<PRECISION,ACC_PRECISION> EAMLJMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int eam_lj_gpu_init(const int ntypes, double host_cutforcesq, 
                  int **host_type2rhor, int **host_type2z2r, int *host_type2frho,
                  double ***host_rhor_spline, double ***host_z2r_spline,
                  double ***host_frho_spline, double **host_cutljsq, 
                  double **host_lj1, double **host_lj2, double **host_lj3, 
                  double **host_lj4, double **offset, double *special_lj,
                  double rdr, double rdrho, int nrhor, 
                  int nrho, int nz2r, int nfrho, int nr, 
                  const int nlocal, const int nall, const int max_nbors, 
                  const int maxspecial, const double cell_size, 
                  int &gpu_mode, FILE *screen, int &fp_size) {
   EAMLJMF.clear();
   gpu_mode=EAMLJMF.device->gpu_mode();
   double gpu_split=EAMLJMF.device->particle_split();
   int first_gpu=EAMLJMF.device->first_device();
   int last_gpu=EAMLJMF.device->last_device();
   int world_me=EAMLJMF.device->world_me();
   int gpu_rank=EAMLJMF.device->gpu_rank();
   int procs_per_gpu=EAMLJMF.device->procs_per_gpu();
 
   // disable host/device split for now
   if (gpu_split != 1.0) 
     return -8;
     
   fp_size=sizeof(PRECISION);
     
   EAMLJMF.device->init_message(screen,"eam_lj",first_gpu,last_gpu);
 
   bool message=false;
   if (EAMLJMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=EAMLJMF.init(ntypes, host_cutforcesq,
                     host_type2rhor, host_type2z2r, host_type2frho,
                     host_rhor_spline, host_z2r_spline,
                     host_frho_spline, host_cutljsq, host_lj1,
                     host_lj2, host_lj3, host_lj4,
                     offset, special_lj, rdr, rdrho, nrhor, 
                     nrho, nz2r, nfrho, nr, 
                     nlocal, nall, 300, maxspecial,
                     cell_size, gpu_split, screen);
 
   EAMLJMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=EAMLJMF.init(ntypes, host_cutforcesq,
                     host_type2rhor, host_type2z2r, host_type2frho,
                     host_rhor_spline, host_z2r_spline,
                     host_frho_spline, host_cutljsq, host_lj1,
                     host_lj2, host_lj3, host_lj4,
                     offset, special_lj, rdr, rdrho, nrhor, 
                     nrho, nz2r, nfrho, nr, 
                     nlocal, nall, 300, maxspecial,
                     cell_size, gpu_split, screen);
 
     EAMLJMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     EAMLJMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void eam_lj_gpu_clear() {
   EAMLJMF.clear();
 }
 
 int ** eam_lj_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          double *sublo, double *subhi, int *tag, int **nspecial, 
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
                          bool &success, int &inum, void **fp_ptr) {
   return EAMLJMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         inum, fp_ptr);
 }  
 
 void eam_lj_gpu_compute(const int ago, const int inum_full, const int nlocal, 
                      const int nall, double **host_x, int *host_type, 
                      int *ilist, int *numj, int **firstneigh, const bool eflag,
                      const bool vflag, const bool eatom, const bool vatom,
                      int &host_start, const double cpu_time, bool &success,
                      void **fp_ptr) {
   EAMLJMF.compute(ago,inum_full,nlocal,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
                 fp_ptr);
 }
 
 void eam_lj_gpu_compute_force(int *ilist, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom) {
   EAMLJMF.compute2(ilist, eflag, vflag, eatom, vatom);
 }
 
 
 double eam_lj_gpu_bytes() {
   return EAMLJMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_gauss_ext.cpp b/lib/gpu/lal_gauss_ext.cpp
index 4d95c7cfb..8adb96855 100644
--- a/lib/gpu/lal_gauss_ext.cpp
+++ b/lib/gpu/lal_gauss_ext.cpp
@@ -1,120 +1,120 @@
 /***************************************************************************
                                  gauss_ext.cpp
                              -------------------
                             Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to gauss acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_gauss.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static Gauss<PRECISION,ACC_PRECISION> GLMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a, 
                    double **host_b, double **offset, double *special_lj, 
                    const int inum, const int nall, const int max_nbors,  
                    const int maxspecial,
                    const double cell_size, int &gpu_mode, FILE *screen) {
   GLMF.clear();
   gpu_mode=GLMF.device->gpu_mode();
   double gpu_split=GLMF.device->particle_split();
   int first_gpu=GLMF.device->first_device();
   int last_gpu=GLMF.device->last_device();
   int world_me=GLMF.device->world_me();
   int gpu_rank=GLMF.device->gpu_rank();
   int procs_per_gpu=GLMF.device->procs_per_gpu();
 
   GLMF.device->init_message(screen,"gauss",first_gpu,last_gpu);
 
   bool message=false;
   if (GLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=GLMF.init(ntypes, cutsq, host_a, host_b, 
                        offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
   GLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=GLMF.init(ntypes, cutsq, host_a, host_b,
                         offset, special_lj, inum, nall, 300, maxspecial,
                         cell_size, gpu_split, screen);
 
     GLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     GLMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void gauss_gpu_clear() {
   GLMF.clear();
 }
 
 int ** gauss_gpu_compute_n(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            double *sublo, double *subhi, int *tag, int **nspecial,
                            int **special, const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
                            bool &success) {
   return GLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
 void gauss_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
                        const double cpu_time, bool &success) {
   GLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
 double gauss_gpu_bytes() {
   return GLMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_gayberne_ext.cpp b/lib/gpu/lal_gayberne_ext.cpp
index 047250363..a18f4bcd7 100644
--- a/lib/gpu/lal_gayberne_ext.cpp
+++ b/lib/gpu/lal_gayberne_ext.cpp
@@ -1,141 +1,141 @@
 /***************************************************************************
                               gayberne_ext.cpp
                              -------------------
                                W. Michael Brown
 
   LAMMPS Wrappers for Gay-Berne Acceleration
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_gayberne.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static GayBerne<PRECISION,ACC_PRECISION> GBMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int gb_gpu_init(const int ntypes, const double gamma,
                 const double upsilon, const double mu, double **shape,
                 double **well, double **cutsq, double **sigma,
                 double **epsilon, double *host_lshape, int **form,
                 double **host_lj1, double **host_lj2, double **host_lj3,
                 double **host_lj4, double **offset, double *special_lj,
                 const int inum, const int nall, const int max_nbors, 
                 const int maxspecial, const double cell_size, int &gpu_mode,
                 FILE *screen) {
   GBMF.clear();
   gpu_mode=GBMF.device->gpu_mode();
   double gpu_split=GBMF.device->particle_split();
   int first_gpu=GBMF.device->first_device();
   int last_gpu=GBMF.device->last_device();
   int world_me=GBMF.device->world_me();
   int gpu_rank=GBMF.device->gpu_rank();
   int procs_per_gpu=GBMF.device->procs_per_gpu();
 
   GBMF.device->init_message(screen,"gayberne",first_gpu,last_gpu);
 
   bool message=false;
   if (GBMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
                       sigma, epsilon, host_lshape, form, host_lj1, 
                       host_lj2, host_lj3, host_lj4, offset, special_lj, 
                       inum, nall, max_nbors, maxspecial, cell_size, gpu_split,
                       screen);
 
   GBMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
         
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,  sigma,
                         epsilon, host_lshape, form, host_lj1, host_lj2,
                         host_lj3, host_lj4, offset, special_lj,  inum, nall,
                         max_nbors, maxspecial, cell_size, gpu_split,  screen);
 
     GBMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     GBMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 // ---------------------------------------------------------------------------
 // Clear memory on host and device
 // ---------------------------------------------------------------------------
 void gb_gpu_clear() {
   GBMF.clear();
 }
 
   int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, int *tag, int **nspecial,
                 int **special, const bool eflag, const bool vflag, 
                 const bool eatom, const bool vatom, int &host_start, 
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 double **host_quat);
 
 int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, double *sublo,
                        double *subhi, int *tag, int **nspecial, int **special,
                        const bool eflag, const bool vflag, const bool eatom,
                        const bool vatom, int &host_start, int **ilist,
                        int **jnum, const double cpu_time, bool &success,
                        double **host_quat) {
   return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, 
                       tag, nspecial, special, eflag, vflag, eatom, vatom, 
                       host_start, ilist, jnum, cpu_time, success, host_quat);
 }
 
 int * gb_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success, double **host_quat) {
   return GBMF.compute(ago, inum_full, nall, host_x, host_type, ilist,
                       numj, firstneigh, eflag, vflag, eatom, vatom, host_start,
                       cpu_time, success, host_quat);
 }
 
 // ---------------------------------------------------------------------------
 // Return memory usage
 // ---------------------------------------------------------------------------
 double gb_gpu_bytes() {
   return GBMF.host_memory_usage();
 }
 
diff --git a/lib/gpu/lal_lj96_ext.cpp b/lib/gpu/lal_lj96_ext.cpp
index 4fe188057..ce3a5a87b 100644
--- a/lib/gpu/lal_lj96_ext.cpp
+++ b/lib/gpu/lal_lj96_ext.cpp
@@ -1,120 +1,120 @@
 /***************************************************************************
                                  lj96_ext.cpp
                              -------------------
                             W. Michael Brown (ORNL)
 
   Functions for LAMMPS access to lj96/cut acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : brownw@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_lj96.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static LJ96<PRECISION,ACC_PRECISION> LJ96MF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                   double **host_lj2, double **host_lj3, double **host_lj4,
                   double **offset, double *special_lj, const int inum,
                   const int nall, const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen) {
   LJ96MF.clear();
   gpu_mode=LJ96MF.device->gpu_mode();
   double gpu_split=LJ96MF.device->particle_split();
   int first_gpu=LJ96MF.device->first_device();
   int last_gpu=LJ96MF.device->last_device();
   int world_me=LJ96MF.device->world_me();
   int gpu_rank=LJ96MF.device->gpu_rank();
   int procs_per_gpu=LJ96MF.device->procs_per_gpu();
 
   LJ96MF.device->init_message(screen,"lj96/cut",first_gpu,last_gpu);
 
   bool message=false;
   if (LJ96MF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                         host_lj4, offset, special_lj, inum, nall, 300,
                         maxspecial, cell_size, gpu_split, screen);
 
   LJ96MF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                           offset, special_lj, inum,  nall, 300, maxspecial,
                           cell_size, gpu_split, screen);
 
     LJ96MF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     LJ96MF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void lj96_gpu_clear() {
   LJ96MF.clear();
 }
 
 int** lj96_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          double *sublo, double *subhi, int *tag, int **nspecial,
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time,
                          bool &success) {
   return LJ96MF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
 void lj96_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
                       const double cpu_time, bool &success) {
   LJ96MF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
                  eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
 double lj96_gpu_bytes() {
   return LJ96MF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_lj_class2_long_ext.cpp b/lib/gpu/lal_lj_class2_long_ext.cpp
index 7e637d4c9..6152dc27c 100644
--- a/lib/gpu/lal_lj_class2_long_ext.cpp
+++ b/lib/gpu/lal_lj_class2_long_ext.cpp
@@ -1,129 +1,129 @@
 /***************************************************************************
                            lj_class2_long_ext.cpp
                              -------------------
                                W. Michael Brown
 
   LAMMPS Wrappers for COMMPASS LJ long Acceleration
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : Mon May 16 2011
     email                : brownw@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_lj_class2_long.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static LJClass2Long<PRECISION,ACC_PRECISION> C2CLMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                   double **host_lj2, double **host_lj3, double **host_lj4,
                   double **offset, double *special_lj, const int inum,
                   const int nall, const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen,
                   double **host_cut_ljsq, double host_cut_coulsq,
                   double *host_special_coul, const double qqrd2e,
                   const double g_ewald) {
   C2CLMF.clear();
   gpu_mode=C2CLMF.device->gpu_mode();
   double gpu_split=C2CLMF.device->particle_split();
   int first_gpu=C2CLMF.device->first_device();
   int last_gpu=C2CLMF.device->last_device();
   int world_me=C2CLMF.device->world_me();
   int gpu_rank=C2CLMF.device->gpu_rank();
   int procs_per_gpu=C2CLMF.device->procs_per_gpu();
 
   C2CLMF.device->init_message(screen,"lj/class2/coul/long",first_gpu,last_gpu);
 
   bool message=false;
   if (C2CLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=C2CLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                         offset, special_lj, inum, nall, 300, maxspecial,
                         cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
   C2CLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=C2CLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                           offset, special_lj, inum, nall, 300, maxspecial,
                           cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
     C2CLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     C2CLMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void c2cl_gpu_clear() {
   C2CLMF.clear();
 }
 
 int** c2cl_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          double *sublo, double *subhi, int *tag, int **nspecial, 
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
                          bool &success, double *host_q, double *boxlo,
                          double *prd) {
   return C2CLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
 }  
 			
 void c2cl_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
                       const double cpu_time, bool &success, double *host_q,
                       const int nlocal, double *boxlo, double *prd) {
   C2CLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
                 host_q,nlocal,boxlo,prd);
 }
 
 double c2cl_gpu_bytes() {
   return C2CLMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_lj_coul_debye_ext.cpp b/lib/gpu/lal_lj_coul_debye_ext.cpp
index 3e5aab4d9..3658d8a5b 100644
--- a/lib/gpu/lal_lj_coul_debye_ext.cpp
+++ b/lib/gpu/lal_lj_coul_debye_ext.cpp
@@ -1,129 +1,129 @@
 /***************************************************************************
                              lj_coul_debye_ext.cpp
                              -------------------
                             Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to lj/cut/coul/debye acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_lj_coul_debye.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static LJCoulDebye<PRECISION,ACC_PRECISION> LJCDMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                   double **host_lj2, double **host_lj3, double **host_lj4,
                   double **offset, double *special_lj, const int inum,
                   const int nall, const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen,
                   double **host_cut_ljsq, double **host_cut_coulsq,
                   double *host_special_coul, const double qqrd2e, 
                   const double kappa) {
   LJCDMF.clear();
   gpu_mode=LJCDMF.device->gpu_mode();
   double gpu_split=LJCDMF.device->particle_split();
   int first_gpu=LJCDMF.device->first_device();
   int last_gpu=LJCDMF.device->last_device();
   int world_me=LJCDMF.device->world_me();
   int gpu_rank=LJCDMF.device->gpu_rank();
   int procs_per_gpu=LJCDMF.device->procs_per_gpu();
 
   LJCDMF.device->init_message(screen,"lj/cut/coul/debye",first_gpu,last_gpu);
 
   bool message=false;
   if (LJCDMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=LJCDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                         host_lj4, offset, special_lj, inum, nall, 300,
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, kappa);
 
   LJCDMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJCDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                           offset, special_lj, inum, nall, 300, maxspecial,
                           cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, qqrd2e, kappa);
 
     LJCDMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     LJCDMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void ljcd_gpu_clear() {
   LJCDMF.clear();
 }
 
 int** ljcd_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          double *sublo, double *subhi, int *tag, int **nspecial, 
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time,
                          bool &success, double *host_q, double *boxlo,
                          double *prd) {
   return LJCDMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
 }  
 			
 void ljcd_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success, double *host_q,
                      const int nlocal, double *boxlo, double *prd) {
   LJCDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
                 vflag,eatom,vatom,host_start,cpu_time,success,host_q,
                 nlocal,boxlo,prd);
 }
 
 double ljcd_gpu_bytes() {
   return LJCDMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_lj_coul_ext.cpp b/lib/gpu/lal_lj_coul_ext.cpp
index b0dec6f07..1faf52899 100644
--- a/lib/gpu/lal_lj_coul_ext.cpp
+++ b/lib/gpu/lal_lj_coul_ext.cpp
@@ -1,128 +1,128 @@
 /***************************************************************************
                                lj_coul_ext.cpp
                              -------------------
                             W. Michael Brown (ORNL)
 
   Functions for LAMMPS access to lj/cut/coul acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : brownw@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_lj_coul.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static LJCoul<PRECISION,ACC_PRECISION> LJCMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  double **host_lj2, double **host_lj3, double **host_lj4,
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors, const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen,
                  double **host_cut_ljsq, double **host_cut_coulsq,
                  double *host_special_coul, const double qqrd2e) {
   LJCMF.clear();
   gpu_mode=LJCMF.device->gpu_mode();
   double gpu_split=LJCMF.device->particle_split();
   int first_gpu=LJCMF.device->first_device();
   int last_gpu=LJCMF.device->last_device();
   int world_me=LJCMF.device->world_me();
   int gpu_rank=LJCMF.device->gpu_rank();
   int procs_per_gpu=LJCMF.device->procs_per_gpu();
 
   LJCMF.device->init_message(screen,"lj/cut/coul/cut",first_gpu,last_gpu);
 
   bool message=false;
   if (LJCMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                        host_lj4, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, qqrd2e);
 
   LJCMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                          offset, special_lj, inum, nall, 300, maxspecial,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e);
 
     LJCMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     LJCMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void ljc_gpu_clear() {
   LJCMF.clear();
 }
 
 int** ljc_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, int *tag, int **nspecial, 
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
                         bool &success, double *host_q, double *boxlo,
                         double *prd) {
   return LJCMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, boxlo, prd);
 }  
 			
 void ljc_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success, double *host_q,
                      const int nlocal, double *boxlo, double *prd) {
   LJCMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
                 vflag,eatom,vatom,host_start,cpu_time,success,host_q,
                 nlocal,boxlo,prd);
 }
 
 double ljc_gpu_bytes() {
   return LJCMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_lj_coul_long_ext.cpp b/lib/gpu/lal_lj_coul_long_ext.cpp
index f0724a8a9..a7a485b4c 100644
--- a/lib/gpu/lal_lj_coul_long_ext.cpp
+++ b/lib/gpu/lal_lj_coul_long_ext.cpp
@@ -1,129 +1,129 @@
 /***************************************************************************
                             lj_coul_long_ext.cpp
                              -------------------
                             W. Michael Brown (ORNL)
 
   Functions for LAMMPS access to lj/cut/coul/long acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : brownw@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_lj_coul_long.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static LJCoulLong<PRECISION,ACC_PRECISION> LJCLMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                   double **host_lj2, double **host_lj3, double **host_lj4,
                   double **offset, double *special_lj, const int inum,
                   const int nall, const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen,
                   double **host_cut_ljsq, double host_cut_coulsq,
                   double *host_special_coul, const double qqrd2e,
                   const double g_ewald) {
   LJCLMF.clear();
   gpu_mode=LJCLMF.device->gpu_mode();
   double gpu_split=LJCLMF.device->particle_split();
   int first_gpu=LJCLMF.device->first_device();
   int last_gpu=LJCLMF.device->last_device();
   int world_me=LJCLMF.device->world_me();
   int gpu_rank=LJCLMF.device->gpu_rank();
   int procs_per_gpu=LJCLMF.device->procs_per_gpu();
 
   LJCLMF.device->init_message(screen,"lj/cut/coul/long",first_gpu,last_gpu);
 
   bool message=false;
   if (LJCLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                         offset, special_lj, inum, nall, 300, maxspecial,
                         cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
   LJCLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                           offset, special_lj, inum, nall, 300, maxspecial,
                           cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
     LJCLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     LJCLMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void ljcl_gpu_clear() {
   LJCLMF.clear();
 }
 
 int** ljcl_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          double *sublo, double *subhi, int *tag, int **nspecial, 
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
                          bool &success, double *host_q, double *boxlo,
                          double *prd) {
   return LJCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
 }  
 			
 void ljcl_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
                       const double cpu_time, bool &success, double *host_q,
                       const int nlocal, double *boxlo, double *prd) {
   LJCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
                 host_q,nlocal,boxlo,prd);
 }
 
 double ljcl_gpu_bytes() {
   return LJCLMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_lj_coul_msm_ext.cpp b/lib/gpu/lal_lj_coul_msm_ext.cpp
index 3480b14f3..af4c93bb8 100644
--- a/lib/gpu/lal_lj_coul_msm_ext.cpp
+++ b/lib/gpu/lal_lj_coul_msm_ext.cpp
@@ -1,131 +1,131 @@
 /***************************************************************************
                             lj_coul_msm_ext.cpp
                              -------------------
                             W. Michael Brown (ORNL)
 
   Functions for LAMMPS access to lj/cut/coul/msm acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : brownw@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_lj_coul_msm.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static LJCoulMSM<PRECISION,ACC_PRECISION> LJCMLMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  double **host_lj2, double **host_lj3, double **host_lj4,
                  double **host_gcons, double **host_dgcons,
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors, const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen,
                  double **host_cut_ljsq, double host_cut_coulsq,
                  double *host_special_coul, const int order, const double qqrd2e) {
   LJCMLMF.clear();
   gpu_mode=LJCMLMF.device->gpu_mode();
   double gpu_split=LJCMLMF.device->particle_split();
   int first_gpu=LJCMLMF.device->first_device();
   int last_gpu=LJCMLMF.device->last_device();
   int world_me=LJCMLMF.device->world_me();
   int gpu_rank=LJCMLMF.device->gpu_rank();
   int procs_per_gpu=LJCMLMF.device->procs_per_gpu();
 
   LJCMLMF.device->init_message(screen,"lj/cut/coul/msm",first_gpu,last_gpu);
 
   bool message=false;
   if (LJCMLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=LJCMLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                         host_gcons, host_dgcons, offset,
                         special_lj, inum, nall, 300, maxspecial,
                         cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, order, qqrd2e);
 
   LJCMLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJCMLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                           host_gcons, host_dgcons, offset,
                           special_lj, inum, nall, 300, maxspecial,
                           cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, order, qqrd2e);
 
     LJCMLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     LJCMLMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void ljcm_gpu_clear() {
   LJCMLMF.clear();
 }
 
 int** ljcm_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          double *sublo, double *subhi, int *tag, int **nspecial, 
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,  const double cpu_time,
                          bool &success, double *host_q, double *boxlo,
                          double *prd) {
   return LJCMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                         subhi, tag, nspecial, special, eflag, vflag, eatom,
                         vatom, host_start, ilist, jnum, cpu_time, success,
                         host_q, boxlo, prd);
 }  
 			
 void ljcm_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
                       const double cpu_time, bool &success, double *host_q,
                       const int nlocal, double *boxlo, double *prd) {
   LJCMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
                 host_q,nlocal,boxlo,prd);
 }
 
 double ljcm_gpu_bytes() {
   return LJCMLMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_lj_dsf_ext.cpp b/lib/gpu/lal_lj_dsf_ext.cpp
index 52b56f38c..5f1f17ee5 100644
--- a/lib/gpu/lal_lj_dsf_ext.cpp
+++ b/lib/gpu/lal_lj_dsf_ext.cpp
@@ -1,132 +1,132 @@
 /***************************************************************************
                                lj_dsf_ext.cpp
                              -------------------
                             W. Michael Brown (ORNL)
 
   Functions for LAMMPS access to lj/cut/coul/dsf acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 7/12/2012
     email                : brownw@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_lj_dsf.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static LJDSF<PRECISION,ACC_PRECISION> LJDMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  double **host_lj2, double **host_lj3, double **host_lj4,
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors, const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen,
                  double **host_cut_ljsq, const double host_cut_coulsq,
                  double *host_special_coul, const double qqrd2e,
                  const double e_shift, const double f_shift, 
                  const double alpha) {
   LJDMF.clear();
   gpu_mode=LJDMF.device->gpu_mode();
   double gpu_split=LJDMF.device->particle_split();
   int first_gpu=LJDMF.device->first_device();
   int last_gpu=LJDMF.device->last_device();
   int world_me=LJDMF.device->world_me();
   int gpu_rank=LJDMF.device->gpu_rank();
   int procs_per_gpu=LJDMF.device->procs_per_gpu();
 
   LJDMF.device->init_message(screen,"lj/cut/coul/dsf",first_gpu,last_gpu);
 
   bool message=false;
   if (LJDMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=LJDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                        host_lj4, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, qqrd2e, e_shift,
                        f_shift, alpha);
 
   LJDMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                          offset, special_lj, inum, nall, 300, maxspecial,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e, e_shift,
                          f_shift, alpha);
 
     LJDMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     LJDMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void ljd_gpu_clear() {
   LJDMF.clear();
 }
 
 int** ljd_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, int *tag, int **nspecial, 
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
                         bool &success, double *host_q, double *boxlo,
                         double *prd) {
   return LJDMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success,
                        host_q, boxlo, prd);
 }  
 			
 void ljd_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success, double *host_q,
                      const int nlocal, double *boxlo, double *prd) {
   LJDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
                 vflag,eatom,vatom,host_start,cpu_time,success,host_q,
                 nlocal,boxlo,prd);
 }
 
 double ljd_gpu_bytes() {
   return LJDMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_lj_expand_ext.cpp b/lib/gpu/lal_lj_expand_ext.cpp
index 54bb3f62f..c6939cdf9 100644
--- a/lib/gpu/lal_lj_expand_ext.cpp
+++ b/lib/gpu/lal_lj_expand_ext.cpp
@@ -1,121 +1,121 @@
 /***************************************************************************
                               lj_expand_ext.cpp
                              -------------------
                             Inderaj Bains (NVIDIA)
 
   Functions for LAMMPS access to lj/expand acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : ibains@nvidia.com
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_lj_expand.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static LJExpand<PRECISION,ACC_PRECISION> LJEMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  double **host_lj2, double **host_lj3, double **host_lj4,
                  double **offset, double **shift, double *special_lj,
                  const int inum, const int nall, const int max_nbors, 
                  const int maxspecial, const double cell_size, int &gpu_mode,
                  FILE *screen) {
   LJEMF.clear();
   gpu_mode=LJEMF.device->gpu_mode();
   double gpu_split=LJEMF.device->particle_split();
   int first_gpu=LJEMF.device->first_device();
   int last_gpu=LJEMF.device->last_device();
   int world_me=LJEMF.device->world_me();
   int gpu_rank=LJEMF.device->gpu_rank();
   int procs_per_gpu=LJEMF.device->procs_per_gpu();
 
   LJEMF.device->init_message(screen,"lj/expand",first_gpu,last_gpu);
 
   bool message=false;
   if (LJEMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                        host_lj4, offset, shift, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
   LJEMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                          offset, shift, special_lj, inum, nall, 300, maxspecial,
                          cell_size, gpu_split,screen);
 
     LJEMF.device->world_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     LJEMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void lje_gpu_clear() {
   LJEMF.clear();
 }
 
 int** lje_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, int *tag, int **nspecial,
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
                         bool &success) {
   return LJEMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
 void lje_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success) {
   LJEMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
 double lje_gpu_bytes() {
   return LJEMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_lj_ext.cpp b/lib/gpu/lal_lj_ext.cpp
index 1dc47ccbb..650eadbfc 100644
--- a/lib/gpu/lal_lj_ext.cpp
+++ b/lib/gpu/lal_lj_ext.cpp
@@ -1,120 +1,120 @@
 /***************************************************************************
                                  lj_ext.cpp
                              -------------------
                             W. Michael Brown (ORNL)
 
   Functions for LAMMPS access to lj/cut acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : brownw@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_lj.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static LJ<PRECISION,ACC_PRECISION> LJLMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  double **host_lj2, double **host_lj3, double **host_lj4,
                  double **offset, double *special_lj, const int inum,
                  const int nall, const int max_nbors,  const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen) {
   LJLMF.clear();
   gpu_mode=LJLMF.device->gpu_mode();
   double gpu_split=LJLMF.device->particle_split();
   int first_gpu=LJLMF.device->first_device();
   int last_gpu=LJLMF.device->last_device();
   int world_me=LJLMF.device->world_me();
   int gpu_rank=LJLMF.device->gpu_rank();
   int procs_per_gpu=LJLMF.device->procs_per_gpu();
 
   LJLMF.device->init_message(screen,"lj/cut",first_gpu,last_gpu);
 
   bool message=false;
   if (LJLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
                        host_lj4, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
   LJLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                          offset, special_lj, inum, nall, 300, maxspecial,
                          cell_size, gpu_split, screen);
 
     LJLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     LJLMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void ljl_gpu_clear() {
   LJLMF.clear();
 }
 
 int ** ljl_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, int *tag, int **nspecial,
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
                         bool &success) {
   return LJLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
 void ljl_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success) {
   LJLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
 double ljl_gpu_bytes() {
   return LJLMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_mie_ext.cpp b/lib/gpu/lal_mie_ext.cpp
index 52668cf52..d3f17a92b 100644
--- a/lib/gpu/lal_mie_ext.cpp
+++ b/lib/gpu/lal_mie_ext.cpp
@@ -1,124 +1,124 @@
 /***************************************************************************
                                  mie_ext.cpp
                              -------------------
                             Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to mie acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_mie.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static Mie<PRECISION,ACC_PRECISION> MLMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
                  double **host_mie2, double **host_mie3, double **host_mie4,
                  double **host_gamA, double **host_gamR,
                  double **offset, double *special_lj,
                  const int inum, const int nall, const int max_nbors,
                  const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen) {
   MLMF.clear();
   gpu_mode=MLMF.device->gpu_mode();
   double gpu_split=MLMF.device->particle_split();
   int first_gpu=MLMF.device->first_device();
   int last_gpu=MLMF.device->last_device();
   int world_me=MLMF.device->world_me();
   int gpu_rank=MLMF.device->gpu_rank();
   int procs_per_gpu=MLMF.device->procs_per_gpu();
 
   MLMF.device->init_message(screen,"mie",first_gpu,last_gpu);
 
   bool message=false;
   if (MLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=MLMF.init(ntypes, cutsq, host_mie1, host_mie2,
                       host_mie3, host_mie4, host_gamA, host_gamR,
                       offset, special_lj, inum, nall, 300,
                       maxspecial, cell_size, gpu_split, screen);
 
   MLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=MLMF.init(ntypes, cutsq, host_mie1, host_mie2,
                         host_mie3, host_mie4, host_gamA, host_gamR,
                         offset, special_lj, inum, nall, 300, maxspecial,
                         cell_size, gpu_split, screen);
 
     MLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     MLMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void mie_gpu_clear() {
   MLMF.clear();
 }
 
 int ** mie_gpu_compute_n(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            double *sublo, double *subhi, int *tag, int **nspecial,
                            int **special, const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
                            bool &success) {
   return MLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
 void mie_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
                        const double cpu_time, bool &success) {
   MLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
 double mie_gpu_bytes() {
   return MLMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_morse_ext.cpp b/lib/gpu/lal_morse_ext.cpp
index 787f49b39..5277ba43f 100644
--- a/lib/gpu/lal_morse_ext.cpp
+++ b/lib/gpu/lal_morse_ext.cpp
@@ -1,121 +1,121 @@
 /***************************************************************************
                                   morse.cpp
                              -------------------
                             W. Michael Brown (ORNL)
 
   Functions for LAMMPS access to morse acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : brownw@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_morse.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static Morse<PRECISION,ACC_PRECISION> MORMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int mor_gpu_init(const int ntypes, double **cutsq,
                  double **host_lj1, double **host_lj2, double **host_lj3, 
                  double **host_lj4, double **offset, double *special_lj,
                  const int inum, const int nall, const int max_nbors, 
                  const int maxspecial, const double cell_size, int &gpu_mode,
                  FILE *screen) {
   MORMF.clear();
   gpu_mode=MORMF.device->gpu_mode();
   double gpu_split=MORMF.device->particle_split();
   int first_gpu=MORMF.device->first_device();
   int last_gpu=MORMF.device->last_device();
   int world_me=MORMF.device->world_me();
   int gpu_rank=MORMF.device->gpu_rank();
   int procs_per_gpu=MORMF.device->procs_per_gpu();
 
   MORMF.device->init_message(screen,"morse",first_gpu,last_gpu);
 
   bool message=false;
   if (MORMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
                        host_lj4, offset, special_lj, inum, nall, 300,
                        maxspecial, cell_size, gpu_split, screen);
 
   MORMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                          offset, special_lj, inum, nall, 300, maxspecial,
                          cell_size, gpu_split, screen);
 
     MORMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     MORMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void mor_gpu_clear() {
   MORMF.clear();
 }
 
 int** mor_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, int *tag, int **nspecial,
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
                         bool &success) {
   return MORMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
 void mor_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success) {
   MORMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
 double mor_gpu_bytes() {
   return MORMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_pppm_ext.cpp b/lib/gpu/lal_pppm_ext.cpp
index 08f2c94e9..6e5a82af5 100644
--- a/lib/gpu/lal_pppm_ext.cpp
+++ b/lib/gpu/lal_pppm_ext.cpp
@@ -1,178 +1,178 @@
 /***************************************************************************
                                  pppm_ext.cpp
                              -------------------
                             W. Michael Brown (ORNL)
 
   Functions for LAMMPS access to PPPM acceleration routines
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : brownw@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_pppm.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static PPPM<PRECISION,ACC_PRECISION,float,_lgpu_float4> PPPMF;
 static PPPM<PRECISION,ACC_PRECISION,double,_lgpu_double4> PPPMD;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 template <class grdtyp, class memtyp>
 grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
                        FILE *screen, const int order, const int nxlo_out, 
                        const int nylo_out, const int nzlo_out,
                        const int nxhi_out, const int nyhi_out,
                        const int nzhi_out, grdtyp **rho_coeff,
                        grdtyp **vd_brick, const double slab_volfactor,
                        const int nx_pppm, const int ny_pppm, const int nz_pppm,
                        const bool split, int &success) {
   pppm.clear(0.0);
   int first_gpu=pppm.device->first_device();
   int last_gpu=pppm.device->last_device();
   int world_me=pppm.device->world_me();
   int gpu_rank=pppm.device->gpu_rank();
   int procs_per_gpu=pppm.device->procs_per_gpu();
 
   pppm.device->init_message(screen,"pppm",first_gpu,last_gpu);
 
   bool message=false;
   if (pppm.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   success=0;
   grdtyp * host_brick=NULL;
   if (world_me==0)
     host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out,nzlo_out,
                          nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
                          slab_volfactor,nx_pppm,ny_pppm,nz_pppm,split,success);
 
   pppm.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out,
                            nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
                            vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
                            split,success);
 
     pppm.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
   return host_brick;
 }
 
 float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen,
                         const int order, const int nxlo_out, 
                         const int nylo_out, const int nzlo_out,
                         const int nxhi_out, const int nyhi_out,
                         const int nzhi_out, float **rho_coeff,
                         float **vd_brick, const double slab_volfactor,
                         const int nx_pppm, const int ny_pppm, const int nz_pppm,
                         const bool split, const bool respa, int &success) {
   float *b=pppm_gpu_init(PPPMF,nlocal,nall,screen,order,nxlo_out,nylo_out,
                          nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
                          slab_volfactor,nx_pppm,ny_pppm,nz_pppm,split,success);
   if (split==false && respa==false)
     PPPMF.device->set_single_precompute(&PPPMF);                         
   return b;
 }
 
 void pppm_gpu_clear_f(const double cpu_time) {
   PPPMF.clear(cpu_time);
 }
 
 int pppm_gpu_spread_f(const int ago, const int nlocal, const int nall,
                      double **host_x, int *host_type, bool &success,
                      double *host_q, double *boxlo, const double delxinv,
                      const double delyinv, const double delzinv) {
   return PPPMF.spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
                       delxinv,delyinv,delzinv);
 }
 
 void pppm_gpu_interp_f(const float qqrd2e_scale) {
   PPPMF.interp(qqrd2e_scale);
 }
 
 double pppm_gpu_bytes_f() {
   return PPPMF.host_memory_usage();
 }
 
 void pppm_gpu_forces_f(double **f) {
   double etmp;
   PPPMF.atom->data_unavail();
   PPPMF.ans->get_answers(f,NULL,NULL,NULL,NULL,etmp);
 }
 
 double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen,
                          const int order, const int nxlo_out, 
                          const int nylo_out, const int nzlo_out,
                          const int nxhi_out, const int nyhi_out,
                          const int nzhi_out, double **rho_coeff,
                          double **vd_brick, const double slab_volfactor,
                          const int nx_pppm, const int ny_pppm,
                          const int nz_pppm, const bool split, 
                          const bool respa, int &success) {
   double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out,
                           nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
                           vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
                           split,success);                        
   if (split==false && respa==false)
     PPPMD.device->set_double_precompute(&PPPMD);                         
   return b;
 }
 
 void pppm_gpu_clear_d(const double cpu_time) {
   PPPMD.clear(cpu_time);
 }
 
 int pppm_gpu_spread_d(const int ago, const int nlocal, const int nall,
                       double **host_x, int *host_type, bool &success,
                       double *host_q, double *boxlo, const double delxinv,
                       const double delyinv, const double delzinv) {
   return PPPMD.spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
                       delxinv,delyinv,delzinv);
 }
 
 void pppm_gpu_interp_d(const double qqrd2e_scale) {
   PPPMD.interp(qqrd2e_scale);
 }
 
 double pppm_gpu_bytes_d() {
   return PPPMD.host_memory_usage();
 }
 
 void pppm_gpu_forces_d(double **f) {
   double etmp;
   PPPMD.atom->data_unavail();
   PPPMD.ans->get_answers(f,NULL,NULL,NULL,NULL,etmp);
 }
 
diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h
index 05e3e1d57..226483b94 100644
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@@ -1,477 +1,527 @@
 // **************************************************************************
 //                              preprocessor.cu
 //                             -------------------
 //                           W. Michael Brown (ORNL)
 //
 //  Device code for CUDA-specific preprocessor definitions
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
 // __________________________________________________________________________
 //
 //    begin                : 
 //    email                : brownw@ornl.gov
 // ***************************************************************************/
 
 //*************************************************************************
 //                           Preprocessor Definitions
 //                           
 //  Note: It is assumed that constants with the same names are defined with
 //  the same values in all files.
 //  
 //  ARCH
 //     Definition:   Architecture number for accelerator
 //  MEM_THREADS
 //     Definition:   Number of threads with sequential ids accessing memory
 //                   simultaneously on multiprocessor
 //  WARP_SIZE:
 //     Definition:   Number of threads guaranteed to be on the same instruction
 //  THREADS_PER_ATOM
 //     Definition:   Default number of threads assigned per atom for pair styles
 //     Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
 //  THREADS_PER_CHARGE
 //     Definition:   Default number of threads assigned per atom for pair styles
 //                   with charge
 //     Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
 //  PPPM_MAX_SPLINE
 //     Definition:   Maximum order for splines in PPPM
 //  PPPM_BLOCK_1D    
 //     Definition:   Thread block size for PPPM kernels
 //     Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
 //                   PPPM_BLOCK_1D%32==0 
 //  BLOCK_PAIR
 //     Definition:   Default thread block size for pair styles
 //     Restrictions:
 //  MAX_SHARED_TYPES 8
 //     Definition:   Max # of atom type params can be stored in shared memory
 //     Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR
 //  BLOCK_CELL_2D 
 //     Definition:   Default block size in each dimension for cell list builds
 //                   and matrix transpose
 //  BLOCK_CELL_ID    
 //     Definition:   Default block size for binning atoms in cell list builds
 //  BLOCK_NBOR_BUILD 
 //     Definition:   Default block size for neighbor list builds
 //  BLOCK_BIO_PAIR
 //     Definition:   Default thread block size for "bio" pair styles
 //  MAX_BIO_SHARED_TYPES
 //     Definition:   Max # of atom type params can be stored in shared memory
 //     Restrictions:  MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2
 //
 //*************************************************************************/
 
 // -------------------------------------------------------------------------
 //                            CUDA DEFINITIONS
 // -------------------------------------------------------------------------
 
 #ifdef NV_KERNEL
 
 #define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
 #define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
 #define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
 #define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
 #define THREAD_ID_X threadIdx.x
 #define THREAD_ID_Y threadIdx.y
 #define BLOCK_ID_X blockIdx.x
 #define BLOCK_ID_Y blockIdx.y
 #define BLOCK_SIZE_X blockDim.x
 #define BLOCK_SIZE_Y blockDim.y
 #define __kernel extern "C" __global__
 #define __local __shared__
 #define __global  
 #define restrict __restrict__
 #define atom_add atomicAdd
 #define ucl_inline static __inline__ __device__ 
 
 #ifdef __CUDA_ARCH__
 #define ARCH __CUDA_ARCH__
 #else
 #define ARCH 100
 #endif
 
 #if (ARCH < 200)
 
 #define THREADS_PER_ATOM 1
 #define THREADS_PER_CHARGE 16
 #define BLOCK_NBOR_BUILD 64
 #define BLOCK_PAIR 64
 #define BLOCK_BIO_PAIR 64
 #define MAX_SHARED_TYPES 8
 
 #else
 
 #if (ARCH < 300)
 
 #define THREADS_PER_ATOM 4
 #define THREADS_PER_CHARGE 8
 #define BLOCK_NBOR_BUILD 128
 #define BLOCK_PAIR 128
 #define BLOCK_BIO_PAIR 128
 #define MAX_SHARED_TYPES 8
 
 #else
 
 #define THREADS_PER_ATOM 4
 #define THREADS_PER_CHARGE 8
 #define BLOCK_NBOR_BUILD 128
 #define BLOCK_PAIR 256
 #define BLOCK_BIO_PAIR 256
 #define BLOCK_ELLIPSE 128
 #define MAX_SHARED_TYPES 11
 
 #ifdef _SINGLE_SINGLE
 #define shfl_xor __shfl_xor
 #else
 ucl_inline double shfl_xor(double var, int laneMask, int width) {
   int2 tmp;
   tmp.x = __double2hiint(var);
   tmp.y = __double2loint(var);
   tmp.x = __shfl_xor(tmp.x,laneMask,width);
   tmp.y = __shfl_xor(tmp.y,laneMask,width);
   return __hiloint2double(tmp.x,tmp.y);
 }
 #endif
 
 #endif
 
 #endif
 
 #define WARP_SIZE 32
 #define PPPM_BLOCK_1D 64
 #define BLOCK_CELL_2D 8
 #define BLOCK_CELL_ID 128
 #define MAX_BIO_SHARED_TYPES 128
 
 #ifdef _DOUBLE_DOUBLE
 #define fetch4(ans,i,pos_tex) {                        \
   int4 xy = tex1Dfetch(pos_tex,i*2);                   \
   int4 zt = tex1Dfetch(pos_tex,i*2+1);                 \
   ans.x=__hiloint2double(xy.y, xy.x);                  \
   ans.y=__hiloint2double(xy.w, xy.z);                  \
   ans.z=__hiloint2double(zt.y, zt.x);                  \
   ans.w=__hiloint2double(zt.w, zt.z);                  \
 }
 #define fetch(ans,i,q_tex) {                           \
   int2 qt = tex1Dfetch(q_tex,i);                       \
   ans=__hiloint2double(qt.y, qt.x);                    \
 }
 #else
 #define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
 #define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
 #endif
 
 #if (__CUDA_ARCH__ < 200)
 #define fast_mul __mul24
 #define MEM_THREADS 16
 #else
 #define fast_mul(X,Y) (X)*(Y)
 #define MEM_THREADS 32
 #endif
 
 #ifdef CUDA_PRE_THREE
 struct __builtin_align__(16) _double4
 {
   double x, y, z, w;
 };
 typedef struct _double4 double4;
 #endif
 
 #ifdef _DOUBLE_DOUBLE
 
 #define ucl_exp exp
 #define ucl_powr pow
 #define ucl_atan atan
 #define ucl_cbrt cbrt
 #define ucl_ceil ceil
 #define ucl_abs fabs
 #define ucl_rsqrt rsqrt
 #define ucl_sqrt sqrt
 #define ucl_recip(x) ((numtyp)1.0/(x))
 
 #else
 
 #define ucl_atan atanf
 #define ucl_cbrt cbrtf
 #define ucl_ceil ceilf
 #define ucl_abs fabsf
 #define ucl_recip(x) ((numtyp)1.0/(x))
 #define ucl_rsqrt rsqrtf
 #define ucl_sqrt sqrtf
 
 #ifdef NO_HARDWARE_TRANSCENDENTALS
 
 #define ucl_exp expf
 #define ucl_powr powf
 
 #else
 
 #define ucl_exp __expf
 #define ucl_powr __powf
 
 #endif
 
 #endif
 
 #endif
 
 // -------------------------------------------------------------------------
 //                            NVIDIA GENERIC OPENCL DEFINITIONS
 // -------------------------------------------------------------------------
 
 #ifdef NV_GENERIC_OCL
 
 #define USE_OPENCL
 #define fast_mul mul24
 #define MEM_THREADS 16
 #define THREADS_PER_ATOM 1
 #define THREADS_PER_CHARGE 1
 #define BLOCK_PAIR 64
 #define MAX_SHARED_TYPES 8
 #define BLOCK_NBOR_BUILD 64
 #define BLOCK_BIO_PAIR 64
 
 #define WARP_SIZE 32
 #define PPPM_BLOCK_1D 64
 #define BLOCK_CELL_2D 8
 #define BLOCK_CELL_ID 128
 #define MAX_BIO_SHARED_TYPES 128
 
 #endif
 
 // -------------------------------------------------------------------------
 //                           NVIDIA FERMI OPENCL DEFINITIONS
 // -------------------------------------------------------------------------
 
 #ifdef FERMI_OCL
 
 #define USE_OPENCL
 #define MEM_THREADS 32
 #define THREADS_PER_ATOM 4
 #define THREADS_PER_CHARGE 8
 #define BLOCK_PAIR 128
 #define MAX_SHARED_TYPES 11
 #define BLOCK_NBOR_BUILD 128
 #define BLOCK_BIO_PAIR 128
 
 #define WARP_SIZE 32
 #define PPPM_BLOCK_1D 64
 #define BLOCK_CELL_2D 8
 #define BLOCK_CELL_ID 128
 #define MAX_BIO_SHARED_TYPES 128
 
 #endif
 
 // -------------------------------------------------------------------------
 //                           NVIDIA KEPLER OPENCL DEFINITIONS
 // -------------------------------------------------------------------------
 
 #ifdef KEPLER_OCL
 
 #define USE_OPENCL
 #define MEM_THREADS 32
 #define THREADS_PER_ATOM 4
 #define THREADS_PER_CHARGE 8
 #define BLOCK_PAIR 256
 #define MAX_SHARED_TYPES 11
 #define BLOCK_NBOR_BUILD 128
 #define BLOCK_BIO_PAIR 256
 #define BLOCK_ELLIPSE 128
 
 #define WARP_SIZE 32
 #define PPPM_BLOCK_1D 64
 #define BLOCK_CELL_2D 8
 #define BLOCK_CELL_ID 128
 #define MAX_BIO_SHARED_TYPES 128
 
 #ifndef NO_OCL_PTX
 #define ARCH 300
 #ifdef _SINGLE_SINGLE
 inline float shfl_xor(float var, int laneMask, int width) {
   float ret;
   int c;
   c = ((WARP_SIZE-width) << 8) | 0x1f;
   asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c));
   return ret;
 }
 #else
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 inline double shfl_xor(double var, int laneMask, int width) {
   int c = ((WARP_SIZE-width) << 8) | 0x1f;
   int x,y,x2,y2;
   double ans;
   asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
   asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(x2) : "r"(x), "r"(laneMask), "r"(c));
   asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(y2) : "r"(y), "r"(laneMask), "r"(c));
   asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
   return ans;
 }
 #endif
 #endif
 
 #endif
 
 // -------------------------------------------------------------------------
 //                            AMD CYPRESS OPENCL DEFINITIONS
 // -------------------------------------------------------------------------
 
 #ifdef CYPRESS_OCL
 
 #define USE_OPENCL
 #define MEM_THREADS 32
 #define THREADS_PER_ATOM 4
 #define THREADS_PER_CHARGE 8
 #define BLOCK_PAIR 128
 #define MAX_SHARED_TYPES 8
 #define BLOCK_NBOR_BUILD 64
 #define BLOCK_BIO_PAIR 64
 
 #define WARP_SIZE 64
 #define PPPM_BLOCK_1D 64
 #define BLOCK_CELL_2D 8
 #define BLOCK_CELL_ID 128
 #define MAX_BIO_SHARED_TYPES 128
 
 #endif
 
+// -------------------------------------------------------------------------
+//                           INTEL CPU OPENCL DEFINITIONS
+// -------------------------------------------------------------------------
+
+#ifdef INTEL_OCL
+
+#define USE_OPENCL
+#define MEM_THREADS 16
+#define THREADS_PER_ATOM 1
+#define THREADS_PER_CHARGE 1
+#define BLOCK_PAIR 1
+#define MAX_SHARED_TYPES 0
+#define BLOCK_NBOR_BUILD 4
+#define BLOCK_BIO_PAIR 2
+#define BLOCK_ELLIPSE 2
+
+#define WARP_SIZE 1
+#define PPPM_BLOCK_1D 32
+#define BLOCK_CELL_2D 1
+#define BLOCK_CELL_ID 2
+#define MAX_BIO_SHARED_TYPES 0
+
+#endif
+
+// -------------------------------------------------------------------------
+//                           INTEL PHI OPENCL DEFINITIONS
+// -------------------------------------------------------------------------
+
+#ifdef PHI_OCL
+
+#define USE_OPENCL
+#define MEM_THREADS 16
+#define THREADS_PER_ATOM 1
+#define THREADS_PER_CHARGE 1
+#define BLOCK_PAIR 16
+#define MAX_SHARED_TYPES 0
+#define BLOCK_NBOR_BUILD 16
+#define BLOCK_BIO_PAIR 16
+#define BLOCK_ELLIPSE 16
+
+#define WARP_SIZE 1
+#define PPPM_BLOCK_1D 32
+#define BLOCK_CELL_2D 4
+#define BLOCK_CELL_ID 16
+#define MAX_BIO_SHARED_TYPES 0
+
+#endif
+
 // -------------------------------------------------------------------------
 //                            GENERIC OPENCL DEFINITIONS
 // -------------------------------------------------------------------------
 
 #ifdef GENERIC_OCL
 
 #define USE_OPENCL
 #define MEM_THREADS 16
 #define THREADS_PER_ATOM 1
 #define THREADS_PER_CHARGE 1
 #define BLOCK_PAIR 64
 #define MAX_SHARED_TYPES 8
 #define BLOCK_NBOR_BUILD 64
 #define BLOCK_BIO_PAIR 64
 
 #define WARP_SIZE 1
 #define PPPM_BLOCK_1D 64
 #define BLOCK_CELL_2D 8
 #define BLOCK_CELL_ID 128
 #define MAX_BIO_SHARED_TYPES 128
 
 #endif
 
 // -------------------------------------------------------------------------
 //                     OPENCL Stuff for All Hardware
 // -------------------------------------------------------------------------
 #ifdef USE_OPENCL
 
 #ifndef _SINGLE_SINGLE
 
 #ifndef cl_khr_fp64
 #ifndef cl_amd_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #endif
 #endif
 #if defined(cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #elif defined(cl_amd_fp64)
 #pragma OPENCL EXTENSION cl_amd_fp64 : enable
 #endif
 
 #endif
 
 #ifndef fast_mul
 #define fast_mul(X,Y) (X)*(Y)
 #endif
 
 #ifndef ARCH
 #define ARCH 0
 #endif
 
 #ifndef DRIVER
 #define DRIVER 0
 #endif
 
 #define GLOBAL_ID_X get_global_id(0)
 #define THREAD_ID_X get_local_id(0)
 #define BLOCK_ID_X get_group_id(0)
 #define BLOCK_SIZE_X get_local_size(0)
 #define GLOBAL_SIZE_X get_global_size(0)
 #define THREAD_ID_Y get_local_id(1)
 #define BLOCK_ID_Y get_group_id(1)
 #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
 #define ucl_inline inline
 #define fetch4(ans,i,x) ans=x[i]
 #define fetch(ans,i,q) ans=q[i]
 
 #define ucl_atan atan
 #define ucl_cbrt cbrt
 #define ucl_ceil ceil
 #define ucl_abs fabs
 
 #ifdef _DOUBLE_DOUBLE
 #define NO_HARDWARE_TRANSCENDENTALS
 #endif
 
 #ifdef NO_HARDWARE_TRANSCENDENTALS
 
 #define ucl_exp exp
 #define ucl_powr powr
 #define ucl_rsqrt rsqrt
 #define ucl_sqrt sqrt
 #define ucl_recip(x) ((numtyp)1.0/(x))
 
 #else
 
 #define ucl_exp native_exp
 #define ucl_powr native_powr
 #define ucl_rsqrt native_rsqrt
 #define ucl_sqrt native_sqrt
 #define ucl_recip native_recip
 
 #endif
 
 #endif
 
 // -------------------------------------------------------------------------
 //                  ARCHITECTURE INDEPENDENT DEFINITIONS
 // -------------------------------------------------------------------------
 
+#ifndef PPPM_MAX_SPLINE
 #define PPPM_MAX_SPLINE 8
+#endif
 
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
 #define acctyp4 double4
 #endif
 
 #ifdef _SINGLE_DOUBLE
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
 #define acctyp4 double4
 #endif
 
 #ifndef numtyp
 #define numtyp float
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
 #define acctyp4 float4
 #endif
 
 #define EWALD_F (numtyp)1.12837917
 #define EWALD_P (numtyp)0.3275911
 #define A1 (numtyp)0.254829592
 #define A2 (numtyp)-0.284496736
 #define A3 (numtyp)1.421413741
 #define A4 (numtyp)-1.453152027
 #define A5 (numtyp)1.061405429
 
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 ucl_inline int sbmask(int j) { return j >> SBBITS & 3; };
 
 #ifndef BLOCK_ELLIPSE
 #define BLOCK_ELLIPSE BLOCK_PAIR
 #endif
 
diff --git a/lib/gpu/lal_re_squared_ext.cpp b/lib/gpu/lal_re_squared_ext.cpp
index 09e4c15c4..18bb1f681 100644
--- a/lib/gpu/lal_re_squared_ext.cpp
+++ b/lib/gpu/lal_re_squared_ext.cpp
@@ -1,138 +1,138 @@
 /***************************************************************************
                               re_squared_ext.cpp
                              -------------------
                                W. Michael Brown
 
   LAMMPS Wrappers for RE-Squared Acceleration
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                :
     email                : brownw@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_re_squared.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static RESquared<PRECISION,ACC_PRECISION> REMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq,
                 double **sigma, double **epsilon, 
                 int **form, double **host_lj1, double **host_lj2, 
                 double **host_lj3, double **host_lj4, double **offset,
                 double *special_lj, const int inum, const int nall,
                 const int max_nbors, const int maxspecial,
                 const double cell_size, int &gpu_mode, FILE *screen) {
   REMF.clear();
   gpu_mode=REMF.device->gpu_mode();
   double gpu_split=REMF.device->particle_split();
   int first_gpu=REMF.device->first_device();
   int last_gpu=REMF.device->last_device();
   int world_me=REMF.device->world_me();
   int gpu_rank=REMF.device->gpu_rank();
   int procs_per_gpu=REMF.device->procs_per_gpu();
 
   REMF.device->init_message(screen,"resquared",first_gpu,last_gpu);
 
   bool message=false;
   if (REMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=REMF.init(ntypes, shape, well, cutsq, sigma, epsilon, 
                       form, host_lj1, host_lj2, host_lj3, host_lj4, offset,
                       special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                       gpu_split, screen);
 
   REMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
         
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=REMF.init(ntypes, shape, well, cutsq,  sigma, epsilon, 
                         form, host_lj1, host_lj2, host_lj3,
                         host_lj4, offset, special_lj,  inum, nall,
                         max_nbors, maxspecial, cell_size, gpu_split, screen);
 
     REMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     REMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 // ---------------------------------------------------------------------------
 // Clear memory on host and device
 // ---------------------------------------------------------------------------
 void re_gpu_clear() {
   REMF.clear();
 }
 
   int** compute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, double *sublo,
                 double *subhi, int *tag, int **nspecial,
                 int **special, const bool eflag, const bool vflag, 
                 const bool eatom, const bool vatom, int &host_start, 
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 double **host_quat);
 
 int** re_gpu_compute_n(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, double *sublo,
                        double *subhi, int *tag, int **nspecial, int **special,
                        const bool eflag, const bool vflag, const bool eatom,
                        const bool vatom, int &host_start, int **ilist,
                        int **jnum, const double cpu_time, bool &success,
                        double **host_quat) {
   return REMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, 
                       tag, nspecial, special, eflag, vflag, eatom, vatom, 
                       host_start, ilist, jnum, cpu_time, success, host_quat);
 }
 
 int * re_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success, double **host_quat) {
   return REMF.compute(ago, inum_full, nall, host_x, host_type, ilist,
                       numj, firstneigh, eflag, vflag, eatom, vatom, host_start,
                       cpu_time, success, host_quat);
 }
 
 // ---------------------------------------------------------------------------
 // Return memory usage
 // ---------------------------------------------------------------------------
 double re_gpu_bytes() {
   return REMF.host_memory_usage();
 }
 
diff --git a/lib/gpu/lal_soft_ext.cpp b/lib/gpu/lal_soft_ext.cpp
index 3efeece42..effae9639 100644
--- a/lib/gpu/lal_soft_ext.cpp
+++ b/lib/gpu/lal_soft_ext.cpp
@@ -1,120 +1,120 @@
 /***************************************************************************
                                  soft_ext.cpp
                              -------------------
                             Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to soft acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_soft.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static Soft<PRECISION,ACC_PRECISION> SLMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
                   double **host_cut, double *special_lj,
                   const int inum, const int nall, const int max_nbors,
                   const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen) {
   SLMF.clear();
   gpu_mode=SLMF.device->gpu_mode();
   double gpu_split=SLMF.device->particle_split();
   int first_gpu=SLMF.device->first_device();
   int last_gpu=SLMF.device->last_device();
   int world_me=SLMF.device->world_me();
   int gpu_rank=SLMF.device->gpu_rank();
   int procs_per_gpu=SLMF.device->procs_per_gpu();
 
   SLMF.device->init_message(screen,"soft",first_gpu,last_gpu);
 
   bool message=false;
   if (SLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=SLMF.init(ntypes, cutsq, host_prefactor, host_cut,
                       special_lj, inum, nall, 300,
                       maxspecial, cell_size, gpu_split, screen);
 
   SLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=SLMF.init(ntypes, cutsq, host_prefactor, host_cut,
                         special_lj, inum, nall, 300, maxspecial,
                         cell_size, gpu_split, screen);
 
     SLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     SLMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void soft_gpu_clear() {
   SLMF.clear();
 }
 
 int ** soft_gpu_compute_n(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            double *sublo, double *subhi, int *tag, int **nspecial,
                            int **special, const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
                            bool &success) {
   return SLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
                       vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
 void soft_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
                        const double cpu_time, bool &success) {
   SLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
 double soft_gpu_bytes() {
   return SLMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_sw_ext.cpp b/lib/gpu/lal_sw_ext.cpp
index 33abf073d..e23570e02 100644
--- a/lib/gpu/lal_sw_ext.cpp
+++ b/lib/gpu/lal_sw_ext.cpp
@@ -1,128 +1,128 @@
 /***************************************************************************
                                  sw_ext.cpp
                              -------------------
                             W. Michael Brown (ORNL)
 
   Functions for LAMMPS access to sw acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : Tue March 26, 2013
     email                : brownw@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_sw.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static SW<PRECISION,ACC_PRECISION> SWMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int sw_gpu_init(const int inum, const int nall, const int max_nbors, 
                 const double cell_size, int &gpu_mode, FILE *screen,
                 const double sw_epsilon, const double sw_sigma,
                 const double sw_lambda, const double sw_gamma,
                 const double sw_costheta, const double sw_biga,
                 const double sw_bigb, const double sw_powerp,
                 const double sw_powerq, const double sw_cut, 
                 const double sw_cutsq) {
   SWMF.clear();
   gpu_mode=SWMF.device->gpu_mode();
   double gpu_split=SWMF.device->particle_split();
   int first_gpu=SWMF.device->first_device();
   int last_gpu=SWMF.device->last_device();
   int world_me=SWMF.device->world_me();
   int gpu_rank=SWMF.device->gpu_rank();
   int procs_per_gpu=SWMF.device->procs_per_gpu();
 
   // disable host/device split for now
   if (gpu_split != 1.0) 
     return -8;
 
   SWMF.device->init_message(screen,"sw/gpu",first_gpu,last_gpu);
 
   bool message=false;
   if (SWMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=SWMF.init(inum, nall, 300, cell_size, gpu_split, screen,
                       sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, 
                       sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq);
 
   SWMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=SWMF.init(inum, nall, 300, cell_size, gpu_split, screen,
                         sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, 
                         sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, 
                         sw_cutsq);
 
     SWMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     SWMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void sw_gpu_clear() {
   SWMF.clear();
 }
 
 int ** sw_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, int *tag, int **nspecial,
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
                         bool &success) {
   return SWMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
 void sw_gpu_compute(const int ago, const int nlocal, const int nall, 
                     const int nlist, double **host_x, int *host_type, 
                     int *ilist, int *numj, int **firstneigh, const bool eflag, 
                     const bool vflag, const bool eatom, const bool vatom, 
                     int &host_start, const double cpu_time, bool &success) {
   SWMF.compute(ago,nlocal,nall,nlist,host_x,host_type,ilist,numj,
                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
 double sw_gpu_bytes() {
   return SWMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_table_ext.cpp b/lib/gpu/lal_table_ext.cpp
index 2186e33de..55fe9318b 100644
--- a/lib/gpu/lal_table_ext.cpp
+++ b/lib/gpu/lal_table_ext.cpp
@@ -1,120 +1,120 @@
 /***************************************************************************
                                  lal_table.cpp
                              -------------------
                            Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to table acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_table.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static Table<PRECISION,ACC_PRECISION> TBMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
                  double **table_data, double *special_lj, const int inum,
                  const int nall, const int max_nbors,  const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen, 
                  int tabstyle, int ntables, int tablength) {
   TBMF.clear();
   gpu_mode=TBMF.device->gpu_mode();
   double gpu_split=TBMF.device->particle_split();
   int first_gpu=TBMF.device->first_device();
   int last_gpu=TBMF.device->last_device();
   int world_me=TBMF.device->world_me();
   int gpu_rank=TBMF.device->gpu_rank();
   int procs_per_gpu=TBMF.device->procs_per_gpu();
 
   TBMF.device->init_message(screen,"table",first_gpu,last_gpu);
 
   bool message=false;
   if (TBMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=TBMF.init(ntypes, cutsq, table_coeffs, table_data,
                       special_lj, inum, nall, 300, maxspecial, cell_size, 
                       gpu_split, screen, tabstyle, ntables, tablength);
 
   TBMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=TBMF.init(ntypes, cutsq, table_coeffs, table_data,
                       special_lj, inum, nall, 300, maxspecial, cell_size, 
                       gpu_split, screen, tabstyle, ntables, tablength);
 
     TBMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     TBMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void table_gpu_clear() {
   TBMF.clear();
 }
 
 int ** table_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, int *tag, int **nspecial,
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
                         bool &success) {
   return TBMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
 void table_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success) {
   TBMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
 double table_gpu_bytes() {
   return TBMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_yukawa_colloid_ext.cpp b/lib/gpu/lal_yukawa_colloid_ext.cpp
index c43166c33..a1bd0ee1f 100644
--- a/lib/gpu/lal_yukawa_colloid_ext.cpp
+++ b/lib/gpu/lal_yukawa_colloid_ext.cpp
@@ -1,123 +1,123 @@
 /***************************************************************************
                              yukawa_colloid_ext.cpp
                              -------------------
                             Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to colloid acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_yukawa_colloid.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static YukawaColloid<PRECISION,ACC_PRECISION> YKCOLLMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a, 
                        double **host_offset, double *special_lj, const int inum,
                        const int nall, const int max_nbors,  const int maxspecial,
                        const double cell_size, int &gpu_mode, FILE *screen, 
                        const double kappa) {
   YKCOLLMF.clear();
   gpu_mode=YKCOLLMF.device->gpu_mode();
   double gpu_split=YKCOLLMF.device->particle_split();
   int first_gpu=YKCOLLMF.device->first_device();
   int last_gpu=YKCOLLMF.device->last_device();
   int world_me=YKCOLLMF.device->world_me();
   int gpu_rank=YKCOLLMF.device->gpu_rank();
   int procs_per_gpu=YKCOLLMF.device->procs_per_gpu();
 
   YKCOLLMF.device->init_message(screen,"yukawa/colloid",first_gpu,last_gpu);
 
   bool message=false;
   if (YKCOLLMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, 
                           inum, nall, 300, maxspecial, cell_size, gpu_split, 
                           screen, kappa);
 
   YKCOLLMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, 
                             inum, nall, 300, maxspecial, cell_size, gpu_split, 
                             screen, kappa);
 
     YKCOLLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     YKCOLLMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void ykcolloid_gpu_clear() {
   YKCOLLMF.clear();
 }
 
 int ** ykcolloid_gpu_compute_n(const int ago, const int inum_full,
                                const int nall, double **host_x, int *host_type,
                                double *sublo, double *subhi, int *tag, int **nspecial,
                                int **special, const bool eflag, const bool vflag,
                                const bool eatom, const bool vatom, int &host_start,
                                int **ilist, int **jnum, const double cpu_time,
                                bool &success, double *host_rad) {
   return YKCOLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                           subhi, tag, nspecial, special, eflag, vflag, eatom,
                           vatom, host_start, ilist, jnum, cpu_time, success,
                           host_rad);
 }  
 			
 void ykcolloid_gpu_compute(const int ago, const int inum_full, 
                            const int nall, double **host_x, int *host_type, 
                            int *ilist, int *numj, int **firstneigh, 
                            const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            const double cpu_time, bool &success, double *host_rad) {
   YKCOLLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                    firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,
                    success,host_rad);
 }
 
 double ykcolloid_gpu_bytes() {
   return YKCOLLMF.host_memory_usage();
 }
 
 
diff --git a/lib/gpu/lal_yukawa_ext.cpp b/lib/gpu/lal_yukawa_ext.cpp
index 36f390ab9..5737bc102 100644
--- a/lib/gpu/lal_yukawa_ext.cpp
+++ b/lib/gpu/lal_yukawa_ext.cpp
@@ -1,120 +1,120 @@
 /***************************************************************************
                                yukawa_ext.cpp
                              -------------------
                            Trung Dac Nguyen (ORNL)
 
   Functions for LAMMPS access to yukawa acceleration routines.
 
  __________________________________________________________________________
     This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
  __________________________________________________________________________
 
     begin                : 
     email                : nguyentd@ornl.gov
  ***************************************************************************/
 
 #include <iostream>
 #include <cassert>
 #include <math.h>
 
 #include "lal_yukawa.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
 
 static Yukawa<PRECISION,ACC_PRECISION> YKMF;
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
                  double **host_a, double **offset, double *special_lj, 
                  const int inum, const int nall, const int max_nbors,  
                  const int maxspecial, const double cell_size, 
                  int &gpu_mode, FILE *screen) {
   YKMF.clear();
   gpu_mode=YKMF.device->gpu_mode();
   double gpu_split=YKMF.device->particle_split();
   int first_gpu=YKMF.device->first_device();
   int last_gpu=YKMF.device->last_device();
   int world_me=YKMF.device->world_me();
   int gpu_rank=YKMF.device->gpu_rank();
   int procs_per_gpu=YKMF.device->procs_per_gpu();
 
   YKMF.device->init_message(screen,"yukawa",first_gpu,last_gpu);
 
   bool message=false;
   if (YKMF.device->replica_me()==0 && screen)
     message=true;
 
   if (message) {
-    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fprintf(screen,"Initializing Device and compiling on process 0...");
     fflush(screen);
   }
 
   int init_ok=0;
   if (world_me==0)
     init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, 
                       inum, nall, 300, maxspecial, cell_size, 
                       gpu_split, screen);
 
   YKMF.device->world_barrier();
   if (message)
     fprintf(screen,"Done.\n");
 
   for (int i=0; i<procs_per_gpu; i++) {
     if (message) {
       if (last_gpu-first_gpu==0)
-        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
       else
-        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
                 last_gpu,i);
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, 
                       inum, nall, 300, maxspecial, cell_size, 
                       gpu_split, screen);
 
     YKMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
 
   if (init_ok==0)
     YKMF.estimate_gpu_overhead();
   return init_ok;
 }
 
 void yukawa_gpu_clear() {
   YKMF.clear();
 }
 
 int ** yukawa_gpu_compute_n(const int ago, const int inum_full,
                             const int nall, double **host_x, int *host_type,
                             double *sublo, double *subhi, int *tag, int **nspecial,
                             int **special, const bool eflag, const bool vflag,
                             const bool eatom, const bool vatom, int &host_start,
                             int **ilist, int **jnum, const double cpu_time,
                             bool &success) {
   return YKMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                        subhi, tag, nspecial, special, eflag, vflag, eatom,
                        vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
 void yukawa_gpu_compute(const int ago, const int inum_full, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
                        const double cpu_time, bool &success) {
   YKMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
 double yukawa_gpu_bytes() {
   return YKMF.host_memory_usage();
 }