diff --git a/lib/gpu/Makefile.nvidia b/lib/gpu/Makefile.nvidia
index 47e1fefbe..4bb2c3fef 100644
--- a/lib/gpu/Makefile.nvidia
+++ b/lib/gpu/Makefile.nvidia
@@ -1,75 +1,65 @@
 #***************************************************************************
 #                               Makefile
 #                          -------------------
 #                           W. Michael Brown
 #
 #  _________________________________________________________________________
 #    Build for the LAMMPS GPU Force Library
 #
 #  _________________________________________________________________________
 #
 #    begin                : Tue June 23 2009
 #    copyright            : (C) 2009 by W. Michael Brown
 #    email                : wmbrown@sandia.gov
 # ***************************************************************************/
 
 BIN_DIR = .
 OBJ_DIR = .
 AR = ar
 CUDA_CPP  = nvcc -I/usr/local/cuda/include -DUNIX -O3 -DDEBUG -Xptxas -v --use_fast_math
 CUDA_ARCH = -maxrregcount 128 #-arch=sm_13
 CUDA_PREC = -D_SINGLE_SINGLE
 CUDA_LINK = -L/usr/local/cuda/lib -lcudart $(CUDA_LIB)
 
 CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC)
 
 CUDA_LIB = $(OBJ_DIR)/libgpu.a
 
 # Headers for CUDA Stuff
-NVC_H  = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h
+NVC_H  = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h nvc_traits.h
 # Headers for Pair Stuff
 PAIR_H  = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h
+# Dependencies for the Texture Tar
+TAR_H = $(NVC_H) $(PAIR_H) pair_gpu_atom.cu lj_gpu_memory.h lj_gpu_memory.cu \
+        lj_gpu_kernel.h lj_gpu.cu gb_gpu_memory.h gb_gpu_memory.cu \
+        gb_gpu_extra.h gb_gpu_kernel.h gb_gpu.cu
 
 ALL_H = $(NVC_H) $(PAIR_H)
 
 EXECS = $(BIN_DIR)/nvc_get_devices
-OBJS = $(OBJ_DIR)/nvc_device.o $(OBJ_DIR)/gb_gpu.cu_o \
-       $(OBJ_DIR)/gb_gpu_memory.cu_o $(OBJ_DIR)/lj_gpu.cu_o \
-       $(OBJ_DIR)/lj_gpu_memory.cu_o $(OBJ_DIR)/pair_gpu_atom.cu_o \
-       $(OBJ_DIR)/pair_gpu_nbor.cu_o
+OBJS = $(OBJ_DIR)/nvc_device.o $(OBJ_DIR)/pair_gpu_nbor.cu_o \
+       $(OBJ_DIR)/pair_tex_tar.cu_o
 
 all: $(CUDA_LIB) $(EXECS)
 
 $(OBJ_DIR)/nvc_device.o: nvc_device.cu $(NVC_H)
 	$(CUDA) -o $@ -c nvc_device.cu
 
-$(OBJ_DIR)/pair_gpu_atom.cu_o: pair_gpu_atom.cu pair_gpu_texture.h pair_gpu_atom.h $(NVC_H)
-	$(CUDA) -o $@ -c pair_gpu_atom.cu
-
 $(OBJ_DIR)/pair_gpu_nbor.cu_o: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor.h $(NVC_H)
 	$(CUDA) -o $@ -c pair_gpu_nbor.cu
 
-$(OBJ_DIR)/lj_gpu_memory.cu_o: lj_gpu_memory.cu lj_gpu_memory.h $(ALL_H)
-	$(CUDA) -o $@ -c lj_gpu_memory.cu
-
-$(OBJ_DIR)/lj_gpu.cu_o: lj_gpu.cu $(ALL_H) lj_gpu_memory.h lj_gpu_kernel.h
-	$(CUDA) -o $@ -c lj_gpu.cu
-
-$(OBJ_DIR)/gb_gpu_memory.cu_o: gb_gpu_memory.cu gb_gpu_memory.h $(ALL_H)
-	$(CUDA) -o $@ -c gb_gpu_memory.cu
-
-$(OBJ_DIR)/gb_gpu.cu_o: gb_gpu.cu $(ALL_H) gb_gpu_memory.h gb_gpu_kernel.h gb_gpu_extra.h
-	$(CUDA) -o $@ -c gb_gpu.cu
+$(OBJ_DIR)/pair_tex_tar.cu_o: $(TAR_H)
+	$(CUDA) -o $@ -c pair_tex_tar.cu
 
 $(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.o
 	$(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.o 
 
 $(CUDA_LIB): $(OBJS)
 	$(AR) -crusv $(CUDA_LIB) $(OBJS)
 
 clean:
 	rm -rf $(EXECS) $(CUDA_LIB) $(OBJS) *.linkinfo
 
 veryclean: clean
 	rm -rf *~ *.linkinfo
 
diff --git a/lib/gpu/gb_gpu.cu b/lib/gpu/gb_gpu.cu
index cde56d569..174161331 100644
--- a/lib/gpu/gb_gpu.cu
+++ b/lib/gpu/gb_gpu.cu
@@ -1,518 +1,518 @@
 /***************************************************************************
                                   gb_gpu.cu
                              -------------------
                                W. Michael Brown
 
   Gay-Berne anisotropic potential GPU calcultation
 
    *** Force decomposition by Atom Version ***
 
  __________________________________________________________________________
     This file is part of the LAMMPS GPU Library
  __________________________________________________________________________
 
     begin                : Tue Jun 23 2009
     copyright            : (C) 2009 by W. Michael Brown
     email                : wmbrown@sandia.gov
  ***************************************************************************/
 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
    ----------------------------------------------------------------------- */
 
 #include <iostream>
 #include <cassert>
 #include "nvc_macros.h"
 #include "nvc_timer.h"
 #include "nvc_device.h"
-#include "gb_gpu_memory.h"
+#include "gb_gpu_memory.cu"
 #include "gb_gpu_kernel.h"
 
 using namespace std;
 
 static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF[MAX_GPU_THREADS];
 #define GBMT GB_GPU_Memory<numtyp,acctyp>
 
 // ---------------------------------------------------------------------------
 // Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
 // -- Only pack neighbors matching the specified inclusive range of forms
 // -- Only pack neighbors within cutoff
 // ---------------------------------------------------------------------------
 template<class numtyp>
 __global__ void kernel_pack_nbor(int *dev_nbor, const int nbor_pitch, 
                                  const int start, const int inum, 
                                  const int *dev_ij, const int form_low, 
                                  const int form_high, const int nall) {
                                 
   // ii indexes the two interacting particles in gi
   int ii=threadIdx.x+INT_MUL(blockIdx.x,blockDim.x)+start;
 
   if (ii<inum) {
     int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
     const int *list=dev_ij+*nbor;
     const int *list_end=list+numj;
     nbor+=nbor_pitch;
     int *nbor_newj=nbor;
     nbor+=nbor_pitch;
   
     numtyp ix=_x_<numtyp>(i,0);
     numtyp iy=_x_<numtyp>(i,1);
     numtyp iz=_x_<numtyp>(i,2);
     int itype=_x_<numtyp>(i,7);
 
     int newj=0;  
     for ( ; list<list_end; list++) {
       int j=*list;
       if (j>=nall)
         j%=nall;
       int jtype=_x_<numtyp>(j,7);
       
       if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) {
         // Compute r12;
         numtyp rsq=_x_<numtyp>(j,0)-ix;
         rsq*=rsq;
         numtyp t=_x_<numtyp>(j,1)-iy;
         rsq+=t*t;
         t=_x_<numtyp>(j,2)-iz;
         rsq+=t*t;
 
         if (rsq< _cutsq_<numtyp>(itype,jtype)) {
           *nbor=j;
           nbor+=nbor_pitch;
           newj++;
         }
       }
     }
     *nbor_newj=newj;
   }
 }
 
 // ---------------------------------------------------------------------------
 // Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
 // -- Only pack neighbors matching the specified inclusive range of forms
 // -- Only pack neighbors within cutoff
 // -- Fast version of routine that uses shared memory for LJ constants
 // ---------------------------------------------------------------------------
 template<class numtyp>
 __global__ void kernel_pack_nbor_fast(int *dev_nbor, const int nbor_pitch, 
                                       const int start, const int inum, 
                                       const int *dev_ij, const int form_low, 
                                       const int form_high, const int nall) {
                                 
   int ii=threadIdx.x;
   __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     int itype=ii/MAX_SHARED_TYPES;
     int jtype=ii%MAX_SHARED_TYPES;
     cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
     form[ii]=_form_(itype,jtype);
   }
   ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
 
   if (ii<inum) {
     int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
     const int *list=dev_ij+*nbor;
     const int *list_end=list+numj;
     nbor+=nbor_pitch;
     int *nbor_newj=nbor;
     nbor+=nbor_pitch;
   
     numtyp ix=_x_<numtyp>(i,0);
     numtyp iy=_x_<numtyp>(i,1);
     numtyp iz=_x_<numtyp>(i,2);
     int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));
 
     int newj=0;  
     for ( ; list<list_end; list++) {
       int j=*list;
       if (j>=nall)
         j%=nall;
       int mtype=itype+_x_<numtyp>(j,7);
       
       if (form[mtype]>=form_low && form[mtype]<=form_high) {
         // Compute r12;
         numtyp rsq=_x_<numtyp>(j,0)-ix;
         rsq*=rsq;
         numtyp t=_x_<numtyp>(j,1)-iy;
         rsq+=t*t;
         t=_x_<numtyp>(j,2)-iz;
         rsq+=t*t;
 
         if (rsq<cutsq[mtype]) {
           *nbor=j;
           nbor+=nbor_pitch;
           newj++;
         }
       }
     }
     *nbor_newj=newj;
   }
 }
 
 template<class numtyp, class acctyp>
 void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, 
                 const int inum, const int form_low, const int form_high) {
   if (gbm.shared_types)
     kernel_pack_nbor_fast<numtyp><<<GX,BX,0,gbm.pair_stream>>>
           (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
            gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
   else
     kernel_pack_nbor<numtyp><<<GX,BX,0,gbm.pair_stream>>>
           (gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), start, inum,
            gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
 }
 
 // ---------------------------------------------------------------------------
 // Convert something to a string
 // ---------------------------------------------------------------------------
 #include <sstream>
 template <class t>
 inline string gb_gpu_toa(const t& in) {
   ostringstream o;
   o.precision(2);
   o << in;
   return o.str();
 }
 
 // ---------------------------------------------------------------------------
 // Return string with GPU info
 // ---------------------------------------------------------------------------
 string gb_gpu_name(const int id, const int max_nbors) {
   string name=GBMF[0].gpu.name(id)+", "+
               gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+
               gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+
               gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ, "+
               gb_gpu_toa(GBMF[0].get_max_atoms(GBMF[0].gpu.bytes(id),
                                                max_nbors))+" Atoms";
   return name;
 }
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
                   const double upsilon, const double mu, double **shape,
                   double **well, double **cutsq, double **sigma, 
                   double **epsilon, double *host_lshape, int **form,
                   double **host_lj1, double **host_lj2, double **host_lj3, 
                   double **host_lj4, double **offset, double *special_lj,
                   const int max_nbors, const int thread, const int gpu_id) {
   assert(thread<MAX_GPU_THREADS);
   
   if (GBMF[thread].gpu.num_devices()==0)
     return 0;                   
 
   ij_size=IJ_SIZE;
   return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape,
                            well, cutsq, sigma, epsilon, host_lshape, form,
                            host_lj1, host_lj2, host_lj3, host_lj4, offset,
                            special_lj, max_nbors, false, gpu_id);
 }
 
 // ---------------------------------------------------------------------------
 // Clear memory on host and device
 // ---------------------------------------------------------------------------
 void gb_gpu_clear(const int thread) {
   GBMF[thread].clear();
 }
 
 // ---------------------------------------------------------------------------
 // copy atom positions, quaternions, and optionally types to device
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 inline void _gb_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x, 
                          double **host_quat, const int *host_type, 
                          const bool rebuild, cudaStream_t &stream) {
   atom.time_atom.start();
   atom.reset_write_buffer();
  
   // Rows 1-3 of dev_x are position; rows 4-7 are quaternion
   atom.add_atom_data(host_x[0],3);
   atom.add_atom_data(host_x[0]+1,3);
   atom.add_atom_data(host_x[0]+2,3);
   atom.add_atom_data(host_quat[0],4);
   atom.add_atom_data(host_quat[0]+1,4);
   atom.add_atom_data(host_quat[0]+2,4);
   atom.add_atom_data(host_quat[0]+3,4);
    
   int csize=7;
   
   // If a rebuild occured, copy type data
   if (rebuild) {
     atom.add_atom_data(host_type);
     csize++;
   }
   
   atom.copy_atom_data(csize,stream);
   atom.time_atom.stop();
 }
 
 void gb_gpu_atom(double **host_x, double **host_quat, 
                  const int *host_type, const bool rebuild, const int thread) {
   _gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild,
                GBMF[thread].pair_stream);
 }
 
 // ---------------------------------------------------------------------------
 // Signal that we need to transfer a new neighbor list
 // ---------------------------------------------------------------------------
 template <class gbmtyp>
 int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal, 
                           const int inum, int *ilist, const int *numj,
                           const int *type, bool &success) {
   if (nall>gbm.max_atoms) {
     success=false;
     return 0;
   }
   success=true;
     
   gbm.nbor.time_nbor.start();
 
   gbm.atom.nall(nall);
   gbm.atom.inum(inum);
 
   if (gbm.multiple_forms) {
     int p=0, acc=0;
     for (int i=0; i<inum; i++) {
       int itype=type[ilist[i]];
       if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
         gbm.host_olist[p]=ilist[i];
         gbm.nbor.host_ij[p]=numj[ilist[i]];
         gbm.nbor.host_ij[p+inum]=acc;
         acc+=numj[ilist[i]];
         p++;
       }
     }
     gbm.last_ellipse=p;
     for (int i=0; i<inum; i++) {
       int itype=type[ilist[i]];
       if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
         gbm.host_olist[p]=ilist[i];
         gbm.nbor.host_ij[p]=numj[ilist[i]];
         gbm.nbor.host_ij[p+inum]=acc;
         acc+=numj[ilist[i]];
         p++;
       }
     }
     gbm.nbor.ij_total=0;
     gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
     gbm.nbor.host_ij.copy_to_2Ddevice(gbm.nbor.dev_nbor.begin()+
                                       gbm.nbor.dev_nbor.row_size(),
                                       gbm.nbor.dev_nbor.row_size(),2,inum,
                                       gbm.pair_stream);
   } else {
     gbm.nbor.reset(inum,ilist,numj,gbm.pair_stream);
     gbm.last_ellipse=inum;
   }
 
   gbm.nbor.time_nbor.stop();
   
   if (gbm.multiple_forms)
     return gbm.host_olist.begin();
   return ilist;
 }
 
 int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, 
                          int *ilist, const int *numj, const int *type,
                          const int thread, bool &success) {
   return _gb_gpu_reset_nbors(GBMF[thread],nall,nlocal,inum,ilist,numj,type,
                              success);
 }
 
 // ---------------------------------------------------------------------------
 // Copy a set of ij_size ij interactions to device and compute energies,
 // forces, and torques for those interactions
 // ---------------------------------------------------------------------------
 template <class gbmtyp>
 void _gb_gpu_nbors(gbmtyp &gbm, const int num_ij, const bool eflag) {
   gbm.nbor.time_nbor.add_to_total();
   // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
   
   gbm.nbor.time_nbor.start();
   gbm.nbor.add(num_ij,gbm.pair_stream);
   gbm.nbor.time_nbor.stop();
 }
 
 void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread) {
   _gb_gpu_nbors(GBMF[thread],num_ij,eflag);
 }
 
 // ---------------------------------------------------------------------------
 // Calculate energies, forces, and torques for all ij interactions
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag, 
                       const bool rebuild) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=BLOCK_1D;
 
   int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum())/BX));
 
   if (gbm.multiple_forms) {
     gbm.time_kernel.start();
     if (gbm.last_ellipse>0) {
       // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
       GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
                                static_cast<double>(BX)));
       pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,SPHERE_ELLIPSE,ELLIPSE_ELLIPSE);
       gbm.time_kernel.stop();
   
       gbm.time_gayberne.start();                                 
       kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
            (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
             gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(),
             gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
             eflag, vflag, gbm.last_ellipse, gbm.atom.nall());
       gbm.time_gayberne.stop();
 
       if (gbm.last_ellipse==gbm.atom.inum()) {
         gbm.time_kernel2.start();
         gbm.time_kernel2.stop();
         gbm.time_gayberne2.start();
         gbm.time_gayberne2.stop();
         gbm.time_pair.start();
         gbm.time_pair.stop();
         return;
       }
                     
       // ------------ SPHERE_ELLIPSE ---------------
         
       gbm.time_kernel2.start();
       GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum()-
                                gbm.last_ellipse)/BX));
       pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE,
                  ELLIPSE_SPHERE);
       gbm.time_kernel2.stop();
 
       gbm.time_gayberne2.start();
       kernel_sphere_gb<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
            (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
             gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), 
             gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
             eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
       gbm.time_gayberne2.stop();
    } else {
       gbm.atom.ans.zero();
       gbm.time_kernel.stop();
       gbm.time_gayberne.start();                                 
       gbm.time_gayberne.stop();
       gbm.time_kernel2.start();
       gbm.time_kernel2.stop();
       gbm.time_gayberne2.start();
       gbm.time_gayberne2.stop();
     }
     
     // ------------         LJ      ---------------
     gbm.time_pair.start();
     if (gbm.last_ellipse<gbm.atom.inum()) {
       if (gbm.shared_types)
         kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
            (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), 
             gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), 
             gbm.atom.ans.begin(), gbm.atom.ans.row_size(), 
             gbm.dev_error.begin(), eflag, vflag, gbm.last_ellipse, 
             gbm.atom.inum(), gbm.atom.nall());
       else
         kernel_lj<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
            (gbm.special_lj.begin(), gbm.nbor.dev_nbor.begin(), 
             gbm.nbor.ij.begin(), gbm.nbor.dev_nbor.row_size(), 
             gbm.atom.ans.begin(), gbm.atom.ans.row_size(),gbm.dev_error.begin(),
             eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
     }
     gbm.time_pair.stop();
   } else {
     gbm.time_kernel.start();
     pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE);
     gbm.time_kernel.stop();
   
     gbm.time_gayberne.start();                                 
     kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
          (gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
           gbm.nbor.dev_nbor.begin(), gbm.nbor.dev_nbor.row_size(), 
           gbm.atom.ans.begin(), gbm.atom.ans.row_size(), gbm.dev_error.begin(), 
           eflag, vflag, gbm.atom.inum(), gbm.atom.nall());
     gbm.time_gayberne.stop();
   }
 }
 
 void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, 
                      const int thread) {
   _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag,rebuild);
 }
 
 // ---------------------------------------------------------------------------
 // Get energies, forces, and torques to host
 // ---------------------------------------------------------------------------
 template<class numtyp, class acctyp>
 double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
                       const bool eflag, const bool vflag, const bool eflag_atom,
                       const bool vflag_atom, double *eatom, double **vatom,
                       double *virial) {
   double evdw;
 
   gbm.atom.time_answer.start();
   gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream);
 
   gbm.atom.time_atom.add_to_total();
   gbm.nbor.time_nbor.add_to_total();
   gbm.time_kernel.add_to_total();
   gbm.time_gayberne.add_to_total();
   if (gbm.multiple_forms) {
     gbm.time_kernel2.add_to_total();
     gbm.time_gayberne2.add_to_total();
     gbm.time_pair.add_to_total();
   }      
   // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
   
   evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
   gbm.atom.add_forces(ilist,f);
   gbm.atom.add_torques(ilist,tor,gbm.last_ellipse);
   gbm.atom.time_answer.stop();
   gbm.atom.time_answer.add_to_total();
   return evdw;
 }
 
 double gb_gpu_forces(double **f, double **tor, const int *ilist,
                      const bool eflag, const bool vflag, const bool eflag_atom,
                      const bool vflag_atom, double *eatom, double **vatom,
                      double *virial, const int thread) {
   return _gb_gpu_forces<PRECISION,ACC_PRECISION>
                        (GBMF[thread],f,tor,ilist,eflag,vflag,eflag_atom,
                         vflag_atom,eatom,vatom,virial);
 }
 
 void gb_gpu_time(const int i) {
   cout.precision(4);
   cout << "Atom copy:     " << GBMF[i].atom.time_atom.total_seconds() 
        << " s.\n"
        << "Neighbor copy: " << GBMF[i].nbor.time_nbor.total_seconds() 
        << " s.\n"
        << "Neighbor pack: " << GBMF[i].time_kernel.total_seconds()+
                                GBMF[i].time_kernel2.total_seconds() << " s.\n"
        << "Force calc:    " << GBMF[i].time_gayberne.total_seconds()+
                                GBMF[i].time_gayberne2.total_seconds()<< " s.\n";
   if (GBMF[i].multiple_forms)
     cout << "LJ calc:       " << GBMF[i].time_pair.total_seconds() << " s.\n";
   cout << "Answer copy:   " << GBMF[i].atom.time_answer.total_seconds() 
        << " s.\n";
 }
 
 int gb_gpu_num_devices() {
   return GBMF[0].gpu.num_devices();
 }
 
 double gb_gpu_bytes() {
   return GBMF[0].host_memory_usage();
 }
diff --git a/lib/gpu/gb_gpu_extra.h b/lib/gpu/gb_gpu_extra.h
index c918e0707..87bcecb3c 100644
--- a/lib/gpu/gb_gpu_extra.h
+++ b/lib/gpu/gb_gpu_extra.h
@@ -1,337 +1,337 @@
 /***************************************************************************
                                gb_gpu_extra.h
                              -------------------
                                W. Michael Brown
 
   Inline GPU kernel routines ala math_extra for the CPU.
 
  __________________________________________________________________________
     This file is part of the LAMMPS GPU Library
  __________________________________________________________________________
 
     begin                : Tue Jun 23 2009
     copyright            : (C) 2009 by W. Michael Brown
     email                : wmbrown@sandia.gov
  ***************************************************************************/
 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
    ----------------------------------------------------------------------- */
 
 #ifndef GB_GPU_EXTRA_H
 #define GB_GPU_EXTRA_H
 
 #include "math.h"
 #include "stdio.h"
 #include "string.h"
 
 /* ----------------------------------------------------------------------
    Atomic update of global memory
 ------------------------------------------------------------------------- */
 /*
 template <class numtyp> __device__ 
 inline void atomicAdd(numtyp *address, numtyp val);
 
 template <>
 __device__ inline void atomicAdd<float>(float *address, float val)
 {
   int i_val = __float_as_int(val);
   int tmp0 = 0;
   int tmp1;
 
   while( (tmp1 = atomicCAS((int *)address, tmp0, i_val)) != tmp0) {
     tmp0 = tmp1;
     i_val = __float_as_int(val + __int_as_float(tmp1));
   }
 }*/
 
 /* ----------------------------------------------------------------------
    dot product of 2 vectors
 ------------------------------------------------------------------------- */
 
 template <class numtyp>
 static __inline__ __device__ numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
 {
   return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2];
 }
 
 /* ----------------------------------------------------------------------
    cross product of 2 vectors
 ------------------------------------------------------------------------- */
 
 template <class numtyp>
 static __inline__ __device__ void gpu_cross3(const numtyp *v1, 
                                              const numtyp *v2, numtyp *ans)
 {
   ans[0] = v1[1]*v2[2]-v1[2]*v2[1];
   ans[1] = v1[2]*v2[0]-v1[0]*v2[2];
   ans[2] = v1[0]*v2[1]-v1[1]*v2[0];
 }
 
 /* ----------------------------------------------------------------------
    determinant of a matrix
 ------------------------------------------------------------------------- */
 
 template <class numtyp>
 static __inline__ __device__ numtyp gpu_det3(const numtyp m[9])
 {
   numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - 
     m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + 
     m[6]*m[1]*m[5] - m[6]*m[2]*m[4];
   return ans;
 }
 
 /* ----------------------------------------------------------------------
    diagonal matrix times a full matrix
 ------------------------------------------------------------------------- */
 
 template <class numtyp>
 static __inline__ __device__ void gpu_well_times3(const int i, const numtyp m[9],
                                                   numtyp ans[9])
 {
   ans[0] = _well_<numtyp>(i,0)*m[0];
   ans[1] = _well_<numtyp>(i,0)*m[1];
   ans[2] = _well_<numtyp>(i,0)*m[2];
   ans[3] = _well_<numtyp>(i,1)*m[3];
   ans[4] = _well_<numtyp>(i,1)*m[4];
   ans[5] = _well_<numtyp>(i,1)*m[5];
   ans[6] = _well_<numtyp>(i,2)*m[6];
   ans[7] = _well_<numtyp>(i,2)*m[7];
   ans[8] = _well_<numtyp>(i,2)*m[8];
 }
 
 /* ----------------------------------------------------------------------
    diagonal matrix times a full matrix
 ------------------------------------------------------------------------- */
 
 template <class numtyp>
 static __inline__ __device__ void gpu_shape_times3(const int i, const numtyp m[9],
                                                    numtyp ans[9])
 {
   ans[0] = _shape_<numtyp>(i,0)*m[0];
   ans[1] = _shape_<numtyp>(i,0)*m[1];
   ans[2] = _shape_<numtyp>(i,0)*m[2];
   ans[3] = _shape_<numtyp>(i,1)*m[3];
   ans[4] = _shape_<numtyp>(i,1)*m[4];
   ans[5] = _shape_<numtyp>(i,1)*m[5];
   ans[6] = _shape_<numtyp>(i,2)*m[6];
   ans[7] = _shape_<numtyp>(i,2)*m[7];
   ans[8] = _shape_<numtyp>(i,2)*m[8];
 }
 
 /* ----------------------------------------------------------------------
    add two matrices
 ------------------------------------------------------------------------- */
 
 template <class numtyp>
 static __inline__ __device__ void gpu_plus3(const numtyp m[9], 
                                             const numtyp m2[9], numtyp ans[9])
 {
   ans[0] = m[0]+m2[0];
   ans[1] = m[1]+m2[1];
   ans[2] = m[2]+m2[2];
   ans[3] = m[3]+m2[3];
   ans[4] = m[4]+m2[4];
   ans[5] = m[5]+m2[5];
   ans[6] = m[6]+m2[6];
   ans[7] = m[7]+m2[7];
   ans[8] = m[8]+m2[8];
 }
 
 /* ----------------------------------------------------------------------
    multiply the transpose of mat1 times mat2
 ------------------------------------------------------------------------- */
 
 template <class numtyp>
 static __inline__ __device__ void gpu_transpose_times3(const numtyp m[9], 
                                                        const numtyp m2[9],
                                                        numtyp ans[9])
 {
   ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6];
   ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7];
   ans[2] = m[0]*m2[2]+m[3]*m2[5]+m[6]*m2[8];
   ans[3] = m[1]*m2[0]+m[4]*m2[3]+m[7]*m2[6];
   ans[4] = m[1]*m2[1]+m[4]*m2[4]+m[7]*m2[7];
   ans[5] = m[1]*m2[2]+m[4]*m2[5]+m[7]*m2[8];
   ans[6] = m[2]*m2[0]+m[5]*m2[3]+m[8]*m2[6];
   ans[7] = m[2]*m2[1]+m[5]*m2[4]+m[8]*m2[7];
   ans[8] = m[2]*m2[2]+m[5]*m2[5]+m[8]*m2[8];
 }
 
 /* ----------------------------------------------------------------------
    row vector times matrix
 ------------------------------------------------------------------------- */
 
 template <class numtyp>
 static __inline__ __device__ void gpu_row_times3(const numtyp *v, 
                                                  const numtyp m[9], numtyp *ans)
 {
   ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6];
   ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7];
   ans[2] = v[0]*m[2]+v[1]*m[5]+m[8]*v[2];
 }
 
 /* ----------------------------------------------------------------------
    solve Ax = b or M ans = v
    use gaussian elimination & partial pivoting on matrix
    error_flag set to 2 if bad matrix inversion attempted
 ------------------------------------------------------------------------- */
 
 template <class numtyp>
 static __inline__ __device__ void gpu_mldivide3(const numtyp m[9], 
                                                 const numtyp *v, numtyp *ans,
                                                 int *error_flag)
 {
   // create augmented matrix for pivoting
 
   numtyp aug[12], t;
 
   aug[3] = v[0];
   aug[0] = m[0];
   aug[1] = m[1];
   aug[2] = m[2];
   aug[7] = v[1];
   aug[4] = m[3];
   aug[5] = m[4];
   aug[6] = m[5];
   aug[11] = v[2];
   aug[8] = m[6];
   aug[9] = m[7];
   aug[10] = m[8];
 
   if (fabs(aug[4]) > fabs(aug[0])) {
     numtyp swapt;
     swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
     swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
     swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
     swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
   }
   if (fabs(aug[8]) > fabs(aug[0])) {
     numtyp swapt;
     swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
     swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
     swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
     swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
   }
 
   if (aug[0] != (numtyp)0.0) {
     if (0!=0) {
       numtyp swapt;
       swapt=aug[0]; aug[0]=aug[0]; aug[0]=swapt;
       swapt=aug[1]; aug[1]=aug[1]; aug[1]=swapt;
       swapt=aug[2]; aug[2]=aug[2]; aug[2]=swapt;
       swapt=aug[3]; aug[3]=aug[3]; aug[3]=swapt;
     }
   } else if (aug[4] != (numtyp)0.0) {
     if (1!=0) {
       numtyp swapt;
       swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
       swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
       swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
       swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
     }
   } else if (aug[8] != (numtyp)0.0) {
     if (2!=0) {
       numtyp swapt;
       swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
       swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
       swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
       swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
     }
   } else
     *error_flag=2;
 
   t = aug[4]/aug[0];
   aug[5]-=t*aug[1];
   aug[6]-=t*aug[2];
   aug[7]-=t*aug[3];
   t = aug[8]/aug[0];
   aug[9]-=t*aug[1];
   aug[10]-=t*aug[2];
   aug[11]-=t*aug[3];
 
   if (fabs(aug[9]) > fabs(aug[5])) {
     numtyp swapt;
     swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
     swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
     swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
     swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
   }
 
   if (aug[5] != (numtyp)0.0) {
     if (1!=1) {
       numtyp swapt;
       swapt=aug[4]; aug[4]=aug[4]; aug[4]=swapt;
       swapt=aug[5]; aug[5]=aug[5]; aug[5]=swapt;
       swapt=aug[6]; aug[6]=aug[6]; aug[6]=swapt;
       swapt=aug[7]; aug[7]=aug[7]; aug[7]=swapt;
     }
   } else if (aug[9] != (numtyp)0.0) {
     if (2!=1) {
       numtyp swapt;
       swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
       swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
       swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
       swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
     }
   }
 
   t = aug[9]/aug[5];
   aug[10]-=t*aug[6];
   aug[11]-=t*aug[7];
   
   if (aug[10] == (numtyp)0.0)
     *error_flag=2;
 
   ans[2] = aug[11]/aug[10];
   t = (numtyp)0.0;
   t += aug[6]*ans[2];
   ans[1] = (aug[7]-t) / aug[5];
   t = (numtyp)0.0;
   t += aug[1]*ans[1];
   t += aug[2]*ans[2];
   ans[0] = (aug[3]-t) / aug[0];
 }
 
 /* ----------------------------------------------------------------------
    compute rotation matrix from quaternion conjugate
    quat = [w i j k]
 ------------------------------------------------------------------------- */
 
 template <class numtyp>
 static __inline__ __device__ void gpu_quat_to_mat_trans(const int qi, 
                                                         numtyp mat[9])
 {
   numtyp qi3=_x_<numtyp>(qi,3);
   numtyp qi4=_x_<numtyp>(qi,4);
   numtyp qi5=_x_<numtyp>(qi,5);
   numtyp qi6=_x_<numtyp>(qi,6);
 
   numtyp w2 = qi3*qi3;
   numtyp i2 = qi4*qi4;
   numtyp j2 = qi5*qi5;
   numtyp k2 = qi6*qi6;
-  numtyp twoij = 2.0*qi4*qi5;
-  numtyp twoik = 2.0*qi4*qi6;
-  numtyp twojk = 2.0*qi5*qi6;
-  numtyp twoiw = 2.0*qi4*qi3;
-  numtyp twojw = 2.0*qi5*qi3;
-  numtyp twokw = 2.0*qi6*qi3;
+  numtyp twoij = (numtyp)2.0*qi4*qi5;
+  numtyp twoik = (numtyp)2.0*qi4*qi6;
+  numtyp twojk = (numtyp)2.0*qi5*qi6;
+  numtyp twoiw = (numtyp)2.0*qi4*qi3;
+  numtyp twojw = (numtyp)2.0*qi5*qi3;
+  numtyp twokw = (numtyp)2.0*qi6*qi3;
 
   mat[0] = w2+i2-j2-k2;
   mat[3] = twoij-twokw;
   mat[6] = twojw+twoik;
 
   mat[1] = twoij+twokw;
   mat[4] = w2-i2+j2-k2;
   mat[7] = twojk-twoiw;
 	
   mat[2] = twoik-twojw;
   mat[5] = twojk+twoiw;
   mat[8] = w2-i2-j2+k2;
 }
 
 #endif
diff --git a/lib/gpu/gb_gpu_kernel.h b/lib/gpu/gb_gpu_kernel.h
index 8ca047e0e..3d74d916f 100644
--- a/lib/gpu/gb_gpu_kernel.h
+++ b/lib/gpu/gb_gpu_kernel.h
@@ -1,861 +1,861 @@
 /***************************************************************************
                               gb_gpu_kernel.cu
                              -------------------
                                W. Michael Brown
 
   Routines that actually perform the force/torque computation
 
    *** Force Decomposition by Atom Version ***
  __________________________________________________________________________
     This file is part of the LAMMPS GPU Library
  __________________________________________________________________________
 
     begin                : Tue Jun 23 2009
     copyright            : (C) 2009 by W. Michael Brown
     email                : wmbrown@sandia.gov
  ***************************************************************************/
 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
    ----------------------------------------------------------------------- */
 
 #ifndef GB_GPU_KERNEL
 #define GB_GPU_KERNEL
 
 #include "gb_gpu_extra.h"
 
 template <class numtyp>
 static __inline__ __device__ void compute_eta_torque(numtyp m[9], 
                                                      numtyp m2[9],
                                                      const int i, 
                                                      numtyp ans[9])
 {
   numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
     m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
     m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
   den = (numtyp)1.0/den;
   
   numtyp shapex=_shape_<numtyp>(i,0);
   numtyp shapey=_shape_<numtyp>(i,1);
   numtyp shapez=_shape_<numtyp>(i,2);
   
   ans[0] = shapex*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
 		    m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
 		    m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
 		    m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
 		    m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
   
   ans[1] = shapex*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
 		    (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
 		    (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
 		    m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
 		    m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
   
   ans[2] = shapex*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
 		    m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
 		    m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
 		    (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
 		    m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
   
   ans[3] = shapey*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
 		    m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
 		    m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
 		    m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
 		    m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
   
   ans[4] = shapey*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
 		    (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
 		    (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
 		    m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
 		    m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
   
   ans[5] = shapey*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
 		    m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
 		    (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
 		    m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
 		    (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
   
   ans[6] = shapez*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
 		    (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
 		    m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
 		    m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
 		    m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
   
   ans[7] = shapez*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
 		     (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
 		     (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
 		     m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
 		     m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
   
   ans[8] = shapez*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
 		    m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
 		    m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
 		    (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
 		    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
 }
 
 template<class numtyp, class acctyp>
 __global__ void kernel_gayberne(const numtyp *gum, const numtyp *special_lj,
                                 const int *dev_nbor, const int nbor_pitch, 
                                 acctyp *ans, size_t ans_pitch, int *err_flag, 
                                 const bool eflag, const bool vflag,
                                 const int inum, const int nall) {
                                 
   __shared__ numtyp sp_lj[4];
 
    // ii indexes the two interacting particles in gi
   int ii=threadIdx.x;
   if (ii<4)
     sp_lj[ii]=special_lj[ii];    
   ii+=INT_MUL(blockIdx.x,blockDim.x);                                  
 
   if (ii<inum) {
   
   acctyp energy=(numtyp)0;
   acctyp fx=(numtyp)0;
   acctyp fy=(numtyp)0;
   acctyp fz=(numtyp)0;
   acctyp torx=(numtyp)0;
   acctyp tory=(numtyp)0;
   acctyp torz=(numtyp)0;
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(numtyp)0;
   
   const int *nbor=dev_nbor+ii;
   int i=*nbor;
   nbor+=nbor_pitch;
   nbor+=nbor_pitch;
   nbor+=nbor_pitch;
   int numj=*nbor;
   nbor+=nbor_pitch;
   const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
   
   numtyp ix=_x_<numtyp>(i,0);
   numtyp iy=_x_<numtyp>(i,1);
   numtyp iz=_x_<numtyp>(i,2);
   int itype=_x_<numtyp>(i,7);
   numtyp a1[9], b1[9], g1[9];
   {
     numtyp t[9];
     gpu_quat_to_mat_trans(i,a1);
     gpu_shape_times3(itype,a1,t);
     gpu_transpose_times3(a1,t,g1);
     gpu_well_times3(itype,a1,t);
     gpu_transpose_times3(a1,t,b1);
   }
 
   numtyp factor_lj;
   for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
   
   int j=*nbor;
   if (j < nall) 
-    factor_lj = 1.0;
+    factor_lj = (numtyp)1.0;
   else {
     factor_lj = sp_lj[j/nall];
     j %= nall;
   }
   int jtype=_x_<numtyp>(j,7);
 
   // Compute r12
   numtyp r12[3];
   r12[0] = _x_<numtyp>(j,0)-ix;
   r12[1] = _x_<numtyp>(j,1)-iy;
   r12[2] = _x_<numtyp>(j,2)-iz;
   numtyp ir = gpu_dot3(r12,r12);
 
   ir = rsqrt(ir);
   numtyp r = (numtyp)1.0/ir;
 
   numtyp a2[9];
   gpu_quat_to_mat_trans(j,a2);
   
   numtyp u_r, dUr[3], tUr[3], eta, teta[3];
   { // Compute U_r, dUr, eta, and teta
     // Compute g12
     numtyp g12[9];
     {
       numtyp g2[9];
       {
           gpu_shape_times3(jtype,a2,g12);
           gpu_transpose_times3(a2,g12,g2);
           gpu_plus3(g1,g2,g12);
       }
   
       { // Compute U_r and dUr
     
         // Compute kappa
         numtyp kappa[3];
         gpu_mldivide3(g12,r12,kappa,err_flag);
 
         // -- replace r12 with r12 hat
         r12[0]*=ir;
         r12[1]*=ir;
         r12[2]*=ir;
 
         // -- kappa is now / r
         kappa[0]*=ir;
         kappa[1]*=ir;
         kappa[2]*=ir;
   
         // energy
   
         // compute u_r and dUr
         numtyp uslj_rsq;
         {
           // Compute distance of closest approach
           numtyp h12, sigma12;
           sigma12 = gpu_dot3(r12,kappa);
           sigma12 = rsqrt((numtyp)0.5*sigma12);
           h12 = r-sigma12;
 
           // -- kappa is now ok
           kappa[0]*=r;
           kappa[1]*=r;
           kappa[2]*=r;
           
           numtyp sigma = _sigma_<numtyp>(itype,jtype);
           numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
           numtyp varrho = sigma/(h12+gum[0]*sigma);
           numtyp varrho6 = varrho*varrho*varrho;
           varrho6*=varrho6;
           numtyp varrho12 = varrho6*varrho6;
           u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
 
           numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
           temp1 = temp1*(numtyp)24.0*epsilon;
           uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
           numtyp temp2 = gpu_dot3(kappa,r12);
           uslj_rsq = uslj_rsq*ir*ir;
 
           dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
           dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
           dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
         }
 
         // torque for particle 1
         {
           numtyp tempv[3], tempv2[3];
           tempv[0] = -uslj_rsq*kappa[0];
           tempv[1] = -uslj_rsq*kappa[1];
           tempv[2] = -uslj_rsq*kappa[2];
           gpu_row_times3(kappa,g1,tempv2);
           gpu_cross3(tempv,tempv2,tUr);
         }
       }
     }
      
     // Compute eta
     {
       eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
       numtyp det_g12 = gpu_det3(g12);
       eta = pow(eta/det_g12,gum[1]);
     }
     
     // Compute teta
     numtyp temp[9], tempv[3], tempv2[3];
     compute_eta_torque(g12,a1,itype,temp);
     numtyp temp1 = -eta*gum[1];
 
     tempv[0] = temp1*temp[0];
     tempv[1] = temp1*temp[1];
     tempv[2] = temp1*temp[2];
     gpu_cross3(a1,tempv,tempv2);
     teta[0] = tempv2[0];
     teta[1] = tempv2[1];
     teta[2] = tempv2[2];
   
     tempv[0] = temp1*temp[3];
     tempv[1] = temp1*temp[4];
     tempv[2] = temp1*temp[5];
     gpu_cross3(a1+3,tempv,tempv2);
     teta[0] += tempv2[0];
     teta[1] += tempv2[1];
     teta[2] += tempv2[2];
 
     tempv[0] = temp1*temp[6];
     tempv[1] = temp1*temp[7];
     tempv[2] = temp1*temp[8];
     gpu_cross3(a1+6,tempv,tempv2);
     teta[0] += tempv2[0];
     teta[1] += tempv2[1];
     teta[2] += tempv2[2];
   }
   
   numtyp chi, dchi[3], tchi[3];
   { // Compute chi and dchi
 
     // Compute b12
     numtyp b2[9], b12[9];
     {
       gpu_well_times3(jtype,a2,b12);
       gpu_transpose_times3(a2,b12,b2);
       gpu_plus3(b1,b2,b12);
     }
 
     // compute chi_12
     r12[0]*=r;
     r12[1]*=r;
     r12[2]*=r;
     numtyp iota[3];
     gpu_mldivide3(b12,r12,iota,err_flag);
     // -- iota is now iota/r
     iota[0]*=ir;
     iota[1]*=ir;
     iota[2]*=ir;
     r12[0]*=ir;
     r12[1]*=ir;
     r12[2]*=ir;
     chi = gpu_dot3(r12,iota);
     chi = pow(chi*(numtyp)2.0,gum[2]);
 
     // -- iota is now ok
     iota[0]*=r;
     iota[1]*=r;
     iota[2]*=r;
 
     numtyp temp1 = gpu_dot3(iota,r12);
     numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
     dchi[0] = temp2*(iota[0]-temp1*r12[0]);
     dchi[1] = temp2*(iota[1]-temp1*r12[1]);
     dchi[2] = temp2*(iota[2]-temp1*r12[2]);
 
     // compute t_chi
     numtyp tempv[3];
     gpu_row_times3(iota,b1,tempv);
     gpu_cross3(tempv,iota,tchi);
     temp1 = (numtyp)-4.0*ir*ir;
     tchi[0] *= temp1;
     tchi[1] *= temp1;
     tchi[2] *= temp1;
   }
 
   numtyp temp2 = factor_lj*eta*chi;
   if (eflag)
     energy+=u_r*temp2;
   numtyp temp1 = -eta*u_r*factor_lj;
   if (vflag) {
     r12[0]*=-r;
     r12[1]*=-r;
     r12[2]*=-r;
     numtyp ft=temp1*dchi[0]-temp2*dUr[0];
     fx+=ft;
     virial[0]+=r12[0]*ft;
     ft=temp1*dchi[1]-temp2*dUr[1];
     fy+=ft;
     virial[1]+=r12[1]*ft;
     virial[3]+=r12[0]*ft;
     ft=temp1*dchi[2]-temp2*dUr[2];
     fz+=ft;
     virial[2]+=r12[2]*ft;
     virial[4]+=r12[0]*ft;
     virial[5]+=r12[1]*ft;
   } else {
     fx+=temp1*dchi[0]-temp2*dUr[0];
     fy+=temp1*dchi[1]-temp2*dUr[1];
     fz+=temp1*dchi[2]-temp2*dUr[2];
   }
 
   // Torque on 1
   temp1 = -u_r*eta*factor_lj;
   temp2 = -u_r*chi*factor_lj;
   numtyp temp3 = -chi*eta*factor_lj;
   torx+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
   tory+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
   torz+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
 
   } // for nbor
 
   // Store answers
   acctyp *ap1=ans+ii;
   if (eflag) {
     *ap1=energy;
     ap1+=ans_pitch;
   }
   if (vflag) {
     for (int i=0; i<6; i++) {
       *ap1=virial[i];
       ap1+=ans_pitch;
     }
   }
   *ap1=fx;
   ap1+=ans_pitch;
   *ap1=fy;
   ap1+=ans_pitch;
   *ap1=fz;
   ap1+=ans_pitch;
   *ap1=torx;
   ap1+=ans_pitch;
   *ap1=tory;
   ap1+=ans_pitch;
   *ap1=torz;
 
   } // if ii
 }
 
 template<class numtyp, class acctyp>
 __global__ void kernel_sphere_gb(const numtyp *gum, const numtyp *special_lj,
                                  const int *dev_nbor, const int nbor_pitch, 
                                  acctyp *ans, size_t ans_pitch, int *err_flag, 
                                  const bool eflag, const bool vflag,
                                  const int start, const int inum, 
                                  const int nall) {
   __shared__ numtyp sp_lj[4];
 
   // ii indexes the two interacting particles in gi
   int ii=threadIdx.x;
   if (ii<4)
     sp_lj[ii]=special_lj[ii];    
   ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
 
   if (ii<inum) {
   
     acctyp energy=(numtyp)0;
     acctyp fx=(numtyp)0;
     acctyp fy=(numtyp)0;
     acctyp fz=(numtyp)0;
     acctyp virial[6];
     for (int i=0; i<6; i++)
       virial[i]=(numtyp)0;
   
     const int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     nbor+=nbor_pitch;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
     const int *nbor_end=nbor+INT_MUL(nbor_pitch,numj);
   
     numtyp ix=_x_<numtyp>(i,0);
     numtyp iy=_x_<numtyp>(i,1);
     numtyp iz=_x_<numtyp>(i,2);
     int itype=_x_<numtyp>(i,7);
       
     numtyp oner=_shape_<numtyp>(itype,0);
     numtyp one_well=_well_<numtyp>(itype,0);
   
     numtyp factor_lj;
     for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
   
       int j=*nbor;
       if (j < nall) 
-        factor_lj = 1.0;
+        factor_lj = (numtyp)1.0;
       else {
         factor_lj = sp_lj[j/nall];
         j %= nall;
       }
       int jtype=_x_<numtyp>(j,7);
 
       // Compute r12
       numtyp r12[3];
       r12[0] = _x_<numtyp>(j,0)-ix;
       r12[1] = _x_<numtyp>(j,1)-iy;
       r12[2] = _x_<numtyp>(j,2)-iz;
       numtyp ir = gpu_dot3(r12,r12);
 
       ir = rsqrt(ir);
       numtyp r = (numtyp)1.0/ir;
       
       numtyp r12hat[3];
       r12hat[0]=r12[0]*ir;
       r12hat[1]=r12[1]*ir;
       r12hat[2]=r12[2]*ir;
 
       numtyp a2[9];
       gpu_quat_to_mat_trans(j,a2);
   
       numtyp u_r, dUr[3], eta;
       { // Compute U_r, dUr, eta, and teta
         // Compute g12
         numtyp g12[9];
         {
           {
             numtyp g2[9];
             gpu_shape_times3(jtype,a2,g12);
             gpu_transpose_times3(a2,g12,g2);
             g12[0]=g2[0]+oner;
             g12[4]=g2[4]+oner;
             g12[8]=g2[8]+oner;
             g12[1]=g2[1];
             g12[2]=g2[2];
             g12[3]=g2[3];
             g12[5]=g2[5];
             g12[6]=g2[6];
             g12[7]=g2[7];    
           }
   
           { // Compute U_r and dUr
     
             // Compute kappa
             numtyp kappa[3];
             gpu_mldivide3(g12,r12,kappa,err_flag);
 
             // -- kappa is now / r
             kappa[0]*=ir;
             kappa[1]*=ir;
             kappa[2]*=ir;
   
             // energy
   
             // compute u_r and dUr
             numtyp uslj_rsq;
             {
               // Compute distance of closest approach
               numtyp h12, sigma12;
               sigma12 = gpu_dot3(r12hat,kappa);
               sigma12 = rsqrt((numtyp)0.5*sigma12);
               h12 = r-sigma12;
 
               // -- kappa is now ok
               kappa[0]*=r;
               kappa[1]*=r;
               kappa[2]*=r;
           
               numtyp sigma = _sigma_<numtyp>(itype,jtype);
               numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
               numtyp varrho = sigma/(h12+gum[0]*sigma);
               numtyp varrho6 = varrho*varrho*varrho;
               varrho6*=varrho6;
               numtyp varrho12 = varrho6*varrho6;
               u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
 
               numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
               temp1 = temp1*(numtyp)24.0*epsilon;
               uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
               numtyp temp2 = gpu_dot3(kappa,r12hat);
               uslj_rsq = uslj_rsq*ir*ir;
 
               dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
               dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
               dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
             }
           }
         }
      
         // Compute eta
         {
           eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
           numtyp det_g12 = gpu_det3(g12);
           eta = pow(eta/det_g12,gum[1]);
         }
       }
   
       numtyp chi, dchi[3];
       { // Compute chi and dchi
 
         // Compute b12
         numtyp b12[9];
         {
           numtyp b2[9];
           gpu_well_times3(jtype,a2,b12);
           gpu_transpose_times3(a2,b12,b2);
           b12[0]=b2[0]+one_well;
           b12[4]=b2[4]+one_well;
           b12[8]=b2[8]+one_well;
           b12[1]=b2[1];
           b12[2]=b2[2];
           b12[3]=b2[3];
           b12[5]=b2[5];
           b12[6]=b2[6];
           b12[7]=b2[7];    
         }
 
         // compute chi_12
         numtyp iota[3];
         gpu_mldivide3(b12,r12,iota,err_flag);
         // -- iota is now iota/r
         iota[0]*=ir;
         iota[1]*=ir;
         iota[2]*=ir;
         chi = gpu_dot3(r12hat,iota);
         chi = pow(chi*(numtyp)2.0,gum[2]);
 
         // -- iota is now ok
         iota[0]*=r;
         iota[1]*=r;
         iota[2]*=r;
 
         numtyp temp1 = gpu_dot3(iota,r12hat);
         numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
         dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
         dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
         dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
       }
 
       numtyp temp2 = factor_lj*eta*chi;
       if (eflag)
         energy+=u_r*temp2;
       numtyp temp1 = -eta*u_r*factor_lj;
       if (vflag) {
         r12[0]*=-1;
         r12[1]*=-1;
         r12[2]*=-1;
         numtyp ft=temp1*dchi[0]-temp2*dUr[0];
         fx+=ft;
         virial[0]+=r12[0]*ft;
         ft=temp1*dchi[1]-temp2*dUr[1];
         fy+=ft;
         virial[1]+=r12[1]*ft;
         virial[3]+=r12[0]*ft;
         ft=temp1*dchi[2]-temp2*dUr[2];
         fz+=ft;
         virial[2]+=r12[2]*ft;
         virial[4]+=r12[0]*ft;
         virial[5]+=r12[1]*ft;
       } else {
         fx+=temp1*dchi[0]-temp2*dUr[0];
         fy+=temp1*dchi[1]-temp2*dUr[1];
         fz+=temp1*dchi[2]-temp2*dUr[2];
       }
     } // for nbor
 
     // Store answers
     acctyp *ap1=ans+ii;
     if (eflag) {
       *ap1=energy;
       ap1+=ans_pitch;
     }
     if (vflag) {
       for (int i=0; i<6; i++) {
         *ap1=virial[i];
         ap1+=ans_pitch;
       }
     }
     *ap1=fx;
     ap1+=ans_pitch;
     *ap1=fy;
     ap1+=ans_pitch;
     *ap1=fz;
   } // if ii
 }
 
 template<class numtyp, class acctyp>
 __global__ void kernel_lj(const numtyp *special_lj, const int *dev_nbor, 
                           const int *dev_ij, const int nbor_pitch, acctyp *ans, 
                           size_t ans_pitch, int *err_flag, const bool eflag, 
                           const bool vflag, const int start, const int inum,
                           const int nall) {
   __shared__ numtyp sp_lj[4];                              
   
   // ii indexes the two interacting particles in gi
   int ii=threadIdx.x;
   if (ii<4)
     sp_lj[ii]=special_lj[ii];    
   ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
 
   if (ii<inum) {
   
     acctyp energy=(numtyp)0;
     acctyp fx=(numtyp)0;
     acctyp fy=(numtyp)0;
     acctyp fz=(numtyp)0;
     acctyp virial[6];
     for (int i=0; i<6; i++)
       virial[i]=(numtyp)0;
   
     const int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
     const int *list=dev_ij+*nbor;
     const int *list_end=list+numj;
   
     numtyp ix=_x_<numtyp>(i,0);
     numtyp iy=_x_<numtyp>(i,1);
     numtyp iz=_x_<numtyp>(i,2);
     int itype=_x_<numtyp>(i,7);
 
     numtyp factor_lj;
     for ( ; list<list_end; list++) {
   
       int j=*list;
       if (j < nall) 
-        factor_lj = 1.0;
+        factor_lj = (numtyp)1.0;
       else {
         factor_lj = sp_lj[j/nall];
         j %= nall;
       }
       int jtype=_x_<numtyp>(j,7);
 
       // Compute r12
       numtyp delx = ix-_x_<numtyp>(j,0);
       numtyp dely = iy-_x_<numtyp>(j,1);
       numtyp delz = iz-_x_<numtyp>(j,2);
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
         
       if (r2inv<_cutsq_<numtyp>(itype,jtype) &&
           _form_(itype,jtype)==SPHERE_SPHERE) {
         r2inv=(numtyp)1.0/r2inv;
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = r2inv*r6inv*(_lj1_<numtyp>(itype,jtype).x*r6inv-
                                     _lj1_<numtyp>(itype,jtype).y);
         force*=factor_lj;
       
         fx+=delx*force;
         fy+=dely*force;
         fz+=delz*force;
 
         if (eflag) {
           numtyp e=r6inv*(_lj3_<numtyp>(itype,jtype).x*r6inv-
                           _lj3_<numtyp>(itype,jtype).y);
           energy+=factor_lj*(e-_offset_<numtyp>(1,1)); 
         }
         if (vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
           virial[3] += delx*dely*force;
           virial[4] += delx*delz*force;
           virial[5] += dely*delz*force;
         }
       }
 
     } // for nbor
 
     // Store answers
     acctyp *ap1=ans+ii;
     if (eflag) {
       *ap1+=energy;
       ap1+=ans_pitch;
     }
     if (vflag) {
       for (int i=0; i<6; i++) {
         *ap1+=virial[i];
         ap1+=ans_pitch;
       }
     }
     *ap1+=fx;
     ap1+=ans_pitch;
     *ap1+=fy;
     ap1+=ans_pitch;
     *ap1+=fz;
 
   } // if ii
 }
 
 template<class numtyp, class acctyp>
 __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor, 
                                const int *dev_ij, const int nbor_pitch, 
                                acctyp *ans, size_t ans_pitch,int *err_flag,
                                const bool eflag, const bool vflag, 
                                const int start, const int inum, const int nall){
   // ii indexes the two interacting particles in gi
   int ii=threadIdx.x;
   __shared__ numtyp sp_lj[4];                              
   __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   if (ii<4)
     sp_lj[ii]=special_lj[ii];    
   if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     int itype=ii/MAX_SHARED_TYPES;
     int jtype=ii%MAX_SHARED_TYPES;
     cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
     form[ii]=_form_(itype,jtype);
     lj1[ii]=_lj1_<numtyp>(itype,jtype).x;
     lj2[ii]=_lj1_<numtyp>(itype,jtype).y;
     if (eflag) {
       lj3[ii]=_lj3_<numtyp>(itype,jtype).x;
       lj4[ii]=_lj3_<numtyp>(itype,jtype).y;
       offset[ii]=_offset_<numtyp>(itype,jtype);
     }
   }
   ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
   
   if (ii<inum) {
   
     acctyp energy=(numtyp)0;
     acctyp fx=(numtyp)0;
     acctyp fy=(numtyp)0;
     acctyp fz=(numtyp)0;
     acctyp virial[6];
     for (int i=0; i<6; i++)
       virial[i]=(numtyp)0;
   
     const int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
     const int *list=dev_ij+*nbor;
     const int *list_end=list+numj;
   
     numtyp ix=_x_<numtyp>(i,0);
     numtyp iy=_x_<numtyp>(i,1);
     numtyp iz=_x_<numtyp>(i,2);
     int itype=INT_MUL(MAX_SHARED_TYPES,_x_<numtyp>(i,7));
 
     numtyp factor_lj;
     for ( ; list<list_end; list++) {
   
       int j=*list;
       if (j < nall) 
-        factor_lj = 1.0;
+        factor_lj = (numtyp)1.0;
       else {
         factor_lj = sp_lj[j/nall];
         j %= nall;
       }
       int mtype=itype+_x_<numtyp>(j,7);
 
       // Compute r12
       numtyp delx = ix-_x_<numtyp>(j,0);
       numtyp dely = iy-_x_<numtyp>(j,1);
       numtyp delz = iz-_x_<numtyp>(j,2);
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
         
       if (r2inv<cutsq[mtype] && form[mtype]==SPHERE_SPHERE) {
         r2inv=(numtyp)1.0/r2inv;
         numtyp r6inv = r2inv*r2inv*r2inv;
         numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype]*r6inv-lj2[mtype]);
       
         fx+=delx*force;
         fy+=dely*force;
         fz+=delz*force;
 
         if (eflag) {
           numtyp e=r6inv*(lj3[mtype]*r6inv-lj4[mtype]);
           energy+=factor_lj*(e-offset[mtype]); 
         }
         if (vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
           virial[3] += delx*dely*force;
           virial[4] += delx*delz*force;
           virial[5] += dely*delz*force;
         }
       }
 
     } // for nbor
 
     // Store answers
     acctyp *ap1=ans+ii;
     if (eflag) {
       *ap1+=energy;
       ap1+=ans_pitch;
     }
     if (vflag) {
       for (int i=0; i<6; i++) {
         *ap1+=virial[i];
         ap1+=ans_pitch;
       }
     }
     *ap1+=fx;
     ap1+=ans_pitch;
     *ap1+=fy;
     ap1+=ans_pitch;
     *ap1+=fz;
 
   } // if ii
 }
 
 #endif
diff --git a/lib/gpu/gb_gpu_memory.cu b/lib/gpu/gb_gpu_memory.cu
index 021cd85e9..aab5ea55a 100644
--- a/lib/gpu/gb_gpu_memory.cu
+++ b/lib/gpu/gb_gpu_memory.cu
@@ -1,146 +1,138 @@
 /***************************************************************************
                               gb_gpu_memory.cu
                              -------------------
                                W. Michael Brown
 
   Global variables for GPU Gayberne Library
 
  __________________________________________________________________________
     This file is part of the LAMMPS GPU Library
  __________________________________________________________________________
 
     begin                : Thu Jun 25 2009
     copyright            : (C) 2009 by W. Michael Brown
     email                : wmbrown@sandia.gov
  ***************************************************************************/
 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
    ----------------------------------------------------------------------- */
 
 #include "gb_gpu_memory.h"
 #define GB_GPU_MemoryT GB_GPU_Memory<numtyp, acctyp>
 
 template <class numtyp, class acctyp>
 GB_GPU_MemoryT::GB_GPU_Memory() : LJ_GPU_MemoryT() {
   this->atom.atom_fields(8); 
   this->atom.ans_fields(13); 
   this->nbor.packing(true);
 }
 
 template <class numtyp, class acctyp>
 GB_GPU_MemoryT::~GB_GPU_Memory() { 
   clear(); 
 }
  
 template <class numtyp, class acctyp>
 int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes, 
                           const double gamma, const double upsilon, 
                           const double mu, double **host_shape,
                           double **host_well, double **host_cutsq, 
                           double **host_sigma, double **host_epsilon, 
                           double *host_lshape, int **h_form, double **host_lj1, 
                           double **host_lj2, double **host_lj3, 
                           double **host_lj4, double **host_offset, 
                           double *host_special_lj, const int max_nbors, 
                           const bool force_d, const int me) {
   if (this->allocated)
     clear();
     
   LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
                        host_lj1, host_lj2, host_lj3, host_lj4, host_offset,
                        host_special_lj, max_nbors, me);
     
   host_form=h_form;
     
   // Initialize timers for the selected GPU
   time_kernel.init();
   time_gayberne.init();
   time_kernel2.init();
   time_gayberne2.init();
     
   // Use the write buffer from atom for data initialization
   NVC_HostT &host_write=this->atom.host_write;
   assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2);
 
   // Allocate, cast and asynchronous memcpy of constant data
   gamma_upsilon_mu.safe_alloc(3);
   host_write[0]=static_cast<numtyp>(gamma); 
   host_write[1]=static_cast<numtyp>(upsilon);
   host_write[2]=static_cast<numtyp>(mu);
   gamma_upsilon_mu.copy_from_host(host_write.begin());
 
-  lshape.safe_alloc(ntypes);
+  lshape.safe_alloc(ntypes,lshape_get_texture<numtyp>());
   lshape.cast_copy(host_lshape,host_write);
   lshape.copy_from_host(host_write.begin());
     
   // Copy shape, well, sigma, epsilon, and cutsq onto GPU
-  shape.safe_alloc(ntypes,3);
+  shape.safe_alloc(ntypes,3,shape_get_texture<numtyp>());
   shape.cast_copy(host_shape[0],host_write);
-  well.safe_alloc(ntypes,3);
+  well.safe_alloc(ntypes,3,well_get_texture<numtyp>());
   well.cast_copy(host_well[0],host_write);
 
   // Copy LJ data onto GPU
   int lj_types=ntypes;
   if (lj_types<=MAX_SHARED_TYPES)
     lj_types=MAX_SHARED_TYPES;
-  form.safe_alloc(lj_types,lj_types);
+  form.safe_alloc(lj_types,lj_types,form_get_texture());
   form.copy_2Dfrom_host(host_form[0],ntypes,ntypes);
 
   // See if we want fast GB-sphere or sphere-sphere calculations
   multiple_forms=false;
   for (int i=1; i<ntypes; i++)
     for (int j=i; j<ntypes; j++) 
       if (host_form[i][j]!=ELLIPSE_ELLIPSE)
         multiple_forms=true;
         
   // Memory for ilist ordered by particle type
   host_olist.safe_alloc_rw(this->max_atoms);
 
-  // Bind constant data to textures
-  lshape_bind_texture<numtyp>(lshape);
-  shape_bind_texture<numtyp>(shape);
-  well_bind_texture<numtyp>(well);
-  form_bind_texture(form);
-
   return this->nbor.host_ij.begin();
 }
   
 template <class numtyp, class acctyp>
 void GB_GPU_MemoryT::clear() {
   if (!this->allocated)
     return;
 
   int err_flag;
   this->dev_error.copy_to_host(&err_flag);
   if (err_flag == 1)
     std::cerr << "COLLISION BUFFER OVERFLOW OCCURED. INCREASE COLLISION_N "
               << "and RECOMPILE.\n";
   else if (err_flag == 2)
     std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";  
 
   LJ_GPU_MemoryT::clear();      
   
-  shape_unbind_texture<numtyp>();
-  well_unbind_texture<numtyp>();
-  form_unbind_texture();
+  lshape.unbind();
 
   shape.clear();
   well.clear();
   form.clear();
   lshape.clear();
   gamma_upsilon_mu.clear();
   host_olist.clear();
 }  
  
 template <class numtyp, class acctyp>
 double GB_GPU_MemoryT::host_memory_usage() {
   return this->atom.host_memory_usage(this->max_atoms)+
          this->nbor.host_memory_usage()+4*sizeof(numtyp)+
          sizeof(GB_GPU_Memory<numtyp,acctyp>)+this->max_atoms*sizeof(int);
 }
 
 template class GB_GPU_Memory<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h
index 8886931c1..4d607ed14 100644
--- a/lib/gpu/gb_gpu_memory.h
+++ b/lib/gpu/gb_gpu_memory.h
@@ -1,74 +1,73 @@
 /***************************************************************************
                                gb_gpu_memory.h
                              -------------------
                                W. Michael Brown
 
   Global variables for GPU Gayberne Library
 
  __________________________________________________________________________
     This file is part of the LAMMPS GPU Library
  __________________________________________________________________________
 
     begin                : Thu Jun 25 2009
     copyright            : (C) 2009 by W. Michael Brown
     email                : wmbrown@sandia.gov
  ***************************************************************************/
 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
    ----------------------------------------------------------------------- */
 
 #ifndef GB_GPU_MEMORY_H
 #define GB_GPU_MEMORY_H
 
 #define MAX_GPU_THREADS 4
 #include "lj_gpu_memory.h"
-#define LJ_GPU_MemoryT LJ_GPU_Memory<numtyp,acctyp>
 
 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 
 template <class numtyp, class acctyp>
-class GB_GPU_Memory : public LJ_GPU_MemoryT {
+class GB_GPU_Memory : public LJ_GPU_Memory<numtyp,acctyp> {
  public:
   GB_GPU_Memory();
   ~GB_GPU_Memory(); 
  
   int* init(const int ij_size, const int ntypes, const double gamma,
             const double upsilon, const double mu, double **host_shape,
             double **host_well, double **host_cutsq, double **host_sigma, 
             double **host_epsilon, double *host_lshape, int **h_form,
             double **host_lj1, double **host_lj2, double **host_lj3, 
             double **host_lj4, double **host_offset, double *host_special_lj,
             const int max_nbors, const bool force_d, const int me);
 
   void clear();
  
   double host_memory_usage();
   
   // ----------------------------  DATA  ----------------------------
 
   // ilist with particles sorted by type
   NVC_HostI host_olist;
 
   // --------------- Const Data for Atoms
   NVC_ConstMatT shape, well;
   NVC_ConstMatI form;
   NVC_VecT lshape, gamma_upsilon_mu;
   
 
   // --------------- Timing Stuff
   NVCTimer time_kernel, time_gayberne, time_kernel2, time_gayberne2;
   
   // True if we want to use fast GB-sphere or sphere-sphere calculations 
   bool multiple_forms;
   int **host_form;
   int last_ellipse;
    
  private:
 };
 
 #endif
 
diff --git a/lib/gpu/lj_gpu.cu b/lib/gpu/lj_gpu.cu
index ea5ba440c..94d826e14 100644
--- a/lib/gpu/lj_gpu.cu
+++ b/lib/gpu/lj_gpu.cu
@@ -1,234 +1,235 @@
 /***************************************************************************
                                   lj_gpu.cu
                              -------------------
                                W. Michael Brown
 
   Lennard-Jones potential GPU calcultation
 
  __________________________________________________________________________
     This file is part of the LAMMPS GPU Library
  __________________________________________________________________________
 
     begin                : Tue Aug 4 2009
     copyright            : (C) 2009 by W. Michael Brown
     email                : wmbrown@sandia.gov
  ***************************************************************************/
 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
    ----------------------------------------------------------------------- */
 
 #include <iostream>
 #include <cassert>
 #include "nvc_macros.h"
 #include "nvc_timer.h"
 #include "nvc_device.h"
-#include "lj_gpu_memory.h"
+#include "pair_gpu_texture.h"
+#include "lj_gpu_memory.cu"
 #include "lj_gpu_kernel.h"
 
 using namespace std;
 
 static LJ_GPU_Memory<PRECISION,ACC_PRECISION> LJMF;
 #define LJMT LJ_GPU_Memory<numtyp,acctyp>
 
 // ---------------------------------------------------------------------------
 // Convert something to a string
 // ---------------------------------------------------------------------------
 #include <sstream>
 template <class t>
 inline string lj_gpu_toa(const t& in) {
   ostringstream o;
   o.precision(2);
   o << in;
   return o.str();
 }
 
 // ---------------------------------------------------------------------------
 // Return string with GPU info
 // ---------------------------------------------------------------------------
 string lj_gpu_name(const int id, const int max_nbors) {
   string name=LJMF.gpu.name(id)+", "+
               lj_gpu_toa(LJMF.gpu.cores(id))+" cores, "+
               lj_gpu_toa(LJMF.gpu.gigabytes(id))+" GB, "+
               lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ, "+
               lj_gpu_toa(LJMF.get_max_atoms(LJMF.gpu.bytes(id),
                                             max_nbors))+" Atoms";
   return name;
 }
 
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, 
                   double **epsilon, double **host_lj1, double **host_lj2, 
                   double **host_lj3, double **host_lj4, double **offset, 
                   double *special_lj, const int max_nbors, const int gpu_id) {
   if (LJMF.gpu.num_devices()==0)
     return 0;                   
 
   ij_size=IJ_SIZE;
   return LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, 
                    host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);
 }
 
 // ---------------------------------------------------------------------------
 // Clear memory on host and device
 // ---------------------------------------------------------------------------
 void lj_gpu_clear() {
   LJMF.clear();
 }
 
 // ---------------------------------------------------------------------------
 // copy atom positions and optionally types to device
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 inline void _lj_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x,
                          const int *host_type, const bool rebuild,
                          cudaStream_t &stream) {
   atom.time_atom.start();
   atom.reset_write_buffer();
   
   // First row of dev_x is x position, second is y, third is z
   atom.add_atom_data(host_x[0],3);
   atom.add_atom_data(host_x[0]+1,3);
   atom.add_atom_data(host_x[0]+2,3);
 
   int csize=3;
 
   // If a rebuild occured, copy type data
   if (rebuild) {
     atom.add_atom_data(host_type);
     csize++;
   }
   
   atom.copy_atom_data(csize,stream);
   atom.time_atom.stop();
 }
 
 void lj_gpu_atom(double **host_x, const int *host_type, const bool rebuild) {
   _lj_gpu_atom(LJMF.atom, host_x, host_type, rebuild, LJMF.pair_stream);
 }
 
 // ---------------------------------------------------------------------------
 // Signal that we need to transfer a new neighbor list
 // ---------------------------------------------------------------------------
 template <class LJMTyp>
 bool _lj_gpu_reset_nbors(LJMTyp &ljm, const int nall, const int inum, 
                          int *ilist, const int *numj) {
   if (nall>ljm.max_atoms)
     return false;
   
   ljm.nbor.time_nbor.start();
 
   ljm.atom.nall(nall);
   ljm.atom.inum(inum);
   ljm.nbor.reset(inum,ilist,numj,ljm.pair_stream);
 
   ljm.nbor.time_nbor.stop();
   return true;
 }
 
 bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist, 
                         const int *numj) {
   return _lj_gpu_reset_nbors(LJMF,nall,inum,ilist,numj);
 }
 
 // ---------------------------------------------------------------------------
 // Copy a set of ij_size ij interactions to device and compute energies,
 // forces, and torques for those interactions
 // ---------------------------------------------------------------------------
 template <class LJMTyp>
 void _lj_gpu_nbors(LJMTyp &ljm, const int num_ij) {
   ljm.nbor.time_nbor.add_to_total();
   // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed
   
   ljm.nbor.time_nbor.start();
   ljm.nbor.add(num_ij,ljm.pair_stream);
   ljm.nbor.time_nbor.stop();
 }
 
 void lj_gpu_nbors(const int num_ij) {
   _lj_gpu_nbors(LJMF,num_ij);
 }
 
 // ---------------------------------------------------------------------------
 // Calculate energies and forces for all ij interactions
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void _lj_gpu(LJMT &ljm, const bool eflag, const bool vflag, const bool rebuild){
   // Compute the block size and grid size to keep all cores busy
   const int BX=BLOCK_1D;
 
   int GX=static_cast<int>(ceil(static_cast<double>(ljm.atom.inum())/BX));
 
   ljm.time_pair.start();
   if (ljm.shared_types)
     kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
            (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), 
             ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), 
             ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag,
             vflag, ljm.atom.inum(), ljm.atom.nall());
   else
     kernel_lj<numtyp,acctyp><<<GX,BX,0,ljm.pair_stream>>>
            (ljm.special_lj.begin(), ljm.nbor.dev_nbor.begin(), 
             ljm.nbor.ij.begin(), ljm.nbor.dev_nbor.row_size(), 
             ljm.atom.ans.begin(), ljm.atom.ans.row_size(), eflag, 
             vflag, ljm.atom.inum(), ljm.atom.nall());
   ljm.time_pair.stop();
 }
 
 void lj_gpu(const bool eflag, const bool vflag, const bool rebuild) {
   _lj_gpu<PRECISION,ACC_PRECISION>(LJMF,eflag,vflag,rebuild);
 }
 
 // ---------------------------------------------------------------------------
 // Get energies and forces to host
 // ---------------------------------------------------------------------------
 template<class numtyp, class acctyp>
 double _lj_gpu_forces(LJMT &ljm, double **f, const int *ilist,
                       const bool eflag, const bool vflag, const bool eflag_atom,
                       const bool vflag_atom, double *eatom, double **vatom,
                       double *virial) {
   double evdw;
   
   ljm.atom.time_answer.start();
   ljm.atom.copy_answers(eflag,vflag,ljm.pair_stream);
 
   ljm.atom.time_atom.add_to_total();
   ljm.nbor.time_nbor.add_to_total();
   ljm.time_pair.add_to_total();
   // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // not if timed
 
   evdw=ljm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
   ljm.atom.add_forces(ilist,f);
   ljm.atom.time_answer.stop();
   ljm.atom.time_answer.add_to_total();
   return evdw;
 }
 
 double lj_gpu_forces(double **f, const int *ilist, const bool eflag, 
                      const bool vflag, const bool eflag_atom,
                      const bool vflag_atom, double *eatom, double **vatom,
                      double *virial) {
   return _lj_gpu_forces<PRECISION,ACC_PRECISION> 
     (LJMF,f,ilist,eflag,vflag,eflag_atom,vflag_atom,eatom,vatom,virial);
 }
 
 void lj_gpu_time() {
   cout.precision(4);
   cout << "Atom copy:     " << LJMF.atom.time_atom.total_seconds() << " s.\n";
   cout << "Neighbor copy: " << LJMF.nbor.time_nbor.total_seconds() << " s.\n";
   cout << "LJ calc:       " << LJMF.time_pair.total_seconds() << " s.\n";
   cout << "Answer copy:   " << LJMF.atom.time_answer.total_seconds() << " s.\n";
 }
 
 int lj_gpu_num_devices() {
   return LJMF.gpu.num_devices();
 }
 
 double lj_gpu_bytes() {
   return LJMF.host_memory_usage();
 }
diff --git a/lib/gpu/lj_gpu_memory.cu b/lib/gpu/lj_gpu_memory.cu
index 484e28e86..5e8bf4cdc 100644
--- a/lib/gpu/lj_gpu_memory.cu
+++ b/lib/gpu/lj_gpu_memory.cu
@@ -1,167 +1,152 @@
 /***************************************************************************
                                lj_gpu_memory.cu
                              -------------------
                                W. Michael Brown
 
   Global variables for GPU Lennard-Jones Library
 
  __________________________________________________________________________
     This file is part of the LAMMPS GPU Library
  __________________________________________________________________________
 
     begin                : Tue Aug 4 2009
     copyright            : (C) 2009 by W. Michael Brown
     email                : wmbrown@sandia.gov
  ***************************************************************************/
 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
    ----------------------------------------------------------------------- */
 
 #include "lj_gpu_memory.h"
 #define LJ_GPU_MemoryT LJ_GPU_Memory<numtyp, acctyp>
 
 template <class numtyp, class acctyp>
 int LJ_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
   return atom.bytes_per_atom()+nbor.bytes_per_atom(max_nbors); 
 }
 
 template <class numtyp, class acctyp>
 int LJ_GPU_MemoryT::get_max_atoms(const size_t gpu_bytes, const int max_nbors) {
   int matoms=static_cast<int>(PERCENT_GPU_MEMORY*gpu_bytes/
                               bytes_per_atom(max_nbors));
   if (matoms>MAX_ATOMS)
     matoms=MAX_ATOMS;
   return matoms;
 }
   
 template <class numtyp, class acctyp>
 int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, 
                           double **host_cutsq, double **host_sigma, 
                           double **host_epsilon, double **host_lj1, 
                           double **host_lj2, double **host_lj3, 
                           double **host_lj4, double **host_offset, 
                           double *host_special_lj, const int max_nbors, 
                           const int me) {
   if (allocated)
     clear();
     
   if (me>=gpu.num_devices())
     return 0;
   gpu.set(me);
   if (gpu.revision()<1.0)
     return 0;  
     
   // Initialize timers for the selected GPU
   time_pair.init();
 
   // Initialize atom and nbor data
   max_atoms=get_max_atoms(gpu.bytes(),max_nbors);
   atom.init(max_atoms);
   nbor.init(ij_size,max_atoms,max_nbors);
   
   // Get a stream for computing pair potentials
   CUDA_SAFE_CALL(cudaStreamCreate(&pair_stream));
     
   // Use the write buffer from atom for data initialization
   NVC_HostT &host_write=atom.host_write;
   assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2);
 
   // Copy data for bonded interactions
   special_lj.safe_alloc(4);
   special_lj.cast_copy(host_special_lj,host_write);
 
   // Copy sigma, epsilon, and cutsq onto GPU
-  sigma.safe_alloc(ntypes,ntypes);
+  sigma.safe_alloc(ntypes,ntypes,sigma_get_texture<numtyp>());
   sigma.cast_copy(host_sigma[0],host_write);
-  epsilon.safe_alloc(ntypes,ntypes);
+  epsilon.safe_alloc(ntypes,ntypes,epsilon_get_texture<numtyp>());
   epsilon.cast_copy(host_epsilon[0],host_write);
-  cutsq.safe_alloc(ntypes,ntypes);
+  cutsq.safe_alloc(ntypes,ntypes,cutsq_get_texture<numtyp>());
   cutsq.cast_copy(host_cutsq[0],host_write);
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
   if (lj_types<=MAX_SHARED_TYPES) {
     lj_types=MAX_SHARED_TYPES;
     shared_types=true;
   }
-  offset.safe_alloc(lj_types,lj_types);
+  offset.safe_alloc(lj_types,lj_types,offset_get_texture<numtyp>());
   offset.cast_copy2D(host_offset[0],host_write,ntypes,ntypes);
   double *t1=host_lj1[0];
   double *t2=host_lj2[0];
-  for (int i=0; i<lj_types*lj_types; i++) {
+  for (int i=0; i<ntypes*ntypes; i++) {
     host_write[i*2]=t1[i];
     host_write[i*2+1]=t2[i];
   }
-  lj1.safe_alloc(lj_types,lj_types);
-  lj1.copy_2Dfrom_host(reinterpret_cast<typename cu_vec_traits<numtyp>::vec2 *> (host_write.begin()),
+  lj1.safe_alloc(lj_types,lj_types,lj1_get_texture<numtyp>());
+  lj1.copy_2Dfrom_host(reinterpret_cast<typename nvc_vec_traits<numtyp>::vec2 *> (host_write.begin()),
                        ntypes,ntypes);
   t1=host_lj3[0];
   t2=host_lj4[0];
-  for (int i=0; i<lj_types*lj_types; i++) {
+  for (int i=0; i<ntypes*ntypes; i++) {
     host_write[i*2]=t1[i];
     host_write[i*2+1]=t2[i];
   }
-  lj3.safe_alloc(lj_types,lj_types);
-  lj3.copy_2Dfrom_host(reinterpret_cast<typename cu_vec_traits<numtyp>::vec2 *> (host_write.begin()),
+  lj3.safe_alloc(lj_types,lj_types,lj3_get_texture<numtyp>());
+  lj3.copy_2Dfrom_host(reinterpret_cast<typename nvc_vec_traits<numtyp>::vec2 *> (host_write.begin()),
                        ntypes,ntypes);
         
-  // Bind constant data to textures
-  sigma_bind_texture<numtyp>(sigma);
-  epsilon_bind_texture<numtyp>(epsilon);
-  cutsq_bind_texture<numtyp>(cutsq);
-  offset_bind_texture<numtyp>(offset);
-  lj1_bind_texture<typename cu_vec_traits<numtyp>::vec2>(lj1);
-  lj3_bind_texture<typename cu_vec_traits<numtyp>::vec2>(lj3);
-
   dev_error.safe_alloc(1);
   dev_error.zero();
     
   allocated=true;
 
   return nbor.host_ij.begin();
 }
   
 template <class numtyp, class acctyp>
 void LJ_GPU_MemoryT::clear() {
   if (!allocated)
     return;
   allocated=false;
       
   // Check for any pair style specific errors here
   int err_flag;
   dev_error.copy_to_host(&err_flag);
  
   atom.clear();
   nbor.clear();
     
-  sigma_unbind_texture<numtyp>();
-  epsilon_unbind_texture<numtyp>();
-  cutsq_unbind_texture<numtyp>();
-  offset_unbind_texture<numtyp>();
-  lj1_unbind_texture<typename cu_vec_traits<numtyp>::vec2>();
-  lj3_unbind_texture<typename cu_vec_traits<numtyp>::vec2>();
-
   CUDA_SAFE_CALL(cudaStreamDestroy(pair_stream));
 
   dev_error.clear();
   sigma.clear();
   epsilon.clear();
   special_lj.clear();
   cutsq.clear();
   offset.clear();
   lj1.clear();
   lj3.clear();
 }  
  
 template <class numtyp, class acctyp>
 double LJ_GPU_MemoryT::host_memory_usage() const {
   return atom.host_memory_usage(max_atoms)+nbor.host_memory_usage()+
          sizeof(LJ_GPU_Memory<numtyp,acctyp>);
 }
 
 template class LJ_GPU_Memory<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lj_gpu_memory.h b/lib/gpu/lj_gpu_memory.h
index d6f3e1cf1..82a87ea6a 100644
--- a/lib/gpu/lj_gpu_memory.h
+++ b/lib/gpu/lj_gpu_memory.h
@@ -1,88 +1,89 @@
 /***************************************************************************
                                lj_gpu_memory.h
                              -------------------
                                W. Michael Brown
 
   Global variables for GPU Lennard-Jones Library
 
  __________________________________________________________________________
     This file is part of the LAMMPS GPU Library
  __________________________________________________________________________
 
     begin                : Tue Aug 4 2009
     copyright            : (C) 2009 by W. Michael Brown
     email                : wmbrown@sandia.gov
  ***************************************************************************/
 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
    ----------------------------------------------------------------------- */
 
 #ifndef LJ_GPU_MEMORY_H
 #define LJ_GPU_MEMORY_H
 
 #include "nvc_device.h"
+#include "nvc_traits.h"
 #include "pair_gpu_atom.h"
 #include "pair_gpu_nbor.h"
 
 #define BLOCK_1D 64
 #define MAX_SHARED_TYPES 8
 #define PERCENT_GPU_MEMORY 0.7
 
 template <class numtyp, class acctyp>
 class LJ_GPU_Memory {
  public:
   LJ_GPU_Memory() : allocated(false) {}
   ~LJ_GPU_Memory() { clear(); }
  
   /// Allocate memory on host and device
   int* init(const int ij_size, const int ntypes, double **host_cutsq, 
             double **host_sigma, double **host_epsilon, 
             double **host_lj1, double **host_lj2, double **host_lj3, 
             double **host_lj4, double **host_offset, double *host_special_lj,
             const int max_nbors, const int me);
   /// Free any memory on host and device
   void clear();
 
   /// Returns memory usage on GPU per atom
   int bytes_per_atom(const int max_nbors) const;
   /// Maximum number of atoms that can be stored on GPU
   int get_max_atoms(const size_t gpu_bytes, const int max_nbors);
   /// Total host memory used by library
   double host_memory_usage() const;
   
   // -------------------------   DATA   -----------------------------
 
   // Device Properties
   NVCDevice gpu;
   // Device Error Flag
   NVC_VecI dev_error;
   // Stream for asynchronous work
   cudaStream_t pair_stream;
   
   // Atom Data
   PairGPUAtom<numtyp,acctyp> atom;
   // Neighbor Data
   PairGPUNbor nbor;
   
   // --------------- Const Data for Atoms
   NVC_ConstMatT sigma, epsilon, cutsq, offset;
-  NVC_ConstMat< typename cu_vec_traits<numtyp>::vec2 > lj1, lj3;
+  NVC_ConstMat< typename nvc_vec_traits<numtyp>::vec2 > lj1, lj3;
   NVC_VecT special_lj;
   
   size_t max_atoms;
   
   // Timing for pair calculation
   NVCTimer time_pair;
   
   // If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
    
  protected:
   bool allocated;
 };
 
 #endif
diff --git a/lib/gpu/nvc_memory.h b/lib/gpu/nvc_memory.h
index a83178985..0ff5229c0 100644
--- a/lib/gpu/nvc_memory.h
+++ b/lib/gpu/nvc_memory.h
@@ -1,447 +1,475 @@
 /***************************************************************************
                                 nvc_memory.h
                              -------------------
                                W. Michael Brown
 
   Routines for memory management on CUDA devices
 
  __________________________________________________________________________
     This file is part of the NVC Library
  __________________________________________________________________________
 
     begin                : Thu Jun 25 2009
     copyright            : (C) 2009 by W. Michael Brown
     email                : wmbrown@sandia.gov
  ***************************************************************************/
 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
    ----------------------------------------------------------------------- */
 
 #ifndef NVC_MEMORY_H
 #define NVC_MEMORY_H
 
 #include <iostream>
+#include "nvc_macros.h"
 
 #define NVC_HostT NVC_Host<numtyp>
 #define NVC_HostD NVC_Host<double>
 #define NVC_HostS NVC_Host<float>
 #define NVC_HostI NVC_Host<int>
 
 #define NVC_VecT NVC_Vec<numtyp>
 #define NVC_VecD NVC_Vec<double>
 #define NVC_VecS NVC_Vec<float>
 #define NVC_VecI NVC_Vec<int>
 #define NVC_VecI2 NVC_Vec<int2>
 #define NVC_VecU2 NVC_Vec<uint2>
 
 #define NVC_MatT NVC_Mat<numtyp>
 #define NVC_MatD NVC_Mat<double>
 #define NVC_MatS NVC_Mat<float>
 #define NVC_MatI NVC_Mat<int>
 
 #define NVC_ConstMatT NVC_ConstMat<numtyp>
 #define NVC_ConstMatD NVC_ConstMat<double>
 #define NVC_ConstMatS NVC_ConstMat<float>
 #define NVC_ConstMatI NVC_ConstMat<int>
 #define NVC_ConstMatD2 NVC_ConstMat<double2>
 
 namespace NVC {
 
 // Get a channel for float array
 template <class numtyp>
 inline void cuda_gb_get_channel(cudaChannelFormatDesc &channel) {
   channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
 }
 
 // Get a channel for float2 array
 template <>
 inline void cuda_gb_get_channel<float2>(cudaChannelFormatDesc &channel) {
   channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat);
 }
 
 // Get a channel for double array
 template <>
 inline void cuda_gb_get_channel<double>(cudaChannelFormatDesc &channel) {
   channel = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindSigned);
 }
 
 // Get a channel for double array
 template <>
 inline void cuda_gb_get_channel<double2>(cudaChannelFormatDesc &channel) {
   channel = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindSigned);
 }
 
 // Get a channel for int array
 template <>
 inline void cuda_gb_get_channel<int>(cudaChannelFormatDesc &channel) {
   channel = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindSigned);
 }
 
 }
 
 /// Page-locked Row Vector on Host
 template <class numtyp>
 class NVC_Host {
  public:
   NVC_Host() { _cols=0; }
   ~NVC_Host() { if (_cols>0) CUDA_SAFE_CALL(cudaFreeHost(_array)); }
   
   // Allocate page-locked memory with fast write/slow read on host
   inline void safe_alloc_w(const size_t cols) {
     _cols=cols;
     _row_bytes=cols*sizeof(numtyp);
     CUDA_SAFE_CALL(cudaHostAlloc((void **)&_array,_row_bytes,
                    cudaHostAllocWriteCombined));
     _end=_array+cols;
   }
 
   // Allocate page-locked memory with fast read/write on host
   inline void safe_alloc_rw(const size_t cols) {
     _cols=cols;
     _row_bytes=cols*sizeof(numtyp);
     CUDA_SAFE_CALL(cudaMallocHost((void **)&_array,_row_bytes));
     _end=_array+cols;
   }
   
   /// Free any memory associated with device
   inline void clear() 
     { if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFreeHost(_array)); } }
 
   /// Set each element to zero
   inline void zero() { memset(_array,0,row_bytes()); }
   
   /// Set first n elements to zero
   inline void zero(const int n) { memset(_array,0,n*sizeof(numtyp)); }
 
   inline numtyp * begin() { return _array; }
   inline const numtyp * begin() const { return _array; }
   inline numtyp * end() { return _end; }
   inline const numtyp * end() const { return _end; }
 
   inline size_t numel() const { return _cols; }
   inline size_t rows() const { return 1; }
   inline size_t cols() const { return _cols; }
   inline size_t row_size() const { return _cols; }
   inline size_t row_bytes() const { return _row_bytes; }
     
   inline numtyp & operator[](const int i) { return _array[i]; }
   inline const numtyp & operator[](const int i) const { return _array[i]; }
   
   /// Copy from device (numel is not bytes)
   inline void copy_from_device(const numtyp *device_p, size_t numel) {
     CUDA_SAFE_CALL(cudaMemcpy(_array,device_p,numel*sizeof(numtyp),
                               cudaMemcpyDeviceToHost));
   }
   
   /// Copy to device (numel is not bytes)
   inline void copy_to_device(numtyp *device_p, size_t numel) {
     CUDA_SAFE_CALL(cudaMemcpy(device_p,_array,numel*sizeof(numtyp),
                               cudaMemcpyHostToDevice));
   }
   
   /// Copy to 2D matrix on device (numel is not bytes)
   inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size,
                                const size_t rows, const size_t cols) {
     CUDA_SAFE_CALL(cudaMemcpy2D(device_p,dev_row_size*sizeof(numtyp),
                                 _array,cols*sizeof(numtyp),
                                 cols*sizeof(numtyp),rows,
                                 cudaMemcpyHostToDevice));
   }
 
   /// Asynchronous copy from device (numel is not bytes)
   inline void copy_from_device(const numtyp *device_p, size_t numel,
                                cudaStream_t &stream) {
     CUDA_SAFE_CALL(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp),
                                    cudaMemcpyDeviceToHost,stream));
   }
   
   /// Asynchronous copy to device (numel is not bytes)
   inline void copy_to_device(numtyp *device_p, size_t numel,
                              cudaStream_t &stream) {
     CUDA_SAFE_CALL(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp),
                                    cudaMemcpyHostToDevice,stream));
   }
   
   /// Asynchronous copy to 2D matrix on device (numel is not bytes)
   inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size,
                                const size_t rows, const size_t cols,
                                cudaStream_t &stream) {
     CUDA_SAFE_CALL(cudaMemcpy2DAsync(device_p,dev_row_size*sizeof(numtyp),
                                      _array,cols*sizeof(numtyp),
                                      cols*sizeof(numtyp),rows,
                                      cudaMemcpyHostToDevice,stream));
   }
   
  private:
   numtyp *_array, *_end;
   size_t _row_bytes, _row_size, _rows, _cols;
 };
 
 /// Row vector on device 
 template <class numtyp>
 class NVC_Vec {
  public:
   NVC_Vec() { _cols=0; }
   ~NVC_Vec() { if (_cols>0) CUDA_SAFE_CALL(cudaFree(_array)); }
   
   // Row vector on device
   inline void safe_alloc(const size_t cols) {
     _cols=cols;
     _row_bytes=cols*sizeof(numtyp);
     CUDA_SAFE_CALL(cudaMalloc((void **)&_array,_row_bytes));
     _end=_array+cols;
   }
   
+  // Row vector on device (allocate and assign texture and bind)
+  inline void safe_alloc(const size_t cols, textureReference *t) 
+    { safe_alloc(cols); assign_texture(t); bind(); }
+
   /// Free any memory associated with device
   inline void clear() 
     { if (_cols>0) { _cols=0; CUDA_SAFE_CALL(cudaFree(_array)); } }
 
   /// Set each element to zero
   inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0,row_bytes())); }
 
   inline numtyp * begin() { return _array; }
   inline const numtyp * begin() const { return _array; }
   inline numtyp * end() { return _end; }
   inline const numtyp * end() const { return _end; }
 
   inline size_t numel() const { return _cols; }
   inline size_t rows() const { return 1; }
   inline size_t cols() const { return _cols; }
   inline size_t row_size() const { return _cols; }
   inline size_t row_bytes() const { return _row_bytes; }
     
   /// Copy from host
   inline void copy_from_host(const numtyp *host_p)
     { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,row_bytes(), 
                                 cudaMemcpyHostToDevice)); }
 
   /// Asynchronous copy from host
   inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream)
     { CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,row_bytes(), 
                      cudaMemcpyHostToDevice, stream)); }
 
   /// Copy to host
   inline void copy_to_host(numtyp *host_p)
     { CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,row_bytes(),
                                 cudaMemcpyDeviceToHost)); }
 
   /// Copy n elements to host
   inline void copy_to_host(numtyp *host_p, const int n)
     { CUDA_SAFE_CALL(cudaMemcpy(host_p,_array,n*sizeof(numtyp),
                                 cudaMemcpyDeviceToHost)); }
 
   /// Cast and then copy to device
   template <class numtyp2>
   inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) {
     for (int i=0; i<numel(); i++)
       host_write[i]=static_cast<numtyp>(buffer[i]);
     copy_from_host(host_write.begin());
   }
   
+  /// Assign a texture to matrix
+  inline void assign_texture(textureReference *t) { _tex_ptr=t; }  
+
   /// Bind to texture
-  template <class texture>
-  inline void bind_texture(texture &texi, cudaChannelFormatDesc &channel) {
-    NVC::cuda_gb_get_channel<numtyp>(channel);
-    texi.addressMode[0] = cudaAddressModeClamp;
-    texi.addressMode[1] = cudaAddressModeClamp;
-    texi.filterMode = cudaFilterModePoint;
-    texi.normalized = false;
-    CUDA_SAFE_CALL(cudaBindTexture(NULL,&texi,_array,&channel));
+  inline void bind() {
+    NVC::cuda_gb_get_channel<numtyp>(_channel);
+    (*_tex_ptr).addressMode[0] = cudaAddressModeClamp;
+    (*_tex_ptr).addressMode[1] = cudaAddressModeClamp;
+    (*_tex_ptr).filterMode = cudaFilterModePoint;
+    (*_tex_ptr).normalized = false;
+    CUDA_SAFE_CALL(cudaBindTexture(NULL,_tex_ptr,_array,&_channel));
   }
 
+  /// Unbind texture
+  inline void unbind() { CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); }
+
   /// Output the vector (debugging)
   inline void print(std::ostream &out) { print (out, numel()); }    
 
   // Output first n elements of vector
   inline void print(std::ostream &out, const int n) {
     numtyp *t=new numtyp[n];
     copy_to_host(t,n);
     for (int i=0; i<n; i++)
       out << t[i] << " ";
     delete []t;
   }
     
  private:
   numtyp *_array, *_end;
   size_t _row_bytes, _row_size, _rows, _cols;
+  cudaChannelFormatDesc _channel;
+  textureReference *_tex_ptr;
 };
 
 /// 2D Matrix on device (can have extra column storage to get correct alignment)
 template <class numtyp>
 class NVC_Mat {
  public:
   NVC_Mat() { _rows=0; }
   ~NVC_Mat() { if (_rows>0) CUDA_SAFE_CALL(cudaFree(_array)); }
   
   // Row major matrix on device
   // - Coalesced access using adjacent cols on same row
   // - NVC_Mat(row,col) given by array[row*row_size()+col]
   inline void safe_alloc(const size_t rows, const size_t cols) {
     _rows=rows;
     _cols=cols;
     CUDA_SAFE_CALL(cudaMallocPitch((void **)&_array,&_pitch, 
                    cols*sizeof(numtyp),rows));
    _row_size=_pitch/sizeof(numtyp);                
     _end=_array+_row_size*cols;
   }
   
   /// Free any memory associated with device
   inline void clear() 
     { if (_rows>0) { _rows=0; CUDA_SAFE_CALL(cudaFree(_array)); } }
 
   /// Set each element to zero
   inline void zero() { CUDA_SAFE_CALL(cudaMemset(_array,0, _pitch*_rows)); }
 
   inline numtyp * begin() { return _array; }
   inline const numtyp * begin() const { return _array; }
   inline numtyp * end() { return _end; }
   inline const numtyp * end() const { return _end; }
 
 
   inline size_t numel() const { return _cols*_rows; }
   inline size_t rows() const { return _rows; }
   inline size_t cols() const { return _cols; }
   inline size_t row_size() const { return _row_size; }
   inline size_t row_bytes() const { return _pitch; }
     
   /// Copy from host (elements not bytes)
   inline void copy_from_host(const numtyp *host_p, const size_t numel)
     { CUDA_SAFE_CALL(cudaMemcpy(_array,host_p,numel*sizeof(numtyp), 
                      cudaMemcpyHostToDevice)); }
 
   /// Asynchronous copy from host (elements not bytes)
   inline void copy_from_host(const numtyp *host_p, const size_t numel,
                              cudaStream_t &stream)
     { CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp), 
                      cudaMemcpyHostToDevice, stream)); }
 
   /// Asynchronous Copy from Host
   /** \note Used when the number of columns/rows allocated on host smaller than
     *       on device **/
   inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
                                const size_t cols, cudaStream_t &stream) {
     CUDA_SAFE_CALL(cudaMemcpy2DAsync(_array, _pitch, host_p,cols*sizeof(numtyp), 
                                      cols*sizeof(numtyp), rows,
                                      cudaMemcpyHostToDevice,stream));
   }
 
  private:
   numtyp *_array, *_end;
   size_t _pitch, _row_size, _rows, _cols;
 };
 
 /// Const 2D Matrix on device (requires texture binding)
 template <class numtyp>
 class NVC_ConstMat {
  public:
   NVC_ConstMat() { _rows=0; }
   ~NVC_ConstMat() { if (_rows>0) CUDA_SAFE_CALL(cudaFreeArray(_array)); }
-  
+
+  /// Assign a texture to matrix
+  inline void assign_texture(textureReference *t) { _tex_ptr=t; }  
+      
   /// Row major matrix on device
   inline void safe_alloc(const size_t rows, const size_t cols) {
     _rows=rows;
     _cols=cols;
 
     NVC::cuda_gb_get_channel<numtyp>(_channel);
     CUDA_SAFE_CALL(cudaMallocArray(&_array, &_channel, cols, rows));
   }
   
+  /// Row major matrix on device (Allocate and bind texture)
+  inline void safe_alloc(const size_t rows, const size_t cols, 
+                         textureReference *t) 
+    { safe_alloc(rows,cols); assign_texture(t); bind(); }
+
   /// Bind to texture
-  template <class texture>
-  inline void bind_texture(texture &texi) {
-    texi.addressMode[0] = cudaAddressModeClamp;
-    texi.addressMode[1] = cudaAddressModeClamp;
-    texi.filterMode = cudaFilterModePoint;
-    texi.normalized = false;
-    CUDA_SAFE_CALL(cudaBindTextureToArray(&texi,_array,&_channel));
+  inline void bind() {
+    (*_tex_ptr).addressMode[0] = cudaAddressModeClamp;
+    (*_tex_ptr).addressMode[1] = cudaAddressModeClamp;
+    (*_tex_ptr).filterMode = cudaFilterModePoint;
+    (*_tex_ptr).normalized = false;
+    CUDA_SAFE_CALL(cudaBindTextureToArray(_tex_ptr,_array,&_channel));
   }
   
-  /// Free any memory associated with device
-  inline void clear() 
-    { if (_rows>0) { _rows=0; CUDA_SAFE_CALL(cudaFreeArray(_array)); } }
+  /// Unbind texture
+  inline void unbind() { CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); }
+  
+  /// Free any memory associated with device and unbind
+  inline void clear() {
+    if (_rows>0) { 
+      _rows=0; 
+      CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); 
+      CUDA_SAFE_CALL(cudaFreeArray(_array)); 
+    } 
+  }
 
   inline size_t numel() const { return _cols*_rows; }
   inline size_t rows() const { return _rows; }
   inline size_t cols() const { return _cols; }
   inline size_t row_size() const { return _cols; }
   inline size_t row_bytes() const { return _cols*sizeof(numtyp); }
 
   /// Copy from Host
   inline void copy_from_host(const numtyp *host_p) {
     CUDA_SAFE_CALL(cudaMemcpyToArray(_array, 0, 0, host_p,
                                      numel()*sizeof(numtyp),
                                      cudaMemcpyHostToDevice));
   }
 
   /// Copy from Host
   /** \note Used when the number of columns/rows allocated on host smaller than
     *       on device **/
   inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
                                const size_t cols) {
     CUDA_SAFE_CALL(cudaMemcpy2DToArray(_array, 0, 0, host_p, 
                          cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
                          cudaMemcpyHostToDevice));
   }                               
 
   /// Asynchronous Copy from Host
   inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) {
     CUDA_SAFE_CALL(cudaMemcpyToArrayAsync(_array, 0, 0, host_p, 
                                           numel()*sizeof(numtyp),
                                           cudaMemcpyHostToDevice,stream));
   }
       
   /// Asynchronous Copy from Host
   /** \note Used when the number of columns/rows allocated on host smaller than
     *       on device **/
   inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
                                const size_t cols, cudaStream_t &stream) {
     CUDA_SAFE_CALL(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p, 
                          cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
                          cudaMemcpyHostToDevice,stream));
   }                               
 
   /// Cast buffer to numtyp in host_write and copy to array
   template <class numtyp2>
   inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write) {
     int n=numel();                        
     for (int i=0; i<n; i++) {
       host_write[i]=static_cast<numtyp>(*buffer); buffer++;
     }
     copy_from_host(host_write.begin());
   }
 
   /// Cast buffer to numtyp in host_write and copy to array
   /** \note Used when the number of columns/rows allocated on host smaller than
     *       on device **/
   template <class numtyp2>
   inline void cast_copy2D(const numtyp2 *buffer, NVC_HostT &host_write,
                           const size_t rows, const size_t cols) {
     int n=rows*cols;                        
     for (int i=0; i<n; i++) {
       host_write[i]=static_cast<numtyp>(*buffer); buffer++;
     }
     copy_2Dfrom_host(host_write.begin(),rows,cols);
   }
 
   /// Cast buffer to numtyp in host_write and copy to array asynchronously
   template <class numtyp2>
   inline void cast_copy(const numtyp2 *buffer, NVC_HostT &host_write,
                         cudaStream_t &stream) {
     int n=numel();                        
     for (int i=0; i<n; i++) {
       host_write[i]=static_cast<numtyp>(*buffer); buffer++;
     }
     copy_from_host(host_write.begin(),stream);
   }
 
  private:
   size_t _rows, _cols;
   cudaArray *_array;
   cudaChannelFormatDesc _channel;
+  textureReference *_tex_ptr;
 };
 
 #endif
diff --git a/lib/gpu/nvc_traits.h b/lib/gpu/nvc_traits.h
new file mode 100644
index 000000000..eee92b702
--- /dev/null
+++ b/lib/gpu/nvc_traits.h
@@ -0,0 +1,31 @@
+/***************************************************************************
+                             nvc_texture_traits.h
+                             -------------------
+                               W. Michael Brown
+
+  Tricks for templating textures
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS GPU Library
+ __________________________________________________________________________
+
+    begin                : Tue Jun 23 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#ifndef NVC_TEXTURE_TRAITS_H
+#define NVC_TEXTURE_TRAITS_H
+
+template <class numtyp> class nvc_vec_traits;
+template <> class nvc_vec_traits<float> { public: typedef float2 vec2; };
+template <> class nvc_vec_traits<double> { public: typedef double2 vec2; };
+
+#endif
diff --git a/lib/gpu/pair_gpu_atom.cu b/lib/gpu/pair_gpu_atom.cu
index 94617a5c1..88b24e741 100644
--- a/lib/gpu/pair_gpu_atom.cu
+++ b/lib/gpu/pair_gpu_atom.cu
@@ -1,173 +1,174 @@
 /***************************************************************************
                                pair_gpu_atom.cu
                              -------------------
                                W. Michael Brown
 
   Memory routines for moving atom and force data between host and gpu
 
  __________________________________________________________________________
     This file is part of the LAMMPS GPU Library
  __________________________________________________________________________
 
     begin                : Tue Aug 4 2009
     copyright            : (C) 2009 by W. Michael Brown
     email                : wmbrown@sandia.gov
  ***************************************************************************/
 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
    ----------------------------------------------------------------------- */
 
+#include "pair_gpu_texture.h"
 #include "pair_gpu_atom.h"
+
 #define PairGPUAtomT PairGPUAtom<numtyp,acctyp>
 
 template <class numtyp, class acctyp>
 int PairGPUAtomT::bytes_per_atom() const { 
   return atom_fields()*sizeof(numtyp)+ans_fields()*sizeof(acctyp); 
 }
 
 template <class numtyp, class acctyp>
 void PairGPUAtomT::init(const int max_atoms) {
   if (allocated)
     clear();
     
   _max_atoms=max_atoms;
 
   // Initialize timers for the selected GPU
   time_atom.init();
   time_answer.init();
 
   // Device matrices for atom and force data
-  dev_x.safe_alloc(atom_fields(),max_atoms);
-  x_bind_texture<numtyp>(dev_x);
+  dev_x.safe_alloc(atom_fields(),max_atoms,x_get_texture<numtyp>());
   ans.safe_alloc(ans_fields(),max_atoms);
 
   // Get a host write only buffer
   host_write.safe_alloc_w(max_atoms*4);
   // Get a host read/write buffer
   host_read.safe_alloc_rw(ans.row_size()*ans_fields());
     
   allocated=true;
 }
   
 template <class numtyp, class acctyp>
 void PairGPUAtomT::clear() {
   if (!allocated)
       return;
   allocated=false;
       
-  x_unbind_texture<numtyp>();
+  dev_x.unbind();
   ans.clear();                               
   host_write.clear();
   host_read.clear();
   dev_x.clear();
 }  
  
 template <class numtyp, class acctyp>
 double PairGPUAtomT::host_memory_usage(const int max_atoms) const {
   return max_atoms*atom_fields()*sizeof(numtyp)+
          ans_fields()*(max_atoms)*sizeof(acctyp)+
          sizeof(PairGPUAtom<numtyp,acctyp>);
 }
   
 template <class numtyp, class acctyp>
 void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag, 
                                 cudaStream_t &s) {
   _eflag=eflag;
   _vflag=vflag;
     
   int csize=ans_fields();    
   if (!eflag)
     csize--;
   if (!vflag)
     csize-=6;
       
   host_read.copy_from_device(ans.begin(),ans.row_size()*csize,s);
 }
   
 template <class numtyp, class acctyp>
 double PairGPUAtomT::energy_virial(const int *ilist, const bool eflag_atom,
                                    const bool vflag_atom, double *eatom, 
                                    double **vatom, double *virial) {
   double evdwl=0.0;
   int gap=ans.row_size()-_inum;
 
   acctyp *ap=host_read.begin();
   if (_eflag) {
     if (eflag_atom) {
       for (int i=0; i<_inum; i++) {
         evdwl+=*ap;
         eatom[ilist[i]]+=*ap*0.5;
         ap++;
       }
     } else
       for (int i=0; i<_inum; i++) {
         evdwl+=*ap;
         ap++;
       }
     ap+=gap;
     evdwl*=0.5;
   }
   _read_loc=ap;
   gap=ans.row_size();
   if (_vflag) {
     if (vflag_atom) {
       for (int ii=0; ii<_inum; ii++) {
         int i=ilist[ii];
         ap=_read_loc+ii;
         for (int j=0; j<6; j++) {
           vatom[i][j]+=*ap*0.5;
           virial[j]+=*ap;
           ap+=gap;
         }
       }
     } else {
       for (int ii=0; ii<_inum; ii++) {
         ap=_read_loc+ii;
         for (int j=0; j<6; j++) {
           virial[j]+=*ap;
           ap+=gap;
         }
       }
     }
     for (int j=0; j<6; j++)
       virial[j]*=0.5;
     _read_loc+=gap*6;
   }
   
   return evdwl;
 }
 
 template <class numtyp, class acctyp>
 void PairGPUAtomT::add_forces(const int *ilist, double **f) {
   int gap=ans.row_size();
   for (int ii=0; ii<_inum; ii++) {
     acctyp *ap=_read_loc+ii;
     int i=ilist[ii];
     f[i][0]+=*ap;
     ap+=gap;
     f[i][1]+=*ap;
     ap+=gap;
     f[i][2]+=*ap;
   }
 }
 
 template <class numtyp, class acctyp>
 void PairGPUAtomT::add_torques(const int *ilist, double **tor, const int n) {
   int gap=ans.row_size();
   _read_loc+=gap*3;
   for (int ii=0; ii<n; ii++) {
     acctyp *ap=_read_loc+ii;
     int i=ilist[ii];
     tor[i][0]+=*ap;
     ap+=gap;
     tor[i][1]+=*ap;
     ap+=gap;
     tor[i][2]+=*ap;
   }
 }
 
 template class PairGPUAtom<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h
index 18acafea3..01f450ffc 100644
--- a/lib/gpu/pair_gpu_atom.h
+++ b/lib/gpu/pair_gpu_atom.h
@@ -1,153 +1,153 @@
 /***************************************************************************
                                pair_gpu_atom.h
                              -------------------
                                W. Michael Brown
 
   Memory routines for moving atom and force data between host and gpu
 
  __________________________________________________________________________
     This file is part of the LAMMPS GPU Library
  __________________________________________________________________________
 
     begin                : Tue Aug 4 2009
     copyright            : (C) 2009 by W. Michael Brown
     email                : wmbrown@sandia.gov
  ***************************************************************************/
 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
    ----------------------------------------------------------------------- */
 
 #ifndef PAIR_GPU_ATOM_H
 #define PAIR_GPU_ATOM_H
 
 // PRECISION - Precision for rsq, energy, force, and torque calculation
 // ACC_PRECISION - Precision for accumulation of energies, forces, and torques
 #ifdef _SINGLE_DOUBLE
 #define PRECISION float
 #define ACC_PRECISION double
 #define MAX_ATOMS 65536
 #endif
 
 #ifdef _DOUBLE_DOUBLE
 #define PRECISION double
 #define ACC_PRECISION double
 #define MAX_ATOMS 32768
 #endif
 
 #ifndef PRECISION
 #define PRECISION float
-#define ACC_PRECISION double
+#define ACC_PRECISION float
 #define MAX_ATOMS 65536
 #endif
 
 #include "nvc_timer.h"
-#include "pair_gpu_texture.h"
+#include "nvc_memory.h"
 
 template <class numtyp, class acctyp>
 class PairGPUAtom {
  public:
   PairGPUAtom() : _atom_fields(4), _ans_fields(10), allocated(false) {}
   ~PairGPUAtom() { clear(); }
 
   // Accessors
   inline int atom_fields() const { return _atom_fields; }
   inline int ans_fields() const { return _ans_fields; }
   inline int max_atoms() const { return _max_atoms; }
   inline int nall() const { return _nall; }
   inline int inum() const { return _inum; }
 
   /// Set number of atoms for future copy operations
   inline void nall(const int n) { _nall=n; }
   /// Set number of inum for future copy operations
   inline void inum(const int n) { _inum=n; }
   /// Set the number of atom fields (x, y, z, type, etc)
   inline void atom_fields(const int n) { _atom_fields=n; }
   /// Set the number of answer fields (energy, virial, force, etc.)
   inline void ans_fields(const int n) { _ans_fields=n; }
   
   /// Memory usage per atom in this class
   /** \note atom_fields and ans_fields should be set for correct answer **/
   int bytes_per_atom() const; 
 
   /// Must be called once to allocate host and device memory
   /** \note atom_fields and ans_fields should be set first if not default **/
   void init(const int max_atoms);
   
   /// Free all memory on host and device
   void clear();
  
   /// Return the total amount of host memory used by class
   double host_memory_usage(const int max_atoms) const;
 
   
   // -------------------------COPY TO GPU ----------------------------------
 
   /// Reset the write buffer pointer (Start copying new atom data)
   inline void reset_write_buffer() { _write_loc=host_write.begin(); }
   
   /// Add a row to write buffer with unit stride
   /** Copies nall() elements **/
   template<class cpytyp>
   inline void add_atom_data(const cpytyp *host_ptr)
     { for (int i=0; i<_nall; i++) { *_write_loc=host_ptr[i]; _write_loc++; } }
   
   /// Add a row to write buffer with non-unit stride
   /** Copies nall() elements **/
   template<class cpytyp>
   inline void add_atom_data(const cpytyp *hostptr, const int stride) {
     int t=_nall*stride; 
     for (int i=0; i<t; i+=stride) { *_write_loc=hostptr[i]; _write_loc++; }
   }
   
   /// Copy num_rows x nall() write buffer to x in GPU
   /** num_rows<=atom_fields() **/
   inline void copy_atom_data(const int num_rows, cudaStream_t &stream) 
     { dev_x.copy_2Dfrom_host(host_write.begin(),num_rows,nall(),stream); }
 
     
   // -------------------------COPY FROM GPU -------------------------------
 
   /// Copy answers from GPU into read buffer
   void copy_answers(const bool eflag, const bool vflag, cudaStream_t &s);
   
   /// Copy energy and virial data into LAMMPS memory
   double energy_virial(const int *ilist, const bool eflag_atom,
                        const bool vflag_atom, double *eatom, double **vatom,
                        double *virial);
 
   /// Add forces from the GPU into a LAMMPS pointer
   void add_forces(const int *ilist, double **f);
 
   /// Add torques from the GPU into a LAMMPS pointer
   void add_torques(const int *ilist, double **tor, const int n);
     
   // ------------------------------ DATA ----------------------------------
 
   // atom_fields() x n (example: rows 1-3 position, 4 is type)
   NVC_ConstMatT dev_x;
   // ans_fields() x n Storage for Forces, etc.
   // example: if (eflag and vflag) 1 is energy, 2-7 is virial, 8-10 is force 
   // example: if (!eflag) 1-3 is force
   NVC_Mat<acctyp> ans;                               
 
   // Buffer for moving floating point data to GPU
   NVC_HostT host_write;
   // Buffer for moving floating point data to CPU
   NVC_Host<acctyp> host_read;
   
   // Timing Stuff
   NVCTimer time_atom, time_answer;
   
  private:
   bool allocated, _eflag, _vflag;
   int _atom_fields, _ans_fields;
   int _max_atoms, _nall, _inum;
   numtyp * _write_loc;
   acctyp * _read_loc;
 };
 
 #endif
diff --git a/lib/gpu/pair_gpu_texture.h b/lib/gpu/pair_gpu_texture.h
index 76bac7136..e647dda8b 100644
--- a/lib/gpu/pair_gpu_texture.h
+++ b/lib/gpu/pair_gpu_texture.h
@@ -1,300 +1,316 @@
 /***************************************************************************
                               pair_gpu_texture.h
                              -------------------
                                W. Michael Brown
 
   Tricks for templating textures
 
  __________________________________________________________________________
     This file is part of the LAMMPS GPU Library
  __________________________________________________________________________
 
     begin                : Tue Jun 23 2009
     copyright            : (C) 2009 by W. Michael Brown
     email                : wmbrown@sandia.gov
  ***************************************************************************/
 
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under 
    the GNU General Public License.
    ----------------------------------------------------------------------- */
 
+#include "nvc_traits.h"
 #include "nvc_memory.h"
 
 #ifndef PAIR_GPU_TEXTURE_H
 #define PAIR_GPU_TEXTURE_H
 
 #ifdef _SINGLE_DOUBLE
 #define GB_GPU_DOUBLE
 #endif
 
 #ifdef _DOUBLE_DOUBLE
 #define GB_GPU_DOUBLE
 #endif
 
-template <class numtyp> class cu_vec_traits;
-template <> class cu_vec_traits<float> { public: typedef float2 vec2; };
-template <> class cu_vec_traits<double> { public: typedef double2 vec2; };
-
 // ------------------------------- x ------------------------------------
 
 static texture<float, 2, cudaReadModeElementType> x_float_tex;
 static texture<int2, 2, cudaReadModeElementType> x_double_tex;
-template <class numtyp> inline void x_bind_texture(NVC_ConstMatT &mat)
-  { mat.bind_texture(x_float_tex); }
-
-template <> inline void x_bind_texture<double>(NVC_ConstMatD &mat)
-  { mat.bind_texture(x_double_tex); } 
-template <class numtyp> inline void x_unbind_texture()
-  { cudaUnbindTexture(x_float_tex); }
-template <> inline void x_unbind_texture<double>() 
-  { cudaUnbindTexture(x_double_tex); }
+template <class numtyp> inline textureReference * x_get_texture() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"x_float_tex");
+  return const_cast<textureReference *>(ptr);
+}
+template <> inline textureReference * x_get_texture<double>() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"x_double_tex");
+  return const_cast<textureReference *>(ptr);
+}
 template <class numtyp>
 static __inline__ __device__ numtyp _x_(const int i, const int j) {
   return tex2D(x_float_tex,i,j);
 }
 #ifdef GB_GPU_DOUBLE
 template <>
 static __inline__ __device__ double _x_<double>(const int i,const int j) {
   int2 t=tex2D(x_double_tex,i,j);
   return __hiloint2double(t.y, t.x);
 }
 #endif
 
 // ------------------------------- form ------------------------------------
 
 static texture<int, 2, cudaReadModeElementType> form_tex;
-inline void form_bind_texture(NVC_ConstMatI &mat)
-  { mat.bind_texture(form_tex); }
-inline void form_unbind_texture()
-  { cudaUnbindTexture(form_tex); }
+inline textureReference * form_get_texture() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"form_tex");
+  return const_cast<textureReference *>(ptr);
+}
 static __inline__ __device__ int _form_(const int i, const int j) {
   return tex2D(form_tex,i,j);
 }
 
 // ------------------------------- lshape ------------------------------------
 
 static texture<float, 1, cudaReadModeElementType> lshape_float_tex;
 static texture<int2, 1, cudaReadModeElementType> lshape_double_tex;
-static cudaChannelFormatDesc channel_lshape;
-template <class numtyp> inline void lshape_bind_texture(NVC_VecT &vec) 
- { vec.bind_texture(lshape_float_tex,channel_lshape); }
-template <> inline void lshape_bind_texture<double>(NVC_VecD &vec)
- { vec.bind_texture(lshape_double_tex,channel_lshape); }
-template <class numtyp> inline void lshape_unbind_texture() 
-  { cudaUnbindTexture(lshape_float_tex); }
-template <> inline void lshape_unbind_texture<double>() 
-  { cudaUnbindTexture(lshape_double_tex); }
+template <class numtyp> inline textureReference * lshape_get_texture() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"lshape_float_tex");
+  return const_cast<textureReference *>(ptr);
+}
+template <> inline textureReference * lshape_get_texture<double>() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"lshape_double_tex");
+  return const_cast<textureReference *>(ptr);
+}
 template <class numtyp>
 static __inline__ __device__ numtyp _lshape_(const int i) 
   { return tex1Dfetch(lshape_float_tex,i); }
 #ifdef GB_GPU_DOUBLE
 template <>
 static __inline__ __device__ double _lshape_<double>(const int i) {
   int2 t=tex1Dfetch(lshape_double_tex,i);
   return __hiloint2double(t.y, t.x);
 }
 #endif
 
 // ------------------------------- shape ------------------------------------
 
 static texture<float, 2, cudaReadModeElementType> shape_float_tex;
 static texture<int2, 2, cudaReadModeElementType> shape_double_tex;
-template <class numtyp> inline void shape_bind_texture(NVC_ConstMatT &mat) 
- { mat.bind_texture(shape_float_tex); }
-template <> inline void shape_bind_texture<double>(NVC_ConstMatD &mat) 
- { mat.bind_texture(shape_double_tex); } 
-template <class numtyp> inline void shape_unbind_texture() 
-  { cudaUnbindTexture(shape_float_tex); }
-template <> inline void shape_unbind_texture<double>() 
-  { cudaUnbindTexture(shape_double_tex); }
+template <class numtyp> inline textureReference * shape_get_texture() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"shape_float_tex");
+  return const_cast<textureReference *>(ptr);
+}
+template <> inline textureReference * shape_get_texture<double>() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"shape_double_tex");
+  return const_cast<textureReference *>(ptr);
+}
 template <class numtyp> 
 static __inline__ __device__ numtyp _shape_(const int i, const int j) {
   return tex2D(shape_float_tex,j,i);
 }
 #ifdef GB_GPU_DOUBLE
 template <>
 static __inline__ __device__ double _shape_<double>(const int i, const int j) {
   int2 t=tex2D(shape_double_tex,j,i);
   return __hiloint2double(t.y, t.x);
 }
 #endif
 
 // ------------------------------- well ------------------------------------
 
 static texture<float, 2, cudaReadModeElementType> well_float_tex;
 static texture<int2, 2, cudaReadModeElementType> well_double_tex;
-template <class numtyp> inline void well_bind_texture(NVC_ConstMatT &mat) 
-  { mat.bind_texture(well_float_tex); }
-template <> inline void well_bind_texture<double>(NVC_ConstMatD &mat)
-  { mat.bind_texture(well_double_tex); }
-template <class numtyp> inline void well_unbind_texture() 
-  { cudaUnbindTexture(well_float_tex); }
-template <> inline void well_unbind_texture<double>() 
-  { cudaUnbindTexture(well_double_tex); }
+template <class numtyp> inline textureReference * well_get_texture() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"well_float_tex");
+  return const_cast<textureReference *>(ptr);
+}
+template <> inline textureReference * well_get_texture<double>() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"well_double_tex");
+  return const_cast<textureReference *>(ptr);
+}
 template <class numtyp> 
 static __inline__ __device__ numtyp _well_(const int i, const int j) 
   { return tex2D(well_float_tex,j,i); }
 #ifdef GB_GPU_DOUBLE
 template <>
 static __inline__ __device__ double _well_<double>(const int i,const int j) {
   int2 t=tex2D(well_double_tex,j,i);
   return __hiloint2double(t.y, t.x);
 }
 #endif
 
 // ------------------------------- sigma ------------------------------------
 
 static texture<float, 2, cudaReadModeElementType> sigma_float_tex;
 static texture<int2, 2, cudaReadModeElementType> sigma_double_tex;
-template <class numtyp> inline void sigma_bind_texture(NVC_ConstMatT &mat)
-  { mat.bind_texture(sigma_float_tex); }
-template <> inline void sigma_bind_texture<double>(NVC_ConstMatD &mat)
-  { mat.bind_texture(sigma_double_tex); }
-template <class numtyp> inline void sigma_unbind_texture() 
-  { cudaUnbindTexture(sigma_float_tex); }
-template <> inline void sigma_unbind_texture<double>() 
-  { cudaUnbindTexture(sigma_double_tex); }
+template <class numtyp> inline textureReference * sigma_get_texture() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"sigma_float_tex");
+  return const_cast<textureReference *>(ptr);
+}
+template <> inline textureReference * sigma_get_texture<double>() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"sigma_double_tex");
+  return const_cast<textureReference *>(ptr);
+}
 template <class numtyp>
 static __inline__ __device__ numtyp _sigma_(const int i, const int j) {
   return tex2D(sigma_float_tex,j,i);
 }
 #ifdef GB_GPU_DOUBLE
 template <>
 static __inline__ __device__ double _sigma_<double>(const int i,const int j) {
   int2 t=tex2D(sigma_double_tex,j,i);
   return __hiloint2double(t.y, t.x);
 }
 #endif
 
 // ------------------------------- epsilon ------------------------------------
 
 static texture<float, 2, cudaReadModeElementType> epsilon_float_tex;
 static texture<int2, 2, cudaReadModeElementType> epsilon_double_tex;
-template <class numtyp> inline void epsilon_bind_texture(NVC_ConstMatT &mat)
-  { mat.bind_texture(epsilon_float_tex); }
-template <> inline void epsilon_bind_texture<double>(NVC_ConstMatD &mat)
-  { mat.bind_texture(epsilon_double_tex); }
-template <class numtyp> inline void epsilon_unbind_texture() 
-  { cudaUnbindTexture(epsilon_float_tex); }
-template <> inline void epsilon_unbind_texture<double>() 
-  { cudaUnbindTexture(epsilon_double_tex); }
+template <class numtyp> inline textureReference * epsilon_get_texture() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"epsilon_float_tex");
+  return const_cast<textureReference *>(ptr);
+}
+template <> inline textureReference * epsilon_get_texture<double>() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"epsilon_double_tex");
+  return const_cast<textureReference *>(ptr);
+}
 template <class numtyp>
 static __inline__ __device__ numtyp _epsilon_(const int i, const int j) {
   return tex2D(epsilon_float_tex,j,i);
 }
 #ifdef GB_GPU_DOUBLE
 template <>
 static __inline__ __device__ double _epsilon_<double>(const int i,const int j) {
   int2 t=tex2D(epsilon_double_tex,j,i);
   return __hiloint2double(t.y, t.x);
 }
 #endif
 
 // ------------------------------- cutsq ------------------------------------
 
 static texture<float, 2, cudaReadModeElementType> cutsq_float_tex;
 static texture<int2, 2, cudaReadModeElementType> cutsq_double_tex;
-template <class numtyp> inline void cutsq_bind_texture(NVC_ConstMatT &mat)
-  { mat.bind_texture(cutsq_float_tex); }
-template <> inline void cutsq_bind_texture<double>(NVC_ConstMatD &mat)
-  { mat.bind_texture(cutsq_double_tex); }
-template <class numtyp> inline void cutsq_unbind_texture() 
- { cudaUnbindTexture(cutsq_float_tex); }
-template <> inline void cutsq_unbind_texture<double>() 
- { cudaUnbindTexture(cutsq_double_tex); }
+template <class numtyp> inline textureReference * cutsq_get_texture() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"cutsq_float_tex");
+  return const_cast<textureReference *>(ptr);
+}
+template <> inline textureReference * cutsq_get_texture<double>() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"cutsq_double_tex");
+  return const_cast<textureReference *>(ptr);
+}
 template <class numtyp>
 static __inline__ __device__ numtyp _cutsq_(const int i, const int j) {
   return tex2D(cutsq_float_tex,j,i);
 }
 #ifdef GB_GPU_DOUBLE
 template <>
 static __inline__ __device__ double _cutsq_<double>(const int i,const int j) {
   int2 t=tex2D(cutsq_double_tex,j,i);
   return __hiloint2double(t.y, t.x);
 }
 #endif
 
 // ------------------------------- lj1 ------------------------------------
 
 static texture<float2, 2, cudaReadModeElementType> lj1_float_tex;
 static texture<int4, 2, cudaReadModeElementType> lj1_double_tex;
-template <class numtyp> inline void lj1_bind_texture(NVC_ConstMatT &mat)
-  { mat.bind_texture(lj1_float_tex); }
-template <> inline void lj1_bind_texture<double2>(NVC_ConstMatD2 &mat)
-  { mat.bind_texture(lj1_double_tex); }
-template <class numtyp> inline void lj1_unbind_texture() 
- { cudaUnbindTexture(lj1_float_tex); }
-template <> inline void lj1_unbind_texture<double2>() 
- { cudaUnbindTexture(lj1_double_tex); }
+template <class numtyp> inline textureReference * lj1_get_texture() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"lj1_float_tex");
+  return const_cast<textureReference *>(ptr);
+}
+template <> inline textureReference * lj1_get_texture<double>() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"lj1_double_tex");
+  return const_cast<textureReference *>(ptr);
+}
 template <class numtyp>
 static __inline__ __device__ 
-typename cu_vec_traits<numtyp>::vec2 _lj1_(const int i, const int j) {
+typename nvc_vec_traits<numtyp>::vec2 _lj1_(const int i, const int j) {
   return tex2D(lj1_float_tex,j,i);
 }
 #ifdef GB_GPU_DOUBLE
 template <>
 static __inline__ __device__ double2 _lj1_<double>(const int i,const int j) {
   int4 t=tex2D(lj1_double_tex,j,i);
   double2 ans;
   ans.x=__hiloint2double(t.y, t.x);
   ans.y=__hiloint2double(t.w, t.z);
   return ans;
 }
 #endif
 
 // ------------------------------- lj3 ------------------------------------
 
 static texture<float2, 2, cudaReadModeElementType> lj3_float_tex;
 static texture<int4, 2, cudaReadModeElementType> lj3_double_tex;
-template <class numtyp> inline void lj3_bind_texture(NVC_ConstMatT &mat)
-  { mat.bind_texture(lj3_float_tex); }
-template <> inline void lj3_bind_texture<double2>(NVC_ConstMatD2 &mat)
-  { mat.bind_texture(lj3_double_tex); }
-template <class numtyp> inline void lj3_unbind_texture() 
- { cudaUnbindTexture(lj3_float_tex); }
-template <> inline void lj3_unbind_texture<double2>() 
- { cudaUnbindTexture(lj3_double_tex); }
+template <class numtyp> inline textureReference * lj3_get_texture() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"lj3_float_tex");
+  return const_cast<textureReference *>(ptr);
+}
+template <> inline textureReference * lj3_get_texture<double>() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"lj3_double_tex");
+  return const_cast<textureReference *>(ptr);
+}
 template <class numtyp>
 static __inline__ __device__ 
-typename cu_vec_traits<numtyp>::vec2 _lj3_(const int i, const int j) {
+typename nvc_vec_traits<numtyp>::vec2 _lj3_(const int i, const int j) {
   return tex2D(lj3_float_tex,j,i);
 }
 #ifdef GB_GPU_DOUBLE
 template <>
 static __inline__ __device__ double2 _lj3_<double>(const int i,const int j) {
   int4 t=tex2D(lj3_double_tex,j,i);
   double2 ans;
   ans.x=__hiloint2double(t.y, t.x);
   ans.y=__hiloint2double(t.w, t.z);
   return ans;
 }
 #endif
 
 // ------------------------------- offset ------------------------------------
 
 static texture<float, 2, cudaReadModeElementType> offset_float_tex;
 static texture<int2, 2, cudaReadModeElementType> offset_double_tex;
-template <class numtyp> inline void offset_bind_texture(NVC_ConstMatT &mat)
-  { mat.bind_texture(offset_float_tex); }
-template <> inline void offset_bind_texture<double>(NVC_ConstMatD &mat)
-  { mat.bind_texture(offset_double_tex); }
-template <class numtyp> inline void offset_unbind_texture() 
- { cudaUnbindTexture(offset_float_tex); }
-template <> inline void offset_unbind_texture<double>() 
- { cudaUnbindTexture(offset_double_tex); }
+template <class numtyp> inline textureReference * offset_get_texture() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"offset_float_tex");
+  return const_cast<textureReference *>(ptr);
+}
+template <> inline textureReference * offset_get_texture<double>() {
+  const textureReference *ptr;
+  cudaGetTextureReference(&ptr,"offset_double_tex");
+  return const_cast<textureReference *>(ptr);
+}
 template <class numtyp>
 static __inline__ __device__ numtyp _offset_(const int i, const int j) {
   return tex2D(offset_float_tex,j,i);
 }
 #ifdef GB_GPU_DOUBLE
 template <>
 static __inline__ __device__ double _offset_<double>(const int i,const int j) {
   int2 t=tex2D(offset_double_tex,j,i);
   return __hiloint2double(t.y, t.x);
 }
 #endif
 
 #endif
diff --git a/lib/gpu/pair_tex_tar.cu b/lib/gpu/pair_tex_tar.cu
new file mode 100644
index 000000000..d0b07177e
--- /dev/null
+++ b/lib/gpu/pair_tex_tar.cu
@@ -0,0 +1,28 @@
+/***************************************************************************
+                               pair_tex_tar.cu
+                             -------------------
+                               W. Michael Brown
+
+  "Tar" of header and source files that need texture reference definitions
+  within file scope.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS GPU Library
+ __________________________________________________________________________
+
+    begin                : Tue Jun 23 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : wmbrown@sandia.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   ----------------------------------------------------------------------- */
+
+#include "pair_gpu_atom.cu"
+#include "lj_gpu.cu"
+#include "gb_gpu.cu"
+