lal_preprocessor.h
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Wed, Jul 31, 00:44

lal_preprocessor.h
View Options

	// **************************************************************************
	// preprocessor.cu
	// -------------------
	// W. Michael Brown (ORNL)
	//
	// Device code for CUDA-specific preprocessor definitions
	//
	// __________________________________________________________________________
	// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
	// __________________________________________________________________________
	//
	// begin :
	// email : brownw@ornl.gov
	// ***************************************************************************/

	//*************************************************************************
	// Preprocessor Definitions
	//
	// Note: It is assumed that constants with the same names are defined with
	// the same values in all files.
	//
	// ARCH
	// Definition: Architecture number for accelerator
	// MEM_THREADS
	// Definition: Number of threads with sequential ids accessing memory
	// simultaneously on multiprocessor
	// WARP_SIZE:
	// Definition: Number of threads guaranteed to be on the same instruction
	// THREADS_PER_ATOM
	// Definition: Default number of threads assigned per atom for pair styles
	// Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
	// THREADS_PER_CHARGE
	// Definition: Default number of threads assigned per atom for pair styles
	// with charge
	// Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
	// PPPM_MAX_SPLINE
	// Definition: Maximum order for splines in PPPM
	// PPPM_BLOCK_1D
	// Definition: Thread block size for PPPM kernels
	// Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
	// PPPM_BLOCK_1D%32==0
	// BLOCK_PAIR
	// Definition: Default thread block size for pair styles
	// Restrictions:
	// MAX_SHARED_TYPES 8
	// Definition: Max # of atom type params can be stored in shared memory
	// Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR
	// BLOCK_CELL_2D
	// Definition: Default block size in each dimension for cell list builds
	// and matrix transpose
	// BLOCK_CELL_ID
	// Definition: Default block size for binning atoms in cell list builds
	// BLOCK_NBOR_BUILD
	// Definition: Default block size for neighbor list builds
	// BLOCK_BIO_PAIR
	// Definition: Default thread block size for "bio" pair styles
	// MAX_BIO_SHARED_TYPES
	// Definition: Max # of atom type params can be stored in shared memory
	// Restrictions: MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2
	//
	//*************************************************************************/

	// -------------------------------------------------------------------------
	// CUDA DEFINITIONS
	// -------------------------------------------------------------------------

	#ifdef NV_KERNEL

	#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
	#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
	#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
	#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
	#define THREAD_ID_X threadIdx.x
	#define THREAD_ID_Y threadIdx.y
	#define BLOCK_ID_X blockIdx.x
	#define BLOCK_ID_Y blockIdx.y
	#define BLOCK_SIZE_X blockDim.x
	#define BLOCK_SIZE_Y blockDim.y
	#define __kernel extern "C" __global__
	#define __local __shared__
	#define __global
	#define restrict __restrict__
	#define atom_add atomicAdd
	#define ucl_inline static __inline__ __device__

	#ifdef __CUDA_ARCH__
	#define ARCH __CUDA_ARCH__
	#else
	#define ARCH 100
	#endif

	#if (ARCH < 200)

	#define THREADS_PER_ATOM 1
	#define THREADS_PER_CHARGE 16
	#define BLOCK_NBOR_BUILD 64
	#define BLOCK_PAIR 64
	#define BLOCK_BIO_PAIR 64
	#define MAX_SHARED_TYPES 8

	#else

	#if (ARCH < 300)

	#define THREADS_PER_ATOM 4
	#define THREADS_PER_CHARGE 8
	#define BLOCK_NBOR_BUILD 128
	#define BLOCK_PAIR 128
	#define BLOCK_BIO_PAIR 128
	#define MAX_SHARED_TYPES 8

	#else

	#define THREADS_PER_ATOM 4
	#define THREADS_PER_CHARGE 8
	#define BLOCK_NBOR_BUILD 128
	#define BLOCK_PAIR 256
	#define BLOCK_BIO_PAIR 256
	#define BLOCK_ELLIPSE 128
	#define MAX_SHARED_TYPES 11

	#ifdef _SINGLE_SINGLE
	#define shfl_xor __shfl_xor
	#else
	ucl_inline double shfl_xor(double var, int laneMask, int width) {
	int2 tmp;
	tmp.x = __double2hiint(var);
	tmp.y = __double2loint(var);
	tmp.x = __shfl_xor(tmp.x,laneMask,width);
	tmp.y = __shfl_xor(tmp.y,laneMask,width);
	return __hiloint2double(tmp.x,tmp.y);
	}
	#endif

	#endif

	#endif

	#define WARP_SIZE 32
	#define PPPM_BLOCK_1D 64
	#define BLOCK_CELL_2D 8
	#define BLOCK_CELL_ID 128
	#define MAX_BIO_SHARED_TYPES 128

	#ifdef _DOUBLE_DOUBLE
	#define fetch4(ans,i,pos_tex) { \
	int4 xy = tex1Dfetch(pos_tex,i*2); \
	int4 zt = tex1Dfetch(pos_tex,i*2+1); \
	ans.x=__hiloint2double(xy.y, xy.x); \
	ans.y=__hiloint2double(xy.w, xy.z); \
	ans.z=__hiloint2double(zt.y, zt.x); \
	ans.w=__hiloint2double(zt.w, zt.z); \
	}
	#define fetch(ans,i,q_tex) { \
	int2 qt = tex1Dfetch(q_tex,i); \
	ans=__hiloint2double(qt.y, qt.x); \
	}
	#else
	#define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
	#define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
	#endif

	#if (__CUDA_ARCH__ < 200)
	#define fast_mul __mul24
	#define MEM_THREADS 16
	#else
	#define fast_mul(X,Y) (X)*(Y)
	#define MEM_THREADS 32
	#endif

	#ifdef CUDA_PRE_THREE
	struct __builtin_align__(16) _double4
	{
	double x, y, z, w;
	};
	typedef struct _double4 double4;
	#endif

	#ifdef _DOUBLE_DOUBLE

	#define ucl_exp exp
	#define ucl_powr pow
	#define ucl_atan atan
	#define ucl_cbrt cbrt
	#define ucl_ceil ceil
	#define ucl_abs fabs
	#define ucl_rsqrt rsqrt
	#define ucl_sqrt sqrt
	#define ucl_recip(x) ((numtyp)1.0/(x))

	#else

	#define ucl_atan atanf
	#define ucl_cbrt cbrtf
	#define ucl_ceil ceilf
	#define ucl_abs fabsf
	#define ucl_recip(x) ((numtyp)1.0/(x))
	#define ucl_rsqrt rsqrtf
	#define ucl_sqrt sqrtf

	#ifdef NO_HARDWARE_TRANSCENDENTALS

	#define ucl_exp expf
	#define ucl_powr powf

	#else

	#define ucl_exp __expf
	#define ucl_powr __powf

	#endif

	#endif

	#endif

	// -------------------------------------------------------------------------
	// NVIDIA GENERIC OPENCL DEFINITIONS
	// -------------------------------------------------------------------------

	#ifdef NV_GENERIC_OCL

	#define USE_OPENCL
	#define fast_mul mul24
	#define MEM_THREADS 16
	#define THREADS_PER_ATOM 1
	#define THREADS_PER_CHARGE 1
	#define BLOCK_PAIR 64
	#define MAX_SHARED_TYPES 8
	#define BLOCK_NBOR_BUILD 64
	#define BLOCK_BIO_PAIR 64

	#define WARP_SIZE 32
	#define PPPM_BLOCK_1D 64
	#define BLOCK_CELL_2D 8
	#define BLOCK_CELL_ID 128
	#define MAX_BIO_SHARED_TYPES 128

	#endif

	// -------------------------------------------------------------------------
	// NVIDIA FERMI OPENCL DEFINITIONS
	// -------------------------------------------------------------------------

	#ifdef FERMI_OCL

	#define USE_OPENCL
	#define MEM_THREADS 32
	#define THREADS_PER_ATOM 4
	#define THREADS_PER_CHARGE 8
	#define BLOCK_PAIR 128
	#define MAX_SHARED_TYPES 11
	#define BLOCK_NBOR_BUILD 128
	#define BLOCK_BIO_PAIR 128

	#define WARP_SIZE 32
	#define PPPM_BLOCK_1D 64
	#define BLOCK_CELL_2D 8
	#define BLOCK_CELL_ID 128
	#define MAX_BIO_SHARED_TYPES 128

	#endif

	// -------------------------------------------------------------------------
	// NVIDIA KEPLER OPENCL DEFINITIONS
	// -------------------------------------------------------------------------

	#ifdef KEPLER_OCL

	#define USE_OPENCL
	#define MEM_THREADS 32
	#define THREADS_PER_ATOM 4
	#define THREADS_PER_CHARGE 8
	#define BLOCK_PAIR 256
	#define MAX_SHARED_TYPES 11
	#define BLOCK_NBOR_BUILD 128
	#define BLOCK_BIO_PAIR 256
	#define BLOCK_ELLIPSE 128

	#define WARP_SIZE 32
	#define PPPM_BLOCK_1D 64
	#define BLOCK_CELL_2D 8
	#define BLOCK_CELL_ID 128
	#define MAX_BIO_SHARED_TYPES 128

	#ifndef NO_OCL_PTX
	#define ARCH 300
	#ifdef _SINGLE_SINGLE
	inline float shfl_xor(float var, int laneMask, int width) {
	float ret;
	int c;
	c = ((WARP_SIZE-width) << 8) \| 0x1f;
	asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c));
	return ret;
	}
	#else
	#pragma OPENCL EXTENSION cl_khr_fp64 : enable
	inline double shfl_xor(double var, int laneMask, int width) {
	int c = ((WARP_SIZE-width) << 8) \| 0x1f;
	int x,y,x2,y2;
	double ans;
	asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
	asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(x2) : "r"(x), "r"(laneMask), "r"(c));
	asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(y2) : "r"(y), "r"(laneMask), "r"(c));
	asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
	return ans;
	}
	#endif
	#endif

	#endif

	// -------------------------------------------------------------------------
	// AMD CYPRESS OPENCL DEFINITIONS
	// -------------------------------------------------------------------------

	#ifdef CYPRESS_OCL

	#define USE_OPENCL
	#define MEM_THREADS 32
	#define THREADS_PER_ATOM 4
	#define THREADS_PER_CHARGE 8
	#define BLOCK_PAIR 128
	#define MAX_SHARED_TYPES 8
	#define BLOCK_NBOR_BUILD 64
	#define BLOCK_BIO_PAIR 64

	#define WARP_SIZE 64
	#define PPPM_BLOCK_1D 64
	#define BLOCK_CELL_2D 8
	#define BLOCK_CELL_ID 128
	#define MAX_BIO_SHARED_TYPES 128

	#endif

	// -------------------------------------------------------------------------
	// INTEL CPU OPENCL DEFINITIONS
	// -------------------------------------------------------------------------

	#ifdef INTEL_OCL

	#define USE_OPENCL
	#define MEM_THREADS 16
	#define THREADS_PER_ATOM 1
	#define THREADS_PER_CHARGE 1
	#define BLOCK_PAIR 1
	#define MAX_SHARED_TYPES 0
	#define BLOCK_NBOR_BUILD 4
	#define BLOCK_BIO_PAIR 2
	#define BLOCK_ELLIPSE 2

	#define WARP_SIZE 1
	#define PPPM_BLOCK_1D 32
	#define BLOCK_CELL_2D 1
	#define BLOCK_CELL_ID 2
	#define MAX_BIO_SHARED_TYPES 0

	#endif

	// -------------------------------------------------------------------------
	// INTEL PHI OPENCL DEFINITIONS
	// -------------------------------------------------------------------------

	#ifdef PHI_OCL

	#define USE_OPENCL
	#define MEM_THREADS 16
	#define THREADS_PER_ATOM 1
	#define THREADS_PER_CHARGE 1
	#define BLOCK_PAIR 16
	#define MAX_SHARED_TYPES 0
	#define BLOCK_NBOR_BUILD 16
	#define BLOCK_BIO_PAIR 16
	#define BLOCK_ELLIPSE 16

	#define WARP_SIZE 1
	#define PPPM_BLOCK_1D 32
	#define BLOCK_CELL_2D 4
	#define BLOCK_CELL_ID 16
	#define MAX_BIO_SHARED_TYPES 0

	#endif

	// -------------------------------------------------------------------------
	// GENERIC OPENCL DEFINITIONS
	// -------------------------------------------------------------------------

	#ifdef GENERIC_OCL

	#define USE_OPENCL
	#define MEM_THREADS 16
	#define THREADS_PER_ATOM 1
	#define THREADS_PER_CHARGE 1
	#define BLOCK_PAIR 64
	#define MAX_SHARED_TYPES 8
	#define BLOCK_NBOR_BUILD 64
	#define BLOCK_BIO_PAIR 64

	#define WARP_SIZE 1
	#define PPPM_BLOCK_1D 64
	#define BLOCK_CELL_2D 8
	#define BLOCK_CELL_ID 128
	#define MAX_BIO_SHARED_TYPES 128

	#endif

	// -------------------------------------------------------------------------
	// OPENCL Stuff for All Hardware
	// -------------------------------------------------------------------------
	#ifdef USE_OPENCL

	#ifndef _SINGLE_SINGLE

	#ifndef cl_khr_fp64
	#ifndef cl_amd_fp64
	#pragma OPENCL EXTENSION cl_khr_fp64 : enable
	#endif
	#endif
	#if defined(cl_khr_fp64)
	#pragma OPENCL EXTENSION cl_khr_fp64 : enable
	#elif defined(cl_amd_fp64)
	#pragma OPENCL EXTENSION cl_amd_fp64 : enable
	#endif

	#endif

	#ifndef fast_mul
	#define fast_mul(X,Y) (X)*(Y)
	#endif

	#ifndef ARCH
	#define ARCH 0
	#endif

	#ifndef DRIVER
	#define DRIVER 0
	#endif

	#define GLOBAL_ID_X get_global_id(0)
	#define THREAD_ID_X get_local_id(0)
	#define BLOCK_ID_X get_group_id(0)
	#define BLOCK_SIZE_X get_local_size(0)
	#define GLOBAL_SIZE_X get_global_size(0)
	#define THREAD_ID_Y get_local_id(1)
	#define BLOCK_ID_Y get_group_id(1)
	#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
	#define ucl_inline inline
	#define fetch4(ans,i,x) ans=x[i]
	#define fetch(ans,i,q) ans=q[i]

	#define ucl_atan atan
	#define ucl_cbrt cbrt
	#define ucl_ceil ceil
	#define ucl_abs fabs

	#ifdef _DOUBLE_DOUBLE
	#define NO_HARDWARE_TRANSCENDENTALS
	#endif

	#ifdef NO_HARDWARE_TRANSCENDENTALS

	#define ucl_exp exp
	#define ucl_powr powr
	#define ucl_rsqrt rsqrt
	#define ucl_sqrt sqrt
	#define ucl_recip(x) ((numtyp)1.0/(x))

	#else

	#define ucl_exp native_exp
	#define ucl_powr native_powr
	#define ucl_rsqrt native_rsqrt
	#define ucl_sqrt native_sqrt
	#define ucl_recip native_recip

	#endif

	#endif

	// -------------------------------------------------------------------------
	// ARCHITECTURE INDEPENDENT DEFINITIONS
	// -------------------------------------------------------------------------

	#ifndef PPPM_MAX_SPLINE
	#define PPPM_MAX_SPLINE 8
	#endif

	#ifdef _DOUBLE_DOUBLE
	#define numtyp double
	#define numtyp2 double2
	#define numtyp4 double4
	#define acctyp double
	#define acctyp4 double4
	#endif

	#ifdef _SINGLE_DOUBLE
	#define numtyp float
	#define numtyp2 float2
	#define numtyp4 float4
	#define acctyp double
	#define acctyp4 double4
	#endif

	#ifndef numtyp
	#define numtyp float
	#define numtyp2 float2
	#define numtyp4 float4
	#define acctyp float
	#define acctyp4 float4
	#endif

	#define EWALD_F (numtyp)1.12837917
	#define EWALD_P (numtyp)0.3275911
	#define A1 (numtyp)0.254829592
	#define A2 (numtyp)-0.284496736
	#define A3 (numtyp)1.421413741
	#define A4 (numtyp)-1.453152027
	#define A5 (numtyp)1.061405429

	#define SBBITS 30
	#define NEIGHMASK 0x3FFFFFFF
	ucl_inline int sbmask(int j) { return j >> SBBITS & 3; };

	#ifndef BLOCK_ELLIPSE
	#define BLOCK_ELLIPSE BLOCK_PAIR
	#endif

	// default to 32-bit smallint and other ints, 64-bit bigint: same as defined in src/lmptype.h
	#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && !defined(LAMMPS_SMALLBIG)
	#define LAMMPS_SMALLBIG
	#endif

lal_preprocessor.hNo OneTemporaryActions

File Metadata

lal_preprocessor.hView Options

Event Timeline

lal_preprocessor.h
No OneTemporary
Actions

lal_preprocessor.h
View Options