diff --git a/doc/package.txt b/doc/package.txt index a74bded19..54f534313 100644 --- a/doc/package.txt +++ b/doc/package.txt @@ -1,286 +1,287 @@ "LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c :link(lws,http://lammps.sandia.gov) :link(ld,Manual.html) :link(lc,Section_commands.html#comm) :line package command :h3 [Syntax:] package style args :pre style = {gpu} or {cuda} or {omp} :ulb,l args = arguments specific to the style :l {gpu} args = mode first last split keyword value ... mode = force or force/neigh first = ID of first GPU to be used on each node last = ID of last GPU to be used on each node split = fraction of particles assigned to the GPU zero or more keyword/value pairs may be appended keywords = {threads_per_atom} or {cellsize} or {device} {threads_per_atom} value = Nthreads Nthreads = # of GPU threads used per atom {cellsize} value = dist dist = length (distance units) in each dimension for neighbor bins {device} value = device_type - device_type = {kepler} or {fermi} or {cypress} or {generic} + device_type = {kepler} or {fermi} or {cypress} or {phi} or {intel} or {generic} {cuda} args = keyword value ... one or more keyword/value pairs may be appended keywords = {gpu/node} or {gpu/node/special} or {timing} or {test} or {override/bpa} {gpu/node} value = N N = number of GPUs to be used per node {gpu/node/special} values = N gpu1 .. gpuN N = number of GPUs to be used per node gpu1 .. gpuN = N IDs of the GPUs to use {timing} values = none {test} values = id id = atom-ID of a test particle {override/bpa} values = flag flag = 0 for TpA algorithm, 1 for BpA algorithm {omp} args = Nthreads mode Nthreads = # of OpenMP threads to associate with each MPI process mode = force or force/neigh (optional) :pre :ule [Examples:] package gpu force 0 0 1.0 package gpu force 0 0 0.75 package gpu force/neigh 0 0 1.0 package gpu force/neigh 0 1 -1.0 package cuda gpu/node/special 2 0 2 package cuda test 3948 package omp * force/neigh package omp 4 force :pre [Description:] This command invokes package-specific settings. Currently the following packages use it: GPU, USER-CUDA, and USER-OMP. To use the accelerated GPU and USER-OMP styles, the use of the package command is required. However, as described in the "Defaults" section below, if you use the "-sf gpu" or "-sf omp" "command-line options"_Section_start.html#start_7 to enable use of these styles, then default package settings are enabled. In that case you only need to use the package command if you want to change the defaults. To use the accelerate USER-CUDA styles, the package command is not required as defaults are assigned internally. You only need to use the package command if you want to change the defaults. See "Section_accelerate"_Section_accelerate.html of the manual for more details about using these various packages for accelerating LAMMPS calculations. :line The {gpu} style invokes options associated with the use of the GPU package. The {mode} setting specifies where neighbor list calculations will be performed. If {mode} is force, neighbor list calculation is performed on the CPU. If {mode} is force/neigh, neighbor list calculation is performed on the GPU. GPU neighbor list calculation currently cannot be used with a triclinic box. GPU neighbor list calculation currently cannot be used with "hybrid"_pair_hybrid.html pair styles. GPU neighbor lists are not compatible with styles that are not GPU-enabled. When a non-GPU enabled style requires a neighbor list, it will also be built using CPU routines. In these cases, it will typically be more efficient to only use CPU neighbor list builds. The {first} and {last} settings specify the GPUs that will be used for simulation. On each node, the GPU IDs in the inclusive range from {first} to {last} will be used. The {split} setting can be used for load balancing force calculation work between CPU and GPU cores in GPU-enabled pair styles. If 0 < {split} < 1.0, a fixed fraction of particles is offloaded to the GPU while force calculation for the other particles occurs simulataneously on the CPU. If {split}<0, the optimal fraction (based on CPU and GPU timings) is calculated every 25 timesteps. If {split} = 1.0, all force calculations for GPU accelerated pair styles are performed on the GPU. In this case, "hybrid"_pair_hybrid.html, "bond"_bond_style.html, "angle"_angle_style.html, "dihedral"_dihedral_style.html, "improper"_improper_style.html, and "long-range"_kspace_style.html calculations can be performed on the CPU while the GPU is performing force calculations for the GPU-enabled pair style. If all CPU force computations complete before the GPU, LAMMPS will block until the GPU has finished before continuing the timestep. As an example, if you have two GPUs per node and 8 CPU cores per node, and would like to run on 4 nodes (32 cores) with dynamic balancing of force calculation across CPU and GPU cores, you could specify package gpu force/neigh 0 1 -1 :pre In this case, all CPU cores and GPU devices on the nodes would be utilized. Each GPU device would be shared by 4 CPU cores. The CPU cores would perform force calculations for some fraction of the particles at the same time the GPUs performed force calculation for the other particles. The {threads_per_atom} keyword allows control of the number of GPU threads used per-atom to perform the short range force calculation. By default, the value will be chosen based on the pair style, however, the value can be set with this keyword to fine-tune performance. For large cutoffs or with a small number of particles per GPU, increasing the value can improve performance. The number of threads per atom must be a power of 2 and currently cannot be greater than 32. The {cellsize} keyword can be used to control the size of the cells used for binning atoms in neighbor list calculations. Setting this value is normally not needed; the optimal value is close to the default (equal to the cutoff distance for the short range interactions plus the neighbor skin). GPUs can perform efficiently with much larger cutoffs than CPUs and this can be used to reduce the time required for long-range calculations or in some cases to eliminate them with models such as "coul/wolf"_pair_coul.html or "coul/dsf"_pair_coul.html. For very large cutoffs, it can be more efficient to use smaller values for cellsize in parallel simulations. For example, with a cutoff of 20*sigma and a neighbor skin of sigma, a cellsize of 5.25*sigma can be efficient for parallel simulations. The {device} keyword can be used to tune parameters to optimize for a specific accelerator when using OpenCL. For CUDA, the {device} keyword is ignored. Currently, the device type is limited to NVIDIA Kepler, NVIDIA Fermi, -AMD Cypress, or a generic device. More devices will be added soon. The default -device type can be specified when building LAMMPS with the GPU library. +AMD Cypress, Intel CPU, Intel Phi, or a generic device. More devices will be +added soon. The default device type can be specified when building LAMMPS with +the GPU library. :line The {cuda} style invokes options associated with the use of the USER-CUDA package. The {gpu/node} keyword specifies the number {N} of GPUs to be used on each node. An MPI process with rank {K} will use the GPU (K mod N). This implies that processes should be assigned with successive ranks on each node, which is the default with most (or even all) MPI implementations. The default value for {N} is 2. The {gpu/node/special} keyword also specifies the number (N) of GPUs to be used on each node, but allows more control over their specification. An MPI process with rank {K} will use the GPU {gpuI} with l = (K mod N) + 1. This implies that processes should be assigned with successive ranks on each node, which is the default with most (or even all) MPI implementations. For example if you have three GPUs on a machine, one of which is used for the X-Server (the GPU with the ID 1) while the others (with IDs 0 and 2) are used for computations you would specify: package cuda gpu/node/special 2 0 2 :pre A main purpose of the {gpu/node/special} optoin is to allow two (or more) simulations to be run on one workstation. In that case one would set the first simulation to use GPU 0 and the second to use GPU 1. This is not necessary though, if the GPUs are in what is called {compute exclusive} mode. Using that setting, every process will get its own GPU automatically. This {compute exclusive} mode can be set as root using the {nvidia-smi} tool which is part of the CUDA installation. Note that if the {gpu/node/special} keyword is not used, the USER-CUDA package sorts existing GPUs on each node according to their number of multiprocessors. This way, compute GPUs will be priorized over X-Server GPUs. Use of the {timing} keyword will output detailed timing information for various subroutines. The {test} keyword will output info for the the specified atom at several points during each time step. This is mainly usefull for debugging purposes. Note that the simulation will be severly slowed down if this option is used. The {override/bpa} keyword can be used to specify which mode is used for pair-force evaluation. TpA = one thread per atom; BpA = one block per atom. If this keyword is not used, a short test at the begin of each run will determine which method is more effective (the result of this test is part of the LAMMPS output). Therefore it is usually not necessary to use this keyword. :line The {omp} style invokes options associated with the use of the USER-OMP package. The first argument allows to explicitly set the number of OpenMP threads to be allocated for each MPI process. For example, if your system has nodes with dual quad-core processors, it has a total of 8 cores per node. You could run MPI on 2 cores on each node (e.g. using options for the mpirun command), and set the {Nthreads} setting to 4. This would effectively use all 8 cores on each node. Since each MPI process would spawn 4 threads (one of which runs as part of the MPI process itself). For performance reasons, you should not set {Nthreads} to more threads than there are physical cores (per MPI task), but LAMMPS cannot check for this. An {Nthreads} value of '*' instructs LAMMPS to use whatever is the default for the given OpenMP environment. This is usually determined via the {OMP_NUM_THREADS} environment variable or the compiler runtime. Please note that in most cases the default for OpenMP capable compilers is to use one thread for each available CPU core when {OMP_NUM_THREADS} is not set, which can lead to extremely bad performance. Which combination of threads and MPI tasks gives the best performance is difficult to predict and can depend on many components of your input. Not all features of LAMMPS support OpenMP and the parallel efficiency can be very different, too. The {mode} setting specifies where neighbor list calculations will be multi-threaded as well. If {mode} is force, neighbor list calculation is performed in serial. If {mode} is force/neigh, a multi-threaded neighbor list build is used. Using the force/neigh setting is almost always faster and should produce idential neighbor lists at the expense of using some more memory (neighbor list pages are always allocated for all threads at the same time and each thread works on its own pages). :line [Restrictions:] This command cannot be used after the simulation box is defined by a "read_data"_read_data.html or "create_box"_create_box.html command. The cuda style of this command can only be invoked if LAMMPS was built with the USER-CUDA package. See the "Making LAMMPS"_Section_start.html#start_3 section for more info. The gpu style of this command can only be invoked if LAMMPS was built with the GPU package. See the "Making LAMMPS"_Section_start.html#start_3 section for more info. When using the "r-RESPA run style"_run_style.html, GPU accelerated styles can only be used on the outermost RESPA level. The omp style of this command can only be invoked if LAMMPS was built with the USER-OMP package. See the "Making LAMMPS"_Section_start.html#start_3 section for more info. [Related commands:] "suffix"_suffix.html [Default:] If the "-sf gpu" "command-line switch"_Section_start.html#start_7 is used then it is as if the command "package gpu force/neigh 0 0 1" were invoked, to specify default settings for the GPU package. If the command-line switch is not used, then no defaults are set, and you must specify the appropriate package command in your input script. The default settings for the USER CUDA package are "package cuda gpu 2". This is the case whether the "-sf cuda" "command-line switch"_Section_start.html#start_7 is used or not. If the "-sf omp" "command-line switch"_Section_start.html#start_7 is used then it is as if the command "package omp *" were invoked, to specify default settings for the USER-OMP package. If the command-line switch is not used, then no defaults are set, and you must specify the appropriate package command in your input script. diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h index 5fffe77c8..12a18ae87 100644 --- a/lib/gpu/geryon/nvd_device.h +++ b/lib/gpu/geryon/nvd_device.h @@ -1,457 +1,463 @@ /*************************************************************************** nvd_device.h ------------------- W. Michael Brown Utilities for dealing with cuda devices __________________________________________________________________________ This file is part of the Geryon Unified Coprocessor Library (UCL) __________________________________________________________________________ begin : Thu Jan 21 2010 copyright : (C) 2010 by W. Michael Brown email : brownw@ornl.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ #ifndef NVD_DEVICE #define NVD_DEVICE #include <string> #include <vector> #include <iostream> #include "nvd_macros.h" #include "ucl_types.h" namespace ucl_cudadr { // -------------------------------------------------------------------------- // - COMMAND QUEUE STUFF // -------------------------------------------------------------------------- typedef CUstream command_queue; inline void ucl_sync(CUstream &stream) { CU_SAFE_CALL(cuStreamSynchronize(stream)); } struct NVDProperties { int device_id; std::string name; int major; int minor; CUDA_INT_TYPE totalGlobalMem; int multiProcessorCount; CUdevprop_st p; int kernelExecTimeoutEnabled; int integrated; int canMapHostMemory; int concurrentKernels; int ECCEnabled; int computeMode; }; /// Class for looking at device properties /** \note Calls to change the device outside of the class results in incorrect * behavior * \note There is no error checking for indexing past the number of devices **/ class UCL_Device { public: /// Collect properties for every GPU on the node /** \note You must set the active GPU with set() before using the device **/ inline UCL_Device(); inline ~UCL_Device(); /// Returns 1 (For compatibility with OpenCL) inline int num_platforms() { return 1; } /// Return a string with name and info of the current platform inline std::string platform_name() { return "NVIDIA Corporation NVIDIA CUDA Driver"; } /// Delete any contexts/data and set the platform number to be used inline int set_platform(const int pid); /// Return the number of devices that support CUDA inline int num_devices() { return _properties.size(); } /// Set the CUDA device to the specified device number /** A context and default command queue will be created for the device * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not * be allocated for use. clear() is called to delete any contexts and * associated data from previous calls to set(). **/ inline int set(int num); /// Delete any context and associated data stored from a call to set() inline void clear(); /// Get the current device number inline int device_num() { return _device; } /// Returns the default stream for the current device inline command_queue & cq() { return cq(0); } /// Returns the stream indexed by i inline command_queue & cq(const int i) { return _cq[i]; } /// Block until all commands in the default stream have completed inline void sync() { sync(0); } /// Block until all commands in the specified stream have completed inline void sync(const int i) { ucl_sync(cq(i)); } /// Get the number of command queues currently available on device inline int num_queues() { return _cq.size(); } /// Add a stream for device computations inline void push_command_queue() { _cq.push_back(CUstream()); CU_SAFE_CALL(cuStreamCreate(&_cq.back(),0)); } /// Remove a stream for device computations /** \note You cannot delete the default stream **/ inline void pop_command_queue() { if (_cq.size()<2) return; CU_SAFE_CALL_NS(cuStreamDestroy(_cq.back())); _cq.pop_back(); } /// Set the default command queue (by default this is the null stream) /** \param i index of the command queue (as added by push_command_queue()) If i is 0, the default command queue is set to the null stream **/ inline void set_command_queue(const int i) { if (i==0) _cq[0]=0; else _cq[0]=_cq[i]; } /// Get the current CUDA device name inline std::string name() { return name(_device); } /// Get the CUDA device name inline std::string name(const int i) { return std::string(_properties[i].name); } /// Get a string telling the type of the current device inline std::string device_type_name() { return device_type_name(_device); } /// Get a string telling the type of the device inline std::string device_type_name(const int i) { return "GPU"; } /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) inline int device_type() { return device_type(_device); } /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) inline int device_type(const int i) { return UCL_GPU; } /// Returns true if host memory is efficiently addressable from device inline bool shared_memory() { return shared_memory(_device); } /// Returns true if host memory is efficiently addressable from device inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; } /// Returns true if double precision is support for the current device inline bool double_precision() { return double_precision(_device); } /// Returns true if double precision is support for the device inline bool double_precision(const int i) {return arch(i)>=1.3;} + /// Get the number of compute units on the current device + inline unsigned cus() { return cus(_device); } + /// Get the number of compute units + inline unsigned cus(const int i) + { return _properties[i].multiProcessorCount; } + /// Get the number of cores in the current device inline unsigned cores() { return cores(_device); } /// Get the number of cores inline unsigned cores(const int i) { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; else if (arch(i)<2.1) return _properties[i].multiProcessorCount*32; else if (arch(i)<3.0) return _properties[i].multiProcessorCount*48; else return _properties[i].multiProcessorCount*192; } /// Get the gigabytes of global memory in the current device inline double gigabytes() { return gigabytes(_device); } /// Get the gigabytes of global memory inline double gigabytes(const int i) { return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; } /// Get the bytes of global memory in the current device inline size_t bytes() { return bytes(_device); } /// Get the bytes of global memory inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; } // Get the gigabytes of free memory in the current device inline double free_gigabytes() { return free_gigabytes(_device); } // Get the gigabytes of free memory inline double free_gigabytes(const int i) { return static_cast<double>(free_bytes(i))/1073741824; } // Get the bytes of free memory in the current device inline size_t free_bytes() { return free_bytes(_device); } // Get the bytes of free memory inline size_t free_bytes(const int i) { CUDA_INT_TYPE dfree, dtotal; CU_SAFE_CALL_NS(cuMemGetInfo(&dfree, &dtotal)); return static_cast<size_t>(dfree); } /// Return the GPGPU compute capability for current device inline double arch() { return arch(_device); } /// Return the GPGPU compute capability inline double arch(const int i) { return static_cast<double>(_properties[i].minor)/10+_properties[i].major;} /// Clock rate in GHz for current device inline double clock_rate() { return clock_rate(_device); } /// Clock rate in GHz inline double clock_rate(const int i) { return _properties[i].p.clockRate*1e-6;} /// Get the maximum number of threads per block inline size_t group_size() { return group_size(_device); } /// Get the maximum number of threads per block inline size_t group_size(const int i) { return _properties[i].p.maxThreadsPerBlock; } /// Return the maximum memory pitch in bytes for current device inline size_t max_pitch() { return max_pitch(_device); } /// Return the maximum memory pitch in bytes inline size_t max_pitch(const int i) { return _properties[i].p.memPitch; } /// Returns false if accelerator cannot be shared by multiple processes /** If it cannot be determined, true is returned **/ inline bool sharing_supported() { return sharing_supported(_device); } /// Returns false if accelerator cannot be shared by multiple processes /** If it cannot be determined, true is returned **/ inline bool sharing_supported(const int i) { return (_properties[i].computeMode == CU_COMPUTEMODE_DEFAULT); } /// True if splitting device into equal subdevices supported inline bool fission_equal() { return fission_equal(_device); } /// True if splitting device into equal subdevices supported inline bool fission_equal(const int i) { return false; } /// True if splitting device into subdevices by specified counts supported inline bool fission_by_counts() { return fission_by_counts(_device); } /// True if splitting device into subdevices by specified counts supported inline bool fission_by_counts(const int i) { return false; } /// True if splitting device into subdevices by affinity domains supported inline bool fission_by_affinity() { return fission_by_affinity(_device); } /// True if splitting device into subdevices by affinity domains supported inline bool fission_by_affinity(const int i) { return false; } /// Maximum number of subdevices allowed from device fission inline int max_sub_devices() { return max_sub_devices(_device); } /// Maximum number of subdevices allowed from device fission inline int max_sub_devices(const int i) { return 0; } /// List all devices along with all properties inline void print_all(std::ostream &out); private: int _device, _num_devices; std::vector<NVDProperties> _properties; std::vector<CUstream> _cq; CUdevice _cu_device; CUcontext _context; }; // Grabs the properties for all devices UCL_Device::UCL_Device() { CU_SAFE_CALL_NS(cuInit(0)); CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices)); for (int dev=0; dev<_num_devices; ++dev) { CUdevice m; CU_SAFE_CALL_NS(cuDeviceGet(&m,dev)); int major, minor; CU_SAFE_CALL_NS(cuDeviceComputeCapability(&major,&minor,m)); if (major==9999) continue; _properties.push_back(NVDProperties()); _properties.back().device_id=dev; _properties.back().major=major; _properties.back().minor=minor; char namecstr[1024]; CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m)); _properties.back().name=namecstr; CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m)); CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, m)); CU_SAFE_CALL_NS(cuDeviceGetProperties(&_properties.back().p,m)); #if CUDA_VERSION >= 2020 CU_SAFE_CALL_NS(cuDeviceGetAttribute( &_properties.back().kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev)); CU_SAFE_CALL_NS(cuDeviceGetAttribute( &_properties.back().integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev)); CU_SAFE_CALL_NS(cuDeviceGetAttribute( &_properties.back().canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev)); CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,dev)); #endif #if CUDA_VERSION >= 3010 CU_SAFE_CALL_NS(cuDeviceGetAttribute( &_properties.back().concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev)); CU_SAFE_CALL_NS(cuDeviceGetAttribute( &_properties.back().ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev)); #endif } _device=-1; _cq.push_back(CUstream()); _cq.back()=0; } UCL_Device::~UCL_Device() { clear(); } int UCL_Device::set_platform(const int pid) { clear(); #ifdef UCL_DEBUG assert(pid<num_platforms()); #endif return UCL_SUCCESS; } // Set the CUDA device to the specified device number int UCL_Device::set(int num) { clear(); _device=_properties[num].device_id; CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device)); CUresult err=cuCtxCreate(&_context,0,_cu_device); if (err!=CUDA_SUCCESS) { #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not access accelerator number " << num << " for use.\n"; UCL_GERYON_EXIT; #endif return UCL_ERROR; } return UCL_SUCCESS; } void UCL_Device::clear() { if (_device>-1) { for (int i=1; i<num_queues(); i++) pop_command_queue(); cuCtxDestroy(_context); } _device=-1; } // List all devices along with all properties void UCL_Device::print_all(std::ostream &out) { #if CUDA_VERSION >= 2020 int driver_version; cuDriverGetVersion(&driver_version); out << "CUDA Driver Version: " << driver_version/1000 << "." << driver_version%100 << std::endl; #endif if (num_devices() == 0) out << "There is no device supporting CUDA\n"; for (int i=0; i<num_devices(); ++i) { out << "\nDevice " << i << ": \"" << name(i) << "\"\n"; out << " Type of device: " << device_type_name(i).c_str() << std::endl; out << " Compute capability: " << arch(i) << std::endl; out << " Double precision support: "; if (double_precision(i)) out << "Yes\n"; else out << "No\n"; out << " Total amount of global memory: " << gigabytes(i) << " GB\n"; #if CUDA_VERSION >= 2000 out << " Number of compute units/multiprocessors: " << _properties[i].multiProcessorCount << std::endl; out << " Number of cores: " << cores(i) << std::endl; #endif out << " Total amount of constant memory: " << _properties[i].p.totalConstantMemory << " bytes\n"; out << " Total amount of local/shared memory per block: " << _properties[i].p.sharedMemPerBlock << " bytes\n"; out << " Total number of registers available per block: " << _properties[i].p.regsPerBlock << std::endl; out << " Warp size: " << _properties[i].p.SIMDWidth << std::endl; out << " Maximum number of threads per block: " << _properties[i].p.maxThreadsPerBlock << std::endl; out << " Maximum group size (# of threads per block) " << _properties[i].p.maxThreadsDim[0] << " x " << _properties[i].p.maxThreadsDim[1] << " x " << _properties[i].p.maxThreadsDim[2] << std::endl; out << " Maximum item sizes (# threads for each dim) " << _properties[i].p.maxGridSize[0] << " x " << _properties[i].p.maxGridSize[1] << " x " << _properties[i].p.maxGridSize[2] << std::endl; out << " Maximum memory pitch: " << max_pitch(i) << " bytes\n"; out << " Texture alignment: " << _properties[i].p.textureAlign << " bytes\n"; out << " Clock rate: " << clock_rate(i) << " GHz\n"; #if CUDA_VERSION >= 2020 out << " Run time limit on kernels: "; if (_properties[i].kernelExecTimeoutEnabled) out << "Yes\n"; else out << "No\n"; out << " Integrated: "; if (_properties[i].integrated) out << "Yes\n"; else out << "No\n"; out << " Support host page-locked memory mapping: "; if (_properties[i].canMapHostMemory) out << "Yes\n"; else out << "No\n"; out << " Compute mode: "; if (_properties[i].computeMode == CU_COMPUTEMODE_DEFAULT) out << "Default\n"; // multiple threads can use device else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE) out << "Exclusive\n"; // only thread can use device else if (_properties[i].computeMode == CU_COMPUTEMODE_PROHIBITED) out << "Prohibited\n"; // no thread can use device #if CUDART_VERSION >= 4000 else if (_properties[i].computeMode == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) out << "Exclusive Process\n"; // multiple threads 1 process #endif else out << "Unknown\n"; #endif #if CUDA_VERSION >= 3010 out << " Concurrent kernel execution: "; if (_properties[i].concurrentKernels) out << "Yes\n"; else out << "No\n"; out << " Device has ECC support enabled: "; if (_properties[i].ECCEnabled) out << "Yes\n"; else out << "No\n"; #endif } } } #endif diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h index 79fa53d55..8dadcf2ef 100644 --- a/lib/gpu/geryon/ocl_device.h +++ b/lib/gpu/geryon/ocl_device.h @@ -1,598 +1,597 @@ /*************************************************************************** ocl_device.h ------------------- W. Michael Brown Utilities for dealing with OpenCL devices __________________________________________________________________________ This file is part of the Geryon Unified Coprocessor Library (UCL) __________________________________________________________________________ begin : Mon Dec 23 2009 copyright : (C) 2009 by W. Michael Brown email : brownw@ornl.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ #ifndef OCL_DEVICE #define OCL_DEVICE #include <string> #include <vector> #include <iostream> #ifdef __APPLE__ #include <OpenCL/cl.h> #include <OpenCL/cl_platform.h> #else #include <CL/cl.h> #include <CL/cl_platform.h> #endif #include "ocl_macros.h" #include "ucl_types.h" namespace ucl_opencl { // -------------------------------------------------------------------------- // - COMMAND QUEUE STUFF // -------------------------------------------------------------------------- typedef cl_command_queue command_queue; typedef cl_context context_type; inline void ucl_sync(cl_command_queue &cq) { CL_SAFE_CALL(clFinish(cq)); } inline bool _shared_mem_device(cl_device_type &device_type) { return (device_type==CL_DEVICE_TYPE_CPU); } struct OCLProperties { std::string name; cl_device_type device_type; cl_ulong global_mem; cl_ulong shared_mem; cl_ulong const_mem; cl_uint compute_units; cl_uint clock; size_t work_group_size; size_t work_item_size[3]; bool double_precision; int alignment; size_t timer_resolution; bool ecc_support; std::string c_version; bool partition_equal, partition_counts, partition_affinity; cl_uint max_sub_devices; }; /// Class for looking at data parallel device properties /** \note Calls to change the device outside of the class results in incorrect * behavior * \note There is no error checking for indexing past the number of devices **/ class UCL_Device { public: /// Collect properties for every device on the node /** \note You must set the active GPU with set() before using the device **/ inline UCL_Device(); inline ~UCL_Device(); /// Return the number of platforms (0 if error or no platforms) inline int num_platforms() { return _num_platforms; } /// Return a string with name and info of the current platform inline std::string platform_name(); /// Delete any contexts/data and set the platform number to be used inline int set_platform(const int pid); /// Return the number of devices that support OpenCL inline int num_devices() { return _num_devices; } /// Set the OpenCL device to the specified device number /** A context and default command queue will be created for the device * * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not * be allocated for use. clear() is called to delete any contexts and * associated data from previous calls to set(). **/ inline int set(int num); /// Delete any context and associated data stored from a call to set() inline void clear(); /// Get the current device number inline int device_num() { return _device; } /// Returns the context for the current device inline cl_context & context() { return _context; } /// Returns the default stream for the current device inline command_queue & cq() { return cq(_default_cq); } /// Returns the stream indexed by i inline command_queue & cq(const int i) { return _cq[i]; } /// Set the default command queue /** \param i index of the command queue (as added by push_command_queue()) If i is 0, the command queue created with device initialization is used **/ inline void set_command_queue(const int i) { _default_cq=i; } /// Block until all commands in the default stream have completed inline void sync() { sync(_default_cq); } /// Block until all commands in the specified stream have completed inline void sync(const int i) { ucl_sync(cq(i)); } /// Get the number of command queues currently available on device inline int num_queues() { return _cq.size(); } /// Add a command queue for device computations (with profiling enabled) inline void push_command_queue() { cl_int errorv; _cq.push_back(cl_command_queue()); _cq.back()=clCreateCommandQueue(_context,_cl_device, CL_QUEUE_PROFILING_ENABLE,&errorv); if (errorv!=CL_SUCCESS) { std::cerr << "Could not create command queue on device: " << name() << std::endl; UCL_GERYON_EXIT; } } /// Remove a stream for device computations /** \note You cannot delete the default stream **/ inline void pop_command_queue() { if (_cq.size()<2) return; CL_SAFE_CALL(clReleaseCommandQueue(_cq.back())); _cq.pop_back(); } /// Get the current OpenCL device name inline std::string name() { return name(_device); } /// Get the OpenCL device name inline std::string name(const int i) { return std::string(_properties[i].name); } /// Get a string telling the type of the current device inline std::string device_type_name() { return device_type_name(_device); } /// Get a string telling the type of the device inline std::string device_type_name(const int i); /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) inline int device_type() { return device_type(_device); } /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT) inline int device_type(const int i); /// Returns true if host memory is efficiently addressable from device inline bool shared_memory() { return shared_memory(_device); } /// Returns true if host memory is efficiently addressable from device inline bool shared_memory(const int i) { return _shared_mem_device(_properties[i].device_type); } /// Returns true if double precision is support for the current device inline bool double_precision() { return double_precision(_device); } /// Returns true if double precision is support for the device inline bool double_precision(const int i) {return _properties[i].double_precision;} - /// Get the number of cores in the current device - inline unsigned cores() { return cores(_device); } - /// Get the number of cores - inline unsigned cores(const int i) - { if (device_type(i)==UCL_CPU) return _properties[i].compute_units; - else return _properties[i].compute_units*8; } - + /// Get the number of compute units on the current device + inline unsigned cus() { return cus(_device); } + /// Get the number of compute units + inline unsigned cus(const int i) + { return _properties[i].compute_units; } + /// Get the gigabytes of global memory in the current device inline double gigabytes() { return gigabytes(_device); } /// Get the gigabytes of global memory inline double gigabytes(const int i) { return static_cast<double>(_properties[i].global_mem)/1073741824; } /// Get the bytes of global memory in the current device inline size_t bytes() { return bytes(_device); } /// Get the bytes of global memory inline size_t bytes(const int i) { return _properties[i].global_mem; } /// Return the GPGPU revision number for current device //inline double revision() { return revision(_device); } /// Return the GPGPU revision number //inline double revision(const int i) // { return //static_cast<double>(_properties[i].minor)/10+_properties[i].major;} /// Clock rate in GHz for current device inline double clock_rate() { return clock_rate(_device); } /// Clock rate in GHz inline double clock_rate(const int i) { return _properties[i].clock*1e-3;} /// Return the address alignment in bytes inline int alignment() { return alignment(_device); } /// Return the address alignment in bytes inline int alignment(const int i) { return _properties[i].alignment; } /// Return the timer resolution inline size_t timer_resolution() { return timer_resolution(_device); } /// Return the timer resolution inline size_t timer_resolution(const int i) { return _properties[i].timer_resolution; } /// Get the maximum number of threads per block inline size_t group_size() { return group_size(_device); } /// Get the maximum number of threads per block inline size_t group_size(const int i) { return _properties[i].work_group_size; } /// Return the maximum memory pitch in bytes for current device inline size_t max_pitch() { return max_pitch(_device); } /// Return the maximum memory pitch in bytes inline size_t max_pitch(const int i) { return 0; } /// Returns false if accelerator cannot be shared by multiple processes /** If it cannot be determined, true is returned **/ inline bool sharing_supported() { return sharing_supported(_device); } /// Returns false if accelerator cannot be shared by multiple processes /** If it cannot be determined, true is returned **/ inline bool sharing_supported(const int i) { return true; } /// True if splitting device into equal subdevices supported inline bool fission_equal() { return fission_equal(_device); } /// True if splitting device into equal subdevices supported inline bool fission_equal(const int i) { return _properties[i].partition_equal; } /// True if splitting device into subdevices by specified counts supported inline bool fission_by_counts() { return fission_by_counts(_device); } /// True if splitting device into subdevices by specified counts supported inline bool fission_by_counts(const int i) { return _properties[i].partition_counts; } /// True if splitting device into subdevices by affinity domains supported inline bool fission_by_affinity() { return fission_by_affinity(_device); } /// True if splitting device into subdevices by affinity domains supported inline bool fission_by_affinity(const int i) { return _properties[i].partition_affinity; } /// Maximum number of subdevices allowed from device fission inline int max_sub_devices() { return max_sub_devices(_device); } /// Maximum number of subdevices allowed from device fission inline int max_sub_devices(const int i) { return _properties[i].max_sub_devices; } /// List all devices along with all properties inline void print_all(std::ostream &out); /// Return the OpenCL type for the device inline cl_device_id & cl_device() { return _cl_device; } private: int _num_platforms; // Number of platforms int _platform; // UCL_Device ID for current platform cl_platform_id _cl_platform; // OpenCL ID for current platform cl_platform_id _cl_platforms[20]; // OpenCL IDs for all platforms cl_context _context; // Context used for accessing the device std::vector<cl_command_queue> _cq;// The default command queue for this device int _device; // UCL_Device ID for current device cl_device_id _cl_device; // OpenCL ID for current device std::vector<cl_device_id> _cl_devices; // OpenCL IDs for all devices int _num_devices; // Number of devices std::vector<OCLProperties> _properties; // Properties for each device inline void add_properties(cl_device_id); inline int create_context(); int _default_cq; }; // Grabs the properties for all devices UCL_Device::UCL_Device() { _device=-1; // --- Get Number of Platforms cl_uint nplatforms; cl_int errorv=clGetPlatformIDs(20,_cl_platforms,&nplatforms); if (errorv!=CL_SUCCESS) { _num_platforms=0; return; } else _num_platforms=static_cast<int>(nplatforms); set_platform(0); } UCL_Device::~UCL_Device() { clear(); } void UCL_Device::clear() { if (_device>-1) { for (size_t i=0; i<_cq.size(); i++) { CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back())); _cq.pop_back(); } CL_DESTRUCT_CALL(clReleaseContext(_context)); } _device=-1; } int UCL_Device::set_platform(int pid) { clear(); cl_int errorv; _cl_device=0; _device=-1; _num_devices=0; _default_cq=0; #ifdef UCL_DEBUG assert(pid<num_platforms()); #endif _platform=pid; _cl_platform=_cl_platforms[_platform]; // --- Get Number of Devices cl_uint n; errorv=clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,0,NULL,&n); _num_devices=n; if (errorv!=CL_SUCCESS || _num_devices==0) { _num_devices=0; return UCL_ERROR; } cl_device_id device_list[_num_devices]; CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list, &n)); // --- Store properties for each device for (int i=0; i<_num_devices; i++) { _cl_devices.push_back(device_list[i]); add_properties(device_list[i]); } return UCL_SUCCESS; } int UCL_Device::create_context() { cl_int errorv; cl_context_properties props[3]; props[0]=CL_CONTEXT_PLATFORM; props[1]=_platform; props[2]=0; _context=clCreateContext(0,1,&_cl_device,NULL,NULL,&errorv); if (errorv!=CL_SUCCESS) { #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not access accelerator number " << _device << " for use.\n"; UCL_GERYON_EXIT; #endif return UCL_ERROR; } push_command_queue(); _default_cq=0; return UCL_SUCCESS; } void UCL_Device::add_properties(cl_device_id device_list) { OCLProperties op; char buffer[1024]; cl_bool ans_bool; CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,NULL)); op.name=buffer; CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(op.global_mem),&op.global_mem,NULL)); CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_LOCAL_MEM_SIZE, sizeof(op.shared_mem),&op.shared_mem,NULL)); CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(op.const_mem),&op.const_mem,NULL)); CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_TYPE, sizeof(op.device_type),&op.device_type,NULL)); CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(op.compute_units),&op.compute_units, NULL)); CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(op.clock),&op.clock,NULL)); CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(op.work_group_size),&op.work_group_size, NULL)); CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_WORK_ITEM_SIZES, 3*sizeof(op.work_item_size[0]),op.work_item_size, NULL)); CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint),&op.alignment,NULL)); op.alignment/=8; // Determine if double precision is supported cl_uint double_width; CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(double_width),&double_width,NULL)); if (double_width==0) op.double_precision=false; else op.double_precision=true; CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(size_t),&op.timer_resolution,NULL)); op.ecc_support=false; CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(ans_bool),&ans_bool,NULL)); if (ans_bool==CL_TRUE) op.ecc_support=true; op.c_version=""; op.partition_equal=false; op.partition_counts=false; op.partition_affinity=false; #ifdef CL_VERSION_1_2 size_t return_bytes; CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_OPENCL_C_VERSION,1024, buffer,NULL)); op.c_version=buffer; cl_device_partition_property pinfo[4]; CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_PARTITION_PROPERTIES, 4*sizeof(cl_device_partition_property), pinfo,&return_bytes)); int nprops=return_bytes/sizeof(cl_device_partition_property); for (int i=0; i<nprops; i++) { if (pinfo[i]==CL_DEVICE_PARTITION_EQUALLY) op.partition_equal=true; else if (pinfo[i]==CL_DEVICE_PARTITION_BY_COUNTS) op.partition_counts=true; else if (pinfo[i]==CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN) op.partition_affinity=true; } CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_PARTITION_MAX_SUB_DEVICES, sizeof(cl_uint),&op.max_sub_devices,NULL)); #endif _properties.push_back(op); } std::string UCL_Device::platform_name() { char info[1024]; CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VENDOR,1024,info, NULL)); std::string ans=std::string(info)+' '; CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_NAME,1024,info, NULL)); ans+=std::string(info)+' '; CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VERSION,1024,info, NULL)); ans+=std::string(info); return ans; } // Get a string telling the type of the device std::string UCL_Device::device_type_name(const int i) { if (_properties[i].device_type==CL_DEVICE_TYPE_CPU) return "CPU"; else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU) return "GPU"; else if (_properties[i].device_type==CL_DEVICE_TYPE_ACCELERATOR) return "ACCELERATOR"; else return "DEFAULT"; } // Get a string telling the type of the device int UCL_Device::device_type(const int i) { if (_properties[i].device_type==CL_DEVICE_TYPE_CPU) return UCL_CPU; else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU) return UCL_GPU; else if (_properties[i].device_type==CL_DEVICE_TYPE_ACCELERATOR) return UCL_ACCELERATOR; else return UCL_DEFAULT; } // Set the CUDA device to the specified device number int UCL_Device::set(int num) { clear(); cl_device_id device_list[_num_devices]; cl_uint n; CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices, device_list,&n)); _device=num; _cl_device=device_list[_device]; return create_context(); } // List all devices along with all properties void UCL_Device::print_all(std::ostream &out) { if (num_devices() == 0) out << "There is no device supporting OpenCL\n"; for (int i=0; i<num_devices(); ++i) { out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n"; out << " Type of device: " << device_type_name(i).c_str() << std::endl; out << " Double precision support: "; if (double_precision(i)) out << "Yes\n"; else out << "No\n"; out << " Total amount of global memory: " << gigabytes(i) << " GB\n"; out << " Number of compute units/multiprocessors: " << _properties[i].compute_units << std::endl; //out << " Number of cores: " // << cores(i) << std::endl; out << " Total amount of constant memory: " << _properties[i].const_mem << " bytes\n"; out << " Total amount of local/shared memory per block: " << _properties[i].shared_mem << " bytes\n"; //out << " Total number of registers available per block: " // << _properties[i].regsPerBlock << std::endl; //out << " Warp size: " // << _properties[i].warpSize << std::endl; out << " Maximum group size (# of threads per block) " << _properties[i].work_group_size << std::endl; out << " Maximum item sizes (# threads for each dim) " << _properties[i].work_item_size[0] << " x " << _properties[i].work_item_size[1] << " x " << _properties[i].work_item_size[2] << std::endl; //out << " Maximum sizes of each dimension of a grid: " // << _properties[i].maxGridSize[0] << " x " // << _properties[i].maxGridSize[1] << " x " // << _properties[i].maxGridSize[2] << std::endl; //out << " Maximum memory pitch: " // << _properties[i].memPitch) << " bytes\n"; //out << " Texture alignment: " // << _properties[i].textureAlignment << " bytes\n"; out << " Clock rate: " << clock_rate(i) << " GHz\n"; //out << " Concurrent copy and execution: "; out << " ECC support: "; if (_properties[i].ecc_support) out << "Yes\n"; else out << "No\n"; out << " Device fission into equal partitions: "; if (fission_equal(i)) out << "Yes\n"; else out << "No\n"; out << " Device fission by counts: "; if (fission_by_counts(i)) out << "Yes\n"; else out << "No\n"; out << " Device fission by affinity: "; if (fission_by_affinity(i)) out << "Yes\n"; else out << "No\n"; out << " Maximum subdevices from fission: " << max_sub_devices(i) << std::endl; } } } #endif diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp index 641087a6c..18bcf3c10 100644 --- a/lib/gpu/lal_base_ellipsoid.cpp +++ b/lib/gpu/lal_base_ellipsoid.cpp @@ -1,486 +1,488 @@ /*************************************************************************** base_ellipsoid.cpp ------------------- W. Michael Brown (ORNL) Base class for acceleration of ellipsoid potentials __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : Thu May 5 2011 email : brownw@ornl.gov ***************************************************************************/ #include "lal_base_ellipsoid.h" #include <cstdlib> using namespace LAMMPS_AL; #if defined(USE_OPENCL) #include "ellipsoid_nbor_cl.h" #elif defined(USE_CUDART) const char *ellipsoid_nbor=0; #else #include "ellipsoid_nbor_cubin.h" #endif #define BaseEllipsoidT BaseEllipsoid<numtyp, acctyp> extern Device<PRECISION,ACC_PRECISION> global_device; template <class numtyp, class acctyp> BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0) { device=&global_device; ans=new Answer<numtyp,acctyp>(); nbor=new Neighbor(); } template <class numtyp, class acctyp> BaseEllipsoidT::~BaseEllipsoid() { delete ans; delete nbor; } template <class numtyp, class acctyp> int BaseEllipsoidT::bytes_per_atom(const int max_nbors) const { return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ nbor->bytes_per_atom(max_nbors); } template <class numtyp, class acctyp> int BaseEllipsoidT::init_base(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, const int ntypes, int **h_form, const void *ellipsoid_program, const void *lj_program, const char *k_name, const bool ellip_sphere) { screen=_screen; _ellipsoid_sphere=ellip_sphere; int gpu_nbor=0; if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH) gpu_nbor=1; else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH) gpu_nbor=2; int _gpu_host=0; int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); if (host_nlocal>0) _gpu_host=1; _threads_per_atom=device->threads_per_atom(); int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor, maxspecial,_gpu_host,max_nbors,cell_size,true, 1); if (success!=0) return success; ucl_device=device->gpu; atom=&device->atom; _block_size=device->block_ellipse(); compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere); // Initialize host-device load balancer hd_balancer.init(device,gpu_nbor,gpu_split); // Initialize timers for the selected GPU time_lj.init(*ucl_device); time_nbor1.init(*ucl_device); time_ellipsoid.init(*ucl_device); time_nbor2.init(*ucl_device); time_ellipsoid2.init(*ucl_device); time_nbor3.init(*ucl_device); time_ellipsoid3.init(*ucl_device); time_lj.zero(); time_nbor1.zero(); time_ellipsoid.zero(); time_nbor2.zero(); time_ellipsoid2.zero(); time_nbor3.zero(); time_ellipsoid3.zero(); // See if we want fast GB-sphere or sphere-sphere calculations _host_form=h_form; _multiple_forms=false; for (int i=1; i<ntypes; i++) for (int j=i; j<ntypes; j++) if (_host_form[i][j]!=ELLIPSE_ELLIPSE) _multiple_forms=true; if (_multiple_forms && host_nlocal>0) return -8; if (_multiple_forms && gpu_nbor!=0) return -9; if (_multiple_forms) ans->force.zero(); // Memory for ilist ordered by particle type if (host_olist.alloc(nbor->max_atoms(),*ucl_device)!=UCL_SUCCESS) return -3; _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); neigh_tex.bind_float(atom->x,4); pos_tex.bind_float(atom->x,4); quat_tex.bind_float(atom->quat,4); lj_pos_tex.bind_float(atom->x,4); lj_quat_tex.bind_float(atom->quat,4); return 0; } template <class numtyp, class acctyp> void BaseEllipsoidT::estimate_gpu_overhead() { device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead); } template <class numtyp, class acctyp> void BaseEllipsoidT::clear_base() { // Output any timing information output_times(); host_olist.clear(); if (_compiled) { k_nbor_fast.clear(); k_nbor.clear(); k_ellipsoid.clear(); k_ellipsoid_sphere.clear(); k_sphere_ellipsoid.clear(); k_lj_fast.clear(); k_lj.clear(); delete nbor_program; delete ellipsoid_program; delete lj_program; _compiled=false; } time_nbor1.clear(); time_ellipsoid.clear(); time_nbor2.clear(); time_ellipsoid2.clear(); time_nbor3.clear(); time_ellipsoid3.clear(); time_lj.clear(); hd_balancer.clear(); nbor->clear(); ans->clear(); device->clear(); } template <class numtyp, class acctyp> void BaseEllipsoidT::output_times() { // Output any timing information acc_timers(); double single[10], times[10]; single[0]=atom->transfer_time()+ans->transfer_time(); single[1]=nbor->time_nbor.total_seconds()+nbor->time_hybrid1.total_seconds()+ nbor->time_hybrid2.total_seconds(); single[2]=time_nbor1.total_seconds()+time_nbor2.total_seconds()+ time_nbor3.total_seconds()+nbor->time_nbor.total_seconds(); single[3]=time_ellipsoid.total_seconds()+time_ellipsoid2.total_seconds()+ time_ellipsoid3.total_seconds(); if (_multiple_forms) single[4]=time_lj.total_seconds(); else single[4]=0; single[5]=atom->cast_time()+ans->cast_time(); single[6]=_gpu_overhead; single[7]=_driver_overhead; single[8]=ans->cpu_idle_time(); single[9]=nbor->bin_time(); MPI_Reduce(single,times,10,MPI_DOUBLE,MPI_SUM,0,device->replica()); double avg_split=hd_balancer.all_avg_split(); _max_bytes+=atom->max_gpu_bytes(); double mpi_max_bytes; MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0, device->replica()); double max_mb=mpi_max_bytes/(1024*1024); + double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5]; if (device->replica_me()==0) if (screen && times[5]>0.0) { int replica_size=device->replica_size(); fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - fprintf(screen," GPU Time Info (average): "); + fprintf(screen," Device Time Info (average): "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - if (device->procs_per_gpu()==1) { + if (device->procs_per_gpu()==1 && t_time>0) { fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size); fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[5]/replica_size); fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size); if (nbor->gpu_nbor()>0) fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/replica_size); else fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/replica_size); fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size); fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size); } if (nbor->gpu_nbor()==2) fprintf(screen,"Neighbor (CPU): %.4f s.\n",times[9]/replica_size); - fprintf(screen,"GPU Overhead: %.4f s.\n",times[6]/replica_size); + if (times[6]>0) + fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Threads / atom: %d.\n",_threads_per_atom); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size); fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[8]/replica_size); fprintf(screen,"-------------------------------------"); fprintf(screen,"--------------------------------\n\n"); } _max_bytes=0.0; } // --------------------------------------------------------------------------- // Pack neighbors to limit thread divergence for lj-lj and ellipse // --------------------------------------------------------------------------- template<class numtyp, class acctyp> void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start, const int inum, const int form_low, const int form_high, const bool shared_types, int ntypes) { int stride=nbor->nbor_pitch(); if (shared_types) { k_nbor_fast.set_size(GX,BX); k_nbor_fast.run(&atom->x, &cut_form, &nbor->dev_nbor, &stride, &start, &inum, &nbor->dev_packed, &form_low, &form_high); } else { k_nbor.set_size(GX,BX); k_nbor.run(&atom->x, &cut_form, &ntypes, &nbor->dev_nbor, &stride, &start, &inum, &nbor->dev_packed, &form_low, &form_high); } } // --------------------------------------------------------------------------- // Copy neighbor list from host // --------------------------------------------------------------------------- template <class numtyp, class acctyp> void BaseEllipsoidT::reset_nbors(const int nall, const int inum, const int osize, int *ilist, int *numj, int *type, int **firstneigh, bool &success) { success=true; int mn=nbor->max_nbor_loop(osize,numj,ilist); resize_atom(nall,success); resize_local(inum,0,mn,osize,success); if (!success) return; if (_multiple_forms) { int p=0; for (int i=0; i<osize; i++) { int itype=type[ilist[i]]; if (_host_form[itype][itype]==ELLIPSE_ELLIPSE) { host_olist[p]=ilist[i]; p++; } } _max_last_ellipse=p; _last_ellipse=std::min(inum,_max_last_ellipse); for (int i=0; i<osize; i++) { int itype=type[ilist[i]]; if (_host_form[itype][itype]!=ELLIPSE_ELLIPSE) { host_olist[p]=ilist[i]; p++; } } nbor->get_host(inum,host_olist.begin(),numj,firstneigh,block_size()); nbor->copy_unpacked(inum,mn); return; } _last_ellipse=inum; _max_last_ellipse=inum; nbor->get_host(inum,ilist,numj,firstneigh,block_size()); nbor->copy_unpacked(inum,mn); double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; } // --------------------------------------------------------------------------- // Build neighbor list on device // --------------------------------------------------------------------------- template <class numtyp, class acctyp> inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, bool &success) { success=true; resize_atom(nall,success); resize_local(inum,host_inum,nbor->max_nbors(),0,success); if (!success) return; atom->cast_copy_x(host_x,host_type); int mn; nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag, nspecial, special, success, mn); nbor->copy_unpacked(inum,mn); _last_ellipse=inum; _max_last_ellipse=inum; double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; } // --------------------------------------------------------------------------- // Copy nbor list from host if necessary and then calculate forces, virials,.. // --------------------------------------------------------------------------- template <class numtyp, class acctyp> int* BaseEllipsoidT::compute(const int f_ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double **host_quat) { acc_timers(); if (inum_full==0) { host_start=0; zero_timers(); return NULL; } int ago=hd_balancer.ago_first(f_ago); int inum=hd_balancer.balance(ago,inum_full,cpu_time); ans->inum(inum); _last_ellipse=std::min(inum,_max_last_ellipse); host_start=inum; if (ago==0) { reset_nbors(nall, inum, inum_full, ilist, numj, host_type, firstneigh, success); if (!success) return NULL; } int *list; if (_multiple_forms) list=host_olist.begin(); else list=ilist; atom->cast_x_data(host_x,host_type); atom->cast_quat_data(host_quat[0]); hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); atom->add_quat_data(); loop(eflag,vflag); ans->copy_answers(eflag,vflag,eatom,vatom,list); device->add_ans_object(ans); hd_balancer.stop_timer(); return list; } // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary and then compute forces, virials, energies // --------------------------------------------------------------------------- template <class numtyp, class acctyp> int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double **host_quat) { acc_timers(); if (inum_full==0) { host_start=0; zero_timers(); return NULL; } hd_balancer.balance(cpu_time); int inum=hd_balancer.get_gpu_count(ago,inum_full); ans->inum(inum); _last_ellipse=std::min(inum,_max_last_ellipse); host_start=inum; // Build neighbor list on GPU if necessary if (ago==0) { build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, success); if (!success) return NULL; atom->cast_quat_data(host_quat[0]); hd_balancer.start_timer(); } else { atom->cast_x_data(host_x,host_type); atom->cast_quat_data(host_quat[0]); hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); } atom->add_quat_data(); *ilist=nbor->host_ilist.begin(); *jnum=nbor->host_acc.begin(); loop(eflag,vflag); ans->copy_answers(eflag,vflag,eatom,vatom); device->add_ans_object(ans); hd_balancer.stop_timer(); return nbor->host_jlist.begin()-host_start; } template <class numtyp, class acctyp> double BaseEllipsoidT::host_memory_usage_base() const { return device->atom.host_memory_usage()+nbor->host_memory_usage()+ 4*sizeof(numtyp)+sizeof(BaseEllipsoid<numtyp,acctyp>); } template <class numtyp, class acctyp> void BaseEllipsoidT::compile_kernels(UCL_Device &dev, const void *ellipsoid_string, const void *lj_string, const char *kname, const bool e_s) { if (_compiled) return; std::string kns=kname; std::string s_sphere_ellipsoid=kns+"_sphere_ellipsoid"; std::string s_ellipsoid_sphere=kns+"_ellipsoid_sphere"; std::string s_lj=kns+"_lj"; std::string s_lj_fast=kns+"_lj_fast"; std::string flags=device->compile_string(); nbor_program=new UCL_Program(dev); nbor_program->load_string(ellipsoid_nbor,flags.c_str()); k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast"); k_nbor.set_function(*nbor_program,"kernel_nbor"); neigh_tex.get_texture(*nbor_program,"pos_tex"); ellipsoid_program=new UCL_Program(dev); ellipsoid_program->load_string(ellipsoid_string,flags.c_str()); k_ellipsoid.set_function(*ellipsoid_program,kname); pos_tex.get_texture(*ellipsoid_program,"pos_tex"); quat_tex.get_texture(*ellipsoid_program,"quat_tex"); lj_program=new UCL_Program(dev); lj_program->load_string(lj_string,flags.c_str()); k_sphere_ellipsoid.set_function(*lj_program,s_sphere_ellipsoid.c_str()); k_lj_fast.set_function(*lj_program,s_lj_fast.c_str()); k_lj.set_function(*lj_program,s_lj.c_str()); if (e_s) k_ellipsoid_sphere.set_function(*lj_program,s_ellipsoid_sphere.c_str()); lj_pos_tex.get_texture(*lj_program,"pos_tex"); lj_quat_tex.get_texture(*lj_program,"quat_tex"); _compiled=true; } template class BaseEllipsoid<PRECISION,ACC_PRECISION>; diff --git a/lib/gpu/lal_beck_ext.cpp b/lib/gpu/lal_beck_ext.cpp index 1a736bf02..314854760 100644 --- a/lib/gpu/lal_beck_ext.cpp +++ b/lib/gpu/lal_beck_ext.cpp @@ -1,120 +1,120 @@ /*************************************************************************** beck_ext.cpp ------------------- Trung Dac Nguyen (ORNL) Functions for LAMMPS access to beck acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_beck.h" using namespace std; using namespace LAMMPS_AL; static Beck<PRECISION,ACC_PRECISION> BLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int beck_gpu_init(const int ntypes, double **cutsq, double **aa, double **alpha, double **beta, double **AA, double **BB, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { BLMF.clear(); gpu_mode=BLMF.device->gpu_mode(); double gpu_split=BLMF.device->particle_split(); int first_gpu=BLMF.device->first_device(); int last_gpu=BLMF.device->last_device(); int world_me=BLMF.device->world_me(); int gpu_rank=BLMF.device->gpu_rank(); int procs_per_gpu=BLMF.device->procs_per_gpu(); BLMF.device->init_message(screen,"beck",first_gpu,last_gpu); bool message=false; if (BLMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta, AA, BB, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); BLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta, AA, BB, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); BLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) BLMF.estimate_gpu_overhead(); return init_ok; } void beck_gpu_clear() { BLMF.clear(); } int ** beck_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success) { return BLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); } void beck_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { BLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } double beck_gpu_bytes() { return BLMF.host_memory_usage(); } diff --git a/lib/gpu/lal_born_coul_long_ext.cpp b/lib/gpu/lal_born_coul_long_ext.cpp index e8ac4eff0..3d8810fca 100644 --- a/lib/gpu/lal_born_coul_long_ext.cpp +++ b/lib/gpu/lal_born_coul_long_ext.cpp @@ -1,132 +1,132 @@ /*************************************************************************** born_coul_long_ext.cpp ------------------- Trung Dac Nguyen (ORNL) Functions for LAMMPS access to born/coul/long acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_born_coul_long.h" using namespace std; using namespace LAMMPS_AL; static BornCoulLong<PRECISION,ACC_PRECISION> BORNCLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, double **host_born1, double **host_born2, double **host_born3, double **host_a, double **host_c, double **host_d, double **sigma, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald) { BORNCLMF.clear(); gpu_mode=BORNCLMF.device->gpu_mode(); double gpu_split=BORNCLMF.device->particle_split(); int first_gpu=BORNCLMF.device->first_device(); int last_gpu=BORNCLMF.device->last_device(); int world_me=BORNCLMF.device->world_me(); int gpu_rank=BORNCLMF.device->gpu_rank(); int procs_per_gpu=BORNCLMF.device->procs_per_gpu(); BORNCLMF.device->init_message(screen,"born/coul/long",first_gpu,last_gpu); bool message=false; if (BORNCLMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); BORNCLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); BORNCLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) BORNCLMF.estimate_gpu_overhead(); return init_ok; } void borncl_gpu_clear() { BORNCLMF.clear(); } int** borncl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd) { return BORNCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); } void borncl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_q, const int nlocal, double *boxlo, double *prd) { BORNCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, host_q,nlocal,boxlo,prd); } double borncl_gpu_bytes() { return BORNCLMF.host_memory_usage(); } diff --git a/lib/gpu/lal_born_coul_wolf_ext.cpp b/lib/gpu/lal_born_coul_wolf_ext.cpp index 3e779d099..bffab31d1 100644 --- a/lib/gpu/lal_born_coul_wolf_ext.cpp +++ b/lib/gpu/lal_born_coul_wolf_ext.cpp @@ -1,134 +1,134 @@ /*************************************************************************** born_coul_wolf_ext.cpp ------------------- Trung Dac Nguyen (ORNL) Functions for LAMMPS access to born/coul/wolf acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_born_coul_wolf.h" using namespace std; using namespace LAMMPS_AL; static BornCoulWolf<PRECISION,ACC_PRECISION> BORNCWMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, double **host_born1, double **host_born2, double **host_born3, double **host_a, double **host_c, double **host_d, double **sigma, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double alf, const double e_shift, const double f_shift) { BORNCWMF.clear(); gpu_mode=BORNCWMF.device->gpu_mode(); double gpu_split=BORNCWMF.device->particle_split(); int first_gpu=BORNCWMF.device->first_device(); int last_gpu=BORNCWMF.device->last_device(); int world_me=BORNCWMF.device->world_me(); int gpu_rank=BORNCWMF.device->gpu_rank(); int procs_per_gpu=BORNCWMF.device->procs_per_gpu(); BORNCWMF.device->init_message(screen,"born/coul/wolf",first_gpu,last_gpu); bool message=false; if (BORNCWMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, alf, e_shift, f_shift); BORNCWMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, alf, e_shift, f_shift); BORNCWMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) BORNCWMF.estimate_gpu_overhead(); return init_ok; } void borncw_gpu_clear() { BORNCWMF.clear(); } int** borncw_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd) { return BORNCWMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); } void borncw_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_q, const int nlocal, double *boxlo, double *prd) { BORNCWMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, host_q,nlocal,boxlo,prd); } double borncw_gpu_bytes() { return BORNCWMF.host_memory_usage(); } diff --git a/lib/gpu/lal_born_ext.cpp b/lib/gpu/lal_born_ext.cpp index 7785353a8..a3fcfad4e 100644 --- a/lib/gpu/lal_born_ext.cpp +++ b/lib/gpu/lal_born_ext.cpp @@ -1,124 +1,124 @@ /*************************************************************************** born_ext.cpp ------------------- Trung Dac Nguyen (ORNL) Functions for LAMMPS access to born acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_born.h" using namespace std; using namespace LAMMPS_AL; static Born<PRECISION,ACC_PRECISION> BORNMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, double **host_born1, double **host_born2, double **host_born3, double **host_a, double **host_c, double **host_d, double **sigma, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { BORNMF.clear(); gpu_mode=BORNMF.device->gpu_mode(); double gpu_split=BORNMF.device->particle_split(); int first_gpu=BORNMF.device->first_device(); int last_gpu=BORNMF.device->last_device(); int world_me=BORNMF.device->world_me(); int gpu_rank=BORNMF.device->gpu_rank(); int procs_per_gpu=BORNMF.device->procs_per_gpu(); BORNMF.device->init_message(screen,"born",first_gpu,last_gpu); bool message=false; if (BORNMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); BORNMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2, host_born3, host_a, host_c, host_d, sigma, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); BORNMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) BORNMF.estimate_gpu_overhead(); return init_ok; } void born_gpu_clear() { BORNMF.clear(); } int ** born_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success) { return BORNMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); } void born_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { BORNMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } double born_gpu_bytes() { return BORNMF.host_memory_usage(); } diff --git a/lib/gpu/lal_buck_coul_ext.cpp b/lib/gpu/lal_buck_coul_ext.cpp index ac3e6b891..be5dc718d 100644 --- a/lib/gpu/lal_buck_coul_ext.cpp +++ b/lib/gpu/lal_buck_coul_ext.cpp @@ -1,131 +1,131 @@ /*************************************************************************** buck_coul_ext.cpp ------------------- Trung Dac Nguyen (ORNL) Functions for LAMMPS access to buck/coul/cut acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_buck_coul.h" using namespace std; using namespace LAMMPS_AL; static BuckCoul<PRECISION,ACC_PRECISION> BUCKCMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_a, double **host_c, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, const double qqrd2e) { BUCKCMF.clear(); gpu_mode=BUCKCMF.device->gpu_mode(); double gpu_split=BUCKCMF.device->particle_split(); int first_gpu=BUCKCMF.device->first_device(); int last_gpu=BUCKCMF.device->last_device(); int world_me=BUCKCMF.device->world_me(); int gpu_rank=BUCKCMF.device->gpu_rank(); int procs_per_gpu=BUCKCMF.device->procs_per_gpu(); BUCKCMF.device->init_message(screen,"buck",first_gpu,last_gpu); bool message=false; if (BUCKCMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); BUCKCMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); BUCKCMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) BUCKCMF.estimate_gpu_overhead(); return init_ok; } void buckc_gpu_clear() { BUCKCMF.clear(); } int ** buckc_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd) { return BUCKCMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); } void buckc_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_q, const int nlocal, double *boxlo, double *prd) { BUCKCMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag, vflag,eatom,vatom,host_start,cpu_time,success,host_q, nlocal,boxlo,prd); } double buckc_gpu_bytes() { return BUCKCMF.host_memory_usage(); } diff --git a/lib/gpu/lal_buck_coul_long_ext.cpp b/lib/gpu/lal_buck_coul_long_ext.cpp index d9328a921..9f614ffb1 100644 --- a/lib/gpu/lal_buck_coul_long_ext.cpp +++ b/lib/gpu/lal_buck_coul_long_ext.cpp @@ -1,130 +1,130 @@ /*************************************************************************** buck_coul_long_ext.cpp ------------------- Trung Dac Nguyen (ORNL) Functions for LAMMPS access to buck/coul/long acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_buck_coul_long.h" using namespace std; using namespace LAMMPS_AL; static BuckCoulLong<PRECISION,ACC_PRECISION> BUCKCLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_a, double **host_c, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald) { BUCKCLMF.clear(); gpu_mode=BUCKCLMF.device->gpu_mode(); double gpu_split=BUCKCLMF.device->particle_split(); int first_gpu=BUCKCLMF.device->first_device(); int last_gpu=BUCKCLMF.device->last_device(); int world_me=BUCKCLMF.device->world_me(); int gpu_rank=BUCKCLMF.device->gpu_rank(); int procs_per_gpu=BUCKCLMF.device->procs_per_gpu(); BUCKCLMF.device->init_message(screen,"buck/coul/long",first_gpu,last_gpu); bool message=false; if (BUCKCLMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); BUCKCLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); BUCKCLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) BUCKCLMF.estimate_gpu_overhead(); return init_ok; } void buckcl_gpu_clear() { BUCKCLMF.clear(); } int** buckcl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd) { return BUCKCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); } void buckcl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_q, const int nlocal, double *boxlo, double *prd) { BUCKCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, host_q,nlocal,boxlo,prd); } double buckcl_gpu_bytes() { return BUCKCLMF.host_memory_usage(); } diff --git a/lib/gpu/lal_buck_ext.cpp b/lib/gpu/lal_buck_ext.cpp index 9f7f725ae..4b25e7356 100644 --- a/lib/gpu/lal_buck_ext.cpp +++ b/lib/gpu/lal_buck_ext.cpp @@ -1,121 +1,121 @@ /*************************************************************************** buck_ext.cpp ------------------- Trung Dac Nguyen (ORNL) Functions for LAMMPS access to buck acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_buck.h" using namespace std; using namespace LAMMPS_AL; static Buck<PRECISION,ACC_PRECISION> BUCKMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv, double **host_buck1, double **host_buck2, double **host_a, double **host_c, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { BUCKMF.clear(); gpu_mode=BUCKMF.device->gpu_mode(); double gpu_split=BUCKMF.device->particle_split(); int first_gpu=BUCKMF.device->first_device(); int last_gpu=BUCKMF.device->last_device(); int world_me=BUCKMF.device->world_me(); int gpu_rank=BUCKMF.device->gpu_rank(); int procs_per_gpu=BUCKMF.device->procs_per_gpu(); BUCKMF.device->init_message(screen,"buck",first_gpu,last_gpu); bool message=false; if (BUCKMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); BUCKMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2, host_a, host_c, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); BUCKMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) BUCKMF.estimate_gpu_overhead(); return init_ok; } void buck_gpu_clear() { BUCKMF.clear(); } int ** buck_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success) { return BUCKMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); } void buck_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { BUCKMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } double buck_gpu_bytes() { return BUCKMF.host_memory_usage(); } diff --git a/lib/gpu/lal_cg_cmm_ext.cpp b/lib/gpu/lal_cg_cmm_ext.cpp index 6d64c3043..0100ac890 100644 --- a/lib/gpu/lal_cg_cmm_ext.cpp +++ b/lib/gpu/lal_cg_cmm_ext.cpp @@ -1,121 +1,121 @@ /*************************************************************************** cg_cmm.h ------------------- W. Michael Brown (ORNL) Functions for LAMMPS access to lj/sdk pair acceleration routines __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_cg_cmm.h" using namespace std; using namespace LAMMPS_AL; static CGCMM<PRECISION,ACC_PRECISION> CMMMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { CMMMF.clear(); gpu_mode=CMMMF.device->gpu_mode(); double gpu_split=CMMMF.device->particle_split(); int first_gpu=CMMMF.device->first_device(); int last_gpu=CMMMF.device->last_device(); int world_me=CMMMF.device->world_me(); int gpu_rank=CMMMF.device->gpu_rank(); int procs_per_gpu=CMMMF.device->procs_per_gpu(); CMMMF.device->init_message(screen,"lj/sdk",first_gpu,last_gpu); bool message=false; if (CMMMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); CMMMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); CMMMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) CMMMF.estimate_gpu_overhead(); return init_ok; } void cmm_gpu_clear() { CMMMF.clear(); } int** cmm_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success) { return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); } void cmm_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { CMMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } double cmm_gpu_bytes() { return CMMMF.host_memory_usage(); } diff --git a/lib/gpu/lal_cg_cmm_long_ext.cpp b/lib/gpu/lal_cg_cmm_long_ext.cpp index ca7aab70c..9197e75e0 100644 --- a/lib/gpu/lal_cg_cmm_long_ext.cpp +++ b/lib/gpu/lal_cg_cmm_long_ext.cpp @@ -1,129 +1,129 @@ /*************************************************************************** cg_cmm_long.h ------------------- W. Michael Brown (ORNL) Functions for LAMMPS access to lj/sdk/coul/long acceleration functions __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_cg_cmm_long.h" using namespace std; using namespace LAMMPS_AL; static CGCMMLong<PRECISION,ACC_PRECISION> CMMLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald) { CMMLMF.clear(); gpu_mode=CMMLMF.device->gpu_mode(); double gpu_split=CMMLMF.device->particle_split(); int first_gpu=CMMLMF.device->first_device(); int last_gpu=CMMLMF.device->last_device(); int world_me=CMMLMF.device->world_me(); int gpu_rank=CMMLMF.device->gpu_rank(); int procs_per_gpu=CMMLMF.device->procs_per_gpu(); CMMLMF.device->init_message(screen,"lj/sdk/coul/long",first_gpu,last_gpu); bool message=false; if (CMMLMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e,g_ewald); CMMLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); CMMLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) CMMLMF.estimate_gpu_overhead(); return init_ok; } void cmml_gpu_clear() { CMMLMF.clear(); } int** cmml_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd) { return CMMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q,boxlo,prd); } void cmml_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_q, const int nlocal, double *boxlo, double *prd) { CMMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, host_q,nlocal,boxlo,prd); } double cmml_gpu_bytes() { return CMMLMF.host_memory_usage(); } diff --git a/lib/gpu/lal_charmm_long_ext.cpp b/lib/gpu/lal_charmm_long_ext.cpp index 5d544dc87..8f359e361 100644 --- a/lib/gpu/lal_charmm_long_ext.cpp +++ b/lib/gpu/lal_charmm_long_ext.cpp @@ -1,135 +1,135 @@ /*************************************************************************** charmm_long_ext.cpp ------------------- W. Michael Brown (ORNL) Functions for LAMMPS access to charmm/coul/long acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_charmm_long.h" using namespace std; using namespace LAMMPS_AL; static CHARMMLong<PRECISION,ACC_PRECISION> CRMLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double host_cut_ljsq, double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald, const double cut_lj_innersq, const double denom_lj, double **epsilon, double **sigma, const bool mix_arithmetic) { CRMLMF.clear(); gpu_mode=CRMLMF.device->gpu_mode(); double gpu_split=CRMLMF.device->particle_split(); int first_gpu=CRMLMF.device->first_device(); int last_gpu=CRMLMF.device->last_device(); int world_me=CRMLMF.device->world_me(); int gpu_rank=CRMLMF.device->gpu_rank(); int procs_per_gpu=CRMLMF.device->procs_per_gpu(); CRMLMF.device->init_message(screen,"lj/charmm/coul/long",first_gpu,last_gpu); bool message=false; if (CRMLMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj, epsilon,sigma,mix_arithmetic); CRMLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj, epsilon, sigma, mix_arithmetic); CRMLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) CRMLMF.estimate_gpu_overhead(); return init_ok; } void crml_gpu_clear() { CRMLMF.clear(); } int** crml_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd) { return CRMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); } void crml_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_q, const int nlocal, double *boxlo, double *prd) { CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh, eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q, nlocal,boxlo,prd); } double crml_gpu_bytes() { return CRMLMF.host_memory_usage(); } diff --git a/lib/gpu/lal_colloid_ext.cpp b/lib/gpu/lal_colloid_ext.cpp index d4e8a2092..50c95c1e2 100644 --- a/lib/gpu/lal_colloid_ext.cpp +++ b/lib/gpu/lal_colloid_ext.cpp @@ -1,127 +1,127 @@ /*************************************************************************** colloid_ext.cpp ------------------- Trung Dac Nguyen (ORNL) Functions for LAMMPS access to colloid acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_colloid.h" using namespace std; using namespace LAMMPS_AL; static Colloid<PRECISION,ACC_PRECISION> COLLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, double **host_a12, double **host_a1, double **host_a2, double **host_d1, double **host_d2, double **host_sigma3, double **host_sigma6, int **host_form, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { COLLMF.clear(); gpu_mode=COLLMF.device->gpu_mode(); double gpu_split=COLLMF.device->particle_split(); int first_gpu=COLLMF.device->first_device(); int last_gpu=COLLMF.device->last_device(); int world_me=COLLMF.device->world_me(); int gpu_rank=COLLMF.device->gpu_rank(); int procs_per_gpu=COLLMF.device->procs_per_gpu(); COLLMF.device->init_message(screen,"colloid",first_gpu,last_gpu); bool message=false; if (COLLMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, host_a12, host_a1, host_a2, host_d1, host_d2, host_sigma3, host_sigma6, host_form, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); COLLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, host_a12, host_a1, host_a2, host_d1, host_d2, host_sigma3, host_sigma6, host_form, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); COLLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) COLLMF.estimate_gpu_overhead(); return init_ok; } void colloid_gpu_clear() { COLLMF.clear(); } int ** colloid_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success) { return COLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); } void colloid_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { COLLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } double colloid_gpu_bytes() { return COLLMF.host_memory_usage(); } diff --git a/lib/gpu/lal_coul_dsf_ext.cpp b/lib/gpu/lal_coul_dsf_ext.cpp index 19879cb9c..851df0f53 100644 --- a/lib/gpu/lal_coul_dsf_ext.cpp +++ b/lib/gpu/lal_coul_dsf_ext.cpp @@ -1,125 +1,125 @@ /*************************************************************************** coul_dsf_ext.cpp ------------------- Trung Dac Nguyen (ORNL) Functions for LAMMPS access to coul/dsf acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : 8/15/2012 email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_coul_dsf.h" using namespace std; using namespace LAMMPS_AL; static CoulDSF<PRECISION,ACC_PRECISION> CDMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int cdsf_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double e_shift, const double f_shift, const double alpha) { CDMF.clear(); gpu_mode=CDMF.device->gpu_mode(); double gpu_split=CDMF.device->particle_split(); int first_gpu=CDMF.device->first_device(); int last_gpu=CDMF.device->last_device(); int world_me=CDMF.device->world_me(); int gpu_rank=CDMF.device->gpu_rank(); int procs_per_gpu=CDMF.device->procs_per_gpu(); CDMF.device->init_message(screen,"coul/dsf",first_gpu,last_gpu); bool message=false; if (CDMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_coulsq, host_special_coul, qqrd2e, e_shift, f_shift, alpha); CDMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_coulsq, host_special_coul, qqrd2e, e_shift, f_shift, alpha); CDMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) CDMF.estimate_gpu_overhead(); return init_ok; } void cdsf_gpu_clear() { CDMF.clear(); } int** cdsf_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd) { return CDMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); } void cdsf_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_q, const int nlocal, double *boxlo, double *prd) { CDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag, vflag,eatom,vatom,host_start,cpu_time,success,host_q, nlocal,boxlo,prd); } double cdsf_gpu_bytes() { return CDMF.host_memory_usage(); } diff --git a/lib/gpu/lal_coul_long_ext.cpp b/lib/gpu/lal_coul_long_ext.cpp index f6ce0c1d7..156de0be0 100644 --- a/lib/gpu/lal_coul_long_ext.cpp +++ b/lib/gpu/lal_coul_long_ext.cpp @@ -1,123 +1,123 @@ /*************************************************************************** coul_long_ext.cpp ------------------- Axel Kohlmeyer (Temple) Functions for LAMMPS access to coul/long acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : July 2011 email : a.kohlmeyer@temple.edu ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_coul_long.h" using namespace std; using namespace LAMMPS_AL; static CoulLong<PRECISION,ACC_PRECISION> CLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int cl_gpu_init(const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald) { CLMF.clear(); gpu_mode=CLMF.device->gpu_mode(); double gpu_split=CLMF.device->particle_split(); int first_gpu=CLMF.device->first_device(); int last_gpu=CLMF.device->last_device(); int world_me=CLMF.device->world_me(); int gpu_rank=CLMF.device->gpu_rank(); int procs_per_gpu=CLMF.device->procs_per_gpu(); CLMF.device->init_message(screen,"coul/long",first_gpu,last_gpu); bool message=false; if (CLMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); CLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=CLMF.init(inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); CLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) CLMF.estimate_gpu_overhead(); return init_ok; } void cl_gpu_clear() { CLMF.clear(); } int** cl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd) { return CLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); } void cl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_q, const int nlocal, double *boxlo, double *prd) { CLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, host_q,nlocal,boxlo,prd); } double cl_gpu_bytes() { return CLMF.host_memory_usage(); } diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index 99e259213..e3f1037ba 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -1,720 +1,729 @@ /*************************************************************************** device.cpp ------------------- W. Michael Brown (ORNL) Class for management of the device where the computations are performed __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov ***************************************************************************/ #include "lal_device.h" #include "lal_precision.h" #include <map> #include <math.h> #ifdef _OPENMP #include <omp.h> #endif #if defined(USE_OPENCL) #include "device_cl.h" #elif defined(USE_CUDART) const char *device=0; #else #include "device_cubin.h" #endif using namespace LAMMPS_AL; #define DeviceT Device<numtyp, acctyp> template <class numtyp, class acctyp> DeviceT::Device() : _init_count(0), _device_init(false), _gpu_mode(GPU_FORCE), _first_device(0), _last_device(0), _compiled(false) { } template <class numtyp, class acctyp> DeviceT::~Device() { clear_device(); } template <class numtyp, class acctyp> int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, const int last_gpu, const int gpu_mode, const double p_split, const int nthreads, const int t_per_atom, const double cell_size, char *ocl_vendor) { _nthreads=nthreads; #ifdef _OPENMP omp_set_num_threads(nthreads); #endif _threads_per_atom=t_per_atom; _threads_per_charge=t_per_atom; if (_device_init) return 0; _device_init=true; _comm_world=replica; //world; _comm_replica=replica; _first_device=first_gpu; _last_device=last_gpu; _gpu_mode=gpu_mode; _particle_split=p_split; _cell_size=cell_size; // Get the rank/size within the world MPI_Comm_rank(_comm_world,&_world_me); MPI_Comm_size(_comm_world,&_world_size); // Get the rank/size within the replica MPI_Comm_rank(_comm_replica,&_replica_me); MPI_Comm_size(_comm_replica,&_replica_size); // Get the names of all nodes int name_length; char node_name[MPI_MAX_PROCESSOR_NAME]; char node_names[MPI_MAX_PROCESSOR_NAME*_world_size]; MPI_Get_processor_name(node_name,&name_length); MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names, MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world); std::string node_string=std::string(node_name); // Get the number of procs per node std::map<std::string,int> name_map; std::map<std::string,int>::iterator np; for (int i=0; i<_world_size; i++) { std::string i_string=std::string(&node_names[i*MPI_MAX_PROCESSOR_NAME]); np=name_map.find(i_string); if (np==name_map.end()) name_map[i_string]=1; else np->second++; } int procs_per_node=name_map.begin()->second; // Assign a unique id to each node int split_num=0, split_id=0; for (np=name_map.begin(); np!=name_map.end(); ++np) { if (np->first==node_string) split_id=split_num; split_num++; } // Set up a per node communicator and find rank within MPI_Comm node_comm; MPI_Comm_split(_comm_world, split_id, 0, &node_comm); int node_rank; MPI_Comm_rank(node_comm,&node_rank); // set the device ID _procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/ (last_gpu-first_gpu+1))); int my_gpu=node_rank/_procs_per_gpu+first_gpu; // Time on the device only if 1 proc per gpu _time_device=true; if (_procs_per_gpu>1) _time_device=false; // Set up a per device communicator MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu); MPI_Comm_rank(_comm_gpu,&_gpu_rank); gpu=new UCL_Device(); if (my_gpu>=gpu->num_devices()) return -2; #ifndef CUDA_PROXY if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false) return -7; #endif if (gpu->set(my_gpu)!=UCL_SUCCESS) return -6; gpu->push_command_queue(); gpu->set_command_queue(1); _long_range_precompute=0; if (set_ocl_params(ocl_vendor)!=0) return -11; int flag=0; for (int i=0; i<_procs_per_gpu; i++) { if (_gpu_rank==i) flag=compile_kernels(); gpu_barrier(); } return flag; } template <class numtyp, class acctyp> int DeviceT::set_ocl_params(char *ocl_vendor) { #ifdef USE_OPENCL std::string s_vendor=OCL_DEFAULT_VENDOR; if (ocl_vendor!=NULL) s_vendor=ocl_vendor; if (s_vendor=="none") s_vendor="generic"; if (s_vendor=="kepler") { _ocl_vendor_name="NVIDIA Kepler"; #if defined (__APPLE__) || defined(MACOSX) _ocl_vendor_string="-DKEPLER_OCL -DNO_OCL_PTX"; #else _ocl_vendor_string="-DKEPLER_OCL"; #endif } else if (s_vendor=="fermi") { _ocl_vendor_name="NVIDIA Fermi"; _ocl_vendor_string="-DFERMI_OCL"; } else if (s_vendor=="cypress") { _ocl_vendor_name="AMD Cypress"; _ocl_vendor_string="-DCYPRESS_OCL"; + } else if (s_vendor=="phi") { + _ocl_vendor_name="Intel Phi"; + _ocl_vendor_string="-DPHI_OCL"; + } else if (s_vendor=="intel") { + _ocl_vendor_name="Intel CPU"; + _ocl_vendor_string="-DINTEL_OCL"; } else if (s_vendor=="generic") { _ocl_vendor_name="GENERIC"; _ocl_vendor_string="-DGENERIC_OCL"; } else { _ocl_vendor_name="CUSTOM"; _ocl_vendor_string="-DUSE_OPENCL"; int token_count=0; std::string params[13]; char *pch = strtok(ocl_vendor,"\" "); while (pch != NULL) { if (token_count==13) return -11; params[token_count]=pch; token_count++; pch = strtok(NULL,"\" "); } _ocl_vendor_string+=" -DMEM_THREADS="+params[0]+ " -DTHREADS_PER_ATOM="+params[1]+ " -DTHREADS_PER_CHARGE="+params[2]+ " -DBLOCK_PAIR="+params[3]+ " -DMAX_SHARED_TYPES="+params[4]+ " -DBLOCK_NBOR_BUILD="+params[5]+ " -DBLOCK_BIO_PAIR="+params[6]+ " -DBLOCK_ELLIPSE="+params[7]+ " -DWARP_SIZE="+params[8]+ " -DPPPM_BLOCK_1D="+params[9]+ " -DBLOCK_CELL_2D="+params[10]+ " -DBLOCK_CELL_ID="+params[11]+ " -DMAX_BIO_SHARED_TYPES="+params[12]; } _ocl_compile_string="-cl-fast-relaxed-math -cl-mad-enable "+ std::string(OCL_PRECISION_COMPILE)+" "+_ocl_vendor_string; #endif return 0; } template <class numtyp, class acctyp> int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot, const int nlocal, const int host_nlocal, const int nall, Neighbor *nbor, const int maxspecial, const int gpu_host, const int max_nbors, const double cell_size, const bool pre_cut, const int threads_per_atom) { if (!_device_init) return -1; if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false) return -5; // Counts of data transfers for timing overhead estimates _data_in_estimate=0; _data_out_estimate=1; // Initial number of local particles int ef_nlocal=nlocal; if (_particle_split<1.0 && _particle_split>0.0) ef_nlocal=static_cast<int>(_particle_split*nlocal); int gpu_nbor=0; if (_gpu_mode==Device<numtyp,acctyp>::GPU_NEIGH) gpu_nbor=1; else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH) gpu_nbor=2; #ifndef USE_CUDPP if (gpu_nbor==1) gpu_nbor=2; #endif if (_init_count==0) { // Initialize atom and nbor data if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0)) return -3; _data_in_estimate++; if (charge) _data_in_estimate++; if (rot) _data_in_estimate++; } else { if (atom.charge()==false && charge) _data_in_estimate++; if (atom.quaternion()==false && rot) _data_in_estimate++; if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial)) return -3; } if (!ans.init(ef_nlocal,charge,rot,*gpu)) return -3; if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial, *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, _block_cell_id, _block_nbor_build, threads_per_atom, _warp_size, _time_device, compile_string())) return -3; if (_cell_size<0.0) nbor->cell_size(cell_size,cell_size); else nbor->cell_size(_cell_size,cell_size); _init_count++; return 0; } template <class numtyp, class acctyp> int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal, const int nall) { if (!_device_init) return -1; if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false) return -5; if (_init_count==0) { // Initialize atom and nbor data if (!atom.init(nall,true,false,*gpu,false,false)) return -3; } else if (!atom.add_fields(true,false,false,false)) return -3; if (!ans.init(nlocal,true,false,*gpu)) return -3; _init_count++; return 0; } template <class numtyp, class acctyp> void DeviceT::set_single_precompute (PPPM<numtyp,acctyp,float,_lgpu_float4> *pppm) { _long_range_precompute=1; pppm_single=pppm; } template <class numtyp, class acctyp> void DeviceT::set_double_precompute (PPPM<numtyp,acctyp,double,_lgpu_double4> *pppm) { _long_range_precompute=2; pppm_double=pppm; } template <class numtyp, class acctyp> void DeviceT::init_message(FILE *screen, const char *name, const int first_gpu, const int last_gpu) { #if defined(USE_OPENCL) std::string fs=""; #elif defined(USE_CUDART) std::string fs=""; #else std::string fs=toa(gpu->free_gigabytes())+"/"; #endif if (_replica_me == 0 && screen) { fprintf(screen,"\n-------------------------------------"); fprintf(screen,"-------------------------------------\n"); - fprintf(screen,"- Using GPGPU acceleration for %s:\n",name); + fprintf(screen,"- Using acceleration for %s:\n",name); fprintf(screen,"- with %d proc(s) per device.\n",_procs_per_gpu); #ifdef _OPENMP fprintf(screen,"- with %d thread(s) per proc.\n",_nthreads); #endif #ifdef USE_OPENCL fprintf(screen,"- with OpenCL Parameters for: %s\n", _ocl_vendor_name.c_str()); #endif fprintf(screen,"-------------------------------------"); fprintf(screen,"-------------------------------------\n"); int last=last_gpu+1; if (last>gpu->num_devices()) last=gpu->num_devices(); for (int i=first_gpu; i<last; i++) { std::string sname; if (i==first_gpu) - sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+ + sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+fs+ toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+" GHZ ("; else - sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+ + sname=gpu->name(i)+", "+toa(gpu->cus(i))+" CUs, "+fs+ toa(gpu->clock_rate(i))+" GHZ ("; if (sizeof(PRECISION)==4) { if (sizeof(ACC_PRECISION)==4) sname+="Single Precision)"; else sname+="Mixed Precision)"; } else sname+="Double Precision)"; - fprintf(screen,"GPU %d: %s\n",i,sname.c_str()); + fprintf(screen,"Device %d: %s\n",i,sname.c_str()); } fprintf(screen,"-------------------------------------"); fprintf(screen,"-------------------------------------\n\n"); } } template <class numtyp, class acctyp> void DeviceT::estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead, double &gpu_driver_overhead) { UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL; UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL; UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL; UCL_Timer over_timer(*gpu); if (_data_in_estimate>0) { host_data_in=new UCL_H_Vec<int>[_data_in_estimate]; dev_data_in=new UCL_D_Vec<int>[_data_in_estimate]; timers_in=new UCL_Timer[_data_in_estimate]; } if (_data_out_estimate>0) { host_data_out=new UCL_H_Vec<int>[_data_out_estimate]; dev_data_out=new UCL_D_Vec<int>[_data_out_estimate]; timers_out=new UCL_Timer[_data_out_estimate]; } if (kernel_calls>0) { kernel_data=new UCL_D_Vec<int>[kernel_calls]; timers_kernel=new UCL_Timer[kernel_calls]; } for (int i=0; i<_data_in_estimate; i++) { host_data_in[i].alloc(1,*gpu); dev_data_in[i].alloc(1,*gpu); timers_in[i].init(*gpu); } for (int i=0; i<_data_out_estimate; i++) { host_data_out[i].alloc(1,*gpu); dev_data_out[i].alloc(1,*gpu); timers_out[i].init(*gpu); } for (int i=0; i<kernel_calls; i++) { kernel_data[i].alloc(1,*gpu); timers_kernel[i].init(*gpu); } gpu_overhead=0.0; gpu_driver_overhead=0.0; for (int i=0; i<10; i++) { gpu->sync(); gpu_barrier(); over_timer.start(); gpu->sync(); gpu_barrier(); double driver_time=MPI_Wtime(); for (int i=0; i<_data_in_estimate; i++) { timers_in[i].start(); ucl_copy(dev_data_in[i],host_data_in[i],true); timers_in[i].stop(); } for (int i=0; i<kernel_calls; i++) { timers_kernel[i].start(); zero(kernel_data[i],1); timers_kernel[i].stop(); } for (int i=0; i<_data_out_estimate; i++) { timers_out[i].start(); ucl_copy(host_data_out[i],dev_data_out[i],true); timers_out[i].stop(); } over_timer.stop(); double time=over_timer.seconds(); driver_time=MPI_Wtime()-driver_time; if (time_device()) { for (int i=0; i<_data_in_estimate; i++) timers_in[i].add_to_total(); for (int i=0; i<kernel_calls; i++) timers_kernel[i].add_to_total(); for (int i=0; i<_data_out_estimate; i++) timers_out[i].add_to_total(); } double mpi_time, mpi_driver_time; MPI_Allreduce(&time,&mpi_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm()); MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm()); gpu_overhead+=mpi_time; gpu_driver_overhead+=mpi_driver_time; } gpu_overhead/=10.0; gpu_driver_overhead/=10.0; if (_data_in_estimate>0) { delete [] host_data_in; delete [] dev_data_in; delete [] timers_in; } if (_data_out_estimate>0) { delete [] host_data_out; delete [] dev_data_out; delete [] timers_out; } if (kernel_calls>0) { delete [] kernel_data; delete [] timers_kernel; } } template <class numtyp, class acctyp> void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans, Neighbor &nbor, const double avg_split, const double max_bytes, const double gpu_overhead, const double driver_overhead, const int threads_per_atom, FILE *screen) { double single[9], times[9]; single[0]=atom.transfer_time()+ans.transfer_time(); single[1]=nbor.time_nbor.total_seconds()+nbor.time_hybrid1.total_seconds()+ nbor.time_hybrid2.total_seconds(); single[2]=nbor.time_kernel.total_seconds(); single[3]=time_pair.total_seconds(); single[4]=atom.cast_time()+ans.cast_time(); single[5]=gpu_overhead; single[6]=driver_overhead; single[7]=ans.cpu_idle_time(); single[8]=nbor.bin_time(); MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,_comm_replica); double my_max_bytes=max_bytes+atom.max_gpu_bytes(); double mpi_max_bytes; MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); double max_mb=mpi_max_bytes/(1024.0*1024.0); + double t_time=times[0]+times[1]+times[2]+times[3]+times[4]; if (replica_me()==0) if (screen && times[5]>0.0) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - fprintf(screen," GPU Time Info (average): "); + fprintf(screen," Device Time Info (average): "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - if (time_device()) { + if (time_device() && t_time>0) { fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size); fprintf(screen,"Data Cast/Pack: %.4f s.\n",times[4]/_replica_size); fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size); if (nbor.gpu_nbor()>0) fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/_replica_size); else fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size); fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size); } if (nbor.gpu_nbor()==2) fprintf(screen,"Neighbor (CPU): %.4f s.\n",times[8]/_replica_size); - fprintf(screen,"GPU Overhead: %.4f s.\n",times[5]/_replica_size); + if (times[5]>0) + fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Threads / atom: %d.\n",threads_per_atom); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size); fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[7]/_replica_size); fprintf(screen,"-------------------------------------"); fprintf(screen,"--------------------------------\n\n"); } } template <class numtyp, class acctyp> void DeviceT::output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out, UCL_Timer &time_map, UCL_Timer &time_rho, UCL_Timer &time_interp, Answer<numtyp,acctyp> &ans, const double max_bytes, const double cpu_time, const double idle_time, FILE *screen) { double single[8], times[8]; single[0]=time_out.total_seconds(); single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time(); single[2]=time_map.total_seconds(); single[3]=time_rho.total_seconds(); single[4]=time_interp.total_seconds(); single[5]=ans.transfer_time()+ans.cast_time(); single[6]=cpu_time; single[7]=idle_time; MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica); double my_max_bytes=max_bytes+atom.max_gpu_bytes(); double mpi_max_bytes; MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica); double max_mb=mpi_max_bytes/(1024.0*1024.0); + double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5]; if (replica_me()==0) if (screen && times[6]>0.0) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - fprintf(screen," GPU Time Info (average): "); + fprintf(screen," Device Time Info (average): "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - if (time_device()) { + if (time_device() && t_time>0) { fprintf(screen,"Data Out: %.4f s.\n",times[0]/_replica_size); fprintf(screen,"Data In: %.4f s.\n",times[1]/_replica_size); fprintf(screen,"Kernel (map): %.4f s.\n",times[2]/_replica_size); fprintf(screen,"Kernel (rho): %.4f s.\n",times[3]/_replica_size); fprintf(screen,"Force interp: %.4f s.\n",times[4]/_replica_size); fprintf(screen,"Total rho: %.4f s.\n", (times[0]+times[2]+times[3])/_replica_size); fprintf(screen,"Total interp: %.4f s.\n", (times[1]+times[4])/_replica_size); fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size); fprintf(screen,"Total: %.4f s.\n", (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/ _replica_size); } fprintf(screen,"CPU Poisson: %.4f s.\n",times[6]/_replica_size); fprintf(screen,"CPU Idle Time: %.4f s.\n",times[7]/_replica_size); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); fprintf(screen,"-------------------------------------"); fprintf(screen,"--------------------------------\n\n"); } } template <class numtyp, class acctyp> void DeviceT::clear() { if (_init_count>0) { _long_range_precompute=0; _init_count--; if (_init_count==0) { atom.clear(); _neighbor_shared.clear(); } } } template <class numtyp, class acctyp> void DeviceT::clear_device() { while (_init_count>0) clear(); if (_compiled) { k_zero.clear(); k_info.clear(); delete dev_program; _compiled=false; } if (_device_init) { delete gpu; _device_init=false; } } template <class numtyp, class acctyp> int DeviceT::compile_kernels() { int flag=0; if (_compiled) return flag; dev_program=new UCL_Program(*gpu); int success=dev_program->load_string(device,compile_string().c_str()); if (success!=UCL_SUCCESS) return -4; k_zero.set_function(*dev_program,"kernel_zero"); k_info.set_function(*dev_program,"kernel_info"); _compiled=true; UCL_Vector<int,int> gpu_lib_data(15,*gpu,UCL_NOT_PINNED); k_info.set_size(1,1); k_info.run(&gpu_lib_data); gpu_lib_data.update_host(false); _ptx_arch=static_cast<double>(gpu_lib_data[0])/100.0; #ifndef USE_OPENCL if (_ptx_arch>gpu->arch() || floor(_ptx_arch)<floor(gpu->arch())) return -4; #endif _num_mem_threads=gpu_lib_data[1]; _warp_size=gpu_lib_data[2]; if (_threads_per_atom<1) _threads_per_atom=gpu_lib_data[3]; if (_threads_per_charge<1) _threads_per_charge=gpu_lib_data[13]; _pppm_max_spline=gpu_lib_data[4]; _pppm_block=gpu_lib_data[5]; _block_pair=gpu_lib_data[6]; _max_shared_types=gpu_lib_data[7]; _block_cell_2d=gpu_lib_data[8]; _block_cell_id=gpu_lib_data[9]; _block_nbor_build=gpu_lib_data[10]; _block_bio_pair=gpu_lib_data[11]; _max_bio_shared_types=gpu_lib_data[12]; _block_ellipse=gpu_lib_data[14]; if (static_cast<size_t>(_block_pair)>gpu->group_size()) _block_pair=gpu->group_size(); if (static_cast<size_t>(_block_bio_pair)>gpu->group_size()) _block_bio_pair=gpu->group_size(); if (_threads_per_atom>_warp_size) _threads_per_atom=_warp_size; if (_warp_size%_threads_per_atom!=0) _threads_per_atom=1; if (_threads_per_atom & (_threads_per_atom - 1)) _threads_per_atom=1; if (_threads_per_charge>_warp_size) _threads_per_charge=_warp_size; if (_warp_size%_threads_per_charge!=0) _threads_per_charge=1; if (_threads_per_charge & (_threads_per_charge - 1)) _threads_per_charge=1; return flag; } template <class numtyp, class acctyp> double DeviceT::host_memory_usage() const { return atom.host_memory_usage()+4*sizeof(numtyp)+ sizeof(Device<numtyp,acctyp>); } template class Device<PRECISION,ACC_PRECISION>; Device<PRECISION,ACC_PRECISION> global_device; int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, const int last_gpu, const int gpu_mode, const double particle_split, const int nthreads, const int t_per_atom, const double cell_size, char *opencl_vendor) { return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode, particle_split,nthreads,t_per_atom, cell_size,opencl_vendor); } void lmp_clear_device() { global_device.clear_device(); } double lmp_gpu_forces(double **f, double **tor, double *eatom, double **vatom, double *virial, double &ecoul) { return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul); } diff --git a/lib/gpu/lal_dipole_lj_ext.cpp b/lib/gpu/lal_dipole_lj_ext.cpp index 05d8fd9f7..48db22d99 100644 --- a/lib/gpu/lal_dipole_lj_ext.cpp +++ b/lib/gpu/lal_dipole_lj_ext.cpp @@ -1,128 +1,128 @@ /*************************************************************************** dipole_lj_ext.cpp ------------------- Trung Dac Nguyen (ORNL) Functions for LAMMPS access to dipole/cut acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_dipole_lj.h" using namespace std; using namespace LAMMPS_AL; static DipoleLJ<PRECISION,ACC_PRECISION> DPLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, const double qqrd2e) { DPLMF.clear(); gpu_mode=DPLMF.device->gpu_mode(); double gpu_split=DPLMF.device->particle_split(); int first_gpu=DPLMF.device->first_device(); int last_gpu=DPLMF.device->last_device(); int world_me=DPLMF.device->world_me(); int gpu_rank=DPLMF.device->gpu_rank(); int procs_per_gpu=DPLMF.device->procs_per_gpu(); DPLMF.device->init_message(screen,"dipole/cut",first_gpu,last_gpu); bool message=false; if (DPLMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=DPLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); DPLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=DPLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); DPLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) DPLMF.estimate_gpu_overhead(); return init_ok; } void dpl_gpu_clear() { DPLMF.clear(); } int** dpl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double **host_mu, double *boxlo, double *prd) { return DPLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, host_mu, boxlo, prd); } void dpl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_q, double **host_mu, const int nlocal, double *boxlo, double *prd) { DPLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag, vflag,eatom,vatom,host_start,cpu_time,success,host_q,host_mu, nlocal,boxlo,prd); } double dpl_gpu_bytes() { return DPLMF.host_memory_usage(); } diff --git a/lib/gpu/lal_dipole_lj_sf_ext.cpp b/lib/gpu/lal_dipole_lj_sf_ext.cpp index 53ef66fca..ad2bf10b0 100644 --- a/lib/gpu/lal_dipole_lj_sf_ext.cpp +++ b/lib/gpu/lal_dipole_lj_sf_ext.cpp @@ -1,128 +1,128 @@ /*************************************************************************** dipole_lj_sf_ext.cpp ------------------- Trung Dac Nguyen (ORNL) Functions for LAMMPS access to dipole/sf acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_dipole_lj_sf.h" using namespace std; using namespace LAMMPS_AL; static DipoleLJSF<PRECISION,ACC_PRECISION> DPLSFMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, const double qqrd2e) { DPLSFMF.clear(); gpu_mode=DPLSFMF.device->gpu_mode(); double gpu_split=DPLSFMF.device->particle_split(); int first_gpu=DPLSFMF.device->first_device(); int last_gpu=DPLSFMF.device->last_device(); int world_me=DPLSFMF.device->world_me(); int gpu_rank=DPLSFMF.device->gpu_rank(); int procs_per_gpu=DPLSFMF.device->procs_per_gpu(); DPLSFMF.device->init_message(screen,"dipole/sf",first_gpu,last_gpu); bool message=false; if (DPLSFMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=DPLSFMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); DPLSFMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=DPLSFMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); DPLSFMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) DPLSFMF.estimate_gpu_overhead(); return init_ok; } void dplsf_gpu_clear() { DPLSFMF.clear(); } int** dplsf_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double **host_mu, double *boxlo, double *prd) { return DPLSFMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, host_mu, boxlo, prd); } void dplsf_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_q, double **host_mu, const int nlocal, double *boxlo, double *prd) { DPLSFMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag, vflag,eatom,vatom,host_start,cpu_time,success,host_q,host_mu, nlocal,boxlo,prd); } double dplsf_gpu_bytes() { return DPLSFMF.host_memory_usage(); } diff --git a/lib/gpu/lal_eam_ext.cpp b/lib/gpu/lal_eam_ext.cpp index 687a04529..e603f9cff 100644 --- a/lib/gpu/lal_eam_ext.cpp +++ b/lib/gpu/lal_eam_ext.cpp @@ -1,143 +1,143 @@ /*************************************************************************** eam_ext.cpp ------------------- Trung Dac Nguyen, W. Michael Brown (ORNL) Functions for LAMMPS access to buck acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_eam.h" using namespace std; using namespace LAMMPS_AL; static EAM<PRECISION,ACC_PRECISION> EAMMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int eam_gpu_init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, double ***host_frho_spline, double rdr, double rdrho, double rhomax, int nrhor, int nrho, int nz2r, int nfrho, int nr, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, int &fp_size) { EAMMF.clear(); gpu_mode=EAMMF.device->gpu_mode(); double gpu_split=EAMMF.device->particle_split(); int first_gpu=EAMMF.device->first_device(); int last_gpu=EAMMF.device->last_device(); int world_me=EAMMF.device->world_me(); int gpu_rank=EAMMF.device->gpu_rank(); int procs_per_gpu=EAMMF.device->procs_per_gpu(); // disable host/device split for now if (gpu_split != 1.0) return -8; fp_size=sizeof(PRECISION); EAMMF.device->init_message(screen,"eam",first_gpu,last_gpu); bool message=false; if (EAMMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split, screen); EAMMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split, screen); EAMMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) EAMMF.estimate_gpu_overhead(); return init_ok; } void eam_gpu_clear() { EAMMF.clear(); } int ** eam_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, int &inum, void **fp_ptr) { return EAMMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, inum, fp_ptr); } void eam_gpu_compute(const int ago, const int inum_full, const int nlocal, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, void **fp_ptr) { EAMMF.compute(ago,inum_full,nlocal,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, fp_ptr); } void eam_gpu_compute_force(int *ilist, const bool eflag, const bool vflag, const bool eatom, const bool vatom) { EAMMF.compute2(ilist, eflag, vflag, eatom, vatom); } double eam_gpu_bytes() { return EAMMF.host_memory_usage(); } diff --git a/lib/gpu/lal_eam_lj_ext.cpp b/lib/gpu/lal_eam_lj_ext.cpp index ea86a7be9..2719b5106 100644 --- a/lib/gpu/lal_eam_lj_ext.cpp +++ b/lib/gpu/lal_eam_lj_ext.cpp @@ -1,153 +1,153 @@ // ************************************************************************** // lal_eam_ext.cpp // ------------------- // W. Michael Brown, Trung Dac Nguyen (ORNL) // // Class for acceleration of the eam pair style // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : // email : brownw@ornl.gov nguyentd@ornl.gov // ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_eam_lj.h" using namespace std; using namespace LAMMPS_AL; static EAMLJ<PRECISION,ACC_PRECISION> EAMLJMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int eam_lj_gpu_init(const int ntypes, double host_cutforcesq, int **host_type2rhor, int **host_type2z2r, int *host_type2frho, double ***host_rhor_spline, double ***host_z2r_spline, double ***host_frho_spline, double **host_cutljsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, double rdr, double rdrho, int nrhor, int nrho, int nz2r, int nfrho, int nr, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, int &fp_size) { EAMLJMF.clear(); gpu_mode=EAMLJMF.device->gpu_mode(); double gpu_split=EAMLJMF.device->particle_split(); int first_gpu=EAMLJMF.device->first_device(); int last_gpu=EAMLJMF.device->last_device(); int world_me=EAMLJMF.device->world_me(); int gpu_rank=EAMLJMF.device->gpu_rank(); int procs_per_gpu=EAMLJMF.device->procs_per_gpu(); // disable host/device split for now if (gpu_split != 1.0) return -8; fp_size=sizeof(PRECISION); EAMLJMF.device->init_message(screen,"eam_lj",first_gpu,last_gpu); bool message=false; if (EAMLJMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=EAMLJMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, host_frho_spline, host_cutljsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split, screen); EAMLJMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=EAMLJMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r, host_type2frho, host_rhor_spline, host_z2r_spline, host_frho_spline, host_cutljsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, rdr, rdrho, nrhor, nrho, nz2r, nfrho, nr, nlocal, nall, 300, maxspecial, cell_size, gpu_split, screen); EAMLJMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) EAMLJMF.estimate_gpu_overhead(); return init_ok; } void eam_lj_gpu_clear() { EAMLJMF.clear(); } int ** eam_lj_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, int &inum, void **fp_ptr) { return EAMLJMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, inum, fp_ptr); } void eam_lj_gpu_compute(const int ago, const int inum_full, const int nlocal, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, void **fp_ptr) { EAMLJMF.compute(ago,inum_full,nlocal,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, fp_ptr); } void eam_lj_gpu_compute_force(int *ilist, const bool eflag, const bool vflag, const bool eatom, const bool vatom) { EAMLJMF.compute2(ilist, eflag, vflag, eatom, vatom); } double eam_lj_gpu_bytes() { return EAMLJMF.host_memory_usage(); } diff --git a/lib/gpu/lal_gauss_ext.cpp b/lib/gpu/lal_gauss_ext.cpp index 4d95c7cfb..8adb96855 100644 --- a/lib/gpu/lal_gauss_ext.cpp +++ b/lib/gpu/lal_gauss_ext.cpp @@ -1,120 +1,120 @@ /*************************************************************************** gauss_ext.cpp ------------------- Trung Dac Nguyen (ORNL) Functions for LAMMPS access to gauss acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_gauss.h" using namespace std; using namespace LAMMPS_AL; static Gauss<PRECISION,ACC_PRECISION> GLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a, double **host_b, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { GLMF.clear(); gpu_mode=GLMF.device->gpu_mode(); double gpu_split=GLMF.device->particle_split(); int first_gpu=GLMF.device->first_device(); int last_gpu=GLMF.device->last_device(); int world_me=GLMF.device->world_me(); int gpu_rank=GLMF.device->gpu_rank(); int procs_per_gpu=GLMF.device->procs_per_gpu(); GLMF.device->init_message(screen,"gauss",first_gpu,last_gpu); bool message=false; if (GLMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=GLMF.init(ntypes, cutsq, host_a, host_b, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); GLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=GLMF.init(ntypes, cutsq, host_a, host_b, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); GLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) GLMF.estimate_gpu_overhead(); return init_ok; } void gauss_gpu_clear() { GLMF.clear(); } int ** gauss_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success) { return GLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); } void gauss_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { GLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } double gauss_gpu_bytes() { return GLMF.host_memory_usage(); } diff --git a/lib/gpu/lal_gayberne_ext.cpp b/lib/gpu/lal_gayberne_ext.cpp index 047250363..a18f4bcd7 100644 --- a/lib/gpu/lal_gayberne_ext.cpp +++ b/lib/gpu/lal_gayberne_ext.cpp @@ -1,141 +1,141 @@ /*************************************************************************** gayberne_ext.cpp ------------------- W. Michael Brown LAMMPS Wrappers for Gay-Berne Acceleration __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_gayberne.h" using namespace std; using namespace LAMMPS_AL; static GayBerne<PRECISION,ACC_PRECISION> GBMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int gb_gpu_init(const int ntypes, const double gamma, const double upsilon, const double mu, double **shape, double **well, double **cutsq, double **sigma, double **epsilon, double *host_lshape, int **form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { GBMF.clear(); gpu_mode=GBMF.device->gpu_mode(); double gpu_split=GBMF.device->particle_split(); int first_gpu=GBMF.device->first_device(); int last_gpu=GBMF.device->last_device(); int world_me=GBMF.device->world_me(); int gpu_rank=GBMF.device->gpu_rank(); int procs_per_gpu=GBMF.device->procs_per_gpu(); GBMF.device->init_message(screen,"gayberne",first_gpu,last_gpu); bool message=false; if (GBMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, sigma, epsilon, host_lshape, form, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); GBMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, sigma, epsilon, host_lshape, form, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); GBMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) GBMF.estimate_gpu_overhead(); return init_ok; } // --------------------------------------------------------------------------- // Clear memory on host and device // --------------------------------------------------------------------------- void gb_gpu_clear() { GBMF.clear(); } int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, double **host_quat); int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double **host_quat) { return GBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_quat); } int * gb_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double **host_quat) { return GBMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, host_quat); } // --------------------------------------------------------------------------- // Return memory usage // --------------------------------------------------------------------------- double gb_gpu_bytes() { return GBMF.host_memory_usage(); } diff --git a/lib/gpu/lal_lj96_ext.cpp b/lib/gpu/lal_lj96_ext.cpp index 4fe188057..ce3a5a87b 100644 --- a/lib/gpu/lal_lj96_ext.cpp +++ b/lib/gpu/lal_lj96_ext.cpp @@ -1,120 +1,120 @@ /*************************************************************************** lj96_ext.cpp ------------------- W. Michael Brown (ORNL) Functions for LAMMPS access to lj96/cut acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_lj96.h" using namespace std; using namespace LAMMPS_AL; static LJ96<PRECISION,ACC_PRECISION> LJ96MF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { LJ96MF.clear(); gpu_mode=LJ96MF.device->gpu_mode(); double gpu_split=LJ96MF.device->particle_split(); int first_gpu=LJ96MF.device->first_device(); int last_gpu=LJ96MF.device->last_device(); int world_me=LJ96MF.device->world_me(); int gpu_rank=LJ96MF.device->gpu_rank(); int procs_per_gpu=LJ96MF.device->procs_per_gpu(); LJ96MF.device->init_message(screen,"lj96/cut",first_gpu,last_gpu); bool message=false; if (LJ96MF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); LJ96MF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); LJ96MF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) LJ96MF.estimate_gpu_overhead(); return init_ok; } void lj96_gpu_clear() { LJ96MF.clear(); } int** lj96_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success) { return LJ96MF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); } void lj96_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { LJ96MF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh, eflag,vflag,eatom,vatom,host_start,cpu_time,success); } double lj96_gpu_bytes() { return LJ96MF.host_memory_usage(); } diff --git a/lib/gpu/lal_lj_class2_long_ext.cpp b/lib/gpu/lal_lj_class2_long_ext.cpp index 7e637d4c9..6152dc27c 100644 --- a/lib/gpu/lal_lj_class2_long_ext.cpp +++ b/lib/gpu/lal_lj_class2_long_ext.cpp @@ -1,129 +1,129 @@ /*************************************************************************** lj_class2_long_ext.cpp ------------------- W. Michael Brown LAMMPS Wrappers for COMMPASS LJ long Acceleration __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : Mon May 16 2011 email : brownw@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_lj_class2_long.h" using namespace std; using namespace LAMMPS_AL; static LJClass2Long<PRECISION,ACC_PRECISION> C2CLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald) { C2CLMF.clear(); gpu_mode=C2CLMF.device->gpu_mode(); double gpu_split=C2CLMF.device->particle_split(); int first_gpu=C2CLMF.device->first_device(); int last_gpu=C2CLMF.device->last_device(); int world_me=C2CLMF.device->world_me(); int gpu_rank=C2CLMF.device->gpu_rank(); int procs_per_gpu=C2CLMF.device->procs_per_gpu(); C2CLMF.device->init_message(screen,"lj/class2/coul/long",first_gpu,last_gpu); bool message=false; if (C2CLMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=C2CLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); C2CLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=C2CLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); C2CLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) C2CLMF.estimate_gpu_overhead(); return init_ok; } void c2cl_gpu_clear() { C2CLMF.clear(); } int** c2cl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd) { return C2CLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); } void c2cl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_q, const int nlocal, double *boxlo, double *prd) { C2CLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, host_q,nlocal,boxlo,prd); } double c2cl_gpu_bytes() { return C2CLMF.host_memory_usage(); } diff --git a/lib/gpu/lal_lj_coul_debye_ext.cpp b/lib/gpu/lal_lj_coul_debye_ext.cpp index 3e5aab4d9..3658d8a5b 100644 --- a/lib/gpu/lal_lj_coul_debye_ext.cpp +++ b/lib/gpu/lal_lj_coul_debye_ext.cpp @@ -1,129 +1,129 @@ /*************************************************************************** lj_coul_debye_ext.cpp ------------------- Trung Dac Nguyen (ORNL) Functions for LAMMPS access to lj/cut/coul/debye acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_lj_coul_debye.h" using namespace std; using namespace LAMMPS_AL; static LJCoulDebye<PRECISION,ACC_PRECISION> LJCDMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double kappa) { LJCDMF.clear(); gpu_mode=LJCDMF.device->gpu_mode(); double gpu_split=LJCDMF.device->particle_split(); int first_gpu=LJCDMF.device->first_device(); int last_gpu=LJCDMF.device->last_device(); int world_me=LJCDMF.device->world_me(); int gpu_rank=LJCDMF.device->gpu_rank(); int procs_per_gpu=LJCDMF.device->procs_per_gpu(); LJCDMF.device->init_message(screen,"lj/cut/coul/debye",first_gpu,last_gpu); bool message=false; if (LJCDMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=LJCDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, kappa); LJCDMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=LJCDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, kappa); LJCDMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) LJCDMF.estimate_gpu_overhead(); return init_ok; } void ljcd_gpu_clear() { LJCDMF.clear(); } int** ljcd_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd) { return LJCDMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); } void ljcd_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_q, const int nlocal, double *boxlo, double *prd) { LJCDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag, vflag,eatom,vatom,host_start,cpu_time,success,host_q, nlocal,boxlo,prd); } double ljcd_gpu_bytes() { return LJCDMF.host_memory_usage(); } diff --git a/lib/gpu/lal_lj_coul_ext.cpp b/lib/gpu/lal_lj_coul_ext.cpp index b0dec6f07..1faf52899 100644 --- a/lib/gpu/lal_lj_coul_ext.cpp +++ b/lib/gpu/lal_lj_coul_ext.cpp @@ -1,128 +1,128 @@ /*************************************************************************** lj_coul_ext.cpp ------------------- W. Michael Brown (ORNL) Functions for LAMMPS access to lj/cut/coul acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_lj_coul.h" using namespace std; using namespace LAMMPS_AL; static LJCoul<PRECISION,ACC_PRECISION> LJCMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double **host_cut_coulsq, double *host_special_coul, const double qqrd2e) { LJCMF.clear(); gpu_mode=LJCMF.device->gpu_mode(); double gpu_split=LJCMF.device->particle_split(); int first_gpu=LJCMF.device->first_device(); int last_gpu=LJCMF.device->last_device(); int world_me=LJCMF.device->world_me(); int gpu_rank=LJCMF.device->gpu_rank(); int procs_per_gpu=LJCMF.device->procs_per_gpu(); LJCMF.device->init_message(screen,"lj/cut/coul/cut",first_gpu,last_gpu); bool message=false; if (LJCMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); LJCMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e); LJCMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) LJCMF.estimate_gpu_overhead(); return init_ok; } void ljc_gpu_clear() { LJCMF.clear(); } int** ljc_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd) { return LJCMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); } void ljc_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_q, const int nlocal, double *boxlo, double *prd) { LJCMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag, vflag,eatom,vatom,host_start,cpu_time,success,host_q, nlocal,boxlo,prd); } double ljc_gpu_bytes() { return LJCMF.host_memory_usage(); } diff --git a/lib/gpu/lal_lj_coul_long_ext.cpp b/lib/gpu/lal_lj_coul_long_ext.cpp index f0724a8a9..a7a485b4c 100644 --- a/lib/gpu/lal_lj_coul_long_ext.cpp +++ b/lib/gpu/lal_lj_coul_long_ext.cpp @@ -1,129 +1,129 @@ /*************************************************************************** lj_coul_long_ext.cpp ------------------- W. Michael Brown (ORNL) Functions for LAMMPS access to lj/cut/coul/long acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_lj_coul_long.h" using namespace std; using namespace LAMMPS_AL; static LJCoulLong<PRECISION,ACC_PRECISION> LJCLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double g_ewald) { LJCLMF.clear(); gpu_mode=LJCLMF.device->gpu_mode(); double gpu_split=LJCLMF.device->particle_split(); int first_gpu=LJCLMF.device->first_device(); int last_gpu=LJCLMF.device->last_device(); int world_me=LJCLMF.device->world_me(); int gpu_rank=LJCLMF.device->gpu_rank(); int procs_per_gpu=LJCLMF.device->procs_per_gpu(); LJCLMF.device->init_message(screen,"lj/cut/coul/long",first_gpu,last_gpu); bool message=false; if (LJCLMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); LJCLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); LJCLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) LJCLMF.estimate_gpu_overhead(); return init_ok; } void ljcl_gpu_clear() { LJCLMF.clear(); } int** ljcl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd) { return LJCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); } void ljcl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_q, const int nlocal, double *boxlo, double *prd) { LJCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, host_q,nlocal,boxlo,prd); } double ljcl_gpu_bytes() { return LJCLMF.host_memory_usage(); } diff --git a/lib/gpu/lal_lj_coul_msm_ext.cpp b/lib/gpu/lal_lj_coul_msm_ext.cpp index 3480b14f3..af4c93bb8 100644 --- a/lib/gpu/lal_lj_coul_msm_ext.cpp +++ b/lib/gpu/lal_lj_coul_msm_ext.cpp @@ -1,131 +1,131 @@ /*************************************************************************** lj_coul_msm_ext.cpp ------------------- W. Michael Brown (ORNL) Functions for LAMMPS access to lj/cut/coul/msm acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_lj_coul_msm.h" using namespace std; using namespace LAMMPS_AL; static LJCoulMSM<PRECISION,ACC_PRECISION> LJCMLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **host_gcons, double **host_dgcons, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, double host_cut_coulsq, double *host_special_coul, const int order, const double qqrd2e) { LJCMLMF.clear(); gpu_mode=LJCMLMF.device->gpu_mode(); double gpu_split=LJCMLMF.device->particle_split(); int first_gpu=LJCMLMF.device->first_device(); int last_gpu=LJCMLMF.device->last_device(); int world_me=LJCMLMF.device->world_me(); int gpu_rank=LJCMLMF.device->gpu_rank(); int procs_per_gpu=LJCMLMF.device->procs_per_gpu(); LJCMLMF.device->init_message(screen,"lj/cut/coul/msm",first_gpu,last_gpu); bool message=false; if (LJCMLMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=LJCMLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, host_gcons, host_dgcons, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, order, qqrd2e); LJCMLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=LJCMLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, host_gcons, host_dgcons, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, order, qqrd2e); LJCMLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) LJCMLMF.estimate_gpu_overhead(); return init_ok; } void ljcm_gpu_clear() { LJCMLMF.clear(); } int** ljcm_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd) { return LJCMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); } void ljcm_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_q, const int nlocal, double *boxlo, double *prd) { LJCMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, host_q,nlocal,boxlo,prd); } double ljcm_gpu_bytes() { return LJCMLMF.host_memory_usage(); } diff --git a/lib/gpu/lal_lj_dsf_ext.cpp b/lib/gpu/lal_lj_dsf_ext.cpp index 52b56f38c..5f1f17ee5 100644 --- a/lib/gpu/lal_lj_dsf_ext.cpp +++ b/lib/gpu/lal_lj_dsf_ext.cpp @@ -1,132 +1,132 @@ /*************************************************************************** lj_dsf_ext.cpp ------------------- W. Michael Brown (ORNL) Functions for LAMMPS access to lj/cut/coul/dsf acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : 7/12/2012 email : brownw@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_lj_dsf.h" using namespace std; using namespace LAMMPS_AL; static LJDSF<PRECISION,ACC_PRECISION> LJDMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, const double host_cut_coulsq, double *host_special_coul, const double qqrd2e, const double e_shift, const double f_shift, const double alpha) { LJDMF.clear(); gpu_mode=LJDMF.device->gpu_mode(); double gpu_split=LJDMF.device->particle_split(); int first_gpu=LJDMF.device->first_device(); int last_gpu=LJDMF.device->last_device(); int world_me=LJDMF.device->world_me(); int gpu_rank=LJDMF.device->gpu_rank(); int procs_per_gpu=LJDMF.device->procs_per_gpu(); LJDMF.device->init_message(screen,"lj/cut/coul/dsf",first_gpu,last_gpu); bool message=false; if (LJDMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=LJDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, e_shift, f_shift, alpha); LJDMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=LJDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, host_cut_coulsq, host_special_coul, qqrd2e, e_shift, f_shift, alpha); LJDMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) LJDMF.estimate_gpu_overhead(); return init_ok; } void ljd_gpu_clear() { LJDMF.clear(); } int** ljd_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd) { return LJDMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); } void ljd_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_q, const int nlocal, double *boxlo, double *prd) { LJDMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag, vflag,eatom,vatom,host_start,cpu_time,success,host_q, nlocal,boxlo,prd); } double ljd_gpu_bytes() { return LJDMF.host_memory_usage(); } diff --git a/lib/gpu/lal_lj_expand_ext.cpp b/lib/gpu/lal_lj_expand_ext.cpp index 54bb3f62f..c6939cdf9 100644 --- a/lib/gpu/lal_lj_expand_ext.cpp +++ b/lib/gpu/lal_lj_expand_ext.cpp @@ -1,121 +1,121 @@ /*************************************************************************** lj_expand_ext.cpp ------------------- Inderaj Bains (NVIDIA) Functions for LAMMPS access to lj/expand acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : ibains@nvidia.com ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_lj_expand.h" using namespace std; using namespace LAMMPS_AL; static LJExpand<PRECISION,ACC_PRECISION> LJEMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double **shift, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { LJEMF.clear(); gpu_mode=LJEMF.device->gpu_mode(); double gpu_split=LJEMF.device->particle_split(); int first_gpu=LJEMF.device->first_device(); int last_gpu=LJEMF.device->last_device(); int world_me=LJEMF.device->world_me(); int gpu_rank=LJEMF.device->gpu_rank(); int procs_per_gpu=LJEMF.device->procs_per_gpu(); LJEMF.device->init_message(screen,"lj/expand",first_gpu,last_gpu); bool message=false; if (LJEMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, shift, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); LJEMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, shift, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split,screen); LJEMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) LJEMF.estimate_gpu_overhead(); return init_ok; } void lje_gpu_clear() { LJEMF.clear(); } int** lje_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success) { return LJEMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); } void lje_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { LJEMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } double lje_gpu_bytes() { return LJEMF.host_memory_usage(); } diff --git a/lib/gpu/lal_lj_ext.cpp b/lib/gpu/lal_lj_ext.cpp index 1dc47ccbb..650eadbfc 100644 --- a/lib/gpu/lal_lj_ext.cpp +++ b/lib/gpu/lal_lj_ext.cpp @@ -1,120 +1,120 @@ /*************************************************************************** lj_ext.cpp ------------------- W. Michael Brown (ORNL) Functions for LAMMPS access to lj/cut acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_lj.h" using namespace std; using namespace LAMMPS_AL; static LJ<PRECISION,ACC_PRECISION> LJLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { LJLMF.clear(); gpu_mode=LJLMF.device->gpu_mode(); double gpu_split=LJLMF.device->particle_split(); int first_gpu=LJLMF.device->first_device(); int last_gpu=LJLMF.device->last_device(); int world_me=LJLMF.device->world_me(); int gpu_rank=LJLMF.device->gpu_rank(); int procs_per_gpu=LJLMF.device->procs_per_gpu(); LJLMF.device->init_message(screen,"lj/cut",first_gpu,last_gpu); bool message=false; if (LJLMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); LJLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); LJLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) LJLMF.estimate_gpu_overhead(); return init_ok; } void ljl_gpu_clear() { LJLMF.clear(); } int ** ljl_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success) { return LJLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); } void ljl_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { LJLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } double ljl_gpu_bytes() { return LJLMF.host_memory_usage(); } diff --git a/lib/gpu/lal_mie_ext.cpp b/lib/gpu/lal_mie_ext.cpp index 52668cf52..d3f17a92b 100644 --- a/lib/gpu/lal_mie_ext.cpp +++ b/lib/gpu/lal_mie_ext.cpp @@ -1,124 +1,124 @@ /*************************************************************************** mie_ext.cpp ------------------- Trung Dac Nguyen (ORNL) Functions for LAMMPS access to mie acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_mie.h" using namespace std; using namespace LAMMPS_AL; static Mie<PRECISION,ACC_PRECISION> MLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1, double **host_mie2, double **host_mie3, double **host_mie4, double **host_gamA, double **host_gamR, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { MLMF.clear(); gpu_mode=MLMF.device->gpu_mode(); double gpu_split=MLMF.device->particle_split(); int first_gpu=MLMF.device->first_device(); int last_gpu=MLMF.device->last_device(); int world_me=MLMF.device->world_me(); int gpu_rank=MLMF.device->gpu_rank(); int procs_per_gpu=MLMF.device->procs_per_gpu(); MLMF.device->init_message(screen,"mie",first_gpu,last_gpu); bool message=false; if (MLMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=MLMF.init(ntypes, cutsq, host_mie1, host_mie2, host_mie3, host_mie4, host_gamA, host_gamR, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); MLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=MLMF.init(ntypes, cutsq, host_mie1, host_mie2, host_mie3, host_mie4, host_gamA, host_gamR, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); MLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) MLMF.estimate_gpu_overhead(); return init_ok; } void mie_gpu_clear() { MLMF.clear(); } int ** mie_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success) { return MLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); } void mie_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { MLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } double mie_gpu_bytes() { return MLMF.host_memory_usage(); } diff --git a/lib/gpu/lal_morse_ext.cpp b/lib/gpu/lal_morse_ext.cpp index 787f49b39..5277ba43f 100644 --- a/lib/gpu/lal_morse_ext.cpp +++ b/lib/gpu/lal_morse_ext.cpp @@ -1,121 +1,121 @@ /*************************************************************************** morse.cpp ------------------- W. Michael Brown (ORNL) Functions for LAMMPS access to morse acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_morse.h" using namespace std; using namespace LAMMPS_AL; static Morse<PRECISION,ACC_PRECISION> MORMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int mor_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { MORMF.clear(); gpu_mode=MORMF.device->gpu_mode(); double gpu_split=MORMF.device->particle_split(); int first_gpu=MORMF.device->first_device(); int last_gpu=MORMF.device->last_device(); int world_me=MORMF.device->world_me(); int gpu_rank=MORMF.device->gpu_rank(); int procs_per_gpu=MORMF.device->procs_per_gpu(); MORMF.device->init_message(screen,"morse",first_gpu,last_gpu); bool message=false; if (MORMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); MORMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); MORMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) MORMF.estimate_gpu_overhead(); return init_ok; } void mor_gpu_clear() { MORMF.clear(); } int** mor_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success) { return MORMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); } void mor_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { MORMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } double mor_gpu_bytes() { return MORMF.host_memory_usage(); } diff --git a/lib/gpu/lal_pppm_ext.cpp b/lib/gpu/lal_pppm_ext.cpp index 08f2c94e9..6e5a82af5 100644 --- a/lib/gpu/lal_pppm_ext.cpp +++ b/lib/gpu/lal_pppm_ext.cpp @@ -1,178 +1,178 @@ /*************************************************************************** pppm_ext.cpp ------------------- W. Michael Brown (ORNL) Functions for LAMMPS access to PPPM acceleration routines __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_pppm.h" using namespace std; using namespace LAMMPS_AL; static PPPM<PRECISION,ACC_PRECISION,float,_lgpu_float4> PPPMF; static PPPM<PRECISION,ACC_PRECISION,double,_lgpu_double4> PPPMD; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- template <class grdtyp, class memtyp> grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall, FILE *screen, const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, grdtyp **rho_coeff, grdtyp **vd_brick, const double slab_volfactor, const int nx_pppm, const int ny_pppm, const int nz_pppm, const bool split, int &success) { pppm.clear(0.0); int first_gpu=pppm.device->first_device(); int last_gpu=pppm.device->last_device(); int world_me=pppm.device->world_me(); int gpu_rank=pppm.device->gpu_rank(); int procs_per_gpu=pppm.device->procs_per_gpu(); pppm.device->init_message(screen,"pppm",first_gpu,last_gpu); bool message=false; if (pppm.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } success=0; grdtyp * host_brick=NULL; if (world_me==0) host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out,nzlo_out, nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick, slab_volfactor,nx_pppm,ny_pppm,nz_pppm,split,success); pppm.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out, nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff, vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm, split,success); pppm.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); return host_brick; } float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen, const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, float **rho_coeff, float **vd_brick, const double slab_volfactor, const int nx_pppm, const int ny_pppm, const int nz_pppm, const bool split, const bool respa, int &success) { float *b=pppm_gpu_init(PPPMF,nlocal,nall,screen,order,nxlo_out,nylo_out, nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick, slab_volfactor,nx_pppm,ny_pppm,nz_pppm,split,success); if (split==false && respa==false) PPPMF.device->set_single_precompute(&PPPMF); return b; } void pppm_gpu_clear_f(const double cpu_time) { PPPMF.clear(cpu_time); } int pppm_gpu_spread_f(const int ago, const int nlocal, const int nall, double **host_x, int *host_type, bool &success, double *host_q, double *boxlo, const double delxinv, const double delyinv, const double delzinv) { return PPPMF.spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, delxinv,delyinv,delzinv); } void pppm_gpu_interp_f(const float qqrd2e_scale) { PPPMF.interp(qqrd2e_scale); } double pppm_gpu_bytes_f() { return PPPMF.host_memory_usage(); } void pppm_gpu_forces_f(double **f) { double etmp; PPPMF.atom->data_unavail(); PPPMF.ans->get_answers(f,NULL,NULL,NULL,NULL,etmp); } double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen, const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, double **rho_coeff, double **vd_brick, const double slab_volfactor, const int nx_pppm, const int ny_pppm, const int nz_pppm, const bool split, const bool respa, int &success) { double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out, nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff, vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm, split,success); if (split==false && respa==false) PPPMD.device->set_double_precompute(&PPPMD); return b; } void pppm_gpu_clear_d(const double cpu_time) { PPPMD.clear(cpu_time); } int pppm_gpu_spread_d(const int ago, const int nlocal, const int nall, double **host_x, int *host_type, bool &success, double *host_q, double *boxlo, const double delxinv, const double delyinv, const double delzinv) { return PPPMD.spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, delxinv,delyinv,delzinv); } void pppm_gpu_interp_d(const double qqrd2e_scale) { PPPMD.interp(qqrd2e_scale); } double pppm_gpu_bytes_d() { return PPPMD.host_memory_usage(); } void pppm_gpu_forces_d(double **f) { double etmp; PPPMD.atom->data_unavail(); PPPMD.ans->get_answers(f,NULL,NULL,NULL,NULL,etmp); } diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h index 05e3e1d57..226483b94 100644 --- a/lib/gpu/lal_preprocessor.h +++ b/lib/gpu/lal_preprocessor.h @@ -1,477 +1,527 @@ // ************************************************************************** // preprocessor.cu // ------------------- // W. Michael Brown (ORNL) // // Device code for CUDA-specific preprocessor definitions // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : // email : brownw@ornl.gov // ***************************************************************************/ //************************************************************************* // Preprocessor Definitions // // Note: It is assumed that constants with the same names are defined with // the same values in all files. // // ARCH // Definition: Architecture number for accelerator // MEM_THREADS // Definition: Number of threads with sequential ids accessing memory // simultaneously on multiprocessor // WARP_SIZE: // Definition: Number of threads guaranteed to be on the same instruction // THREADS_PER_ATOM // Definition: Default number of threads assigned per atom for pair styles // Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE // THREADS_PER_CHARGE // Definition: Default number of threads assigned per atom for pair styles // with charge // Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE // PPPM_MAX_SPLINE // Definition: Maximum order for splines in PPPM // PPPM_BLOCK_1D // Definition: Thread block size for PPPM kernels // Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE // PPPM_BLOCK_1D%32==0 // BLOCK_PAIR // Definition: Default thread block size for pair styles // Restrictions: // MAX_SHARED_TYPES 8 // Definition: Max # of atom type params can be stored in shared memory // Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR // BLOCK_CELL_2D // Definition: Default block size in each dimension for cell list builds // and matrix transpose // BLOCK_CELL_ID // Definition: Default block size for binning atoms in cell list builds // BLOCK_NBOR_BUILD // Definition: Default block size for neighbor list builds // BLOCK_BIO_PAIR // Definition: Default thread block size for "bio" pair styles // MAX_BIO_SHARED_TYPES // Definition: Max # of atom type params can be stored in shared memory // Restrictions: MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2 // //*************************************************************************/ // ------------------------------------------------------------------------- // CUDA DEFINITIONS // ------------------------------------------------------------------------- #ifdef NV_KERNEL #define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x) #define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y) #define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x); #define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y); #define THREAD_ID_X threadIdx.x #define THREAD_ID_Y threadIdx.y #define BLOCK_ID_X blockIdx.x #define BLOCK_ID_Y blockIdx.y #define BLOCK_SIZE_X blockDim.x #define BLOCK_SIZE_Y blockDim.y #define __kernel extern "C" __global__ #define __local __shared__ #define __global #define restrict __restrict__ #define atom_add atomicAdd #define ucl_inline static __inline__ __device__ #ifdef __CUDA_ARCH__ #define ARCH __CUDA_ARCH__ #else #define ARCH 100 #endif #if (ARCH < 200) #define THREADS_PER_ATOM 1 #define THREADS_PER_CHARGE 16 #define BLOCK_NBOR_BUILD 64 #define BLOCK_PAIR 64 #define BLOCK_BIO_PAIR 64 #define MAX_SHARED_TYPES 8 #else #if (ARCH < 300) #define THREADS_PER_ATOM 4 #define THREADS_PER_CHARGE 8 #define BLOCK_NBOR_BUILD 128 #define BLOCK_PAIR 128 #define BLOCK_BIO_PAIR 128 #define MAX_SHARED_TYPES 8 #else #define THREADS_PER_ATOM 4 #define THREADS_PER_CHARGE 8 #define BLOCK_NBOR_BUILD 128 #define BLOCK_PAIR 256 #define BLOCK_BIO_PAIR 256 #define BLOCK_ELLIPSE 128 #define MAX_SHARED_TYPES 11 #ifdef _SINGLE_SINGLE #define shfl_xor __shfl_xor #else ucl_inline double shfl_xor(double var, int laneMask, int width) { int2 tmp; tmp.x = __double2hiint(var); tmp.y = __double2loint(var); tmp.x = __shfl_xor(tmp.x,laneMask,width); tmp.y = __shfl_xor(tmp.y,laneMask,width); return __hiloint2double(tmp.x,tmp.y); } #endif #endif #endif #define WARP_SIZE 32 #define PPPM_BLOCK_1D 64 #define BLOCK_CELL_2D 8 #define BLOCK_CELL_ID 128 #define MAX_BIO_SHARED_TYPES 128 #ifdef _DOUBLE_DOUBLE #define fetch4(ans,i,pos_tex) { \ int4 xy = tex1Dfetch(pos_tex,i*2); \ int4 zt = tex1Dfetch(pos_tex,i*2+1); \ ans.x=__hiloint2double(xy.y, xy.x); \ ans.y=__hiloint2double(xy.w, xy.z); \ ans.z=__hiloint2double(zt.y, zt.x); \ ans.w=__hiloint2double(zt.w, zt.z); \ } #define fetch(ans,i,q_tex) { \ int2 qt = tex1Dfetch(q_tex,i); \ ans=__hiloint2double(qt.y, qt.x); \ } #else #define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i); #define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i); #endif #if (__CUDA_ARCH__ < 200) #define fast_mul __mul24 #define MEM_THREADS 16 #else #define fast_mul(X,Y) (X)*(Y) #define MEM_THREADS 32 #endif #ifdef CUDA_PRE_THREE struct __builtin_align__(16) _double4 { double x, y, z, w; }; typedef struct _double4 double4; #endif #ifdef _DOUBLE_DOUBLE #define ucl_exp exp #define ucl_powr pow #define ucl_atan atan #define ucl_cbrt cbrt #define ucl_ceil ceil #define ucl_abs fabs #define ucl_rsqrt rsqrt #define ucl_sqrt sqrt #define ucl_recip(x) ((numtyp)1.0/(x)) #else #define ucl_atan atanf #define ucl_cbrt cbrtf #define ucl_ceil ceilf #define ucl_abs fabsf #define ucl_recip(x) ((numtyp)1.0/(x)) #define ucl_rsqrt rsqrtf #define ucl_sqrt sqrtf #ifdef NO_HARDWARE_TRANSCENDENTALS #define ucl_exp expf #define ucl_powr powf #else #define ucl_exp __expf #define ucl_powr __powf #endif #endif #endif // ------------------------------------------------------------------------- // NVIDIA GENERIC OPENCL DEFINITIONS // ------------------------------------------------------------------------- #ifdef NV_GENERIC_OCL #define USE_OPENCL #define fast_mul mul24 #define MEM_THREADS 16 #define THREADS_PER_ATOM 1 #define THREADS_PER_CHARGE 1 #define BLOCK_PAIR 64 #define MAX_SHARED_TYPES 8 #define BLOCK_NBOR_BUILD 64 #define BLOCK_BIO_PAIR 64 #define WARP_SIZE 32 #define PPPM_BLOCK_1D 64 #define BLOCK_CELL_2D 8 #define BLOCK_CELL_ID 128 #define MAX_BIO_SHARED_TYPES 128 #endif // ------------------------------------------------------------------------- // NVIDIA FERMI OPENCL DEFINITIONS // ------------------------------------------------------------------------- #ifdef FERMI_OCL #define USE_OPENCL #define MEM_THREADS 32 #define THREADS_PER_ATOM 4 #define THREADS_PER_CHARGE 8 #define BLOCK_PAIR 128 #define MAX_SHARED_TYPES 11 #define BLOCK_NBOR_BUILD 128 #define BLOCK_BIO_PAIR 128 #define WARP_SIZE 32 #define PPPM_BLOCK_1D 64 #define BLOCK_CELL_2D 8 #define BLOCK_CELL_ID 128 #define MAX_BIO_SHARED_TYPES 128 #endif // ------------------------------------------------------------------------- // NVIDIA KEPLER OPENCL DEFINITIONS // ------------------------------------------------------------------------- #ifdef KEPLER_OCL #define USE_OPENCL #define MEM_THREADS 32 #define THREADS_PER_ATOM 4 #define THREADS_PER_CHARGE 8 #define BLOCK_PAIR 256 #define MAX_SHARED_TYPES 11 #define BLOCK_NBOR_BUILD 128 #define BLOCK_BIO_PAIR 256 #define BLOCK_ELLIPSE 128 #define WARP_SIZE 32 #define PPPM_BLOCK_1D 64 #define BLOCK_CELL_2D 8 #define BLOCK_CELL_ID 128 #define MAX_BIO_SHARED_TYPES 128 #ifndef NO_OCL_PTX #define ARCH 300 #ifdef _SINGLE_SINGLE inline float shfl_xor(float var, int laneMask, int width) { float ret; int c; c = ((WARP_SIZE-width) << 8) | 0x1f; asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c)); return ret; } #else #pragma OPENCL EXTENSION cl_khr_fp64 : enable inline double shfl_xor(double var, int laneMask, int width) { int c = ((WARP_SIZE-width) << 8) | 0x1f; int x,y,x2,y2; double ans; asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var)); asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(x2) : "r"(x), "r"(laneMask), "r"(c)); asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(y2) : "r"(y), "r"(laneMask), "r"(c)); asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2)); return ans; } #endif #endif #endif // ------------------------------------------------------------------------- // AMD CYPRESS OPENCL DEFINITIONS // ------------------------------------------------------------------------- #ifdef CYPRESS_OCL #define USE_OPENCL #define MEM_THREADS 32 #define THREADS_PER_ATOM 4 #define THREADS_PER_CHARGE 8 #define BLOCK_PAIR 128 #define MAX_SHARED_TYPES 8 #define BLOCK_NBOR_BUILD 64 #define BLOCK_BIO_PAIR 64 #define WARP_SIZE 64 #define PPPM_BLOCK_1D 64 #define BLOCK_CELL_2D 8 #define BLOCK_CELL_ID 128 #define MAX_BIO_SHARED_TYPES 128 #endif +// ------------------------------------------------------------------------- +// INTEL CPU OPENCL DEFINITIONS +// ------------------------------------------------------------------------- + +#ifdef INTEL_OCL + +#define USE_OPENCL +#define MEM_THREADS 16 +#define THREADS_PER_ATOM 1 +#define THREADS_PER_CHARGE 1 +#define BLOCK_PAIR 1 +#define MAX_SHARED_TYPES 0 +#define BLOCK_NBOR_BUILD 4 +#define BLOCK_BIO_PAIR 2 +#define BLOCK_ELLIPSE 2 + +#define WARP_SIZE 1 +#define PPPM_BLOCK_1D 32 +#define BLOCK_CELL_2D 1 +#define BLOCK_CELL_ID 2 +#define MAX_BIO_SHARED_TYPES 0 + +#endif + +// ------------------------------------------------------------------------- +// INTEL PHI OPENCL DEFINITIONS +// ------------------------------------------------------------------------- + +#ifdef PHI_OCL + +#define USE_OPENCL +#define MEM_THREADS 16 +#define THREADS_PER_ATOM 1 +#define THREADS_PER_CHARGE 1 +#define BLOCK_PAIR 16 +#define MAX_SHARED_TYPES 0 +#define BLOCK_NBOR_BUILD 16 +#define BLOCK_BIO_PAIR 16 +#define BLOCK_ELLIPSE 16 + +#define WARP_SIZE 1 +#define PPPM_BLOCK_1D 32 +#define BLOCK_CELL_2D 4 +#define BLOCK_CELL_ID 16 +#define MAX_BIO_SHARED_TYPES 0 + +#endif + // ------------------------------------------------------------------------- // GENERIC OPENCL DEFINITIONS // ------------------------------------------------------------------------- #ifdef GENERIC_OCL #define USE_OPENCL #define MEM_THREADS 16 #define THREADS_PER_ATOM 1 #define THREADS_PER_CHARGE 1 #define BLOCK_PAIR 64 #define MAX_SHARED_TYPES 8 #define BLOCK_NBOR_BUILD 64 #define BLOCK_BIO_PAIR 64 #define WARP_SIZE 1 #define PPPM_BLOCK_1D 64 #define BLOCK_CELL_2D 8 #define BLOCK_CELL_ID 128 #define MAX_BIO_SHARED_TYPES 128 #endif // ------------------------------------------------------------------------- // OPENCL Stuff for All Hardware // ------------------------------------------------------------------------- #ifdef USE_OPENCL #ifndef _SINGLE_SINGLE #ifndef cl_khr_fp64 #ifndef cl_amd_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #endif #endif #if defined(cl_khr_fp64) #pragma OPENCL EXTENSION cl_khr_fp64 : enable #elif defined(cl_amd_fp64) #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif #ifndef fast_mul #define fast_mul(X,Y) (X)*(Y) #endif #ifndef ARCH #define ARCH 0 #endif #ifndef DRIVER #define DRIVER 0 #endif #define GLOBAL_ID_X get_global_id(0) #define THREAD_ID_X get_local_id(0) #define BLOCK_ID_X get_group_id(0) #define BLOCK_SIZE_X get_local_size(0) #define GLOBAL_SIZE_X get_global_size(0) #define THREAD_ID_Y get_local_id(1) #define BLOCK_ID_Y get_group_id(1) #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) #define ucl_inline inline #define fetch4(ans,i,x) ans=x[i] #define fetch(ans,i,q) ans=q[i] #define ucl_atan atan #define ucl_cbrt cbrt #define ucl_ceil ceil #define ucl_abs fabs #ifdef _DOUBLE_DOUBLE #define NO_HARDWARE_TRANSCENDENTALS #endif #ifdef NO_HARDWARE_TRANSCENDENTALS #define ucl_exp exp #define ucl_powr powr #define ucl_rsqrt rsqrt #define ucl_sqrt sqrt #define ucl_recip(x) ((numtyp)1.0/(x)) #else #define ucl_exp native_exp #define ucl_powr native_powr #define ucl_rsqrt native_rsqrt #define ucl_sqrt native_sqrt #define ucl_recip native_recip #endif #endif // ------------------------------------------------------------------------- // ARCHITECTURE INDEPENDENT DEFINITIONS // ------------------------------------------------------------------------- +#ifndef PPPM_MAX_SPLINE #define PPPM_MAX_SPLINE 8 +#endif #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 #define numtyp4 double4 #define acctyp double #define acctyp4 double4 #endif #ifdef _SINGLE_DOUBLE #define numtyp float #define numtyp2 float2 #define numtyp4 float4 #define acctyp double #define acctyp4 double4 #endif #ifndef numtyp #define numtyp float #define numtyp2 float2 #define numtyp4 float4 #define acctyp float #define acctyp4 float4 #endif #define EWALD_F (numtyp)1.12837917 #define EWALD_P (numtyp)0.3275911 #define A1 (numtyp)0.254829592 #define A2 (numtyp)-0.284496736 #define A3 (numtyp)1.421413741 #define A4 (numtyp)-1.453152027 #define A5 (numtyp)1.061405429 #define SBBITS 30 #define NEIGHMASK 0x3FFFFFFF ucl_inline int sbmask(int j) { return j >> SBBITS & 3; }; #ifndef BLOCK_ELLIPSE #define BLOCK_ELLIPSE BLOCK_PAIR #endif diff --git a/lib/gpu/lal_re_squared_ext.cpp b/lib/gpu/lal_re_squared_ext.cpp index 09e4c15c4..18bb1f681 100644 --- a/lib/gpu/lal_re_squared_ext.cpp +++ b/lib/gpu/lal_re_squared_ext.cpp @@ -1,138 +1,138 @@ /*************************************************************************** re_squared_ext.cpp ------------------- W. Michael Brown LAMMPS Wrappers for RE-Squared Acceleration __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_re_squared.h" using namespace std; using namespace LAMMPS_AL; static RESquared<PRECISION,ACC_PRECISION> REMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq, double **sigma, double **epsilon, int **form, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { REMF.clear(); gpu_mode=REMF.device->gpu_mode(); double gpu_split=REMF.device->particle_split(); int first_gpu=REMF.device->first_device(); int last_gpu=REMF.device->last_device(); int world_me=REMF.device->world_me(); int gpu_rank=REMF.device->gpu_rank(); int procs_per_gpu=REMF.device->procs_per_gpu(); REMF.device->init_message(screen,"resquared",first_gpu,last_gpu); bool message=false; if (REMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=REMF.init(ntypes, shape, well, cutsq, sigma, epsilon, form, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); REMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=REMF.init(ntypes, shape, well, cutsq, sigma, epsilon, form, host_lj1, host_lj2, host_lj3, host_lj4, offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); REMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) REMF.estimate_gpu_overhead(); return init_ok; } // --------------------------------------------------------------------------- // Clear memory on host and device // --------------------------------------------------------------------------- void re_gpu_clear() { REMF.clear(); } int** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, double **host_quat); int** re_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double **host_quat) { return REMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_quat); } int * re_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double **host_quat) { return REMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, host_quat); } // --------------------------------------------------------------------------- // Return memory usage // --------------------------------------------------------------------------- double re_gpu_bytes() { return REMF.host_memory_usage(); } diff --git a/lib/gpu/lal_soft_ext.cpp b/lib/gpu/lal_soft_ext.cpp index 3efeece42..effae9639 100644 --- a/lib/gpu/lal_soft_ext.cpp +++ b/lib/gpu/lal_soft_ext.cpp @@ -1,120 +1,120 @@ /*************************************************************************** soft_ext.cpp ------------------- Trung Dac Nguyen (ORNL) Functions for LAMMPS access to soft acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_soft.h" using namespace std; using namespace LAMMPS_AL; static Soft<PRECISION,ACC_PRECISION> SLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor, double **host_cut, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { SLMF.clear(); gpu_mode=SLMF.device->gpu_mode(); double gpu_split=SLMF.device->particle_split(); int first_gpu=SLMF.device->first_device(); int last_gpu=SLMF.device->last_device(); int world_me=SLMF.device->world_me(); int gpu_rank=SLMF.device->gpu_rank(); int procs_per_gpu=SLMF.device->procs_per_gpu(); SLMF.device->init_message(screen,"soft",first_gpu,last_gpu); bool message=false; if (SLMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=SLMF.init(ntypes, cutsq, host_prefactor, host_cut, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); SLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=SLMF.init(ntypes, cutsq, host_prefactor, host_cut, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); SLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) SLMF.estimate_gpu_overhead(); return init_ok; } void soft_gpu_clear() { SLMF.clear(); } int ** soft_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success) { return SLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); } void soft_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { SLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } double soft_gpu_bytes() { return SLMF.host_memory_usage(); } diff --git a/lib/gpu/lal_sw_ext.cpp b/lib/gpu/lal_sw_ext.cpp index 33abf073d..e23570e02 100644 --- a/lib/gpu/lal_sw_ext.cpp +++ b/lib/gpu/lal_sw_ext.cpp @@ -1,128 +1,128 @@ /*************************************************************************** sw_ext.cpp ------------------- W. Michael Brown (ORNL) Functions for LAMMPS access to sw acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : Tue March 26, 2013 email : brownw@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_sw.h" using namespace std; using namespace LAMMPS_AL; static SW<PRECISION,ACC_PRECISION> SWMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int sw_gpu_init(const int inum, const int nall, const int max_nbors, const double cell_size, int &gpu_mode, FILE *screen, const double sw_epsilon, const double sw_sigma, const double sw_lambda, const double sw_gamma, const double sw_costheta, const double sw_biga, const double sw_bigb, const double sw_powerp, const double sw_powerq, const double sw_cut, const double sw_cutsq) { SWMF.clear(); gpu_mode=SWMF.device->gpu_mode(); double gpu_split=SWMF.device->particle_split(); int first_gpu=SWMF.device->first_device(); int last_gpu=SWMF.device->last_device(); int world_me=SWMF.device->world_me(); int gpu_rank=SWMF.device->gpu_rank(); int procs_per_gpu=SWMF.device->procs_per_gpu(); // disable host/device split for now if (gpu_split != 1.0) return -8; SWMF.device->init_message(screen,"sw/gpu",first_gpu,last_gpu); bool message=false; if (SWMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=SWMF.init(inum, nall, 300, cell_size, gpu_split, screen, sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq); SWMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=SWMF.init(inum, nall, 300, cell_size, gpu_split, screen, sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta, sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq); SWMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) SWMF.estimate_gpu_overhead(); return init_ok; } void sw_gpu_clear() { SWMF.clear(); } int ** sw_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success) { return SWMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); } void sw_gpu_compute(const int ago, const int nlocal, const int nall, const int nlist, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { SWMF.compute(ago,nlocal,nall,nlist,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } double sw_gpu_bytes() { return SWMF.host_memory_usage(); } diff --git a/lib/gpu/lal_table_ext.cpp b/lib/gpu/lal_table_ext.cpp index 2186e33de..55fe9318b 100644 --- a/lib/gpu/lal_table_ext.cpp +++ b/lib/gpu/lal_table_ext.cpp @@ -1,120 +1,120 @@ /*************************************************************************** lal_table.cpp ------------------- Trung Dac Nguyen (ORNL) Functions for LAMMPS access to table acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_table.h" using namespace std; using namespace LAMMPS_AL; static Table<PRECISION,ACC_PRECISION> TBMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs, double **table_data, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, int tabstyle, int ntables, int tablength) { TBMF.clear(); gpu_mode=TBMF.device->gpu_mode(); double gpu_split=TBMF.device->particle_split(); int first_gpu=TBMF.device->first_device(); int last_gpu=TBMF.device->last_device(); int world_me=TBMF.device->world_me(); int gpu_rank=TBMF.device->gpu_rank(); int procs_per_gpu=TBMF.device->procs_per_gpu(); TBMF.device->init_message(screen,"table",first_gpu,last_gpu); bool message=false; if (TBMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=TBMF.init(ntypes, cutsq, table_coeffs, table_data, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, tabstyle, ntables, tablength); TBMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=TBMF.init(ntypes, cutsq, table_coeffs, table_data, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, tabstyle, ntables, tablength); TBMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) TBMF.estimate_gpu_overhead(); return init_ok; } void table_gpu_clear() { TBMF.clear(); } int ** table_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success) { return TBMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); } void table_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { TBMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } double table_gpu_bytes() { return TBMF.host_memory_usage(); } diff --git a/lib/gpu/lal_yukawa_colloid_ext.cpp b/lib/gpu/lal_yukawa_colloid_ext.cpp index c43166c33..a1bd0ee1f 100644 --- a/lib/gpu/lal_yukawa_colloid_ext.cpp +++ b/lib/gpu/lal_yukawa_colloid_ext.cpp @@ -1,123 +1,123 @@ /*************************************************************************** yukawa_colloid_ext.cpp ------------------- Trung Dac Nguyen (ORNL) Functions for LAMMPS access to colloid acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_yukawa_colloid.h" using namespace std; using namespace LAMMPS_AL; static YukawaColloid<PRECISION,ACC_PRECISION> YKCOLLMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a, double **host_offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen, const double kappa) { YKCOLLMF.clear(); gpu_mode=YKCOLLMF.device->gpu_mode(); double gpu_split=YKCOLLMF.device->particle_split(); int first_gpu=YKCOLLMF.device->first_device(); int last_gpu=YKCOLLMF.device->last_device(); int world_me=YKCOLLMF.device->world_me(); int gpu_rank=YKCOLLMF.device->gpu_rank(); int procs_per_gpu=YKCOLLMF.device->procs_per_gpu(); YKCOLLMF.device->init_message(screen,"yukawa/colloid",first_gpu,last_gpu); bool message=false; if (YKCOLLMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, kappa); YKCOLLMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen, kappa); YKCOLLMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) YKCOLLMF.estimate_gpu_overhead(); return init_ok; } void ykcolloid_gpu_clear() { YKCOLLMF.clear(); } int ** ykcolloid_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_rad) { return YKCOLLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_rad); } void ykcolloid_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success, double *host_rad) { YKCOLLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time, success,host_rad); } double ykcolloid_gpu_bytes() { return YKCOLLMF.host_memory_usage(); } diff --git a/lib/gpu/lal_yukawa_ext.cpp b/lib/gpu/lal_yukawa_ext.cpp index 36f390ab9..5737bc102 100644 --- a/lib/gpu/lal_yukawa_ext.cpp +++ b/lib/gpu/lal_yukawa_ext.cpp @@ -1,120 +1,120 @@ /*************************************************************************** yukawa_ext.cpp ------------------- Trung Dac Nguyen (ORNL) Functions for LAMMPS access to yukawa acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : nguyentd@ornl.gov ***************************************************************************/ #include <iostream> #include <cassert> #include <math.h> #include "lal_yukawa.h" using namespace std; using namespace LAMMPS_AL; static Yukawa<PRECISION,ACC_PRECISION> YKMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa, double **host_a, double **offset, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { YKMF.clear(); gpu_mode=YKMF.device->gpu_mode(); double gpu_split=YKMF.device->particle_split(); int first_gpu=YKMF.device->first_device(); int last_gpu=YKMF.device->last_device(); int world_me=YKMF.device->world_me(); int gpu_rank=YKMF.device->gpu_rank(); int procs_per_gpu=YKMF.device->procs_per_gpu(); YKMF.device->init_message(screen,"yukawa",first_gpu,last_gpu); bool message=false; if (YKMF.device->replica_me()==0 && screen) message=true; if (message) { - fprintf(screen,"Initializing GPU and compiling on process 0..."); + fprintf(screen,"Initializing Device and compiling on process 0..."); fflush(screen); } int init_ok=0; if (world_me==0) init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); YKMF.device->world_barrier(); if (message) fprintf(screen,"Done.\n"); for (int i=0; i<procs_per_gpu; i++) { if (message) { if (last_gpu-first_gpu==0) - fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i); + fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i); else - fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu, + fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu, last_gpu,i); fflush(screen); } if (gpu_rank==i && world_me!=0) init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj, inum, nall, 300, maxspecial, cell_size, gpu_split, screen); YKMF.device->gpu_barrier(); if (message) fprintf(screen,"Done.\n"); } if (message) fprintf(screen,"\n"); if (init_ok==0) YKMF.estimate_gpu_overhead(); return init_ok; } void yukawa_gpu_clear() { YKMF.clear(); } int ** yukawa_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, int *tag, int **nspecial, int **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success) { return YKMF.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success); } void yukawa_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success) { YKMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); } double yukawa_gpu_bytes() { return YKMF.host_memory_usage(); }