accelerate_gpu.html
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Fri, May 17, 15:49

accelerate_gpu.html
View Options



	<!DOCTYPE html>
	<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
	<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
	<head>
	<meta charset="utf-8">

	<meta name="viewport" content="width=device-width, initial-scale=1.0">

	<title>5.3.1. GPU package — LAMMPS documentation</title>















	<link rel="stylesheet" href="_static/css/theme.css" type="text/css" />



	<link rel="stylesheet" href="_static/sphinxcontrib-images/LightBox2/lightbox2/css/lightbox.css" type="text/css" />



	<link rel="top" title="LAMMPS documentation" href="index.html"/>
	<link rel="up" title="5. Accelerating LAMMPS performance" href="Section_accelerate.html"/>
	<link rel="next" title="5.3.2. USER-INTEL package" href="accelerate_intel.html"/>
	<link rel="prev" title="5. Accelerating LAMMPS performance" href="Section_accelerate.html"/>


	<script src="_static/js/modernizr.min.js"></script>

	</head>

	<body class="wy-body-for-nav" role="document">

	<div class="wy-grid-for-nav">


	<nav data-toggle="wy-nav-shift" class="wy-nav-side">
	<div class="wy-side-nav-search">



	<a href="Manual.html" class="icon icon-home"> LAMMPS



	</a>


	<div role="search">
	<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
	<input type="text" name="q" placeholder="Search docs" />
	<input type="hidden" name="check_keywords" value="yes" />
	<input type="hidden" name="area" value="default" />
	</form>
	</div>


	</div>

	<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">



	<p class="caption"><span class="caption-text">User Documentation</span></p>
	<ul class="current">
	<li class="toctree-l1"><a class="reference internal" href="Section_intro.html">1. Introduction</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_start.html">2. Getting Started</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_commands.html">3. Commands</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_packages.html">4. Packages</a></li>
	<li class="toctree-l1 current"><a class="reference internal" href="Section_accelerate.html">5. Accelerating LAMMPS performance</a><ul class="current">
	<li class="toctree-l2"><a class="reference internal" href="Section_accelerate.html#measuring-performance">5.1. Measuring performance</a></li>
	<li class="toctree-l2"><a class="reference internal" href="Section_accelerate.html#general-strategies">5.2. General strategies</a></li>
	<li class="toctree-l2 current"><a class="reference internal" href="Section_accelerate.html#packages-with-optimized-styles">5.3. Packages with optimized styles</a><ul class="current">
	<li class="toctree-l3 current"><a class="current reference internal" href="#">5.3.1. GPU package</a><ul>
	<li class="toctree-l4"><a class="reference internal" href="#restrictions">5.3.1.1. Restrictions</a></li>
	</ul>
	</li>
	<li class="toctree-l3"><a class="reference internal" href="accelerate_intel.html">5.3.2. USER-INTEL package</a></li>
	<li class="toctree-l3"><a class="reference internal" href="accelerate_kokkos.html">5.3.3. KOKKOS package</a></li>
	<li class="toctree-l3"><a class="reference internal" href="accelerate_omp.html">5.3.4. USER-OMP package</a></li>
	<li class="toctree-l3"><a class="reference internal" href="accelerate_opt.html">5.3.5. OPT package</a></li>
	</ul>
	</li>
	<li class="toctree-l2"><a class="reference internal" href="Section_accelerate.html#comparison-of-various-accelerator-packages">5.4. Comparison of various accelerator packages</a></li>
	</ul>
	</li>
	<li class="toctree-l1"><a class="reference internal" href="Section_howto.html">6. How-to discussions</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_example.html">7. Example problems</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_perf.html">8. Performance & scalability</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_tools.html">9. Additional tools</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_modify.html">10. Modifying & extending LAMMPS</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_python.html">11. Python interface to LAMMPS</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_errors.html">12. Errors</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_history.html">13. Future and history</a></li>
	</ul>
	<p class="caption"><span class="caption-text">Index</span></p>
	<ul>
	<li class="toctree-l1"><a class="reference internal" href="tutorials.html">Tutorials</a></li>
	<li class="toctree-l1"><a class="reference internal" href="commands.html">Commands</a></li>
	<li class="toctree-l1"><a class="reference internal" href="fixes.html">Fixes</a></li>
	<li class="toctree-l1"><a class="reference internal" href="computes.html">Computes</a></li>
	<li class="toctree-l1"><a class="reference internal" href="pairs.html">Pair Styles</a></li>
	<li class="toctree-l1"><a class="reference internal" href="bonds.html">Bond Styles</a></li>
	<li class="toctree-l1"><a class="reference internal" href="angles.html">Angle Styles</a></li>
	<li class="toctree-l1"><a class="reference internal" href="dihedrals.html">Dihedral Styles</a></li>
	<li class="toctree-l1"><a class="reference internal" href="impropers.html">Improper Styles</a></li>
	</ul>



	</div>

	</nav>

	<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">


	<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
	<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
	<a href="Manual.html">LAMMPS</a>
	</nav>



	<div class="wy-nav-content">
	<div class="rst-content">
	<div role="navigation" aria-label="breadcrumbs navigation">
	<ul class="wy-breadcrumbs">
	<li><a href="Manual.html">Docs</a> »</li>

	<li><a href="Section_accelerate.html">5. Accelerating LAMMPS performance</a> »</li>

	<li>5.3.1. GPU package</li>
	<li class="wy-breadcrumbs-aside">


	<a href="http://lammps.sandia.gov">Website</a>
	<a href="Section_commands.html#comm">Commands</a>

	</li>
	</ul>
	<hr/>

	<div class="rst-footer-buttons" style="margin-bottom: 1em" role="navigation" aria-label="footer navigation">

	<a href="accelerate_intel.html" class="btn btn-neutral float-right" title="5.3.2. USER-INTEL package" accesskey="n">Next <span class="fa fa-arrow-circle-right"></span></a>


	<a href="Section_accelerate.html" class="btn btn-neutral" title="5. Accelerating LAMMPS performance" accesskey="p"><span class="fa fa-arrow-circle-left"></span> Previous</a>

	</div>

	</div>
	<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
	<div itemprop="articleBody">

	<p><a class="reference internal" href="Section_accelerate.html"><span class="doc">Return to Section accelerate overview</span></a></p>
	<div class="section" id="gpu-package">
	<h1>5.3.1. GPU package</h1>
	<p>The GPU package was developed by Mike Brown at ORNL and his
	collaborators, particularly Trung Nguyen (ORNL). It provides GPU
	versions of many pair styles, including the 3-body Stillinger-Weber
	pair style, and for <a class="reference internal" href="kspace_style.html"><span class="doc">kspace_style pppm</span></a> for
	long-range Coulombics. It has the following general features:</p>
	<ul class="simple">
	<li>It is designed to exploit common GPU hardware configurations where one
	or more GPUs are coupled to many cores of one or more multi-core CPUs,
	e.g. within a node of a parallel machine.</li>
	<li>Atom-based data (e.g. coordinates, forces) moves back-and-forth
	between the CPU(s) and GPU every timestep.</li>
	<li>Neighbor lists can be built on the CPU or on the GPU</li>
	<li>The charge assignment and force interpolation portions of PPPM can be
	run on the GPU. The FFT portion, which requires MPI communication
	between processors, runs on the CPU.</li>
	<li>Asynchronous force computations can be performed simultaneously on the
	CPU(s) and GPU.</li>
	<li>It allows for GPU computations to be performed in single or double
	precision, or in mixed-mode precision, where pairwise forces are
	computed in single precision, but accumulated into double-precision
	force vectors.</li>
	<li>LAMMPS-specific code is in the GPU package. It makes calls to a
	generic GPU library in the lib/gpu directory. This library provides
	NVIDIA support as well as more general OpenCL support, so that the
	same functionality can eventually be supported on a variety of GPU
	hardware.</li>
	</ul>
	<p>Here is a quick overview of how to enable and use the GPU package:</p>
	<ul class="simple">
	<li>build the library in lib/gpu for your GPU hardware with the desired precision settings</li>
	<li>install the GPU package and build LAMMPS as usual</li>
	<li>use the mpirun command to set the number of MPI tasks/node which determines the number of MPI tasks/GPU</li>
	<li>specify the # of GPUs per node</li>
	<li>use GPU styles in your input script</li>
	</ul>
	<p>The latter two steps can be done using the “-pk gpu” and “-sf gpu”
	<a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switches</span></a> respectively. Or
	the effect of the “-pk” or “-sf” switches can be duplicated by adding
	the <a class="reference internal" href="package.html"><span class="doc">package gpu</span></a> or <a class="reference internal" href="suffix.html"><span class="doc">suffix gpu</span></a> commands
	respectively to your input script.</p>
	<p><strong>Required hardware/software:</strong></p>
	<p>To use this package, you currently need to have an NVIDIA GPU and
	install the NVIDIA Cuda software on your system:</p>
	<ul class="simple">
	<li>Check if you have an NVIDIA GPU: cat /proc/driver/nvidia/gpus/0/information</li>
	<li>Go to <a class="reference external" href="http://www.nvidia.com/object/cuda_get.html">http://www.nvidia.com/object/cuda_get.html</a></li>
	<li>Install a driver and toolkit appropriate for your system (SDK is not necessary)</li>
	<li>Run lammps/lib/gpu/nvc_get_devices (after building the GPU library, see below) to list supported devices and properties</li>
	</ul>
	<p><strong>Building LAMMPS with the GPU package:</strong></p>
	<p>This requires two steps (a,b): build the GPU library, then build
	LAMMPS with the GPU package.</p>
	<p>You can do both these steps in one line, using the src/Make.py script,
	described in <a class="reference internal" href="Section_start.html#start-4"><span class="std std-ref">Section 2.4</span></a> of the manual.
	Type “Make.py -h” for help. If run from the src directory, this
	command will create src/lmp_gpu using src/MAKE/Makefile.mpi as the
	starting Makefile.machine:</p>
	<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">Make</span><span class="o">.</span><span class="n">py</span> <span class="o">-</span><span class="n">p</span> <span class="n">gpu</span> <span class="o">-</span><span class="n">gpu</span> <span class="n">mode</span><span class="o">=</span><span class="n">single</span> <span class="n">arch</span><span class="o">=</span><span class="mi">31</span> <span class="o">-</span><span class="n">o</span> <span class="n">gpu</span> <span class="o">-</span><span class="n">a</span> <span class="n">lib</span><span class="o">-</span><span class="n">gpu</span> <span class="n">file</span> <span class="n">mpi</span>
	</pre></div>
	</div>
	<p>Or you can follow these two (a,b) steps:</p>
	<ol class="loweralpha simple">
	<li>Build the GPU library</li>
	</ol>
	<p>The GPU library is in lammps/lib/gpu. Select a Makefile.machine (in
	lib/gpu) appropriate for your system. You should pay special
	attention to 3 settings in this makefile.</p>
	<ul class="simple">
	<li>CUDA_HOME = needs to be where NVIDIA Cuda software is installed on your system</li>
	<li>CUDA_ARCH = needs to be appropriate to your GPUs</li>
	<li>CUDA_PREC = precision (double, mixed, single) you desire</li>
	</ul>
	<p>See lib/gpu/Makefile.linux.double for examples of the ARCH settings
	for different GPU choices, e.g. Fermi vs Kepler. It also lists the
	possible precision settings:</p>
	<pre class="literal-block">
	CUDA_PREC = -D_SINGLE_SINGLE # single precision for all calculations
	CUDA_PREC = -D_DOUBLE_DOUBLE # double precision for all calculations
	CUDA_PREC = -D_SINGLE_DOUBLE # accumulation of forces, etc, in double
	</pre>
	<p>The last setting is the mixed mode referred to above. Note that your
	GPU must support double precision to use either the 2nd or 3rd of
	these settings.</p>
	<p>To build the library, type:</p>
	<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">make</span> <span class="o">-</span><span class="n">f</span> <span class="n">Makefile</span><span class="o">.</span><span class="n">machine</span>
	</pre></div>
	</div>
	<p>If successful, it will produce the files libgpu.a and Makefile.lammps.</p>
	<p>The latter file has 3 settings that need to be appropriate for the
	paths and settings for the CUDA system software on your machine.
	Makefile.lammps is a copy of the file specified by the EXTRAMAKE
	setting in Makefile.machine. You can change EXTRAMAKE or create your
	own Makefile.lammps.machine if needed.</p>
	<p>Note that to change the precision of the GPU library, you need to
	re-build the entire library. Do a “clean” first, e.g. “make -f
	Makefile.linux clean”, followed by the make command above.</p>
	<ol class="loweralpha simple" start="2">
	<li>Build LAMMPS with the GPU package</li>
	</ol>
	<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">cd</span> <span class="n">lammps</span><span class="o">/</span><span class="n">src</span>
	<span class="n">make</span> <span class="n">yes</span><span class="o">-</span><span class="n">gpu</span>
	<span class="n">make</span> <span class="n">machine</span>
	</pre></div>
	</div>
	<p>No additional compile/link flags are needed in Makefile.machine.</p>
	<p>Note that if you change the GPU library precision (discussed above)
	and rebuild the GPU library, then you also need to re-install the GPU
	package and re-build LAMMPS, so that all affected files are
	re-compiled and linked to the new GPU library.</p>
	<p><strong>Run with the GPU package from the command line:</strong></p>
	<p>The mpirun or mpiexec command sets the total number of MPI tasks used
	by LAMMPS (one or multiple per compute node) and the number of MPI
	tasks used per node. E.g. the mpirun command in MPICH does this via
	its -np and -ppn switches. Ditto for OpenMPI via -np and -npernode.</p>
	<p>When using the GPU package, you cannot assign more than one GPU to a
	single MPI task. However multiple MPI tasks can share the same GPU,
	and in many cases it will be more efficient to run this way. Likewise
	it may be more efficient to use less MPI tasks/node than the available
	# of CPU cores. Assignment of multiple MPI tasks to a GPU will happen
	automatically if you create more MPI tasks/node than there are
	GPUs/mode. E.g. with 8 MPI tasks/node and 2 GPUs, each GPU will be
	shared by 4 MPI tasks.</p>
	<p>Use the “-sf gpu” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a>,
	which will automatically append “gpu” to styles that support it. Use
	the “-pk gpu Ng” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a> to
	set Ng = # of GPUs/node to use.</p>
	<pre class="literal-block">
	lmp_machine -sf gpu -pk gpu 1 -in in.script # 1 MPI task uses 1 GPU
	mpirun -np 12 lmp_machine -sf gpu -pk gpu 2 -in in.script # 12 MPI tasks share 2 GPUs on a single 16-core (or whatever) node
	mpirun -np 48 -ppn 12 lmp_machine -sf gpu -pk gpu 2 -in in.script # ditto on 4 16-core nodes
	</pre>
	<p>Note that if the “-sf gpu” switch is used, it also issues a default
	<a class="reference internal" href="package.html"><span class="doc">package gpu 1</span></a> command, which sets the number of
	GPUs/node to 1.</p>
	<p>Using the “-pk” switch explicitly allows for setting of the number of
	GPUs/node to use and additional options. Its syntax is the same as
	same as the “package gpu” command. See the <a class="reference internal" href="package.html"><span class="doc">package</span></a>
	command doc page for details, including the default values used for
	all its options if it is not specified.</p>
	<p>Note that the default for the <a class="reference internal" href="package.html"><span class="doc">package gpu</span></a> command is to
	set the Newton flag to “off” pairwise interactions. It does not
	affect the setting for bonded interactions (LAMMPS default is “on”).
	The “off” setting for pairwise interaction is currently required for
	GPU package pair styles.</p>
	<p><strong>Or run with the GPU package by editing an input script:</strong></p>
	<p>The discussion above for the mpirun/mpiexec command, MPI tasks/node,
	and use of multiple MPI tasks/GPU is the same.</p>
	<p>Use the <a class="reference internal" href="suffix.html"><span class="doc">suffix gpu</span></a> command, or you can explicitly add an
	“gpu” suffix to individual styles in your input script, e.g.</p>
	<pre class="literal-block">
	pair_style lj/cut/gpu 2.5
	</pre>
	<p>You must also use the <a class="reference internal" href="package.html"><span class="doc">package gpu</span></a> command to enable the
	GPU package, unless the “-sf gpu” or “-pk gpu” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switches</span></a> were used. It specifies the
	number of GPUs/node to use, as well as other options.</p>
	<p><strong>Speed-ups to expect:</strong></p>
	<p>The performance of a GPU versus a multi-core CPU is a function of your
	hardware, which pair style is used, the number of atoms/GPU, and the
	precision used on the GPU (double, single, mixed).</p>
	<p>See the <a class="reference external" href="http://lammps.sandia.gov/bench.html">Benchmark page</a> of the
	LAMMPS web site for performance of the GPU package on various
	hardware, including the Titan HPC platform at ORNL.</p>
	<p>You should also experiment with how many MPI tasks per GPU to use to
	give the best performance for your problem and machine. This is also
	a function of the problem size and the pair style being using.
	Likewise, you should experiment with the precision setting for the GPU
	library to see if single or mixed precision will give accurate
	results, since they will typically be faster.</p>
	<p><strong>Guidelines for best performance:</strong></p>
	<ul class="simple">
	<li>Using multiple MPI tasks per GPU will often give the best performance,
	as allowed my most multi-core CPU/GPU configurations.</li>
	<li>If the number of particles per MPI task is small (e.g. 100s of
	particles), it can be more efficient to run with fewer MPI tasks per
	GPU, even if you do not use all the cores on the compute node.</li>
	<li>The <a class="reference internal" href="package.html"><span class="doc">package gpu</span></a> command has several options for tuning
	performance. Neighbor lists can be built on the GPU or CPU. Force
	calculations can be dynamically balanced across the CPU cores and
	GPUs. GPU-specific settings can be made which can be optimized
	for different hardware. See the <a class="reference internal" href="package.html"><span class="doc">packakge</span></a> command
	doc page for details.</li>
	<li>As described by the <a class="reference internal" href="package.html"><span class="doc">package gpu</span></a> command, GPU
	accelerated pair styles can perform computations asynchronously with
	CPU computations. The “Pair” time reported by LAMMPS will be the
	maximum of the time required to complete the CPU pair style
	computations and the time required to complete the GPU pair style
	computations. Any time spent for GPU-enabled pair styles for
	computations that run simultaneously with <a class="reference internal" href="bond_style.html"><span class="doc">bond</span></a>,
	<a class="reference internal" href="angle_style.html"><span class="doc">angle</span></a>, <a class="reference internal" href="dihedral_style.html"><span class="doc">dihedral</span></a>,
	<a class="reference internal" href="improper_style.html"><span class="doc">improper</span></a>, and <a class="reference internal" href="kspace_style.html"><span class="doc">long-range</span></a>
	calculations will not be included in the “Pair” time.</li>
	<li>When the <em>mode</em> setting for the package gpu command is force/neigh,
	the time for neighbor list calculations on the GPU will be added into
	the “Pair” time, not the “Neigh” time. An additional breakdown of the
	times required for various tasks on the GPU (data copy, neighbor
	calculations, force computations, etc) are output only with the LAMMPS
	screen output (not in the log file) at the end of each run. These
	timings represent total time spent on the GPU for each routine,
	regardless of asynchronous CPU calculations.</li>
	<li>The output section “GPU Time Info (average)” reports “Max Mem / Proc”.
	This is the maximum memory used at one time on the GPU for data
	storage by a single MPI process.</li>
	</ul>
	<div class="section" id="restrictions">
	<h2>5.3.1.1. Restrictions</h2>
	<p>None.</p>
	</div>
	</div>


	</div>
	</div>
	<footer>

	<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">

	<a href="accelerate_intel.html" class="btn btn-neutral float-right" title="5.3.2. USER-INTEL package" accesskey="n">Next <span class="fa fa-arrow-circle-right"></span></a>


	<a href="Section_accelerate.html" class="btn btn-neutral" title="5. Accelerating LAMMPS performance" accesskey="p"><span class="fa fa-arrow-circle-left"></span> Previous</a>

	</div>


	<hr/>

	<div role="contentinfo">
	<p>
	© Copyright 2013 Sandia Corporation.
	</p>
	</div>
	Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.

	</footer>

	</div>
	</div>

	</section>

	</div>





	<script type="text/javascript">
	var DOCUMENTATION_OPTIONS = {
	URL_ROOT:'./',
	VERSION:'',
	COLLAPSE_INDEX:false,
	FILE_SUFFIX:'.html',
	HAS_SOURCE: true
	};
	</script>
	<script type="text/javascript" src="_static/jquery.js"></script>
	<script type="text/javascript" src="_static/underscore.js"></script>
	<script type="text/javascript" src="_static/doctools.js"></script>
	<script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
	<script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2/js/jquery-1.11.0.min.js"></script>
	<script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2/js/lightbox.min.js"></script>
	<script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2-customize/jquery-noconflict.js"></script>





	<script type="text/javascript" src="_static/js/theme.js"></script>




	<script type="text/javascript">
	jQuery(function () {
	SphinxRtdTheme.StickyNav.enable();
	});
	</script>


	</body>
	</html>

accelerate_gpu.htmlNo OneTemporaryActions

File Metadata

accelerate_gpu.htmlView Options

Event Timeline

accelerate_gpu.html
No OneTemporary
Actions

accelerate_gpu.html
View Options