Section_accelerate.html
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, May 27, 14:06

Section_accelerate.html
View Options



	<!DOCTYPE html>
	<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
	<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
	<head>
	<meta charset="utf-8">

	<meta name="viewport" content="width=device-width, initial-scale=1.0">

	<title>5. Accelerating LAMMPS performance — LAMMPS documentation</title>















	<link rel="stylesheet" href="_static/css/theme.css" type="text/css" />



	<link rel="stylesheet" href="_static/sphinxcontrib-images/LightBox2/lightbox2/css/lightbox.css" type="text/css" />



	<link rel="top" title="LAMMPS documentation" href="index.html"/>
	<link rel="next" title="5.3.1. GPU package" href="accelerate_gpu.html"/>
	<link rel="prev" title="4. Packages" href="Section_packages.html"/>


	<script src="_static/js/modernizr.min.js"></script>

	</head>

	<body class="wy-body-for-nav" role="document">

	<div class="wy-grid-for-nav">


	<nav data-toggle="wy-nav-shift" class="wy-nav-side">
	<div class="wy-side-nav-search">



	<a href="Manual.html" class="icon icon-home"> LAMMPS



	</a>


	<div role="search">
	<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
	<input type="text" name="q" placeholder="Search docs" />
	<input type="hidden" name="check_keywords" value="yes" />
	<input type="hidden" name="area" value="default" />
	</form>
	</div>


	</div>

	<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">



	<p class="caption"><span class="caption-text">User Documentation</span></p>
	<ul class="current">
	<li class="toctree-l1"><a class="reference internal" href="Section_intro.html">1. Introduction</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_start.html">2. Getting Started</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_commands.html">3. Commands</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_packages.html">4. Packages</a></li>
	<li class="toctree-l1 current"><a class="current reference internal" href="#">5. Accelerating LAMMPS performance</a><ul>
	<li class="toctree-l2"><a class="reference internal" href="#measuring-performance">5.1. Measuring performance</a></li>
	<li class="toctree-l2"><a class="reference internal" href="#general-strategies">5.2. General strategies</a></li>
	<li class="toctree-l2"><a class="reference internal" href="#packages-with-optimized-styles">5.3. Packages with optimized styles</a><ul>
	<li class="toctree-l3"><a class="reference internal" href="accelerate_gpu.html">5.3.1. GPU package</a></li>
	<li class="toctree-l3"><a class="reference internal" href="accelerate_intel.html">5.3.2. USER-INTEL package</a></li>
	<li class="toctree-l3"><a class="reference internal" href="accelerate_kokkos.html">5.3.3. KOKKOS package</a></li>
	<li class="toctree-l3"><a class="reference internal" href="accelerate_omp.html">5.3.4. USER-OMP package</a></li>
	<li class="toctree-l3"><a class="reference internal" href="accelerate_opt.html">5.3.5. OPT package</a></li>
	</ul>
	</li>
	<li class="toctree-l2"><a class="reference internal" href="#comparison-of-various-accelerator-packages">5.4. Comparison of various accelerator packages</a></li>
	</ul>
	</li>
	<li class="toctree-l1"><a class="reference internal" href="Section_howto.html">6. How-to discussions</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_example.html">7. Example problems</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_perf.html">8. Performance & scalability</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_tools.html">9. Additional tools</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_modify.html">10. Modifying & extending LAMMPS</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_python.html">11. Python interface to LAMMPS</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_errors.html">12. Errors</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_history.html">13. Future and history</a></li>
	</ul>
	<p class="caption"><span class="caption-text">Index</span></p>
	<ul>
	<li class="toctree-l1"><a class="reference internal" href="tutorials.html">Tutorials</a></li>
	<li class="toctree-l1"><a class="reference internal" href="commands.html">Commands</a></li>
	<li class="toctree-l1"><a class="reference internal" href="fixes.html">Fixes</a></li>
	<li class="toctree-l1"><a class="reference internal" href="computes.html">Computes</a></li>
	<li class="toctree-l1"><a class="reference internal" href="pairs.html">Pair Styles</a></li>
	<li class="toctree-l1"><a class="reference internal" href="bonds.html">Bond Styles</a></li>
	<li class="toctree-l1"><a class="reference internal" href="angles.html">Angle Styles</a></li>
	<li class="toctree-l1"><a class="reference internal" href="dihedrals.html">Dihedral Styles</a></li>
	<li class="toctree-l1"><a class="reference internal" href="impropers.html">Improper Styles</a></li>
	</ul>



	</div>

	</nav>

	<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">


	<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
	<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
	<a href="Manual.html">LAMMPS</a>
	</nav>



	<div class="wy-nav-content">
	<div class="rst-content">
	<div role="navigation" aria-label="breadcrumbs navigation">
	<ul class="wy-breadcrumbs">
	<li><a href="Manual.html">Docs</a> »</li>

	<li>5. Accelerating LAMMPS performance</li>
	<li class="wy-breadcrumbs-aside">


	<a href="http://lammps.sandia.gov">Website</a>
	<a href="Section_commands.html#comm">Commands</a>

	</li>
	</ul>
	<hr/>

	<div class="rst-footer-buttons" style="margin-bottom: 1em" role="navigation" aria-label="footer navigation">

	<a href="accelerate_gpu.html" class="btn btn-neutral float-right" title="5.3.1. GPU package" accesskey="n">Next <span class="fa fa-arrow-circle-right"></span></a>


	<a href="Section_packages.html" class="btn btn-neutral" title="4. Packages" accesskey="p"><span class="fa fa-arrow-circle-left"></span> Previous</a>

	</div>

	</div>
	<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
	<div itemprop="articleBody">

	<div class="section" id="accelerating-lammps-performance">
	<h1>5. Accelerating LAMMPS performance</h1>
	<p>This section describes various methods for improving LAMMPS
	performance for different classes of problems running on different
	kinds of machines.</p>
	<p>There are two thrusts to the discussion that follows. The
	first is using code options that implement alternate algorithms
	that can speed-up a simulation. The second is to use one
	of the several accelerator packages provided with LAMMPS that
	contain code optimized for certain kinds of hardware, including
	multi-core CPUs, GPUs, and Intel Xeon Phi coprocessors.</p>
	<ul class="simple">
	<li>5.1 <a class="reference internal" href="#acc-1"><span class="std std-ref">Measuring performance</span></a></li>
	<li>5.2 <a class="reference internal" href="#acc-2"><span class="std std-ref">Algorithms and code options to boost performace</span></a></li>
	<li>5.3 <a class="reference internal" href="#acc-3"><span class="std std-ref">Accelerator packages with optimized styles</span></a></li>
	<li>5.3.1 <a class="reference internal" href="accelerate_gpu.html"><span class="doc">GPU package</span></a></li>
	<li>5.3.2 <a class="reference internal" href="accelerate_intel.html"><span class="doc">USER-INTEL package</span></a></li>
	<li>5.3.3 <a class="reference internal" href="accelerate_kokkos.html"><span class="doc">KOKKOS package</span></a></li>
	<li>5.3.4 <a class="reference internal" href="accelerate_omp.html"><span class="doc">USER-OMP package</span></a></li>
	<li>5.3.5 <a class="reference internal" href="accelerate_opt.html"><span class="doc">OPT package</span></a></li>
	<li>5.4 <a class="reference internal" href="#acc-4"><span class="std std-ref">Comparison of various accelerator packages</span></a></li>
	</ul>
	<p>The <a class="reference external" href="http://lammps.sandia.gov/bench.html">Benchmark page</a> of the LAMMPS
	web site gives performance results for the various accelerator
	packages discussed in Section 5.2, for several of the standard LAMMPS
	benchmark problems, as a function of problem size and number of
	compute nodes, on different hardware platforms.</p>
	<div class="section" id="measuring-performance">
	<span id="acc-1"></span><h2>5.1. Measuring performance</h2>
	<p>Before trying to make your simulation run faster, you should
	understand how it currently performs and where the bottlenecks are.</p>
	<p>The best way to do this is run the your system (actual number of
	atoms) for a modest number of timesteps (say 100 steps) on several
	different processor counts, including a single processor if possible.
	Do this for an equilibrium version of your system, so that the
	100-step timings are representative of a much longer run. There is
	typically no need to run for 1000s of timesteps to get accurate
	timings; you can simply extrapolate from short runs.</p>
	<p>For the set of runs, look at the timing data printed to the screen and
	log file at the end of each LAMMPS run. <a class="reference internal" href="Section_start.html#start-8"><span class="std std-ref">This section</span></a> of the manual has an overview.</p>
	<p>Running on one (or a few processors) should give a good estimate of
	the serial performance and what portions of the timestep are taking
	the most time. Running the same problem on a few different processor
	counts should give an estimate of parallel scalability. I.e. if the
	simulation runs 16x faster on 16 processors, its 100% parallel
	efficient; if it runs 8x faster on 16 processors, it’s 50% efficient.</p>
	<p>The most important data to look at in the timing info is the timing
	breakdown and relative percentages. For example, trying different
	options for speeding up the long-range solvers will have little impact
	if they only consume 10% of the run time. If the pairwise time is
	dominating, you may want to look at GPU or OMP versions of the pair
	style, as discussed below. Comparing how the percentages change as
	you increase the processor count gives you a sense of how different
	operations within the timestep are scaling. Note that if you are
	running with a Kspace solver, there is additional output on the
	breakdown of the Kspace time. For PPPM, this includes the fraction
	spent on FFTs, which can be communication intensive.</p>
	<p>Another important detail in the timing info are the histograms of
	atoms counts and neighbor counts. If these vary widely across
	processors, you have a load-imbalance issue. This often results in
	inaccurate relative timing data, because processors have to wait when
	communication occurs for other processors to catch up. Thus the
	reported times for “Communication” or “Other” may be higher than they
	really are, due to load-imbalance. If this is an issue, you can
	uncomment the MPI_Barrier() lines in src/timer.cpp, and recompile
	LAMMPS, to obtain synchronized timings.</p>
	<hr class="docutils" />
	</div>
	<div class="section" id="general-strategies">
	<span id="acc-2"></span><h2>5.2. General strategies</h2>
	<div class="admonition note">
	<p class="first admonition-title">Note</p>
	<p class="last">this section 5.2 is still a work in progress</p>
	</div>
	<p>Here is a list of general ideas for improving simulation performance.
	Most of them are only applicable to certain models and certain
	bottlenecks in the current performance, so let the timing data you
	generate be your guide. It is hard, if not impossible, to predict how
	much difference these options will make, since it is a function of
	problem size, number of processors used, and your machine. There is
	no substitute for identifying performance bottlenecks, and trying out
	various options.</p>
	<ul class="simple">
	<li>rRESPA</li>
	<li>2-FFT PPPM</li>
	<li>Staggered PPPM</li>
	<li>single vs double PPPM</li>
	<li>partial charge PPPM</li>
	<li>verlet/split run style</li>
	<li>processor command for proc layout and numa layout</li>
	<li>load-balancing: balance and fix balance</li>
	</ul>
	<p>2-FFT PPPM, also called <em>analytic differentiation</em> or <em>ad</em> PPPM, uses
	2 FFTs instead of the 4 FFTs used by the default <em>ik differentiation</em>
	PPPM. However, 2-FFT PPPM also requires a slightly larger mesh size to
	achieve the same accuracy as 4-FFT PPPM. For problems where the FFT
	cost is the performance bottleneck (typically large problems running
	on many processors), 2-FFT PPPM may be faster than 4-FFT PPPM.</p>
	<p>Staggered PPPM performs calculations using two different meshes, one
	shifted slightly with respect to the other. This can reduce force
	aliasing errors and increase the accuracy of the method, but also
	doubles the amount of work required. For high relative accuracy, using
	staggered PPPM allows one to half the mesh size in each dimension as
	compared to regular PPPM, which can give around a 4x speedup in the
	kspace time. However, for low relative accuracy, using staggered PPPM
	gives little benefit and can be up to 2x slower in the kspace
	time. For example, the rhodopsin benchmark was run on a single
	processor, and results for kspace time vs. relative accuracy for the
	different methods are shown in the figure below. For this system,
	staggered PPPM (using ik differentiation) becomes useful when using a
	relative accuracy of slightly greater than 1e-5 and above.</p>
	<img alt="_images/rhodo_staggered.jpg" class="align-center" src="_images/rhodo_staggered.jpg" />
	<div class="admonition note">
	<p class="first admonition-title">Note</p>
	<p class="last">Using staggered PPPM may not give the same increase in accuracy
	of energy and pressure as it does in forces, so some caution must be
	used if energy and/or pressure are quantities of interest, such as
	when using a barostat.</p>
	</div>
	<hr class="docutils" />
	</div>
	<div class="section" id="packages-with-optimized-styles">
	<span id="acc-3"></span><h2>5.3. Packages with optimized styles</h2>
	<p>Accelerated versions of various <a class="reference internal" href="pair_style.html"><span class="doc">pair_style</span></a>,
	<a class="reference internal" href="fix.html"><span class="doc">fixes</span></a>, <a class="reference internal" href="compute.html"><span class="doc">computes</span></a>, and other commands have
	been added to LAMMPS, which will typically run faster than the
	standard non-accelerated versions. Some require appropriate hardware
	to be present on your system, e.g. GPUs or Intel Xeon Phi
	coprocessors.</p>
	<p>All of these commands are in packages provided with LAMMPS. An
	overview of packages is give in <a class="reference internal" href="Section_packages.html"><span class="doc">Section packages</span></a>.</p>
	<p>These are the accelerator packages
	currently in LAMMPS, either as standard or user packages:</p>
	<table border="1" class="docutils">
	<colgroup>
	<col width="46%" />
	<col width="54%" />
	</colgroup>
	<tbody valign="top">
	<tr class="row-odd"><td><a class="reference internal" href="accelerate_gpu.html"><span class="doc">GPU Package</span></a></td>
	<td>for NVIDIA GPUs as well as OpenCL support</td>
	</tr>
	<tr class="row-even"><td><a class="reference internal" href="accelerate_intel.html"><span class="doc">USER-INTEL Package</span></a></td>
	<td>for Intel CPUs and Intel Xeon Phi</td>
	</tr>
	<tr class="row-odd"><td><a class="reference internal" href="accelerate_kokkos.html"><span class="doc">KOKKOS Package</span></a></td>
	<td>for Nvidia GPUs, Intel Xeon Phi, and OpenMP threading</td>
	</tr>
	<tr class="row-even"><td><a class="reference internal" href="accelerate_omp.html"><span class="doc">USER-OMP Package</span></a></td>
	<td>for OpenMP threading and generic CPU optimizations</td>
	</tr>
	<tr class="row-odd"><td><a class="reference internal" href="accelerate_opt.html"><span class="doc">OPT Package</span></a></td>
	<td>generic CPU optimizations</td>
	</tr>
	</tbody>
	</table>
	<div class="toctree-wrapper compound">
	</div>
	<p>Inverting this list, LAMMPS currently has acceleration support for
	three kinds of hardware, via the listed packages:</p>
	<table border="1" class="docutils">
	<colgroup>
	<col width="10%" />
	<col width="90%" />
	</colgroup>
	<tbody valign="top">
	<tr class="row-odd"><td>Many-core CPUs</td>
	<td><a class="reference internal" href="accelerate_intel.html"><span class="doc">USER-INTEL</span></a>, <a class="reference internal" href="accelerate_kokkos.html"><span class="doc">KOKKOS</span></a>, <a class="reference internal" href="accelerate_omp.html"><span class="doc">USER-OMP</span></a>, <a class="reference internal" href="accelerate_opt.html"><span class="doc">OPT</span></a> packages</td>
	</tr>
	<tr class="row-even"><td>NVIDIA GPUs</td>
	<td><a class="reference internal" href="accelerate_gpu.html"><span class="doc">GPU</span></a>, <a class="reference internal" href="accelerate_kokkos.html"><span class="doc">KOKKOS</span></a> packages</td>
	</tr>
	<tr class="row-odd"><td>Intel Phi</td>
	<td><a class="reference internal" href="accelerate_intel.html"><span class="doc">USER-INTEL</span></a>, <a class="reference internal" href="accelerate_kokkos.html"><span class="doc">KOKKOS</span></a> packages</td>
	</tr>
	</tbody>
	</table>
	<p>Which package is fastest for your hardware may depend on the size
	problem you are running and what commands (accelerated and
	non-accelerated) are invoked by your input script. While these doc
	pages include performance guidelines, there is no substitute for
	trying out the different packages appropriate to your hardware.</p>
	<p>Any accelerated style has the same name as the corresponding standard
	style, except that a suffix is appended. Otherwise, the syntax for
	the command that uses the style is identical, their functionality is
	the same, and the numerical results it produces should also be the
	same, except for precision and round-off effects.</p>
	<p>For example, all of these styles are accelerated variants of the
	Lennard-Jones <a class="reference internal" href="pair_lj.html"><span class="doc">pair_style lj/cut</span></a>:</p>
	<ul class="simple">
	<li><a class="reference internal" href="pair_lj.html"><span class="doc">pair_style lj/cut/gpu</span></a></li>
	<li><a class="reference internal" href="pair_lj.html"><span class="doc">pair_style lj/cut/intel</span></a></li>
	<li><a class="reference internal" href="pair_lj.html"><span class="doc">pair_style lj/cut/kk</span></a></li>
	<li><a class="reference internal" href="pair_lj.html"><span class="doc">pair_style lj/cut/omp</span></a></li>
	<li><a class="reference internal" href="pair_lj.html"><span class="doc">pair_style lj/cut/opt</span></a></li>
	</ul>
	<p>To see what accelerate styles are currently available, see
	<a class="reference internal" href="Section_commands.html#cmd-5"><span class="std std-ref">Section 3.5</span></a> of the manual. The
	doc pages for individual commands (e.g. <a class="reference internal" href="pair_lj.html"><span class="doc">pair lj/cut</span></a> or
	<a class="reference internal" href="fix_nve.html"><span class="doc">fix nve</span></a>) also list any accelerated variants available
	for that style.</p>
	<p>To use an accelerator package in LAMMPS, and one or more of the styles
	it provides, follow these general steps. Details vary from package to
	package and are explained in the individual accelerator doc pages,
	listed above:</p>
	<table border="1" class="docutils">
	<colgroup>
	<col width="64%" />
	<col width="36%" />
	</colgroup>
	<tbody valign="top">
	<tr class="row-odd"><td>build the accelerator library</td>
	<td>only for GPU package</td>
	</tr>
	<tr class="row-even"><td>install the accelerator package</td>
	<td>make yes-opt, make yes-user-intel, etc</td>
	</tr>
	<tr class="row-odd"><td>add compile/link flags to Makefile.machine in src/MAKE</td>
	<td>only for USER-INTEL, KOKKOS, USER-OMP, OPT packages</td>
	</tr>
	<tr class="row-even"><td>re-build LAMMPS</td>
	<td>make machine</td>
	</tr>
	<tr class="row-odd"><td>prepare and test a regular LAMMPS simulation</td>
	<td>lmp_machine -in in.script; mpirun -np 32 lmp_machine -in in.script</td>
	</tr>
	<tr class="row-even"><td>enable specific accelerator support via ‘-k on’ <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a>,</td>
	<td>only needed for KOKKOS package</td>
	</tr>
	<tr class="row-odd"><td>set any needed options for the package via “-pk” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a> or <a class="reference internal" href="package.html"><span class="doc">package</span></a> command,</td>
	<td>only if defaults need to be changed</td>
	</tr>
	<tr class="row-even"><td>use accelerated styles in your input via “-sf” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a> or <a class="reference internal" href="suffix.html"><span class="doc">suffix</span></a> command</td>
	<td>lmp_machine -in in.script -sf gpu</td>
	</tr>
	</tbody>
	</table>
	<p>Note that the first 4 steps can be done as a single command, using the
	src/Make.py tool. This tool is discussed in <a class="reference internal" href="Section_start.html#start-4"><span class="std std-ref">Section 2.4</span></a> of the manual, and its use is
	illustrated in the individual accelerator sections. Typically these
	steps only need to be done once, to create an executable that uses one
	or more accelerator packages.</p>
	<p>The last 4 steps can all be done from the command-line when LAMMPS is
	launched, without changing your input script, as illustrated in the
	individual accelerator sections. Or you can add
	<a class="reference internal" href="package.html"><span class="doc">package</span></a> and <a class="reference internal" href="suffix.html"><span class="doc">suffix</span></a> commands to your input
	script.</p>
	<div class="admonition note">
	<p class="first admonition-title">Note</p>
	<p class="last">With a few exceptions, you can build a single LAMMPS executable
	with all its accelerator packages installed. Note however that the
	USER-INTEL and KOKKOS packages require you to choose one of their
	hardware options when building for a specific platform. I.e. CPU or
	Phi option for the USER-INTEL package. Or the OpenMP, Cuda, or Phi
	option for the KOKKOS package.</p>
	</div>
	<p>These are the exceptions. You cannot build a single executable with:</p>
	<ul class="simple">
	<li>both the USER-INTEL Phi and KOKKOS Phi options</li>
	<li>the USER-INTEL Phi or Kokkos Phi option, and the GPU package</li>
	</ul>
	<p>See the examples/accelerate/README and make.list files for sample
	Make.py commands that build LAMMPS with any or all of the accelerator
	packages. As an example, here is a command that builds with all the
	GPU related packages installed (GPU, KOKKOS with Cuda), including
	settings to build the needed auxiliary GPU libraries for Kepler GPUs:</p>
	<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">Make</span><span class="o">.</span><span class="n">py</span> <span class="o">-</span><span class="n">j</span> <span class="mi">16</span> <span class="o">-</span><span class="n">p</span> <span class="n">omp</span> <span class="n">gpu</span> <span class="n">kokkos</span> <span class="o">-</span><span class="n">cc</span> <span class="n">nvcc</span> <span class="n">wrap</span><span class="o">=</span><span class="n">mpi</span> <span class="o">-</span><span class="n">gpu</span> <span class="n">mode</span><span class="o">=</span><span class="n">double</span> <span class="n">arch</span><span class="o">=</span><span class="mi">35</span> <span class="o">-</span><span class="n">kokkos</span> <span class="n">cuda</span> <span class="n">arch</span><span class="o">=</span><span class="mi">35</span> <span class="n">lib</span><span class="o">-</span><span class="nb">all</span> <span class="n">file</span> <span class="n">mpi</span>
	</pre></div>
	</div>
	<p>The examples/accelerate directory also has input scripts that can be
	used with all of the accelerator packages. See its README file for
	details.</p>
	<p>Likewise, the bench directory has FERMI and KEPLER and PHI
	sub-directories with Make.py commands and input scripts for using all
	the accelerator packages on various machines. See the README files in
	those dirs.</p>
	<p>As mentioned above, the <a class="reference external" href="http://lammps.sandia.gov/bench.html">Benchmark page</a> of the LAMMPS web site gives
	performance results for the various accelerator packages for several
	of the standard LAMMPS benchmark problems, as a function of problem
	size and number of compute nodes, on different hardware platforms.</p>
	<p>Here is a brief summary of what the various packages provide. Details
	are in the individual accelerator sections.</p>
	<ul class="simple">
	<li>Styles with a “gpu” suffix are part of the GPU package, and can be run
	on NVIDIA GPUs. The speed-up on a GPU depends on a variety of
	factors, discussed in the accelerator sections.</li>
	<li>Styles with an “intel” suffix are part of the USER-INTEL
	package. These styles support vectorized single and mixed precision
	calculations, in addition to full double precision. In extreme cases,
	this can provide speedups over 3.5x on CPUs. The package also
	supports acceleration in “offload” mode to Intel(R) Xeon Phi(TM)
	coprocessors. This can result in additional speedup over 2x depending
	on the hardware configuration.</li>
	<li>Styles with a “kk” suffix are part of the KOKKOS package, and can be
	run using OpenMP on multicore CPUs, on an NVIDIA GPU, or on an Intel
	Xeon Phi in “native” mode. The speed-up depends on a variety of
	factors, as discussed on the KOKKOS accelerator page.</li>
	<li>Styles with an “omp” suffix are part of the USER-OMP package and allow
	a pair-style to be run in multi-threaded mode using OpenMP. This can
	be useful on nodes with high-core counts when using less MPI processes
	than cores is advantageous, e.g. when running with PPPM so that FFTs
	are run on fewer MPI processors or when the many MPI tasks would
	overload the available bandwidth for communication.</li>
	<li>Styles with an “opt” suffix are part of the OPT package and typically
	speed-up the pairwise calculations of your simulation by 5-25% on a
	CPU.</li>
	</ul>
	<p>The individual accelerator package doc pages explain:</p>
	<ul class="simple">
	<li>what hardware and software the accelerated package requires</li>
	<li>how to build LAMMPS with the accelerated package</li>
	<li>how to run with the accelerated package either via command-line switches or modifying the input script</li>
	<li>speed-ups to expect</li>
	<li>guidelines for best performance</li>
	<li>restrictions</li>
	</ul>
	<hr class="docutils" />
	</div>
	<div class="section" id="comparison-of-various-accelerator-packages">
	<span id="acc-4"></span><h2>5.4. Comparison of various accelerator packages</h2>
	<div class="admonition note">
	<p class="first admonition-title">Note</p>
	<p class="last">this section still needs to be re-worked with additional KOKKOS
	and USER-INTEL information.</p>
	</div>
	<p>The next section compares and contrasts the various accelerator
	options, since there are multiple ways to perform OpenMP threading,
	run on GPUs, and run on Intel Xeon Phi coprocessors.</p>
	<p>All 3 of these packages accelerate a LAMMPS calculation using NVIDIA
	hardware, but they do it in different ways.</p>
	<p>As a consequence, for a particular simulation on specific hardware,
	one package may be faster than the other. We give guidelines below,
	but the best way to determine which package is faster for your input
	script is to try both of them on your machine. See the benchmarking
	section below for examples where this has been done.</p>
	<p><strong>Guidelines for using each package optimally:</strong></p>
	<ul class="simple">
	<li>The GPU package allows you to assign multiple CPUs (cores) to a single
	GPU (a common configuration for “hybrid” nodes that contain multicore
	CPU(s) and GPU(s)) and works effectively in this mode.</li>
	<li>The GPU package moves per-atom data (coordinates, forces)
	back-and-forth between the CPU and GPU every timestep. The
	KOKKOS/CUDA package only does this on timesteps when a CPU calculation
	is required (e.g. to invoke a fix or compute that is non-GPU-ized).
	Hence, if you can formulate your input script to only use GPU-ized
	fixes and computes, and avoid doing I/O too often (thermo output, dump
	file snapshots, restart files), then the data transfer cost of the
	KOKKOS/CUDA package can be very low, causing it to run faster than the
	GPU package.</li>
	<li>The GPU package is often faster than the KOKKOS/CUDA package, if the
	number of atoms per GPU is smaller. The crossover point, in terms of
	atoms/GPU at which the KOKKOS/CUDA package becomes faster depends
	strongly on the pair style. For example, for a simple Lennard Jones
	system the crossover (in single precision) is often about 50K-100K
	atoms per GPU. When performing double precision calculations the
	crossover point can be significantly smaller.</li>
	<li>Both packages compute bonded interactions (bonds, angles, etc) on the
	CPU. If the GPU package is running with several MPI processes
	assigned to one GPU, the cost of computing the bonded interactions is
	spread across more CPUs and hence the GPU package can run faster.</li>
	<li>When using the GPU package with multiple CPUs assigned to one GPU, its
	performance depends to some extent on high bandwidth between the CPUs
	and the GPU. Hence its performance is affected if full 16 PCIe lanes
	are not available for each GPU. In HPC environments this can be the
	case if S2050/70 servers are used, where two devices generally share
	one PCIe 2.0 16x slot. Also many multi-GPU mainboards do not provide
	full 16 lanes to each of the PCIe 2.0 16x slots.</li>
	</ul>
	<p><strong>Differences between the two packages:</strong></p>
	<ul class="simple">
	<li>The GPU package accelerates only pair force, neighbor list, and PPPM
	calculations.</li>
	<li>The GPU package requires neighbor lists to be built on the CPU when using
	exclusion lists, hybrid pair styles, or a triclinic simulation box.</li>
	</ul>
	</div>
	</div>


	</div>
	</div>
	<footer>

	<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">

	<a href="accelerate_gpu.html" class="btn btn-neutral float-right" title="5.3.1. GPU package" accesskey="n">Next <span class="fa fa-arrow-circle-right"></span></a>


	<a href="Section_packages.html" class="btn btn-neutral" title="4. Packages" accesskey="p"><span class="fa fa-arrow-circle-left"></span> Previous</a>

	</div>


	<hr/>

	<div role="contentinfo">
	<p>
	© Copyright 2013 Sandia Corporation.
	</p>
	</div>
	Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.

	</footer>

	</div>
	</div>

	</section>

	</div>





	<script type="text/javascript">
	var DOCUMENTATION_OPTIONS = {
	URL_ROOT:'./',
	VERSION:'',
	COLLAPSE_INDEX:false,
	FILE_SUFFIX:'.html',
	HAS_SOURCE: true
	};
	</script>
	<script type="text/javascript" src="_static/jquery.js"></script>
	<script type="text/javascript" src="_static/underscore.js"></script>
	<script type="text/javascript" src="_static/doctools.js"></script>
	<script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
	<script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2/js/jquery-1.11.0.min.js"></script>
	<script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2/js/lightbox.min.js"></script>
	<script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2-customize/jquery-noconflict.js"></script>





	<script type="text/javascript" src="_static/js/theme.js"></script>




	<script type="text/javascript">
	jQuery(function () {
	SphinxRtdTheme.StickyNav.enable();
	});
	</script>


	</body>
	</html>

Section_accelerate.htmlNo OneTemporaryActions

File Metadata

Section_accelerate.htmlView Options

Event Timeline

Section_accelerate.html
No OneTemporary
Actions

Section_accelerate.html
View Options