accelerate_kokkos.html
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sat, May 25, 13:34

accelerate_kokkos.html
View Options



	<!DOCTYPE html>
	<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
	<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
	<head>
	<meta charset="utf-8">

	<meta name="viewport" content="width=device-width, initial-scale=1.0">

	<title>5.3.3. KOKKOS package — LAMMPS documentation</title>















	<link rel="stylesheet" href="_static/css/theme.css" type="text/css" />



	<link rel="stylesheet" href="_static/sphinxcontrib-images/LightBox2/lightbox2/css/lightbox.css" type="text/css" />



	<link rel="top" title="LAMMPS documentation" href="index.html"/>
	<link rel="up" title="5. Accelerating LAMMPS performance" href="Section_accelerate.html"/>
	<link rel="next" title="5.3.4. USER-OMP package" href="accelerate_omp.html"/>
	<link rel="prev" title="5.3.2. USER-INTEL package" href="accelerate_intel.html"/>


	<script src="_static/js/modernizr.min.js"></script>

	</head>

	<body class="wy-body-for-nav" role="document">

	<div class="wy-grid-for-nav">


	<nav data-toggle="wy-nav-shift" class="wy-nav-side">
	<div class="wy-side-nav-search">



	<a href="Manual.html" class="icon icon-home"> LAMMPS



	</a>


	<div role="search">
	<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
	<input type="text" name="q" placeholder="Search docs" />
	<input type="hidden" name="check_keywords" value="yes" />
	<input type="hidden" name="area" value="default" />
	</form>
	</div>


	</div>

	<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">



	<p class="caption"><span class="caption-text">User Documentation</span></p>
	<ul class="current">
	<li class="toctree-l1"><a class="reference internal" href="Section_intro.html">1. Introduction</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_start.html">2. Getting Started</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_commands.html">3. Commands</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_packages.html">4. Packages</a></li>
	<li class="toctree-l1 current"><a class="reference internal" href="Section_accelerate.html">5. Accelerating LAMMPS performance</a><ul class="current">
	<li class="toctree-l2"><a class="reference internal" href="Section_accelerate.html#measuring-performance">5.1. Measuring performance</a></li>
	<li class="toctree-l2"><a class="reference internal" href="Section_accelerate.html#general-strategies">5.2. General strategies</a></li>
	<li class="toctree-l2 current"><a class="reference internal" href="Section_accelerate.html#packages-with-optimized-styles">5.3. Packages with optimized styles</a><ul class="current">
	<li class="toctree-l3"><a class="reference internal" href="accelerate_gpu.html">5.3.1. GPU package</a></li>
	<li class="toctree-l3"><a class="reference internal" href="accelerate_intel.html">5.3.2. USER-INTEL package</a></li>
	<li class="toctree-l3 current"><a class="current reference internal" href="#">5.3.3. KOKKOS package</a><ul>
	<li class="toctree-l4"><a class="reference internal" href="#restrictions">5.3.3.1. Restrictions</a></li>
	</ul>
	</li>
	<li class="toctree-l3"><a class="reference internal" href="accelerate_omp.html">5.3.4. USER-OMP package</a></li>
	<li class="toctree-l3"><a class="reference internal" href="accelerate_opt.html">5.3.5. OPT package</a></li>
	</ul>
	</li>
	<li class="toctree-l2"><a class="reference internal" href="Section_accelerate.html#comparison-of-various-accelerator-packages">5.4. Comparison of various accelerator packages</a></li>
	</ul>
	</li>
	<li class="toctree-l1"><a class="reference internal" href="Section_howto.html">6. How-to discussions</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_example.html">7. Example problems</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_perf.html">8. Performance & scalability</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_tools.html">9. Additional tools</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_modify.html">10. Modifying & extending LAMMPS</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_python.html">11. Python interface to LAMMPS</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_errors.html">12. Errors</a></li>
	<li class="toctree-l1"><a class="reference internal" href="Section_history.html">13. Future and history</a></li>
	</ul>
	<p class="caption"><span class="caption-text">Index</span></p>
	<ul>
	<li class="toctree-l1"><a class="reference internal" href="tutorials.html">Tutorials</a></li>
	<li class="toctree-l1"><a class="reference internal" href="commands.html">Commands</a></li>
	<li class="toctree-l1"><a class="reference internal" href="fixes.html">Fixes</a></li>
	<li class="toctree-l1"><a class="reference internal" href="computes.html">Computes</a></li>
	<li class="toctree-l1"><a class="reference internal" href="pairs.html">Pair Styles</a></li>
	<li class="toctree-l1"><a class="reference internal" href="bonds.html">Bond Styles</a></li>
	<li class="toctree-l1"><a class="reference internal" href="angles.html">Angle Styles</a></li>
	<li class="toctree-l1"><a class="reference internal" href="dihedrals.html">Dihedral Styles</a></li>
	<li class="toctree-l1"><a class="reference internal" href="impropers.html">Improper Styles</a></li>
	</ul>



	</div>

	</nav>

	<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">


	<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
	<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
	<a href="Manual.html">LAMMPS</a>
	</nav>



	<div class="wy-nav-content">
	<div class="rst-content">
	<div role="navigation" aria-label="breadcrumbs navigation">
	<ul class="wy-breadcrumbs">
	<li><a href="Manual.html">Docs</a> »</li>

	<li><a href="Section_accelerate.html">5. Accelerating LAMMPS performance</a> »</li>

	<li>5.3.3. KOKKOS package</li>
	<li class="wy-breadcrumbs-aside">


	<a href="http://lammps.sandia.gov">Website</a>
	<a href="Section_commands.html#comm">Commands</a>

	</li>
	</ul>
	<hr/>

	<div class="rst-footer-buttons" style="margin-bottom: 1em" role="navigation" aria-label="footer navigation">

	<a href="accelerate_omp.html" class="btn btn-neutral float-right" title="5.3.4. USER-OMP package" accesskey="n">Next <span class="fa fa-arrow-circle-right"></span></a>


	<a href="accelerate_intel.html" class="btn btn-neutral" title="5.3.2. USER-INTEL package" accesskey="p"><span class="fa fa-arrow-circle-left"></span> Previous</a>

	</div>

	</div>
	<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
	<div itemprop="articleBody">

	<p><a class="reference internal" href="Section_accelerate.html"><span class="doc">Return to Section accelerate overview</span></a></p>
	<div class="section" id="kokkos-package">
	<h1>5.3.3. KOKKOS package</h1>
	<p>The KOKKOS package was developed primarily by Christian Trott (Sandia)
	with contributions of various styles by others, including Sikandar
	Mashayak (UIUC), Stan Moore (Sandia), and Ray Shan (Sandia). The
	underlying Kokkos library was written primarily by Carter Edwards,
	Christian Trott, and Dan Sunderland (all Sandia).</p>
	<p>The KOKKOS package contains versions of pair, fix, and atom styles
	that use data structures and macros provided by the Kokkos library,
	which is included with LAMMPS in lib/kokkos.</p>
	<p>The Kokkos library is part of
	<a class="reference external" href="http://trilinos.sandia.gov/packages/kokkos">Trilinos</a> and can also be
	downloaded from <a class="reference external" href="https://github.com/kokkos/kokkos">Github</a>. Kokkos is a
	templated C++ library that provides two key abstractions for an
	application like LAMMPS. First, it allows a single implementation of
	an application kernel (e.g. a pair style) to run efficiently on
	different kinds of hardware, such as a GPU, Intel Phi, or many-core
	CPU.</p>
	<p>The Kokkos library also provides data abstractions to adjust (at
	compile time) the memory layout of basic data structures like 2d and
	3d arrays and allow the transparent utilization of special hardware
	load and store operations. Such data structures are used in LAMMPS to
	store atom coordinates or forces or neighbor lists. The layout is
	chosen to optimize performance on different platforms. Again this
	functionality is hidden from the developer, and does not affect how
	the kernel is coded.</p>
	<p>These abstractions are set at build time, when LAMMPS is compiled with
	the KOKKOS package installed. All Kokkos operations occur within the
	context of an individual MPI task running on a single node of the
	machine. The total number of MPI tasks used by LAMMPS (one or
	multiple per compute node) is set in the usual manner via the mpirun
	or mpiexec commands, and is independent of Kokkos.</p>
	<p>Kokkos currently provides support for 3 modes of execution (per MPI
	task). These are OpenMP (for many-core CPUs), Cuda (for NVIDIA GPUs),
	and OpenMP (for Intel Phi). Note that the KOKKOS package supports
	running on the Phi in native mode, not offload mode like the
	USER-INTEL package supports. You choose the mode at build time to
	produce an executable compatible with specific hardware.</p>
	<p>Here is a quick overview of how to use the KOKKOS package
	for CPU acceleration, assuming one or more 16-core nodes.
	More details follow.</p>
	<pre class="literal-block">
	use a C++11 compatible compiler
	make yes-kokkos
	make mpi KOKKOS_DEVICES=OpenMP # build with the KOKKOS package
	make kokkos_omp # or Makefile.kokkos_omp already has variable set
	Make.py -v -p kokkos -kokkos omp -o mpi -a file mpi # or one-line build via Make.py
	</pre>
	<pre class="literal-block">
	mpirun -np 16 lmp_mpi -k on -sf kk -in in.lj # 1 node, 16 MPI tasks/node, no threads
	mpirun -np 2 -ppn 1 lmp_mpi -k on t 16 -sf kk -in in.lj # 2 nodes, 1 MPI task/node, 16 threads/task
	mpirun -np 2 lmp_mpi -k on t 8 -sf kk -in in.lj # 1 node, 2 MPI tasks/node, 8 threads/task
	mpirun -np 32 -ppn 4 lmp_mpi -k on t 4 -sf kk -in in.lj # 8 nodes, 4 MPI tasks/node, 4 threads/task
	</pre>
	<ul class="simple">
	<li>specify variables and settings in your Makefile.machine that enable OpenMP, GPU, or Phi support</li>
	<li>include the KOKKOS package and build LAMMPS</li>
	<li>enable the KOKKOS package and its hardware options via the “-k on” command-line switch use KOKKOS styles in your input script</li>
	</ul>
	<p>Here is a quick overview of how to use the KOKKOS package for GPUs,
	assuming one or more nodes, each with 16 cores and a GPU. More
	details follow.</p>
	<p>discuss use of NVCC, which Makefiles to examine</p>
	<pre class="literal-block">
	use a C++11 compatible compiler
	KOKKOS_DEVICES = Cuda, OpenMP
	KOKKOS_ARCH = Kepler35
	make yes-kokkos
	make machine
	Make.py -p kokkos -kokkos cuda arch=31 -o kokkos_cuda -a file kokkos_cuda
	</pre>
	<pre class="literal-block">
	mpirun -np 1 lmp_cuda -k on t 6 -sf kk -in in.lj # one MPI task, 6 threads on CPU
	mpirun -np 4 -ppn 1 lmp_cuda -k on t 6 -sf kk -in in.lj # ditto on 4 nodes
	</pre>
	<pre class="literal-block">
	mpirun -np 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj # two MPI tasks, 8 threads per CPU
	mpirun -np 32 -ppn 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj # ditto on 16 nodes
	</pre>
	<p>Here is a quick overview of how to use the KOKKOS package
	for the Intel Phi:</p>
	<pre class="literal-block">
	use a C++11 compatible compiler
	KOKKOS_DEVICES = OpenMP
	KOKKOS_ARCH = KNC
	make yes-kokkos
	make machine
	Make.py -p kokkos -kokkos phi -o kokkos_phi -a file mpi
	</pre>
	<pre class="literal-block">
	host=MIC, Intel Phi with 61 cores (240 threads/phi via 4x hardware threading):
	mpirun -np 1 lmp_g++ -k on t 240 -sf kk -in in.lj # 1 MPI task on 1 Phi, 1*240 = 240
	mpirun -np 30 lmp_g++ -k on t 8 -sf kk -in in.lj # 30 MPI tasks on 1 Phi, 30*8 = 240
	mpirun -np 12 lmp_g++ -k on t 20 -sf kk -in in.lj # 12 MPI tasks on 1 Phi, 12*20 = 240
	mpirun -np 96 -ppn 12 lmp_g++ -k on t 20 -sf kk -in in.lj # ditto on 8 Phis
	</pre>
	<p><strong>Required hardware/software:</strong></p>
	<p>Kokkos support within LAMMPS must be built with a C++11 compatible
	compiler. If using gcc, version 4.8.1 or later is required.</p>
	<p>To build with Kokkos support for CPUs, your compiler must support the
	OpenMP interface. You should have one or more multi-core CPUs so that
	multiple threads can be launched by each MPI task running on a CPU.</p>
	<p>To build with Kokkos support for NVIDIA GPUs, NVIDIA Cuda software
	version 6.5 or later must be installed on your system. See the
	discussion for the <a class="reference internal" href="accelerate_gpu.html"><span class="doc">GPU</span></a> package for details of
	how to check and do this.</p>
	<div class="admonition note">
	<p class="first admonition-title">Note</p>
	<p class="last">For good performance of the KOKKOS package on GPUs, you must
	have Kepler generation GPUs (or later). The Kokkos library exploits
	texture cache options not supported by Telsa generation GPUs (or
	older).</p>
	</div>
	<p>To build with Kokkos support for Intel Xeon Phi coprocessors, your
	sysmte must be configured to use them in “native” mode, not “offload”
	mode like the USER-INTEL package supports.</p>
	<p><strong>Building LAMMPS with the KOKKOS package:</strong></p>
	<p>You must choose at build time whether to build for CPUs (OpenMP),
	GPUs, or Phi.</p>
	<p>You can do any of these in one line, using the src/Make.py script,
	described in <a class="reference internal" href="Section_start.html#start-4"><span class="std std-ref">Section 2.4</span></a> of the manual.
	Type “Make.py -h” for help. If run from the src directory, these
	commands will create src/lmp_kokkos_omp, lmp_kokkos_cuda, and
	lmp_kokkos_phi. Note that the OMP and PHI options use
	src/MAKE/Makefile.mpi as the starting Makefile.machine. The CUDA
	option uses src/MAKE/OPTIONS/Makefile.kokkos_cuda.</p>
	<p>The latter two steps can be done using the “-k on”, “-pk kokkos” and
	“-sf kk” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switches</span></a>
	respectively. Or the effect of the “-pk” or “-sf” switches can be
	duplicated by adding the <a class="reference internal" href="package.html"><span class="doc">package kokkos</span></a> or <a class="reference internal" href="suffix.html"><span class="doc">suffix kk</span></a> commands respectively to your input script.</p>
	<p>Or you can follow these steps:</p>
	<p>CPU-only (run all-MPI or with OpenMP threading):</p>
	<pre class="literal-block">
	cd lammps/src
	make yes-kokkos
	make g++ KOKKOS_DEVICES=OpenMP
	</pre>
	<p>Intel Xeon Phi:</p>
	<pre class="literal-block">
	cd lammps/src
	make yes-kokkos
	make g++ KOKKOS_DEVICES=OpenMP KOKKOS_ARCH=KNC
	</pre>
	<p>CPUs and GPUs:</p>
	<pre class="literal-block">
	cd lammps/src
	make yes-kokkos
	make cuda KOKKOS_DEVICES=Cuda
	</pre>
	<p>These examples set the KOKKOS-specific OMP, MIC, CUDA variables on the
	make command line which requires a GNU-compatible make command. Try
	“gmake” if your system’s standard make complains.</p>
	<div class="admonition note">
	<p class="first admonition-title">Note</p>
	<p class="last">If you build using make line variables and re-build LAMMPS twice
	with different KOKKOS options and the same target, e.g. g++ in the
	first two examples above, then you must perform a “make clean-all”
	or “make clean-machine” before each build. This is to force all the
	KOKKOS-dependent files to be re-compiled with the new options.</p>
	</div>
	<p>You can also hardwire these make variables in the specified machine
	makefile, e.g. src/MAKE/Makefile.g++ in the first two examples above,
	with a line like:</p>
	<pre class="literal-block">
	KOKKOS_ARCH = KNC
	</pre>
	<p>Note that if you build LAMMPS multiple times in this manner, using
	different KOKKOS options (defined in different machine makefiles), you
	do not have to worry about doing a “clean” in between. This is
	because the targets will be different.</p>
	<div class="admonition note">
	<p class="first admonition-title">Note</p>
	<p class="last">The 3rd example above for a GPU, uses a different machine
	makefile, in this case src/MAKE/Makefile.cuda, which is included in
	the LAMMPS distribution. To build the KOKKOS package for a GPU, this
	makefile must use the NVIDA “nvcc” compiler. And it must have a
	KOKKOS_ARCH setting that is appropriate for your NVIDIA hardware and
	installed software. Typical values for KOKKOS_ARCH are given below,
	as well as other settings that must be included in the machine
	makefile, if you create your own.</p>
	</div>
	<div class="admonition note">
	<p class="first admonition-title">Note</p>
	<p class="last">Currently, there are no precision options with the KOKKOS
	package. All compilation and computation is performed in double
	precision.</p>
	</div>
	<p>There are other allowed options when building with the KOKKOS package.
	As above, they can be set either as variables on the make command line
	or in Makefile.machine. This is the full list of options, including
	those discussed above, Each takes a value shown below. The
	default value is listed, which is set in the
	lib/kokkos/Makefile.kokkos file.</p>
	<p>#Default settings specific options
	#Options: force_uvm,use_ldg,rdc</p>
	<ul class="simple">
	<li>KOKKOS_DEVICES, values = <em>OpenMP</em>, <em>Serial</em>, <em>Pthreads</em>, <em>Cuda</em>, default = <em>OpenMP</em></li>
	<li>KOKKOS_ARCH, values = <em>KNC</em>, <em>SNB</em>, <em>HSW</em>, <em>Kepler</em>, <em>Kepler30</em>, <em>Kepler32</em>, <em>Kepler35</em>, <em>Kepler37</em>, <em>Maxwell</em>, <em>Maxwell50</em>, <em>Maxwell52</em>, <em>Maxwell53</em>, <em>ARMv8</em>, <em>BGQ</em>, <em>Power7</em>, <em>Power8</em>, default = <em>none</em></li>
	<li>KOKKOS_DEBUG, values = <em>yes</em>, <em>no</em>, default = <em>no</em></li>
	<li>KOKKOS_USE_TPLS, values = <em>hwloc</em>, <em>librt</em>, default = <em>none</em></li>
	<li>KOKKOS_CUDA_OPTIONS, values = <em>force_uvm</em>, <em>use_ldg</em>, <em>rdc</em></li>
	</ul>
	<p>KOKKOS_DEVICE sets the parallelization method used for Kokkos code
	(within LAMMPS). KOKKOS_DEVICES=OpenMP means that OpenMP will be
	used. KOKKOS_DEVICES=Pthreads means that pthreads will be used.
	KOKKOS_DEVICES=Cuda means an NVIDIA GPU running CUDA will be used.</p>
	<p>If KOKKOS_DEVICES=Cuda, then the lo-level Makefile in the src/MAKE
	directory must use “nvcc” as its compiler, via its CC setting. For
	best performance its CCFLAGS setting should use -O3 and have a
	KOKKOS_ARCH setting that matches the compute capability of your NVIDIA
	hardware and software installation, e.g. KOKKOS_ARCH=Kepler30. Note
	the minimal required compute capability is 2.0, but this will give
	signicantly reduced performance compared to Kepler generation GPUs
	with compute capability 3.x. For the LINK setting, “nvcc” should not
	be used; instead use g++ or another compiler suitable for linking C++
	applications. Often you will want to use your MPI compiler wrapper
	for this setting (i.e. mpicxx). Finally, the lo-level Makefile must
	also have a “Compilation rule” for creating .o files from .cu files.
	See src/Makefile.cuda for an example of a lo-level Makefile with all
	of these settings.</p>
	<p>KOKKOS_USE_TPLS=hwloc binds threads to hardware cores, so they do not
	migrate during a simulation. KOKKOS_USE_TPLS=hwloc should always be
	used if running with KOKKOS_DEVICES=Pthreads for pthreads. It is not
	necessary for KOKKOS_DEVICES=OpenMP for OpenMP, because OpenMP
	provides alternative methods via environment variables for binding
	threads to hardware cores. More info on binding threads to cores is
	given in <a class="reference internal" href="Section_accelerate.html#acc-3"><span class="std std-ref">this section</span></a>.</p>
	<p>KOKKOS_ARCH=KNC enables compiler switches needed when compling for an
	Intel Phi processor.</p>
	<p>KOKKOS_USE_TPLS=librt enables use of a more accurate timer mechanism
	on most Unix platforms. This library is not available on all
	platforms.</p>
	<p>KOKKOS_DEBUG is only useful when developing a Kokkos-enabled style
	within LAMMPS. KOKKOS_DEBUG=yes enables printing of run-time
	debugging information that can be useful. It also enables runtime
	bounds checking on Kokkos data structures.</p>
	<p>KOKKOS_CUDA_OPTIONS are additional options for CUDA.</p>
	<p>For more information on Kokkos see the Kokkos programmers’ guide here:
	/lib/kokkos/doc/Kokkos_PG.pdf.</p>
	<p><strong>Run with the KOKKOS package from the command line:</strong></p>
	<p>The mpirun or mpiexec command sets the total number of MPI tasks used
	by LAMMPS (one or multiple per compute node) and the number of MPI
	tasks used per node. E.g. the mpirun command in MPICH does this via
	its -np and -ppn switches. Ditto for OpenMPI via -np and -npernode.</p>
	<p>When using KOKKOS built with host=OMP, you need to choose how many
	OpenMP threads per MPI task will be used (via the “-k” command-line
	switch discussed below). Note that the product of MPI tasks * OpenMP
	threads/task should not exceed the physical number of cores (on a
	node), otherwise performance will suffer.</p>
	<p>When using the KOKKOS package built with device=CUDA, you must use
	exactly one MPI task per physical GPU.</p>
	<p>When using the KOKKOS package built with host=MIC for Intel Xeon Phi
	coprocessor support you need to insure there are one or more MPI tasks
	per coprocessor, and choose the number of coprocessor threads to use
	per MPI task (via the “-k” command-line switch discussed below). The
	product of MPI tasks * coprocessor threads/task should not exceed the
	maximum number of threads the coproprocessor is designed to run,
	otherwise performance will suffer. This value is 240 for current
	generation Xeon Phi(TM) chips, which is 60 physical cores * 4
	threads/core. Note that with the KOKKOS package you do not need to
	specify how many Phi coprocessors there are per node; each
	coprocessors is simply treated as running some number of MPI tasks.</p>
	<p>You must use the “-k on” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a> to enable the KOKKOS package. It
	takes additional arguments for hardware settings appropriate to your
	system. Those arguments are <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">documented here</span></a>. The two most commonly used
	options are:</p>
	<div class="highlight-default"><div class="highlight"><pre><span></span><span class="o">-</span><span class="n">k</span> <span class="n">on</span> <span class="n">t</span> <span class="n">Nt</span> <span class="n">g</span> <span class="n">Ng</span>
	</pre></div>
	</div>
	<p>The “t Nt” option applies to host=OMP (even if device=CUDA) and
	host=MIC. For host=OMP, it specifies how many OpenMP threads per MPI
	task to use with a node. For host=MIC, it specifies how many Xeon Phi
	threads per MPI task to use within a node. The default is Nt = 1.
	Note that for host=OMP this is effectively MPI-only mode which may be
	fine. But for host=MIC you will typically end up using far less than
	all the 240 available threads, which could give very poor performance.</p>
	<p>The “g Ng” option applies to device=CUDA. It specifies how many GPUs
	per compute node to use. The default is 1, so this only needs to be
	specified is you have 2 or more GPUs per compute node.</p>
	<p>The “-k on” switch also issues a “package kokkos” command (with no
	additional arguments) which sets various KOKKOS options to default
	values, as discussed on the <a class="reference internal" href="package.html"><span class="doc">package</span></a> command doc page.</p>
	<p>Use the “-sf kk” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a>,
	which will automatically append “kk” to styles that support it. Use
	the “-pk kokkos” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a> if
	you wish to change any of the default <a class="reference internal" href="package.html"><span class="doc">package kokkos</span></a>
	optionns set by the “-k on” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a>.</p>
	<p>Note that the default for the <a class="reference internal" href="package.html"><span class="doc">package kokkos</span></a> command is
	to use “full” neighbor lists and set the Newton flag to “off” for both
	pairwise and bonded interactions. This typically gives fastest
	performance. If the <a class="reference internal" href="newton.html"><span class="doc">newton</span></a> command is used in the input
	script, it can override the Newton flag defaults.</p>
	<p>However, when running in MPI-only mode with 1 thread per MPI task, it
	will typically be faster to use “half” neighbor lists and set the
	Newton flag to “on”, just as is the case for non-accelerated pair
	styles. You can do this with the “-pk” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a>.</p>
	<p><strong>Or run with the KOKKOS package by editing an input script:</strong></p>
	<p>The discussion above for the mpirun/mpiexec command and setting
	appropriate thread and GPU values for host=OMP or host=MIC or
	device=CUDA are the same.</p>
	<p>You must still use the “-k on” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a> to enable the KOKKOS package, and
	specify its additional arguments for hardware options appopriate to
	your system, as documented above.</p>
	<p>Use the <a class="reference internal" href="suffix.html"><span class="doc">suffix kk</span></a> command, or you can explicitly add a
	“kk” suffix to individual styles in your input script, e.g.</p>
	<pre class="literal-block">
	pair_style lj/cut/kk 2.5
	</pre>
	<p>You only need to use the <a class="reference internal" href="package.html"><span class="doc">package kokkos</span></a> command if you
	wish to change any of its option defaults, as set by the “-k on”
	<a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a>.</p>
	<p><strong>Speed-ups to expect:</strong></p>
	<p>The performance of KOKKOS running in different modes is a function of
	your hardware, which KOKKOS-enable styles are used, and the problem
	size.</p>
	<p>Generally speaking, the following rules of thumb apply:</p>
	<ul class="simple">
	<li>When running on CPUs only, with a single thread per MPI task,
	performance of a KOKKOS style is somewhere between the standard
	(un-accelerated) styles (MPI-only mode), and those provided by the
	USER-OMP package. However the difference between all 3 is small (less
	than 20%).</li>
	<li>When running on CPUs only, with multiple threads per MPI task,
	performance of a KOKKOS style is a bit slower than the USER-OMP
	package.</li>
	<li>When running large number of atoms per GPU, KOKKOS is typically faster
	than the GPU package.</li>
	<li>When running on Intel Xeon Phi, KOKKOS is not as fast as
	the USER-INTEL package, which is optimized for that hardware.</li>
	</ul>
	<p>See the <a class="reference external" href="http://lammps.sandia.gov/bench.html">Benchmark page</a> of the
	LAMMPS web site for performance of the KOKKOS package on different
	hardware.</p>
	<p><strong>Guidelines for best performance:</strong></p>
	<p>Here are guidline for using the KOKKOS package on the different
	hardware configurations listed above.</p>
	<p>Many of the guidelines use the <a class="reference internal" href="package.html"><span class="doc">package kokkos</span></a> command
	See its doc page for details and default settings. Experimenting with
	its options can provide a speed-up for specific calculations.</p>
	<p><strong>Running on a multi-core CPU:</strong></p>
	<p>If N is the number of physical cores/node, then the number of MPI
	tasks/node * number of threads/task should not exceed N, and should
	typically equal N. Note that the default threads/task is 1, as set by
	the “t” keyword of the “-k” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a>. If you do not change this, no
	additional parallelism (beyond MPI) will be invoked on the host
	CPU(s).</p>
	<p>You can compare the performance running in different modes:</p>
	<ul class="simple">
	<li>run with 1 MPI task/node and N threads/task</li>
	<li>run with N MPI tasks/node and 1 thread/task</li>
	<li>run with settings in between these extremes</li>
	</ul>
	<p>Examples of mpirun commands in these modes are shown above.</p>
	<p>When using KOKKOS to perform multi-threading, it is important for
	performance to bind both MPI tasks to physical cores, and threads to
	physical cores, so they do not migrate during a simulation.</p>
	<p>If you are not certain MPI tasks are being bound (check the defaults
	for your MPI installation), binding can be forced with these flags:</p>
	<pre class="literal-block">
	OpenMPI 1.8: mpirun -np 2 -bind-to socket -map-by socket ./lmp_openmpi ...
	Mvapich2 2.0: mpiexec -np 2 -bind-to socket -map-by socket ./lmp_mvapich ...
	</pre>
	<p>For binding threads with the KOKKOS OMP option, use thread affinity
	environment variables to force binding. With OpenMP 3.1 (gcc 4.7 or
	later, intel 12 or later) setting the environment variable
	OMP_PROC_BIND=true should be sufficient. For binding threads with the
	KOKKOS pthreads option, compile LAMMPS the KOKKOS HWLOC=yes option, as
	discussed in <a class="reference internal" href="Section_start.html#start-3-4"><span class="std std-ref">Section 2.3.4</span></a> of the
	manual.</p>
	<p><strong>Running on GPUs:</strong></p>
	<p>Insure the -arch setting in the machine makefile you are using,
	e.g. src/MAKE/Makefile.cuda, is correct for your GPU hardware/software
	(see <a class="reference internal" href="Section_start.html#start-3-4"><span class="std std-ref">this section</span></a> of the manual for
	details).</p>
	<p>The -np setting of the mpirun command should set the number of MPI
	tasks/node to be equal to the # of physical GPUs on the node.</p>
	<p>Use the “-k” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a> to
	specify the number of GPUs per node, and the number of threads per MPI
	task. As above for multi-core CPUs (and no GPU), if N is the number
	of physical cores/node, then the number of MPI tasks/node * number of
	threads/task should not exceed N. With one GPU (and one MPI task) it
	may be faster to use less than all the available cores, by setting
	threads/task to a smaller value. This is because using all the cores
	on a dual-socket node will incur extra cost to copy memory from the
	2nd socket to the GPU.</p>
	<p>Examples of mpirun commands that follow these rules are shown above.</p>
	<div class="admonition note">
	<p class="first admonition-title">Note</p>
	<p class="last">When using a GPU, you will achieve the best performance if your
	input script does not use any fix or compute styles which are not yet
	Kokkos-enabled. This allows data to stay on the GPU for multiple
	timesteps, without being copied back to the host CPU. Invoking a
	non-Kokkos fix or compute, or performing I/O for
	<a class="reference internal" href="thermo_style.html"><span class="doc">thermo</span></a> or <a class="reference internal" href="dump.html"><span class="doc">dump</span></a> output will cause data
	to be copied back to the CPU.</p>
	</div>
	<p>You cannot yet assign multiple MPI tasks to the same GPU with the
	KOKKOS package. We plan to support this in the future, similar to the
	GPU package in LAMMPS.</p>
	<p>You cannot yet use both the host (multi-threaded) and device (GPU)
	together to compute pairwise interactions with the KOKKOS package. We
	hope to support this in the future, similar to the GPU package in
	LAMMPS.</p>
	<p><strong>Running on an Intel Phi:</strong></p>
	<p>Kokkos only uses Intel Phi processors in their “native” mode, i.e.
	not hosted by a CPU.</p>
	<p>As illustrated above, build LAMMPS with OMP=yes (the default) and
	MIC=yes. The latter insures code is correctly compiled for the Intel
	Phi. The OMP setting means OpenMP will be used for parallelization on
	the Phi, which is currently the best option within Kokkos. In the
	future, other options may be added.</p>
	<p>Current-generation Intel Phi chips have either 61 or 57 cores. One
	core should be excluded for running the OS, leaving 60 or 56 cores.
	Each core is hyperthreaded, so there are effectively N = 240 (4*60) or
	N = 224 (4*56) cores to run on.</p>
	<p>The -np setting of the mpirun command sets the number of MPI
	tasks/node. The “-k on t Nt” command-line switch sets the number of
	threads/task as Nt. The product of these 2 values should be N, i.e.
	240 or 224. Also, the number of threads/task should be a multiple of
	4 so that logical threads from more than one MPI task do not run on
	the same physical core.</p>
	<p>Examples of mpirun commands that follow these rules are shown above.</p>
	<div class="section" id="restrictions">
	<h2>5.3.3.1. Restrictions</h2>
	<p>As noted above, if using GPUs, the number of MPI tasks per compute
	node should equal to the number of GPUs per compute node. In the
	future Kokkos will support assigning multiple MPI tasks to a single
	GPU.</p>
	<p>Currently Kokkos does not support AMD GPUs due to limits in the
	available backend programming models. Specifically, Kokkos requires
	extensive C++ support from the Kernel language. This is expected to
	change in the future.</p>
	</div>
	</div>


	</div>
	</div>
	<footer>

	<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">

	<a href="accelerate_omp.html" class="btn btn-neutral float-right" title="5.3.4. USER-OMP package" accesskey="n">Next <span class="fa fa-arrow-circle-right"></span></a>


	<a href="accelerate_intel.html" class="btn btn-neutral" title="5.3.2. USER-INTEL package" accesskey="p"><span class="fa fa-arrow-circle-left"></span> Previous</a>

	</div>


	<hr/>

	<div role="contentinfo">
	<p>
	© Copyright 2013 Sandia Corporation.
	</p>
	</div>
	Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.

	</footer>

	</div>
	</div>

	</section>

	</div>





	<script type="text/javascript">
	var DOCUMENTATION_OPTIONS = {
	URL_ROOT:'./',
	VERSION:'',
	COLLAPSE_INDEX:false,
	FILE_SUFFIX:'.html',
	HAS_SOURCE: true
	};
	</script>
	<script type="text/javascript" src="_static/jquery.js"></script>
	<script type="text/javascript" src="_static/underscore.js"></script>
	<script type="text/javascript" src="_static/doctools.js"></script>
	<script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
	<script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2/js/jquery-1.11.0.min.js"></script>
	<script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2/js/lightbox.min.js"></script>
	<script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2-customize/jquery-noconflict.js"></script>





	<script type="text/javascript" src="_static/js/theme.js"></script>




	<script type="text/javascript">
	jQuery(function () {
	SphinxRtdTheme.StickyNav.enable();
	});
	</script>


	</body>
	</html>

accelerate_kokkos.htmlNo OneTemporaryActions

File Metadata

accelerate_kokkos.htmlView Options

Event Timeline

accelerate_kokkos.html
No OneTemporary
Actions

accelerate_kokkos.html
View Options