#############################################################################
# Benchmarks
#
# in.intel.lj -	        Atomic fluid (LJ Benchmark)
# in.intel.rhodo -      Protein (Rhodopsin Benchmark)
# in.intel.lc -	        Liquid Crystal w/ Gay-Berne potential
# in.intel.eam -	Copper benchmark with Embedded Atom Method
# in.intel.sw -	        Silicon benchmark with Stillinger-Weber
# in.intel.tersoff -    Silicon benchmark with Tersoff
# in.intel.water -      Coarse-grain water benchmark using Stillinger-Weber
# in.intel.airebo -     Polyethelene benchmark with AIREBO
# in.intel.dpd -        Dissipative Particle Dynamics
#
#############################################################################

#############################################################################
# Expected Timesteps/second with turbo on and HT enabled, LAMMPS June-2017
#  - Compiled w/ Intel Parallel Studio 2017u2 and Makefile.intel_cpu_intelmpi
#
#                     Xeon E5-2697v4     Xeon Phi 7250    Xeon Gold 6148
#                    
# in.intel.lj -            199.5               282.3            317.3
# in.intel.rhodo -          12.4                17.5             24.4
# in.intel.lc -	            19.0                25.7             26.8
# in.intel.eam -            59.4                92.8            105.6 
# in.intel.sw -	           132.4               161.9            213.8
# in.intel.tersoff -        83.3               101.1            109.6
# in.intel.water -          53.4                90.3            105.5
# in.intel.airebo -          7.3                11.8             17.6
# in.intel.dpd -            74.5               100.4            148.1
#
#############################################################################

#############################################################################
# For Skylake server (Xeon) architectures, see notes in the USER-INTEL/README
# for build flags that should be used. 
#############################################################################

#############################################################################
# For Haswell (Xeon v3) architectures, depending on the compiler version, 
# it may give better performance to compile for an AVX target (with -xAVX 
# compiler option) instead of -xHost or -xCORE-AVX2 for some of the 
# workloads. In most cases, FMA sensitive routines will still use AVX2 
# (MKL and SVML detect the processor at runtime). For Broadwell (Xeon v4)
# architectures, -xCORE-AVX2 or -xHost will work best for all.
#############################################################################

#############################################################################
# The default benchmark timesteps will run between 30s and 1 minute with
# the Intel package. You can specify a multiplier for all of the benchmarks
# to increase or decrease the runtime. Example commandline arguments:
#
# -v m 2		# Run for twice as long
# -v m 0.5		# Run for half as long
#############################################################################

#############################################################################
# The LAMMPS newton setting can be controlled from the commandline for the
# benchmarks with the N variable:
#
# -v N on		# newton on
# -v N off		# newton off
#
# The default is on for all of the benchmarks except for LJ where the off
# setting performs best with the USER-INTEL package
#############################################################################

#	Example for running benchmarks (see run_benchmarks.sh for script):

# 	Number of physical cores per node not including hyperthreads
export LMP_CORES=28

#      If hyperthreading is enabled, number of hyperthreads to use per core
#      (2 for Xeon; 2 or 4 for Xeon Phi)
export OMP_NUM_THREADS=2
                        
#      Name of the LAMMPS binary
export LMP_BIN=../../lmp_intel_cpu

#      LAMMPS root directory
export LMP_ROOT=../../../
               
source source /opt/intel/parallel_studio_xe_2017.2.050/psxevars.sh
export KMP_BLOCKTIME=0
export I_MPI_PIN_DOMAIN=core
export I_MPI_FABRICS=shm		# For single node

# ONLY FOR INTEL XEON PHI x200 SERIES PROCESSORS
export I_MPI_SHM_LMT=shm

#      Generate the restart file for use with liquid crystal benchmark
mpirun -np $LMP_CORES $LMP_BIN -in in.lc_generate_restart -log none

#      Benchmark to run
export bench=in.intel.lj

#############################################################################
# For Intel Xeon Phi x200 series processors best performance is achieved by
# using MCDRAM. In flat mode, this can be achieved with numactl,
# MPI environment variables, or other options provided by batch schedulers
#############################################################################

#############################################################################
# To run without a optimization package
#############################################################################
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -v N on

#############################################################################
# To run with USER-OMP package
#############################################################################
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk omp 0 -sf omp -v N on

#############################################################################
# To run with USER-INTEL package and no coprocessor
#############################################################################
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 -sf intel

#############################################################################
# To run with USER-INTEL and automatic load balancing to 1 coprocessor
#############################################################################
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 1 -sf intel

#############################################################################
# If using PPPM (e.g. in.intel.rhodo) on Intel Xeon Phi x200 series 
#   or Skylake processors
#############################################################################
export KMP_AFFINITY=none
rthreads=$((OMP_NUM_THREADS-1))
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 omp $rthreads lrt yes -sf intel