#############################################################################
# Benchmarks
#
# in.intel.lj -	        Atomic fluid (LJ Benchmark)
# in.intel.rhodo -      Protein (Rhodopsin Benchmark)
# in.intel.lc -	        Liquid Crystal w/ Gay-Berne potential
# in.intel.sw -	        Silicon benchmark with Stillinger-Weber
# in.intel.tersoff -    Silicon benchmark with Tersoff
# in.intel.water -      Coarse-grain water benchmark using Stillinger-Weber
#
#############################################################################

#############################################################################
# Expected Timesteps/second with turbo on and HT enabled, LAMMPS 18-Jun-2016
#
#                     Xeon E5-2697v4     Xeon Phi 7250
#                    
# in.intel.lj -          162.764             179.148
# in.intel.rhodo -        11.633              13.668
# in.intel.lc -	          19.136              24.863
# in.intel.sw -	         139.048             152.026
# in.intel.tersoff -      82.663              92.985
# in.intel.water -        59.838              85.704
#
#############################################################################

#############################################################################
# For Haswell (Xeon v3) architectures, depending on the compiler version, 
# it may give better performance to compile for an AVX target (with -xAVX 
# compiler option) instead of -xHost or -xCORE-AVX2 for some of the 
# workloads. In most cases, FMA sensitive routines will still use AVX2 
# (MKL and SVML detect the processor at runtime). For Broadwell (Xeon v4)
# architectures, -xCORE-AVX2 or -xHost will work best for all.
#############################################################################

#############################################################################
# The default benchmark timesteps will run between 30s and 1 minute with
# the Intel package. You can specify a multiplier for all of the benchmarks
# to increase or decrease the runtime. Example commandline arguments:
#
# -v m 2		# Run for twice as long
# -v m 0.5		# Run for half as long
#############################################################################

#	Example for running benchmarks:

# 	Number of physical cores per node not including hyperthreads
export LMP_CORES=28

#      If hyperthreading is enabled, number of hyperthreads to use per core
#      (2 for Xeon; 2 or 4 for Xeon Phi)
export OMP_NUM_THREADS=2
                        
#      Name of the LAMMPS binary
export LMP_BIN=../../lmp_intel_cpu

#      LAMMPS root directory
export LMP_ROOT=../../../
               
source /opt/intel/parallel_studio_xe_2016.2.062/psxevars.sh
export I_MPI_PIN_DOMAIN=core
export I_MPI_FABRICS=shm		# For single node

#      Generate the restart file for use with liquid crystal benchmark
mpirun -np $LMP_CORES $LMP_BIN -in in.lc_generate_restart -log none

#      Benchmark to run
export bench=in.intel.lj


#############################################################################
# To run without a optimization package
#############################################################################
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none

#############################################################################
# To run with USER-OMP package
#############################################################################
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk omp 0 -sf omp

#############################################################################
# To run with USER-INTEL package and no coprocessor
#############################################################################
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 -sf intel

#############################################################################
# To run with USER-INTEL and automatic load balancing to 1 coprocessor
#############################################################################
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 1 -sf intel

#############################################################################
# If using PPPM (in.intel.rhodo) on Intel Xeon Phi x200 series processors
#############################################################################
mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 omp 3 lrt yes -sf intel