diff --git a/document.tex b/document.tex index c72c7b7..1b5441f 100644 --- a/document.tex +++ b/document.tex @@ -1,586 +1,622 @@ \begin{frame} \titlepage \centering \end{frame} %--------------------- \begin{frame}{Welcome} \begin{block}{What you will learn} How to compile and launch MPI codes on the SCITAS clusters along with a bit of the "why" \end{block} \vspace{1cm} \begin{block}{What you will not learn} How to write parallel code and optimise it - there are other courses for that! \end{block} \end{frame} %--------------------- \begin{frame}{Compilation} \begin{block}{From code to binary} Compilation is the process by which code (C, C++, Fortran etc) is transformed into a binary that can be run on a CPU. \end{block} \begin{block}{CPUs are not all the same} \begin{itemize} \item CPUs have different features and instruction sets \item The same code will need to be recompiled for different architectures \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame}{What is MPI?} \begin{block}{What is MPI?} \begin{itemize} \item Message Passing Interface \item De facto standard for distributed memory parallelisation \item Open standard with multiple implementations - now at version 3.0 \item Scales to very large systems \end{itemize} \end{block} \begin{block}{Shared vs Distributed Memory} \begin{itemize} \item Shared - all tasks see all the memory (e.g. OpenMP) \item Distributed - tasks only see a small part of the overall memory \end{itemize} Clusters are distributed memory systems so MPI is well suited. \end{block} \end{frame} %--------------------- \begin{frame}{MPI Terminology} \begin{block}{Words that you are going to hear} \begin{itemize} \item Rank - how MPI tasks are organised \item Rank 0 to N - the "worker" tasks \item Hybrid - a code that combines shared memory parallelisation with MPI \end{itemize} Pure MPI codes generally run one rank per core. \end{block} \end{frame} %--------------------- \begin{frame}{Compilers - Intel vs GCC} \begin{block}{GNU Compiler Collection} \begin{itemize} \item The industry standard and available everywhere \item Quick to support new C++ language features \item Fortran support used to be poor \end{itemize} \end{block} \begin{block}{Intel Composer} \begin{itemize} \item Claims to produce faster code on Intel CPUs \item Better Fortran support \item Generally much stricter with bad code! \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame}{MPI - Intel vs MVAPICH2 vs OpenMPI} \begin{block}{Why are there different flavours?} There are multiple MPI flavours that comply with the specification and each claims to have some advantage over the other. Some are vendor specific and others are open source \end{block} \begin{block}{The main contenders} \begin{itemize} \item Intel MPI - commercial MPI with support \item MVAPICH2 - developed by Ohio uni for Infiniband \item OpenMPI - Open source and widely used \end{itemize} In SCITAS we support IntelMPI and MVAPICH2 \end{block} \end{frame} %--------------------- \begin{frame}{Compiler and MPI choice} \begin{block}{First choose your compiler} \begin{itemize} \item GCC or Intel \item This might be a technical or philosophical choice \end{itemize} \end{block} \begin{block}{The associated MPI is then} \begin{itemize} \item GCC with MVAPICH2 \item Intel with IntelMPI \end{itemize} This is a SCITAS restriction to prevent chaos - nothing technically stops one from mixing!\\ \vspace{2mm} Both work well and have good performance. \end{block} \end{frame} %--------------------- %\begin{frame}{Linear Algebra Libraries} % % %\end{frame} %--------------------- \begin{frame}{The dark art of mangling} \begin{block}{Mangling?} Mechanism to allow multiple functions with the same name \end{block} \begin{block}{C/C++} \begin{itemize} \item GCC - \texttt{\_ZN5NOMAD10Eval\_PointD2Ev} \item Intel - \texttt{\_ZN5NOMAD10Eval\_PointD2Ev} \end{itemize} Result: C/C++ libraries are compatible between GCC and Intel \end{block} \begin{block}{Fortran} \begin{itemize} \item GCC - \texttt{\_\_h5f\_MOD\_h5fget\_access\_plist\_f} \item Intel - \texttt{h5f\_mp\_h5fget\_access\_plist\_f\_} \end{itemize} Result: Fortran libraries are not compatible between GCC and Intel! \end{block} \end{frame} %--------------------- \begin{frame}{Linking} \begin{block}{How to use libraries} Linking is the mechanism by which you can use libraries with your code. \begin{itemize} \item static - put everything in your executable \item dymanic - keep the libraries outside and load them as needed \end{itemize} \end{block} \begin{block}{Dynamic by default} There are very few reasons to statically link code. \end{block} \end{frame} %--------------------- \begin{frame}{What is linked?} \begin{block}{ldd is your friend} \texttt{ldd run.x}\\ \texttt{\tiny{linux-vdso.so.1 => (0x00007fffbfcf5000)}}\\ \texttt{\tiny{llibmkl\_intel\_lp64.so => /ssoft/intel/15.0.0/RH6/all/x86\_E5v2/composer\_xe\_2015.2.164/mkl/lib/intel64/libmkl\_intel\_lp64.so}}\\ \texttt{\tiny{libmkl\_intel\_thread.so => /ssoft/intel/15.0.0/RH6/all/x86\_E5v2/composer\_xe\_2015.2.164/mkl/lib/intel64/libmkl\_intel\_thread.so}}\\ % libmkl_core.so => /ssoft/intel/15.0.0/RH6/all/x86_E5v2/composer_xe_2015.2.164/mkl/lib/intel64/libmkl_core.so (0x00007f2246954000) % libiomp5.so => /ssoft/intel/15.0.0/RH6/all/x86_E5v2/composer_xe_2015.2.164/compiler/lib/intel64/libiomp5.so (0x00007f2246618000) % libm.so.6 => /lib64/libm.so.6 (0x0000003acaa00000) % libgcc_s.so.1 => /lib64/libgcc_s.so.1 (0x00000039e2400000) \texttt{\tiny{..}}\\ \texttt{\tiny{libpthread.so.0 => /lib64/libpthread.so.0 (0x0000003acb200000)}}\\ \texttt{\tiny{libc.so.6 => /lib64/libc.so.6 (0x0000003acae00000)}}\\ \texttt{\tiny{libdl.so.2 => /lib64/libdl.so.2 (0x0000003acb600000)}}\\ \texttt{\tiny{/lib64/ld-linux-x86-64.so.2 (0x0000003aca600000)}} \end{block} \end{frame} %--------------------- \begin{frame}{Example 1 - Build sequential 'Hello World'} \begin{block}{Compile the source files} \texttt{gcc -c output.c}\\ \texttt{gcc -c hello.c}\\ \end{block} \begin{block}{Link} \texttt{gcc -o hello output.o hello.o}\\ \end{block} \begin{block}{Run} \texttt{./hello}\\ \texttt{Hello World!} \end{block} \end{frame} %--------------------- \begin{frame}{Modules} \begin{block}{How software is organised on the clusters} Modules is utility that allows multiple, often incompatible, tools and libraries to exist on a cluster. \end{block} \begin{block}{Naming convention} \begin{itemize} \item name / version / compiler \item hdf5/1.8.14/gcc-4.4.7 \item The MPI flavour is implicit! \end{itemize} Note - compilers are backwards compatible so there is no need to have hdf5/1.8.14/gcc-4.8.3! \end{block} \end{frame} %--------------------- \begin{frame}{More Modules} \begin{block}{Commands} \begin{itemize} \item module purge \item module load gcc/4.8.3 \item module load mvapich2/2.0.1/gcc-4.4.7 \item module load hdf5/1.8.14/gcc-4.4.7 \item module list \item module show gcc/4.8.3 \end{itemize} At present Modules will not prevent you from loading incompatible modules! \end{block} \end{frame} %--------------------- \begin{frame}{MPICC and friends} \begin{block}{mpicc / mpiicc / mpicxx / mpif77 / mpif90 / mpiifort} These are wrappers to the underlying compiler that add the correct options to link with the MPI libraries \begin{itemize} \item mpicc - C wrapper \item mpiicc - Intel C wrapper \item mpiifort - Intel Fortran Compiler \end{itemize} Check the MPI flavour documentation for more details \end{block} \begin{block}{mpicc mycode.c } To use the wrappers simply type: \begin{itemize} \item module load mympiflavour/version \item mpicc hello.c -o hi \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame}{Example 2 - Build // MPI-based 'Hello World'} \begin{block}{Load modules} \texttt{module load intel intelmpi}\\ \end{block} \begin{block}{Compile-link} \texttt{mpiicc -g -o hello\_mpi hello\_mpi.c}\\ \end{block} \begin{block}{Run two tasks on two different nodes} \texttt{srun -N2 -n2 --partition=debug ./hello\_mpi}\\ \texttt{Hello world: I am task rank 1, running on node 'b292'}\\ \texttt{Hello world: I am task rank 2, running on node 'b293'}\\ \end{block} \end{frame} %--------------------- \begin{frame}{Configure and Make} \begin{block}{The traditional way to build packages} \begin{itemize} \item \texttt{./configure --help} \item \texttt{./configure --prefix=X --option=Y} \item \texttt{make} \item \texttt{make install} \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame}{MPI and the Batch System} \begin{block}{Telling SLURM what we need} We would like 64 processes over 4 nodes\\ \vspace{2mm} \texttt{\#SBATCH --nodes 4}\\ \texttt{\#SBATCH --ntasks-per-node 16}\\ \texttt{\#SBATCH --cpus-per-task 1}\\ \texttt{\#SBATCH --mem 32000}\\ \vspace{2mm} Remember that the memory is per node! \end{block} \end{frame} \begin{frame}{Alternative formulations} \begin{block}{We would like 64 processes } \texttt{\#SBATCH --ntasks 64}\\ \texttt{\#SBATCH --cpus-per-task 1}\\ \texttt{\#SBATCH --mem 32000}\\ \vspace{2mm} SLURM will find the space for 64 tasks on as few nodes as possible \end{block} \begin{block}{We would like 16 processes each one needing 4 cores} \texttt{\#SBATCH --ntasks 16}\\ \texttt{\#SBATCH --cpus-per-task 4}\\ \texttt{\#SBATCH --mem 32000}\\ \vspace{2mm} SLURM will allocate 64 cores in total \end{block} \end{frame} %--------------------- \begin{frame}{srun and mpirun} \begin{block}{Launching a MPI job} Now that we have a MPI code we need some way of correctly launching it across multiple nodes \begin{itemize} \item \texttt{srun} - SLURM's built in job launcher \item \texttt{mpirun} - "traditional" job launcher \end{itemize} To use this we type\\ \vspace{2mm} \texttt{srun mycode.x}\\ \vspace{2mm} With the directives on the previous slide this will launch 64 processes on 4 nodes \end{block} \end{frame} \begin{frame}{Intel mpirun} \begin{block}{Using IntelMPI and mpirun} On our clusters IntelMPI is configured to work with srun by default. If you want to use mpirun then do as follows: \begin{itemize} \item \texttt{unset I\_MPI\_PMI\_LIBRARY} \item \texttt{mpirun ./mycode.x} \end{itemize} We don't advise doing this and strongly recommend using srun! \end{block} \end{frame} %--------------------- \begin{frame}{CPU affinity} \begin{block}{Kesako?} CPU affinity is the name for the mechanism by which a process is bound to a specific CPU (core) or a set of cores. \end{block} \begin{block}{Pourquoi?} If a mask is not set the OS might place the task on different cores every 100ms or so. Depending on the cache structure this can be a very bad thing to do.\\ \vspace{2mm} We can also optimise placement of ranks with respect to the underlying hardware. \end{block} \end{frame} %--------------------- \begin{frame}{CPU bitmasks} \begin{block}{11000000} When talking about affinity we use the term "mask" or "bit mask" which is a convenient way of representing which cores are part of a CPU set.\newline If we have an 8 core system then the following masks mean: \begin{itemize} \item \texttt{10000000} - core 8 \item \texttt{01000000} - core 7 \item \texttt{00100000} - core 6 \item \texttt{11110000} - cores 5 to 8 \item \texttt{00001111} - cores 1 to 4 \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame}{CPU bitmasks} \begin{block}{11110000 is f0} These numbers can be conveniently written in hexadecimal so if we query the system regarding CPU masks we will see something like:\\ \vspace{5mm} pid 8092's current affinity mask: 1c0\\ pid 8097's current affinity mask: 1c0000\\ \vspace{5mm} In binary this would translate to\\ \vspace{5mm} pid 8092's current affinity mask: \texttt{000000000000000111000000}\\ pid 8097's current affinity mask: \texttt{000111000000000000000000}\\ \end{block} \end{frame} %--------------------- \begin{frame}{Binding with srun} \begin{block}{Examples} \texttt{srun -N 1 -n 4 -c 1 --cpu\_bind=verbose rank ./hi 1}\\ %srun -N 1 -n 4 -c 1 --cpu_bind=verbose,rank ./hi 1\\ \texttt{cpu\_bind=RANK - b370, task 0 : mask 0x1 set}\\ \texttt{cpu\_bind=RANK - b370, task 1 : mask 0x2 set}\\ \texttt{cpu\_bind=RANK - b370, task 2 : mask 0x4 set}\\ \texttt{cpu\_bind=RANK - b370, task 3 : mask 0x8 set}\\ \vspace{5mm} \texttt{srun -N 1 -n 4 -c 4 --cpu\_bind=verbose,sockets ./hi 1}\\ \texttt{cpu\_bind=MASK - b370, task 1 : mask 0xff00 set}\\ \texttt{cpu\_bind=MASK - b370, task 2 : mask 0xff set}\\ \texttt{cpu\_bind=MASK - b370, task 0 : mask 0xff set}\\ \texttt{cpu\_bind=MASK - b370, task 3 : mask 0xff00 set} \end{block} \end{frame} %--------------------- \begin{frame}{Common errors} \begin{block}{Compiled on a different machine} -\texttt{Please verify that both the operating system and the processor support Intel\(R\) MOVBE, FMA, BMI, LZCNT and AVX2 instructions.} +\texttt{Please verify that both the operating system and the processor support Intel MOVBE, FMA, BMI, LZCNT and AVX2 instructions.} \end{block} -\begin{block}{Module not loaded} +\begin{block}{Module not loaded - LD\_LIBRARY\_PATH } \texttt{./run.x}\\ \texttt{./run.x: error while loading shared libraries: libmkl\_intel\_lp64.so: cannot open shared object file: No such file or directory} \end{block} \end{frame} %--------------------- \begin{frame}{If things don't work} \begin{block}{Try interactively} -\texttt{salloc -N 2 -n 32 --partition debug}\\ -\texttt{srun mycode.x < inp.in}\\ +Errors are much more visible this way + \begin{itemize} + \item \texttt{salloc -N 2 -n 32 -t 01:00:00 --partition debug}\\ + \item \texttt{srun mycode.x < inp.in}\\ + \end{itemize} \end{block} \begin{block}{Check what's going on with htop and ps} -\texttt{ssh b123}\\ -\texttt{htop}\\ -\texttt{ps auxf}\\ + \begin{itemize} + \item \texttt{ssh b123}\\ + \item \texttt{htop}\\ + \item \texttt{ps auxf}\\ + \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame}{If things still don't work} +\begin{block}{Crashes or won't start} + \begin{itemize} + \item Reference input files + \item GDB + \item TotalView Debugger + \end{itemize} + +\end{block} + + +\begin{block}{Crashes after a while} +Memory Leak?\\ + \begin{itemize} + \item Check with Ganglia + \item MemoryScape (TotalView) + \item Valgrind + \end{itemize} + +\end{block} + \end{frame} %--------------------- \begin{frame}{Going further} +\begin{block}{SCITAS offers courses in} + + \begin{itemize} + \item MPI, an introduction to parallel programming + \item MPI, advanced parallel programming + \item Introduction to profiling and software optimisation + \item Computing on GPUs + \end{itemize} + +\end{block} + \end{frame} %--------------------- \begin{frame}{Exercise - Build Octopus} \begin{block}{Download package} \texttt{http://www.tddft.org/programs/octopus} \end{block} \begin{block}{Hints} \texttt{- load modules:}\\ \texttt{\tiny{ intel/15.0.0 intelmpi/5.0.1 fftw/3.3.4/intel-15.0.0 gsl/1.16/intel-15.0.0}}\\ \texttt{- build first libxc}\\ \texttt{- some configure options to use for // octopus:}\\ \texttt{\tiny{ --enable-openmp --enable-mpi}}\\ \texttt{\tiny{ --disable-zdotc-test}}\\ \texttt{\tiny{ --with-blas="-L\$\{MKL\_ROOT\}/lib/intel64 -lmkl\_intel\_lp64 -lmkl\_core -lmkl\_intel\_thread -lpthread"}}\\ \texttt{\tiny{ --with-fft-lib="-L\$\{FFTW\_LIBRARY\} -lfftw3 -lfftw3\_threads"}} \end{block} \end{frame} %---------------------