diff --git a/document.tex b/document.tex index 2423769..2aba89e 100644 --- a/document.tex +++ b/document.tex @@ -1,886 +1,887 @@ \begin{frame} \titlepage \centering \end{frame} %--------------------- \begin{frame}{Welcome} \begin{block}{What you will learn} How to compile and launch MPI codes on the SCITAS clusters along with a bit of the "why" \end{block} \vspace{1cm} \begin{block}{What you will not learn} How to write parallel code and optimise it - there are other courses for that! \end{block} \end{frame} %--------------------- \begin{frame}{Compilation} \begin{block}{From code to binary} Compilation is the process by which code (C, C++, Fortran etc) is transformed into a binary that can be run on a CPU. \end{block} \begin{block}{CPUs are not all the same} \begin{itemize} \item CPUs have different features and instruction sets \item The same code will need to be recompiled for different architectures \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame}{What is MPI?} \begin{block}{What is MPI?} \begin{itemize} \item Message Passing Interface \item Open standard - now at version 3.1 \\ {$\rightarrow$ Check this website: \url{http://mpi-forum.org}} \item De facto standard for distributed memory parallelisation \item Multiple implementations - MVAPICH2, MPICH, IntelMPI ... \item Scales to very large systems \end{itemize} \end{block} \begin{block}{Shared vs Distributed Memory} \begin{itemize} \item Shared - all tasks see all the memory (e.g. OpenMP) \item Distributed - tasks only see a small part of the overall memory \end{itemize} Clusters are distributed memory systems so MPI is well suited. \end{block} \end{frame} %--------------------- \begin{frame}{MPI Terminology} \begin{block}{Words that you are going to hear} \begin{itemize} \item Rank - how MPI tasks are organised \item Rank 0 to N - the "worker" tasks \item Hybrid - a code that combines shared memory parallelisation with MPI \end{itemize} Pure MPI codes generally run one rank per core. \end{block} \end{frame} %--------------------- \begin{frame}{Compilers - Intel vs GCC} \begin{block}{GNU Compiler Collection} \begin{itemize} \item The industry standard and available everywhere \item Quick to support new C++ language features \item Fortran support used to be poor \end{itemize} \end{block} \begin{block}{Intel Composer} \begin{itemize} \item Claims to produce faster code on Intel CPUs \item Better Fortran support \item Generally much stricter by default with bad code! \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame}{MPI - Intel vs MVAPICH2 vs OpenMPI} \begin{block}{Why are there different flavours?} There are multiple MPI flavours that comply with the specification and each claims to have some advantage over the other. Some are vendor specific and others are open source \end{block} \begin{block}{The main contenders} \begin{itemize} \item Intel MPI - commercial MPI with support \item MVAPICH2 - developed by Ohio uni for Infiniband \item OpenMPI - Open source and widely used \end{itemize} In SCITAS we support IntelMPI, MVAPICH2 and OpenMPI\\ \vspace{2mm} We \emph{recommend} IntelMPI or MVAPICH2! \end{block} \end{frame} %--------------------- \begin{frame}{Compiler and MPI choice} \begin{block}{First choose your compiler} \begin{itemize} \item GCC or Intel \item This might be a technical or philosophical choice \end{itemize} \end{block} \begin{block}{The associated MPI is then} \begin{itemize} \item GCC with MVAPICH2 \item GCC with OpenMPI \emph{if you have a very good reason} \item Intel with IntelMPI \end{itemize} This is a SCITAS restriction to prevent chaos - nothing technically stops one from mixing!\\ \vspace{2mm} Both work well and have good performance. \end{block} \end{frame} %--------------------- %\begin{frame}{Linear Algebra Libraries} % % %\end{frame} %--------------------- \begin{frame}{Linking} \begin{block}{Let someone else do the hard work} For nearly everything that you want to do there's already a library function. \end{block} \begin{block}{How to use libraries} Linking is the mechanism by which you can use libraries with your code. \begin{itemize} \item static - put everything in your executable \item dymanic - keep the libraries outside and load them as needed \end{itemize} \end{block} \begin{block}{Dynamic by default} There are very few reasons to statically link code. \end{block} \end{frame} %--------------------- \begin{frame}[fragile]{What is linked?} \begin{block}{ldd is your friend} \tiny \begin{verbatim} ldd mycode.x libmpifort.so.12 => /ssoft/intelmpi/5.1.1/RH6/all/x86_E5v2/impi/5.1.1.109/lib64/libmpifort.so.12 libmpi.so.12 => /ssoft/intelmpi/5.1.1/RH6/all/x86_E5v2/impi/5.1.1.109/lib64/libmpi.so.12 libdl.so.2 => /lib64/libdl.so.2 librt.so.1 => /lib64/librt.so.1 libpthread.so.0 => /lib64/libpthread.so.0 libm.so.6 => /lib64/libm.so.6 libgcc_s.so.1 => /lib64/libgcc_s.so.1 libc.so.6 => /lib64/libc.so.6 \end{verbatim} \normalsize \end{block} \end{frame} %--------------------- \begin{frame}{The dark art of mangling} \begin{block}{Mangling and decoration} Mechanism to allow multiple functions with the same name but as there is no standard ABI things can get tricky \end{block} \begin{block}{C/C++} \begin{itemize} \item GCC - \texttt{\_ZN5NOMAD10Eval\_PointD2Ev} \item Intel - \texttt{\_ZN5NOMAD10Eval\_PointD2Ev} \end{itemize} Result: C/C++ libraries are compatible between GCC and Intel \end{block} \begin{block}{Fortran} \begin{itemize} \item GCC - \texttt{\_\_h5f\_MOD\_h5fget\_access\_plist\_f} \item Intel - \texttt{h5f\_mp\_h5fget\_access\_plist\_f\_} \end{itemize} Result: Fortran libraries are not compatible between GCC and Intel! \end{block} \end{frame} %--------------------- \begin{frame}{Example 1 - Build sequential 'Hello World'} \begin{block}{Compile the source files} \texttt{gcc -c output.c}\\ \texttt{gcc -c hello.c}\\ \end{block} \begin{block}{Link} \texttt{gcc -o hello output.o hello.o}\\ \end{block} \begin{block}{Run} \texttt{./hello}\\ \texttt{Hello World!} \end{block} \end{frame} %--------------------- \begin{frame}{Compilation - the general case} \begin{block}{To compile and link we need} \begin{itemize} \item The libraries to link against \item Where to find these libraries \item Where to find their header files \item Your source code \item A nice name for the executable \end{itemize} \end{block} \begin{block}{-l -L and -I} -\texttt{gcc -l libraries -L path\_to\_libraries -I path\_to\_header\_filer -o name\_of\_executable mycode.c}\\ +\texttt{gcc -L path\_to\_libraries -l libraries -I path\_to\_header\_filer -o name\_of\_executable mycode.c}\\ \end{block} \end{frame} \begin{frame}[fragile]{Sequential 'Hello World' with shared libraries} \begin{block}{In case you were wondering...} \begin{verbatim} $ gcc -fPIC -c output.c $ gcc -shared -o liboutput.so output.o $ pwd /home/scitas/using-mpi/ex1 -$ gcc hello.c -loutput -L `pwd` -I `pwd` -o hi +$ gcc hello.c -L `pwd` -loutput -I `pwd` -o hi $ export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH $ ./hi Hello World! \end{verbatim} Now try running \texttt{ldd} for the executable \end{block} \end{frame} \begin{frame}[fragile]{Making code run faster} \begin{block}{Compiling is hard work..} By default a compiler will not optimise your code! \begin{verbatim} float matest(float a, float b, float c) { a = a*b + c; return a; } \end{verbatim} \end{block} \begin{block}{For the details see:} \url{http://scitas.epfl.ch/kb/Compiling+codes+on+different+systems} \end{block} \end{frame} \begin{frame}[fragile]{No optimisation} \begin{block}{icc mycode.c} \begin{verbatim} matest(float, float, float): push rbp mov rbp,rsp movss DWORD PTR [rbp-0x4],xmm0 movss DWORD PTR [rbp-0x8],xmm1 movss DWORD PTR [rbp-0xc],xmm2 movss xmm0,DWORD PTR [rbp-0x4] mulss xmm0,DWORD PTR [rbp-0x8] addss xmm0,DWORD PTR [rbp-0xc] movss DWORD PTR [rbp-0x4],xmm0 mov eax,DWORD PTR [rbp-0x4] mov DWORD PTR [rbp-0x10],eax movss xmm0,DWORD PTR [rbp-0x10] pop rbp ret \end{verbatim} \end{block} \end{frame} \begin{frame}[fragile]{With optimisation} \begin{block}{icc -O3 -xAXV2 mycode.c} \begin{verbatim} matest(float, float, float): vfmadd132ss xmm0,xmm2,xmm1 ret \end{verbatim} \end{block} \end{frame} \begin{frame}[fragile]{Optimisation levels} \begin{block}{O1} Enables optimizations for speed and disables some optimizations that increase code size and affect speed \end{block} \begin{block}{O2} Enables optimizations for speed. This is the generally recommended optimization level. Vectorization is enabled at O2 and higher levels. \end{block} \begin{block}{O3} Performs O2 optimizations and enables more aggressive loop transformations such as Fusion, Block-Unroll-and-Jam, and collapsing IF statements. \end{block} \end{frame} %--------------------- \begin{frame}{Modules} \begin{block}{How software is organised on the clusters} Modules is utility that allows multiple, often incompatible, tools and libraries to exist on a cluster. We use LMod which is an extension of the classical modules tool. \end{block} \begin{block}{Load modules to see more modules} \begin{itemize} \item module avail \item module load \textless{compiler}\textgreater \\ \texttt{\tiny{Ex: module load intel}} \item module avail \item module load \textless{MPI}\textgreater \\ \texttt{\tiny{Ex: module load intel-mpi}} \item module avail \end{itemize} \vspace{2mm} Note that there is an associated BLAS library (MKL or OpenBLAS) \end{block} \end{frame} %--------------------- \begin{frame}{More Modules} \begin{block}{Commands} \begin{itemize} \item module purge \item module load gcc \item module load mvapich2 \item module load hdf5 \\ \texttt{\tiny{$\rightarrow$ Or simply: module load gcc mvapich2 hdf5}} \item module list \item module help hdf5 \item module show hdf5 \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame}{LMod features} \begin{block}{One compiler at a time} \begin{itemize} \item module purge \item module load gcc \item module load hdf5 \item module list \item module load intel \end{itemize} Only one module flavour can be loaded at the same time \end{block} \end{frame} %--------------------- \begin{frame}{slmodules} \begin{block}{How we manage software} One "release" per year \begin{itemize} \item slmodules -r deprecated \item slmodules \item slmodules -s foo \end{itemize} By default you see the architecture (\texttt{\$SYS\_TYPE}) of the system you are connected to. \\ \vspace{2mm} \texttt{Future} becomes \texttt{stable} and \texttt{stable} becomes \texttt{deprecated} in July. \end{block} \end{frame} %--------------------- \begin{frame}{MPICC and friends} \begin{block}{mpicc / mpiicc / mpicxx / mpif77 / mpif90 / mpiifort} These are wrappers to the underlying compiler that add the correct options to link with the MPI libraries \begin{itemize} \item mpicc - C wrapper \item mpiicc - Intel C wrapper \item mpiifort - Intel Fortran Compiler \end{itemize} Check the MPI flavour documentation for more details \end{block} \begin{block}{mpicc mycode.c } To use the wrappers simply type: \begin{itemize} \item module load mympiflavour/version \item mpicc hello.c -o hi \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame}{Example 2 - Build // MPI-based 'Hello World'} \begin{block}{Load modules} \texttt{module load intel intel-mpi}\\ \end{block} \begin{block}{Compile-link} \texttt{mpiicc -g -o hello\_mpi hello\_mpi.c}\\ \end{block} \begin{block}{Run two tasks on two different nodes} \texttt{srun -N2 -n2 --partition=debug ./hello\_mpi}\\ \texttt{Hello world: I am task rank 1, running on node 'b292'}\\ \texttt{Hello world: I am task rank 2, running on node 'b293'}\\ \end{block} \end{frame} %--------------------- \begin{frame}{Configure and Make} \begin{block}{The traditional way to build packages} \begin{itemize} \item \texttt{./configure --help} \item \texttt{./configure --prefix=X --option=Y} \item \texttt{make} \item \texttt{make install} \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame}{cmake} \begin{block}{cmake is a better way to do things!} \begin{itemize} \item \texttt{cmake -DCMAKE\_INSTALL\_PREFIX:PATH=X \\-DOption=Y } \item \texttt{make} \item \texttt{make install} \end{itemize} If you're starting a project from scratch then we recommend using cmake rather than configure. There's also a graphic interface called \texttt{ccmake}. \end{block} \end{frame} %--------------------- \begin{frame}{MPI and the Batch System} \begin{block}{Telling SLURM what we need} We would like 64 processes over 4 nodes\\ \vspace{2mm} -\texttt{\#SBATCH --nodes 4}\\ +\texttt{\#SBATCH} - -nodes 4\\ \texttt{\#SBATCH --ntasks-per-node 16}\\ \texttt{\#SBATCH --cpus-per-task 1}\\ \texttt{\#SBATCH --mem 32000}\\ \vspace{2mm} Remember that the memory is per node! \end{block} \end{frame} \begin{frame}{Alternative formulations} \begin{block}{We would like 64 processes } \texttt{\#SBATCH --ntasks 64}\\ \texttt{\#SBATCH --cpus-per-task 1}\\ \texttt{\#SBATCH --mem 32000}\\ \vspace{2mm} SLURM will find the space for 64 tasks on as few nodes as possible \end{block} \begin{block}{We would like 16 processes each one needing 4 cores} \texttt{\#SBATCH --ntasks 16}\\ \texttt{\#SBATCH --cpus-per-task 4}\\ \texttt{\#SBATCH --mem 32000}\\ \vspace{2mm} SLURM will allocate 64 cores in total\\ \vspace{2mm} Note: SLURM does not set \texttt{OMP\_NUM\_THREADS} for OpenMP! \end{block} \end{frame} %--------------------- \begin{frame}{srun and mpirun} \begin{block}{Launching a MPI job} Now that we have a MPI code we need some way of correctly launching it across multiple nodes \begin{itemize} \item \texttt{srun} - SLURM's built in job launcher \item \texttt{mpirun} - "traditional" job launcher \end{itemize} To use this we type\\ \vspace{2mm} \texttt{srun mycode.x}\\ \vspace{2mm} With the directives on the previous slide this will launch 64 processes on 4 nodes \end{block} \end{frame} %--------------------- \begin{frame}{Multiple srun instances on one node} \begin{block}{For code that doesn't scale...} \texttt{\#SBATCH --nodes 1}\\ \texttt{\#SBATCH --ntasks 16}\\ \texttt{\#SBATCH --cpus-per-task 1}\\ \texttt{\#SBATCH --mem 32000}\\ \vspace{2mm} \texttt{srun --mem=16000 -n 8 mytask1 \&}\\ \texttt{srun --mem=16000 -n 8 mytask2 \&}\\ \texttt{wait}\\ \vspace{2mm} \textit{Note: the \texttt{--multi-prog} option for srun can provide a more elegant solution!} \\ \vspace{2mm} For more details, check our documentation on this page: \\ \vspace{2mm} \tiny{\url{https://scitasadm.epfl.ch/confluence/display/DOC/Running+multiple+tasks+on+one+node}} \end{block} \end{frame} %--------------------- \begin{frame}{Intel mpirun} \begin{block}{Using IntelMPI and mpirun} On our clusters IntelMPI is configured to work with srun by default. If you want to use mpirun then do as follows: \begin{itemize} \item \texttt{unset I\_MPI\_PMI\_LIBRARY} \item \texttt{export SLURM\_CPU\_BIND=none} \item \texttt{mpirun ./mycode.x} \end{itemize} We don't advise doing this and strongly recommend using srun! Please note that, behind the scenes, mpirun still uses SLURM. \end{block} \end{frame} %--------------------- \begin{frame}{CPU affinity} \begin{block}{Kesako?} CPU affinity is the name for the mechanism by which a process is bound to a specific CPU (core) or a set of cores. \end{block} \begin{block}{Pourquoi?} If a mask is not set the OS might place the task on different cores every 100ms or so. For performance this can be a very bad thing to do.\\ \vspace{2mm} We can also optimise placement of ranks with respect to the underlying hardware. \end{block} \end{frame} \begin{frame}{ccNUMA} \begin{block}{Cache Coherent Non Uniform Memory Architecture} This is what compute nodes with more than one processor look like... \end{block} \includegraphics[width=10cm]{images/ccNUMA.pdf} \end{frame} %--------------------- \begin{frame}{CPU bitmasks} \begin{block}{11000000} When talking about affinity we use the term "mask" or "bit mask" which is a convenient way of representing which cores are part of a CPU set.\newline If we have an 8 core system then the following masks mean: \begin{itemize} \item \texttt{10000000} - core 8 \item \texttt{01000000} - core 7 \item \texttt{00100000} - core 6 \item \texttt{11110000} - cores 5 to 8 \item \texttt{00001111} - cores 1 to 4 \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame}{CPU bitmasks} \begin{block}{11110000 is f0} These numbers can be conveniently written in hexadecimal so if we query the system regarding CPU masks we will see something like:\\ \vspace{5mm} pid 8092's current affinity mask: 1c0\\ pid 8097's current affinity mask: 1c0000\\ \vspace{5mm} In binary this would translate to\\ \vspace{5mm} pid 8092's current affinity mask: \texttt{000000000000000111000000}\\ pid 8097's current affinity mask: \texttt{000111000000000000000000}\\ \end{block} \end{frame} %--------------------- \begin{frame}{Binding with srun} \begin{block}{Examples} \texttt{srun -N 1 -n 4 -c 1 --cpu\_bind=verbose,rank ./hi 1}\\ %srun -N 1 -n 4 -c 1 --cpu_bind=verbose,rank ./hi 1\\ \texttt{cpu\_bind=RANK - b370, task 0 : mask 0x1 set}\\ \texttt{cpu\_bind=RANK - b370, task 1 : mask 0x2 set}\\ \texttt{cpu\_bind=RANK - b370, task 2 : mask 0x4 set}\\ \texttt{cpu\_bind=RANK - b370, task 3 : mask 0x8 set}\\ \vspace{5mm} \texttt{srun -N 1 -n 4 -c 4 --cpu\_bind=verbose,sockets ./hi 1}\\ \texttt{cpu\_bind=MASK - b370, task 1 : mask 0xff00 set}\\ \texttt{cpu\_bind=MASK - b370, task 2 : mask 0xff set}\\ \texttt{cpu\_bind=MASK - b370, task 0 : mask 0xff set}\\ \texttt{cpu\_bind=MASK - b370, task 3 : mask 0xff00 set} \end{block} \end{frame} %--------------------- \begin{frame}{Common errors} \begin{block}{Compiled on a different machine} \texttt{Please verify that both the operating system and the processor support Intel MOVBE, FMA, BMI, LZCNT and AVX2 instructions.} \end{block} \begin{block}{LD\_LIBRARY\_PATH not correctly set} \texttt{./run.x: error while loading shared libraries: libmkl\_intel\_lp64.so: cannot open shared object file: No such file or directory} \end{block} \end{frame} \begin{frame}{Don't forget the srun} \begin{block}{./mympicode.x instead of srun mympicode.x} \texttt{Fatal error in MPI\_Init: Other MPI error, error stack:}\\ \texttt{.MPIR\_Init\_thread(514): }\\ \texttt{.MPID\_Init(320).......: channel initialization failed}\\ \texttt{.MPID\_Init(716).......: PMI\_Get\_id returned 14}\\ \end{block} \end{frame} %--------------------- \begin{frame}{If things don't work} \begin{block}{Try interactively} Errors are much more visible this way \begin{itemize} \item \texttt{salloc -N 2 -n 32 -t 01:00:00 --partition debug}\\ \item \texttt{srun mycode.x < inp.in}\\ \end{itemize} \end{block} \begin{block}{Check what's going on with htop and ps} \begin{itemize} \item \texttt{ssh b123}\\ \item \texttt{htop}\\ \item \texttt{ps auxf}\\ \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame}{If things still don't work} \begin{block}{Crashes or won't start} \begin{itemize} \item Reference input files \item GDB \item TotalView Debugger \end{itemize} \end{block} \begin{block}{Crashes after a while} Memory Leak?\\ \begin{itemize} \item Valgrind \item MemoryScape (TotalView) \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame}{Some useful tricks} \begin{block}{compilers} \begin{itemize} \item icc -xAVX -axCORE-AVX2 \item icc -mkl mycode.c \item mpiicc -show mycode.c \end{itemize} \end{block} \begin{block}{MKL link line advisor} \url{https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor} \end{block} \begin{block}{SCITAS documentation} \url{http://scitas.epfl.ch/documentation/compiling-code-different-systems} \end{block} \end{frame} %--------------------- \begin{frame}{Going further} \begin{block}{SCITAS offers courses in} \begin{itemize} \item MPI, an introduction to parallel programming \item MPI, advanced parallel programming \item Introduction to profiling and software optimisation \item Computing on GPUs \end{itemize} \end{block} \end{frame} %--------------------- \begin{frame}{Exercise - Build Octopus} \begin{block}{Download package} \texttt{\url{http://www.tddft.org/programs/octopus}} \end{block} \begin{block}{Hints} \texttt{- load modules:}\\ \texttt{\tiny{ intel intel-mpi intel-mkl fftw gsl}}\\ \texttt{- build first libxc}\\ \texttt{- some configure options to use for // octopus:}\\ \texttt{\tiny{ --enable-openmp --enable-mpi}}\\ \texttt{\tiny{ --disable-zdotc-test}}\\ \texttt{\tiny{ --with-blas="-L\$\{MKLROOT\}/lib/intel64 -lmkl\_intel\_lp64 -lmkl\_core -lmkl\_intel\_thread \textbackslash}} \\ \texttt{\tiny{ -lpthread -lm"}}\\ \texttt{\tiny{ --with-fftw-prefix="\$\{FFTW\_ROOT\}"}} + \end{block} \end{frame} %--------------------- diff --git a/slides.pdf b/slides.pdf index 2bd2a4f..d140cf3 100644 Binary files a/slides.pdf and b/slides.pdf differ diff --git a/slides.synctex.gz b/slides.synctex.gz index 6f5f1a7..67262ff 100644 Binary files a/slides.synctex.gz and b/slides.synctex.gz differ diff --git a/slides.tex b/slides.tex index 156a448..b85593e 100644 --- a/slides.tex +++ b/slides.tex @@ -1,37 +1,41 @@ \documentclass[xcolor={usenames,dvipsnames}]{beamer} \usetheme{SCITAS} \usepackage{alltt} \hypersetup{ pdftitle={Compiling code and using MPI}, pdfauthor={http://scitas.epfl.ch}, colorlinks=true, urlcolor=blue } \title{Compiling code and using MPI} \author{\url{scitas.epfl.ch}} \date{\today} \usepackage{upquote} +\usepackage[T1]{fontenc} +\usepackage[utf8]{inputenc} +\usepackage{babel} + \newcounter{exercise} \resetcounteronoverlays{exercise} \setcounter{exercise}{0} \newenvironment{exo}[1]% {\begin{frame}[fragile,environment=exo]% \refstepcounter{exercise} \frametitle{Exercise~\arabic{exercise}: #1}} {\end{frame}} \newenvironment{questions}% {\begin{block}{Questions:} \begin{itemize}} {\end{itemize} \end{block}} \begin{document} \input{document} \end{document}