diff --git a/document.tex b/document.tex
index 95d49b6..2eb95ca 100644
--- a/document.tex
+++ b/document.tex
@@ -1,809 +1,809 @@
 
 \begin{frame}
 
     \titlepage
     \centering
 
 \end{frame}
 
 
 %---------------------
 
 
 \begin{frame}{Welcome}
   \begin{block}{What you will learn}
 How to compile and launch MPI codes on the SCITAS clusters along with a bit of the "why"
  \end{block}
 \vspace{1cm} 
    \begin{block}{What you will not learn}
 How to write parallel code and optimise it - there are other courses for that!
  \end{block}
 
 \end{frame}
 
 
 %---------------------
 
 
 \begin{frame}{Compilation}
 
   \begin{block}{From code to binary}
 Compilation is the process by which code (C, C++, Fortran etc) is transformed into a binary that can be run on a CPU.
  \end{block}
  
   \begin{block}{CPUs are not all the same}
     \begin{itemize}
     \item CPUs have different features and instruction sets
     \item The same code will need to be recompiled for different architectures
     \end{itemize}
   \end{block}
 
 
 \end{frame}
 
 %---------------------
 
 \begin{frame}{What is MPI?}
 
   \begin{block}{What is MPI?}
     \begin{itemize}
     \item Message Passing Interface
     \item Open standard - now at version 3.1
     \item De facto standard for distributed memory parallelisation
     \item Multiple implementations - MVAPICH2, MPICH, IntelMPI ...
     \item Scales to very large systems 
     \end{itemize}
   \end{block}
 
   \begin{block}{Shared vs Distributed Memory}
     \begin{itemize}
     \item Shared - all tasks see all the memory (e.g. OpenMP)
     \item Distributed - tasks only see a small part of the overall memory
     \end{itemize}
     Clusters are distributed memory systems so MPI is well suited.
   \end{block}
 
 
 \end{frame}
 
 
 %---------------------
 
 
 \begin{frame}{MPI Terminology}
 
  \begin{block}{Words that you are going to hear}
     \begin{itemize}
     \item Rank - how MPI tasks are organised
     \item Rank 0 to N - the "worker" tasks
     \item Hybrid - a code that combines shared memory parallelisation with MPI
     \end{itemize}
 Pure MPI codes generally run one rank per core.    
     
   \end{block}
 
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{Compilers - Intel vs GCC}
 
  \begin{block}{GNU Compiler Collection}
     \begin{itemize}
     \item The industry standard and available everywhere
     \item Quick to support new C++ language features
     \item Fortran support used to be poor
     \end{itemize}
  \end{block}
 
  \begin{block}{Intel Composer}
     \begin{itemize}
     \item Claims to produce faster code on Intel CPUs
     \item Better Fortran support
     \item Generally much stricter by default with bad code!
     \end{itemize}
   \end{block}
 
 \end{frame}
 
 %---------------------
 
 \begin{frame}{MPI - Intel vs MVAPICH2 vs OpenMPI}
 
  \begin{block}{Why are there different flavours?}
 There are multiple MPI flavours that comply with the specification and each claims to have some advantage over the other. Some are vendor specific and others are open source
   \end{block}
   
 \begin{block}{The main contenders}
     \begin{itemize}
     \item Intel MPI - commercial MPI with support
     \item MVAPICH2 - developed by Ohio uni for Infiniband
     \item OpenMPI - Open source and widely used
     \end{itemize}
 In SCITAS we support IntelMPI, MVAPICH2 and OpenMPI\\
 \vspace{2mm} 
 We \emph{recommend} IntelMPI or MVAPICH2!
 
 \end{block}
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{Compiler and MPI choice}
 \begin{block}{First choose your compiler}
     \begin{itemize}
     \item GCC or Intel
     \item This might be a technical or philosophical choice
     \end{itemize}
 \end{block}
 
 \begin{block}{The associated MPI is then}
     \begin{itemize}
     \item GCC with MVAPICH2
     \item GCC with OpenMPI \emph{if you have a very good reason}
     \item Intel with IntelMPI
     \end{itemize}
 This is a SCITAS restriction to prevent chaos - nothing technically stops one from mixing!\\
 \vspace{2mm} 
 Both work well and have good performance.
 \end{block}
 
 
 \end{frame}
 
 
 %---------------------
 
 %\begin{frame}{Linear Algebra Libraries}
 %
 %
 %\end{frame}
 
 
 
 
 %---------------------
 
 \begin{frame}{Linking}
 
 \begin{block}{Let someone else do the hard work}
 For nearly everything that you want to do there's already a library function.
 \end{block}
 
 \begin{block}{How to use libraries}
 Linking is the mechanism by which you can use  libraries with your code.
     \begin{itemize}
     \item static - put everything in your executable
     \item dymanic - keep the libraries outside and load them as needed
     \end{itemize}
 \end{block}
 \begin{block}{Dynamic by default}
 There are very few reasons to statically link code.
 \end{block}
 
 
 \end{frame}
 
 %---------------------
 
 \begin{frame}[fragile]{What is linked?}
 
 \begin{block}{ldd is your friend}
 
 \tiny
 \begin{verbatim}
 ldd mycode.x
 
 libmpifort.so.12 => /ssoft/intelmpi/5.1.1/RH6/all/x86_E5v2/impi/5.1.1.109/lib64/libmpifort.so.12 
 libmpi.so.12 => /ssoft/intelmpi/5.1.1/RH6/all/x86_E5v2/impi/5.1.1.109/lib64/libmpi.so.12 
 libdl.so.2 => /lib64/libdl.so.2 
 librt.so.1 => /lib64/librt.so.1 
 libpthread.so.0 => /lib64/libpthread.so.0 
 libm.so.6 => /lib64/libm.so.6 
 libgcc_s.so.1 => /lib64/libgcc_s.so.1
 libc.so.6 => /lib64/libc.so.6 
 \end{verbatim}
 \normalsize
 
 
 
 \end{block}
 \end{frame}
 
 %---------------------
 
 \begin{frame}{The dark art of mangling}
 
 \begin{block}{Mangling and decoration}
 Mechanism to allow multiple functions with the same name but as there is no standard ABI things can get tricky
 \end{block}
 
 \begin{block}{C/C++}
     \begin{itemize}
     \item GCC - \texttt{\_ZN5NOMAD10Eval\_PointD2Ev}
     \item Intel - \texttt{\_ZN5NOMAD10Eval\_PointD2Ev}
     \end{itemize}
 Result: C/C++ libraries are compatible between GCC and Intel
 \end{block}
 
 \begin{block}{Fortran}
     \begin{itemize}
     \item GCC - \texttt{\_\_h5f\_MOD\_h5fget\_access\_plist\_f}
     \item Intel -  \texttt{h5f\_mp\_h5fget\_access\_plist\_f\_}
     \end{itemize}
 Result: Fortran libraries are not compatible between GCC and Intel!
 \end{block}
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{Example 1 - Build sequential 'Hello World'}
 
 \begin{block}{Compile the source files}
 \texttt{gcc -c output.c}\\
 \texttt{gcc -c hello.c}\\
 \end{block}
 
 \begin{block}{Link}
 \texttt{gcc -o hello output.o hello.o}\\
 \end{block}
 
 \begin{block}{Run}
 \texttt{./hello}\\
 \texttt{Hello World!}
 \end{block}
 
 
 \end{frame}
 
 %---------------------
 
 \begin{frame}{Compilation - the general case}
 
 \begin{block}{To compile and link we need}
     \begin{itemize}
     \item The libraries to link against
     \item Where to find these libraries
     \item Where to find their header files
     \item Your source code
     \item A nice name for the executable
     \end{itemize}
 \end{block}
 
 \begin{block}{-l -L and -I}
 \texttt{gcc -l libraries -L path\_to\_libraries -I path\_to\_header\_filer -o name\_of\_executable mycode.c}\\
 \end{block}
 
 \end{frame}
 
 
 \begin{frame}[fragile]{Sequential 'Hello World' with shared libraries}
 
 \begin{block}{In case you were wondering...}
 
 \begin{verbatim}
 $ gcc -fPIC -c output.c
 $ gcc -shared -o liboutput.so output.o 
 $ pwd
 /home/scitas/using-mpi/ex1
 $ gcc hello.c -loutput -L `pwd` -I `pwd` -o hi
 $ export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
 $ ./hi 
 Hello World!
 \end{verbatim}
 Now try running \texttt{ldd} for the executable
 \end{block}
 \end{frame}
 
 
 
 %---------------------
 
 \begin{frame}{Modules}
 
 \begin{block}{How software is organised on the clusters}
 Modules is utility that allows multiple, often incompatible, tools and libraries to exist on a cluster. We use LMod which is an extension of the classical modules tool. 
 \end{block}
 
 \begin{block}{Load modules to see more modules}
    \begin{itemize}
     \item module avail
     \item module load compiler
     \item module avail
     \item module load MPI
     \item module avail
-    \item there is an associated BLAS library (MKL or OpenBLAS)
     \end{itemize}
-\vspace{2mm}
-
+    \vspace{2mm}
+    Note that there is an associated BLAS library (MKL or OpenBLAS)
+  
    
 \end{block}
 
 
 \end{frame}
 
 
 
 %---------------------
 
 \begin{frame}{More Modules}
 
 \begin{block}{Commands}
    \begin{itemize}
    \item module purge
    \item module load gcc
    \item module load mvapich2
    \item module load hdf5 
    \item module list
    \item module help hdf5
    \item module show hdf5
    \end{itemize}
 \end{block}
 \end{frame}
 
 %---------------------
 
 \begin{frame}{LMod features}
 \begin{block}{One compiler at a time}
    \begin{itemize}
    \item module purge
    \item module load gcc
    \item module load hdf5 
    \item module list
    \item module load intel
    \end{itemize}
 Similar behaviour for different flavours of the same package   
 \end{block}
 
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{slmodules}
 \begin{block}{How we manage software}
 One "release" per year 
    \begin{itemize}
    \item slmodules -r deprecated
    \item slmodules
    \item slmodules -s foo
    \end{itemize}
 By default you see the architecture (\texttt{\$SYS\_TYPE})  of the system you are connected to. \\  
 \vspace{2mm} 
 \texttt{Future} becomes \texttt{stable} and \texttt{stable} becomes \texttt{deprecated} in July.  
 \end{block}
 
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{MPICC and friends}
 
 \begin{block}{mpicc / mpiicc / mpicxx / mpif77 / mpif90 / mpiifort}
 These are wrappers to the underlying compiler that add the correct options to link with the MPI libraries
    \begin{itemize}
    \item mpicc - C wrapper
    \item mpiicc - Intel C wrapper
    \item mpiifort - Intel Fortran Compiler
    \end{itemize}
 Check the MPI flavour documentation for more details
 \end{block}
 
 \begin{block}{mpicc mycode.c }
 To use the wrappers simply type:
    \begin{itemize}
    \item module load mympiflavour/version
    \item mpicc hello.c -o hi
     \end{itemize}
 \end{block}    
     
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{Example 2 - Build // MPI-based 'Hello World'}
 
 \begin{block}{Load modules}
 \texttt{module load intel intelmpi}\\
 \end{block}
 
 \begin{block}{Compile-link}
 \texttt{mpiicc -g -o hello\_mpi hello\_mpi.c}\\
 \end{block}
 
 \begin{block}{Run two tasks on two different nodes}
 \texttt{srun -N2 -n2 --partition=debug ./hello\_mpi}\\
 \texttt{Hello world: I am task rank 1, running on node 'b292'}\\
 \texttt{Hello world: I am task rank 2, running on node 'b293'}\\
 \end{block}
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{Configure and Make}
 \begin{block}{The traditional way to build packages}
    \begin{itemize}
    \item \texttt{./configure --help}
    \item \texttt{./configure --prefix=X --option=Y}
    \item \texttt{make}
    \item \texttt{make install}
     \end{itemize}
 
 \end{block}  
 \end{frame}
 
 %---------------------
 
 \begin{frame}{cmake}
 \begin{block}{cmake is a better way to do things!}
    \begin{itemize}
    \item \texttt{cmake -DCMAKE\_INSTALL\_PREFIX:PATH=X \\-DOption=Y <sources>}
    \item \texttt{make}
    \item \texttt{make install}
     \end{itemize}
 If you're starting a project from scratch then we recommend using cmake rather than configure. There's also a graphic interface called \texttt{ccmake}.
 
 \end{block}  
 \end{frame}
 
 
 %---------------------
 \begin{frame}{MPI and the Batch System}
 
 \begin{block}{Telling SLURM what we need}
 We would like 64 processes over 4 nodes\\
 \vspace{2mm}
 \texttt{\#SBATCH --nodes 4}\\
 \texttt{\#SBATCH --ntasks-per-node 16}\\
 \texttt{\#SBATCH --cpus-per-task 1}\\
 \texttt{\#SBATCH --mem 32000}\\
 \vspace{2mm}
 Remember that the memory is per node!
 \end{block}
 
 \end{frame}
 
 
 \begin{frame}{Alternative formulations}
 
 \begin{block}{We would like 64 processes }
 \texttt{\#SBATCH --ntasks 64}\\
 \texttt{\#SBATCH --cpus-per-task 1}\\
 \texttt{\#SBATCH --mem 32000}\\
 \vspace{2mm}
 SLURM will find the space for 64 tasks on as few nodes as possible
 \end{block}
 
 \begin{block}{We would like 16 processes each one needing 4 cores}
 \texttt{\#SBATCH --ntasks 16}\\
 \texttt{\#SBATCH --cpus-per-task 4}\\
 \texttt{\#SBATCH --mem 32000}\\
 \vspace{2mm}
 SLURM will allocate 64 cores in total\\
 \vspace{2mm}
 Note: SLURM does not set \texttt{OMP\_NUM\_THREADS} for OpenMP!
 \end{block}
 
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{srun and mpirun}
 
 \begin{block}{Launching a MPI job}
 Now that we have a MPI code we need some way of correctly launching it across multiple nodes
    \begin{itemize}
    \item \texttt{srun} - SLURM's built in job launcher
    \item \texttt{mpirun} - "traditional" job launcher 
    \end{itemize}
 To use this we type\\
 \vspace{2mm}
 \texttt{srun mycode.x}\\
 \vspace{2mm}
 With the directives on the previous slide this will launch 64 processes on 4 nodes
 \end{block}    
 
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{Multiple srun instances on one node}
 
 \begin{block}{For code that doesn't scale...}
 \texttt{\#SBATCH --nodes 1}\\
 \texttt{\#SBATCH --ntasks 16}\\
 \texttt{\#SBATCH --cpus-per-task 1}\\
 \texttt{\#SBATCH --mem 32000}\\
 \vspace{2mm}
 \texttt{export I\_MPI\_FABRICS=shm  \# if IntelMPI}\\
 \texttt{srun --mem=16000 -n 8 mytask1 \&}\\
 \texttt{srun --mem=16000 -n 8 mytask2 \&}\\
 \texttt{wait}\\
 \vspace{2mm}
 Note: the \texttt{--multi-prog} option for srun can provide a more elegant solution!
 
 \end{block}    
 
 
 \end{frame}
 
 
 
 %---------------------
 
 
 
 \begin{frame}{Intel mpirun}
 \begin{block}{Using IntelMPI and mpirun}
 On our clusters IntelMPI is configured to work with srun by default. If you want to use mpirun then do as follows:
    \begin{itemize}
    \item \texttt{unset I\_MPI\_PMI\_LIBRARY}
    \item \texttt{export SLURM\_CPU\_BIND=none}
    \item \texttt{mpirun ./mycode.x}
    \end{itemize}
        We don't advise doing this and strongly recommend using srun! Please note that, behind the scenes, mpirun still uses SLURM.
 
 \end{block} 
 \end{frame}   
 %---------------------
 
 \begin{frame}{CPU affinity}
 
 \begin{block}{Kesako?}
 CPU affinity is the name for the mechanism by which a process is bound to a specific CPU (core) or a set of cores.
 \end{block}
 
 \begin{block}{Pourquoi?}
 If a mask is not set the OS might place the task on different cores every 100ms or so. For performance this can be a very bad thing to do.\\
 \vspace{2mm}
 We can also optimise placement of ranks with respect to the underlying hardware.
 \end{block}
 
 \end{frame}
 
 \begin{frame}{ccNUMA}
 
 
 \begin{block}{Cache Coherent Non Uniform Memory Architecture}
 This is what compute nodes with more than one processor look like... 
 \end{block}
 
 \includegraphics[width=10cm]{images/ccNUMA.pdf}
 
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{CPU bitmasks}
 
 \begin{block}{11000000}
 When talking about affinity we use the term "mask" or "bit mask" which is a convenient way of representing which cores are part of a CPU set.\newline If we have an 8 core system then the following masks mean:
    \begin{itemize}
    \item \texttt{10000000} - core 8
    \item \texttt{01000000} - core 7
    \item \texttt{00100000} - core 6
    \item \texttt{11110000} - cores 5 to 8
    \item \texttt{00001111} - cores 1 to 4
    \end{itemize}
 \end{block}
 
 
 
 
 \end{frame}
 
 %---------------------
 
 \begin{frame}{CPU bitmasks}
 
 
 \begin{block}{11110000 is f0}
 These numbers can be conveniently written in hexadecimal  so if we query the system regarding CPU masks we will see something like:\\
 \vspace{5mm}
 pid 8092's current affinity mask: 1c0\\
 pid 8097's current affinity mask: 1c0000\\
 \vspace{5mm}
 In binary this would translate to\\
 \vspace{5mm}
 pid 8092's current affinity mask: \texttt{000000000000000111000000}\\
 pid 8097's current affinity mask: \texttt{000111000000000000000000}\\
 \end{block}
 
 \end{frame}
 
 %---------------------
 
 \begin{frame}{Binding with srun}
 
 \begin{block}{Examples}
 \texttt{srun  -N 1 -n 4 -c 1 --cpu\_bind=verbose,rank  ./hi 1}\\
 %srun -N 1 -n 4 -c 1 --cpu_bind=verbose,rank ./hi 1\\
 \texttt{cpu\_bind=RANK - b370, task  0 : mask 0x1 set}\\
 \texttt{cpu\_bind=RANK - b370, task  1 : mask 0x2 set}\\
 \texttt{cpu\_bind=RANK - b370, task  2 : mask 0x4 set}\\
 \texttt{cpu\_bind=RANK - b370, task  3 : mask 0x8 set}\\
 \vspace{5mm}
 \texttt{srun -N 1 -n 4 -c 4 --cpu\_bind=verbose,sockets ./hi 1}\\
 \texttt{cpu\_bind=MASK - b370, task  1 : mask 0xff00 set}\\
 \texttt{cpu\_bind=MASK - b370, task  2 : mask 0xff set}\\
 \texttt{cpu\_bind=MASK - b370, task  0 : mask 0xff set}\\
 \texttt{cpu\_bind=MASK - b370, task  3 : mask 0xff00 set}
 \end{block}
 
 \end{frame}
 
 
 
 %---------------------
 
 \begin{frame}{Common errors}
 
 \begin{block}{Compiled on a different machine}
 \texttt{Please verify that both the operating system and the processor support Intel  MOVBE, FMA, BMI, LZCNT and AVX2 instructions.}
 \end{block}
 
 \begin{block}{LD\_LIBRARY\_PATH not correctly set}
 
 \texttt{./run.x: error while loading shared libraries: libmkl\_intel\_lp64.so: cannot open shared object file: No such file or directory}
 
 \end{block}
 
 \end{frame}
 
 
 
 \begin{frame}{Don't forget the srun}
 
 \begin{block}{./mympicode.x instead of srun mympicode.x}
 
 \texttt{Fatal error in MPI\_Init: Other MPI error, error stack:}\\
 \texttt{.MPIR\_Init\_thread(514): }\\
 \texttt{.MPID\_Init(320).......: channel initialization failed}\\
 \texttt{.MPID\_Init(716).......: PMI\_Get\_id returned 14}\\
 
 \end{block}
 
 \end{frame}
 
 
 
 
 %---------------------
 
 \begin{frame}{If things don't work}
 \begin{block}{Try interactively}
 Errors are much more visible this way
    \begin{itemize}
    \item \texttt{salloc -N 2 -n 32 -t 01:00:00 --partition debug}\\
    \item \texttt{srun mycode.x < inp.in}\\
    \end{itemize}
 
 \end{block}
 
 \begin{block}{Check what's going on with htop and ps}
    \begin{itemize}
    \item \texttt{ssh b123}\\
    \item \texttt{htop}\\
    \item \texttt{ps auxf}\\
    \end{itemize}
 \end{block}
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{If things still don't work}
 
 \begin{block}{Crashes or won't start}
    \begin{itemize}
    \item Reference input files
    \item GDB
    \item TotalView Debugger
    \end{itemize}
 
 \end{block}
 
 
 \begin{block}{Crashes after a while}
 Memory Leak?\\
    \begin{itemize}
    \item Valgrind
    \item MemoryScape (TotalView)
    \end{itemize}
 
 \end{block}
 
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{Some useful tricks}
 
 \begin{block}{compilers}
 
    \begin{itemize}
    \item icc -xAVX -axCORE-AVX2 
    \item icc -mkl mycode.c
    \item mpiicc -show mycode.c
    \end{itemize}
 
 \end{block}
 
 \begin{block}{MKL link line advisor}
 
 \url{https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor}
 
 \end{block}
 
 \begin{block}{SCITAS documentation}
 
 \url{http://scitas.epfl.ch/documentation/compiling-code-different-systems}
 
 \end{block}
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{Going further}
 
 \begin{block}{SCITAS offers courses in}
 
    \begin{itemize}
    \item MPI, an introduction to parallel programming
    \item MPI, advanced parallel programming
    \item Introduction to profiling and software optimisation
    \item Computing on GPUs
    \end{itemize}
 
 \end{block}
 
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{Exercise - Build Octopus}
 
 \begin{block}{Download package}
 \texttt{http://www.tddft.org/programs/octopus}
 \end{block}
 
 \begin{block}{Hints}
 \texttt{- load modules:}\\
 \texttt{\tiny{  intel intelmpi fftw gsl}}\\
 \texttt{- build first libxc}\\
 \texttt{- some configure options to use for // octopus:}\\
 \texttt{\tiny{  --enable-openmp --enable-mpi}}\\
 \texttt{\tiny{  --disable-zdotc-test}}\\
 \texttt{\tiny{  --with-blas="-L\$\{MKL\_ROOT\}/lib/intel64 -lmkl\_intel\_lp64 -lmkl\_core -lmkl\_intel\_thread -lpthread"}}\\
 \texttt{\tiny{  --with-fft-lib="-L\$\{FFTW\_LIBRARY\} -lfftw3 -lfftw3\_threads"}}
 \end{block}
 
 \end{frame}
 
 %---------------------
 
diff --git a/slides.pdf b/slides.pdf
index 14ad497..35f5dff 100644
Binary files a/slides.pdf and b/slides.pdf differ