diff --git a/document.tex b/document.tex
index c72c7b7..1b5441f 100644
--- a/document.tex
+++ b/document.tex
@@ -1,586 +1,622 @@
 
 \begin{frame}
 
     \titlepage
     \centering
 
 \end{frame}
 
 
 %---------------------
 
 
 \begin{frame}{Welcome}
   \begin{block}{What you will learn}
 How to compile and launch MPI codes on the SCITAS clusters along with a bit of the "why"
  \end{block}
 \vspace{1cm} 
    \begin{block}{What you will not learn}
 How to write parallel code and optimise it - there are other courses for that!
  \end{block}
 
 \end{frame}
 
 
 %---------------------
 
 
 \begin{frame}{Compilation}
 
   \begin{block}{From code to binary}
 Compilation is the process by which code (C, C++, Fortran etc) is transformed into a binary that can be run on a CPU.
  \end{block}
  
   \begin{block}{CPUs are not all the same}
     \begin{itemize}
     \item CPUs have different features and instruction sets
     \item The same code will need to be recompiled for different architectures
     \end{itemize}
   \end{block}
 
 
 \end{frame}
 
 %---------------------
 
 \begin{frame}{What is MPI?}
 
   \begin{block}{What is MPI?}
     \begin{itemize}
     \item Message Passing Interface
     \item De facto standard for distributed memory parallelisation
     \item Open standard with multiple implementations - now at version 3.0
     \item Scales to very large systems 
     \end{itemize}
   \end{block}
 
   \begin{block}{Shared vs Distributed Memory}
     \begin{itemize}
     \item Shared - all tasks see all the memory (e.g. OpenMP)
     \item Distributed - tasks only see a small part of the overall memory
     \end{itemize}
     Clusters are distributed memory systems so MPI is well suited.
   \end{block}
 
 
 \end{frame}
 
 
 %---------------------
 
 
 \begin{frame}{MPI Terminology}
 
  \begin{block}{Words that you are going to hear}
     \begin{itemize}
     \item Rank - how MPI tasks are organised
     \item Rank 0 to N - the "worker" tasks
     \item Hybrid - a code that combines shared memory parallelisation with MPI
     \end{itemize}
 Pure MPI codes generally run one rank per core.    
     
   \end{block}
 
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{Compilers - Intel vs GCC}
 
  \begin{block}{GNU Compiler Collection}
     \begin{itemize}
     \item The industry standard and available everywhere
     \item Quick to support new C++ language features
     \item Fortran support used to be poor
     \end{itemize}
  \end{block}
 
  \begin{block}{Intel Composer}
     \begin{itemize}
     \item Claims to produce faster code on Intel CPUs
     \item Better Fortran support
     \item Generally much stricter with bad code!
     \end{itemize}
   \end{block}
 
 \end{frame}
 
 %---------------------
 
 \begin{frame}{MPI - Intel vs MVAPICH2 vs OpenMPI}
 
  \begin{block}{Why are there different flavours?}
 There are multiple MPI flavours that comply with the specification and each claims to have some advantage over the other. Some are vendor specific and others are open source
   \end{block}
   
 \begin{block}{The main contenders}
     \begin{itemize}
     \item Intel MPI - commercial MPI with support
     \item MVAPICH2 - developed by Ohio uni for Infiniband
     \item OpenMPI - Open source and widely used
     \end{itemize}
 In SCITAS we support IntelMPI and MVAPICH2 
 \end{block}
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{Compiler and MPI choice}
 \begin{block}{First choose your compiler}
     \begin{itemize}
     \item GCC or Intel
     \item This might be a technical or philosophical choice
     \end{itemize}
 \end{block}
 
 \begin{block}{The associated MPI is then}
     \begin{itemize}
     \item GCC with MVAPICH2
     \item Intel with IntelMPI
     \end{itemize}
 This is a SCITAS restriction to prevent chaos - nothing technically stops one from mixing!\\
 \vspace{2mm} 
 Both work well and have good performance.
 \end{block}
 
 
 \end{frame}
 
 
 %---------------------
 
 %\begin{frame}{Linear Algebra Libraries}
 %
 %
 %\end{frame}
 
 
 %---------------------
 
 \begin{frame}{The dark art of mangling}
 
 \begin{block}{Mangling?}
 Mechanism to allow multiple functions with the same name
 \end{block}
 
 \begin{block}{C/C++}
     \begin{itemize}
     \item GCC - \texttt{\_ZN5NOMAD10Eval\_PointD2Ev}
     \item Intel - \texttt{\_ZN5NOMAD10Eval\_PointD2Ev}
     \end{itemize}
 Result: C/C++ libraries are compatible between GCC and Intel
 \end{block}
 
 \begin{block}{Fortran}
     \begin{itemize}
     \item GCC - \texttt{\_\_h5f\_MOD\_h5fget\_access\_plist\_f}
     \item Intel -  \texttt{h5f\_mp\_h5fget\_access\_plist\_f\_}
     \end{itemize}
 Result: Fortran libraries are not compatible between GCC and Intel!
 \end{block}
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{Linking}
 
 \begin{block}{How to use libraries}
 Linking is the mechanism by which you can use  libraries with your code.
     \begin{itemize}
     \item static - put everything in your executable
     \item dymanic - keep the libraries outside and load them as needed
     \end{itemize}
 \end{block}
 \begin{block}{Dynamic by default}
 There are very few reasons to statically link code.
 \end{block}
 
 
 \end{frame}
 
 %---------------------
 
 \begin{frame}{What is linked?}
 
 \begin{block}{ldd is your friend}
 
 \texttt{ldd run.x}\\
 
 \texttt{\tiny{linux-vdso.so.1 =>  (0x00007fffbfcf5000)}}\\
 \texttt{\tiny{llibmkl\_intel\_lp64.so => /ssoft/intel/15.0.0/RH6/all/x86\_E5v2/composer\_xe\_2015.2.164/mkl/lib/intel64/libmkl\_intel\_lp64.so}}\\
 \texttt{\tiny{libmkl\_intel\_thread.so => /ssoft/intel/15.0.0/RH6/all/x86\_E5v2/composer\_xe\_2015.2.164/mkl/lib/intel64/libmkl\_intel\_thread.so}}\\
 %	libmkl_core.so => /ssoft/intel/15.0.0/RH6/all/x86_E5v2/composer_xe_2015.2.164/mkl/lib/intel64/libmkl_core.so (0x00007f2246954000)
 %	libiomp5.so => /ssoft/intel/15.0.0/RH6/all/x86_E5v2/composer_xe_2015.2.164/compiler/lib/intel64/libiomp5.so (0x00007f2246618000)
 %	libm.so.6 => /lib64/libm.so.6 (0x0000003acaa00000)
 %	libgcc_s.so.1 => /lib64/libgcc_s.so.1 (0x00000039e2400000)
 \texttt{\tiny{..}}\\
 \texttt{\tiny{libpthread.so.0 => /lib64/libpthread.so.0 (0x0000003acb200000)}}\\
 \texttt{\tiny{libc.so.6 => /lib64/libc.so.6 (0x0000003acae00000)}}\\
 \texttt{\tiny{libdl.so.2 => /lib64/libdl.so.2 (0x0000003acb600000)}}\\
 \texttt{\tiny{/lib64/ld-linux-x86-64.so.2 (0x0000003aca600000)}}
 
 
 
 \end{block}
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{Example 1 - Build sequential 'Hello World'}
 
 \begin{block}{Compile the source files}
 \texttt{gcc -c output.c}\\
 \texttt{gcc -c hello.c}\\
 \end{block}
 
 \begin{block}{Link}
 \texttt{gcc -o hello output.o hello.o}\\
 \end{block}
 
 \begin{block}{Run}
 \texttt{./hello}\\
 \texttt{Hello World!}
 \end{block}
 
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{Modules}
 
 \begin{block}{How software is organised on the clusters}
 Modules is utility that allows multiple, often incompatible, tools and libraries to exist on a cluster. 
 \end{block}
 
 \begin{block}{Naming convention}
    \begin{itemize}
     \item name / version / compiler
     \item hdf5/1.8.14/gcc-4.4.7 
     \item The MPI flavour is implicit!
     \end{itemize}
 Note -  compilers are backwards compatible so there is no need to have hdf5/1.8.14/gcc-4.8.3!
    
 \end{block}
 
 
 \end{frame}
 
 
 
 %---------------------
 
 \begin{frame}{More Modules}
 
 \begin{block}{Commands}
    \begin{itemize}
    \item module purge
    \item module load gcc/4.8.3
    \item module load mvapich2/2.0.1/gcc-4.4.7 
    \item module load hdf5/1.8.14/gcc-4.4.7 
    \item module list
    \item module show gcc/4.8.3
    \end{itemize}
 At present Modules will not prevent you from loading incompatible modules! 
 \end{block}
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{MPICC and friends}
 
 \begin{block}{mpicc / mpiicc / mpicxx / mpif77 / mpif90 / mpiifort}
 These are wrappers to the underlying compiler that add the correct options to link with the MPI libraries
    \begin{itemize}
    \item mpicc - C wrapper
    \item mpiicc - Intel C wrapper
    \item mpiifort - Intel Fortran Compiler
    \end{itemize}
 Check the MPI flavour documentation for more details
 \end{block}
 
 \begin{block}{mpicc mycode.c }
 To use the wrappers simply type:
    \begin{itemize}
    \item module load mympiflavour/version
    \item mpicc hello.c -o hi
     \end{itemize}
 \end{block}    
     
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{Example 2 - Build // MPI-based 'Hello World'}
 
 \begin{block}{Load modules}
 \texttt{module load intel intelmpi}\\
 \end{block}
 
 \begin{block}{Compile-link}
 \texttt{mpiicc -g -o hello\_mpi hello\_mpi.c}\\
 \end{block}
 
 \begin{block}{Run two tasks on two different nodes}
 \texttt{srun -N2 -n2 --partition=debug ./hello\_mpi}\\
 \texttt{Hello world: I am task rank 1, running on node 'b292'}\\
 \texttt{Hello world: I am task rank 2, running on node 'b293'}\\
 \end{block}
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{Configure and Make}
 \begin{block}{The traditional way to build packages}
    \begin{itemize}
    \item \texttt{./configure --help}
    \item \texttt{./configure --prefix=X --option=Y}
    \item \texttt{make}
    \item \texttt{make install}
     \end{itemize}
 
 \end{block}  
 \end{frame}
 
 
 %---------------------
 \begin{frame}{MPI and the Batch System}
 
 \begin{block}{Telling SLURM what we need}
 We would like 64 processes over 4 nodes\\
 \vspace{2mm}
 \texttt{\#SBATCH --nodes 4}\\
 \texttt{\#SBATCH --ntasks-per-node 16}\\
 \texttt{\#SBATCH --cpus-per-task 1}\\
 \texttt{\#SBATCH --mem 32000}\\
 \vspace{2mm}
 Remember that the memory is per node!
 \end{block}
 
 \end{frame}
 
 
 \begin{frame}{Alternative formulations}
 
 \begin{block}{We would like 64 processes }
 \texttt{\#SBATCH --ntasks 64}\\
 \texttt{\#SBATCH --cpus-per-task 1}\\
 \texttt{\#SBATCH --mem 32000}\\
 \vspace{2mm}
 SLURM will find the space for 64 tasks on as few nodes as possible
 \end{block}
 
 \begin{block}{We would like 16 processes each one needing 4 cores}
 \texttt{\#SBATCH --ntasks 16}\\
 \texttt{\#SBATCH --cpus-per-task 4}\\
 \texttt{\#SBATCH --mem 32000}\\
 \vspace{2mm}
 SLURM will allocate 64 cores in total 
 \end{block}
 
 
 \end{frame}
 
 
 
 %---------------------
 
 \begin{frame}{srun and mpirun}
 
 \begin{block}{Launching a MPI job}
 Now that we have a MPI code we need some way of correctly launching it across multiple nodes
    \begin{itemize}
    \item \texttt{srun} - SLURM's built in job launcher
    \item \texttt{mpirun} - "traditional" job launcher 
    \end{itemize}
 To use this we type\\
 \vspace{2mm}
 \texttt{srun mycode.x}\\
 \vspace{2mm}
 With the directives on the previous slide this will launch 64 processes on 4 nodes
 \end{block}    
 
 
 \end{frame}
 
 
 \begin{frame}{Intel mpirun}
 \begin{block}{Using IntelMPI and mpirun}
 On our clusters IntelMPI is configured to work with srun by default. If you want to use mpirun then do as follows:
    \begin{itemize}
    \item \texttt{unset I\_MPI\_PMI\_LIBRARY}
    \item \texttt{mpirun ./mycode.x}
    \end{itemize}
        We don't advise doing this and strongly recommend using srun!
 
 \end{block} 
 \end{frame}   
 %---------------------
 
 \begin{frame}{CPU affinity}
 
 
 \begin{block}{Kesako?}
 CPU affinity is the name for the mechanism by which a process is bound to a specific CPU (core) or a set of cores.
 \end{block}
 
 \begin{block}{Pourquoi?}
 If a mask is not set the OS might place the task on different cores every 100ms or so. Depending on the cache structure this can be a very bad thing to do.\\
 \vspace{2mm}
 We can also optimise placement of ranks with respect to the underlying hardware.
 \end{block}
 
 
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{CPU bitmasks}
 
 \begin{block}{11000000}
 When talking about affinity we use the term "mask" or "bit mask" which is a convenient way of representing which cores are part of a CPU set.\newline If we have an 8 core system then the following masks mean:
    \begin{itemize}
    \item \texttt{10000000} - core 8
    \item \texttt{01000000} - core 7
    \item \texttt{00100000} - core 6
    \item \texttt{11110000} - cores 5 to 8
    \item \texttt{00001111} - cores 1 to 4
    \end{itemize}
 \end{block}
 
 
 
 
 \end{frame}
 
 %---------------------
 
 \begin{frame}{CPU bitmasks}
 
 
 \begin{block}{11110000 is f0}
 These numbers can be conveniently written in hexadecimal  so if we query the system regarding CPU masks we will see something like:\\
 \vspace{5mm}
 pid 8092's current affinity mask: 1c0\\
 pid 8097's current affinity mask: 1c0000\\
 \vspace{5mm}
 In binary this would translate to\\
 \vspace{5mm}
 pid 8092's current affinity mask: \texttt{000000000000000111000000}\\
 pid 8097's current affinity mask: \texttt{000111000000000000000000}\\
 \end{block}
 
 \end{frame}
 
 %---------------------
 
 \begin{frame}{Binding with srun}
 
 \begin{block}{Examples}
 \texttt{srun  -N 1 -n 4 -c 1 --cpu\_bind=verbose rank  ./hi 1}\\
 %srun -N 1 -n 4 -c 1 --cpu_bind=verbose,rank ./hi 1\\
 \texttt{cpu\_bind=RANK - b370, task  0 : mask 0x1 set}\\
 \texttt{cpu\_bind=RANK - b370, task  1 : mask 0x2 set}\\
 \texttt{cpu\_bind=RANK - b370, task  2 : mask 0x4 set}\\
 \texttt{cpu\_bind=RANK - b370, task  3 : mask 0x8 set}\\
 \vspace{5mm}
 \texttt{srun -N 1 -n 4 -c 4 --cpu\_bind=verbose,sockets ./hi 1}\\
 \texttt{cpu\_bind=MASK - b370, task  1 : mask 0xff00 set}\\
 \texttt{cpu\_bind=MASK - b370, task  2 : mask 0xff set}\\
 \texttt{cpu\_bind=MASK - b370, task  0 : mask 0xff set}\\
 \texttt{cpu\_bind=MASK - b370, task  3 : mask 0xff00 set}
 \end{block}
 
 \end{frame}
 
 
 
 %---------------------
 
 \begin{frame}{Common errors}
 
 \begin{block}{Compiled on a different machine}
-\texttt{Please verify that both the operating system and the processor support Intel\(R\) MOVBE, FMA, BMI, LZCNT and AVX2 instructions.}
+\texttt{Please verify that both the operating system and the processor support Intel  MOVBE, FMA, BMI, LZCNT and AVX2 instructions.}
 \end{block}
 
-\begin{block}{Module not loaded}
+\begin{block}{Module not loaded - LD\_LIBRARY\_PATH }
 \texttt{./run.x}\\
 \texttt{./run.x: error while loading shared libraries: libmkl\_intel\_lp64.so: cannot open shared object file: No such file or directory}
 \end{block}
 
 
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{If things don't work}
 \begin{block}{Try interactively}
-\texttt{salloc -N 2 -n 32 --partition debug}\\
-\texttt{srun mycode.x < inp.in}\\
+Errors are much more visible this way
+   \begin{itemize}
+   \item \texttt{salloc -N 2 -n 32 -t 01:00:00 --partition debug}\\
+   \item \texttt{srun mycode.x < inp.in}\\
+   \end{itemize}
 
 \end{block}
 
 \begin{block}{Check what's going on with htop and ps}
-\texttt{ssh b123}\\
-\texttt{htop}\\
-\texttt{ps auxf}\\
+   \begin{itemize}
+   \item \texttt{ssh b123}\\
+   \item \texttt{htop}\\
+   \item \texttt{ps auxf}\\
+   \end{itemize}
 \end{block}
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{If things still don't work}
 
+\begin{block}{Crashes or won't start}
+   \begin{itemize}
+   \item Reference input files
+   \item GDB
+   \item TotalView Debugger
+    \end{itemize}
+
+\end{block}
+
+
+\begin{block}{Crashes after a while}
+Memory Leak?\\
+   \begin{itemize}
+   \item Check with Ganglia
+   \item MemoryScape (TotalView)
+   \item Valgrind
+   \end{itemize}
+
+\end{block}
+
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{Going further}
 
+\begin{block}{SCITAS offers courses in}
+
+   \begin{itemize}
+   \item MPI, an introduction to parallel programming
+   \item MPI, advanced parallel programming
+   \item Introduction to profiling and software optimisation
+   \item Computing on GPUs
+   \end{itemize}
+
+\end{block}
+
 
 \end{frame}
 
 
 %---------------------
 
 \begin{frame}{Exercise - Build Octopus}
 
 \begin{block}{Download package}
 \texttt{http://www.tddft.org/programs/octopus}
 \end{block}
 
 \begin{block}{Hints}
 \texttt{- load modules:}\\
 \texttt{\tiny{  intel/15.0.0 intelmpi/5.0.1 fftw/3.3.4/intel-15.0.0 gsl/1.16/intel-15.0.0}}\\
 \texttt{- build first libxc}\\
 \texttt{- some configure options to use for // octopus:}\\
 \texttt{\tiny{  --enable-openmp --enable-mpi}}\\
 \texttt{\tiny{  --disable-zdotc-test}}\\
 \texttt{\tiny{  --with-blas="-L\$\{MKL\_ROOT\}/lib/intel64 -lmkl\_intel\_lp64 -lmkl\_core -lmkl\_intel\_thread -lpthread"}}\\
 \texttt{\tiny{  --with-fft-lib="-L\$\{FFTW\_LIBRARY\} -lfftw3 -lfftw3\_threads"}}
 \end{block}
 
 \end{frame}
 
 %---------------------