diff --git a/day1/optim.tex b/day1/optim.tex index f21c892..cfd2cdc 100644 --- a/day1/optim.tex +++ b/day1/optim.tex @@ -1,528 +1,528 @@ \subsection{Debugging-Profiling-Optimization-Parallelization} \begin{frame}[containsverbatim] \frametitle{SDLC} \begin{figure}[ht!] \centering - \includegraphics[width=85mm]{day1/images/SDLC.jpg} + \includegraphics[width=7cm]{day1/images/SDLC.jpg} \end{figure} \end{frame} \begin{frame} \frametitle{Before you start your parallel implementation} \begin{itemize} \item {\bf You have no serial code : } design your application in a parallel way from scratch \item {\bf You have a serial code :} follow a Debugging-Profiling-Optimization cycle before any parallelization \end{itemize} \end{frame} \subsubsection{Debugging} \begin{frame} \frametitle{Debugging ?} \begin{itemize} \item Find and correct bugs within an application \item Bugs can be of various nature : division by zero, buffer overflow, null pointer, infinite loops, etc.. \item The compiler is (very) rarely able to recognize a bug at compilation time and the error is (very) rarely explicit regarding the bug ("syntax error") \item Use standard tools like {\tt gdb} \item A multi-threaded code can be tricky to debug (race conditions, deadlocks, etc..) \item (Complex) tools exist for parallel debug : {\tt Totalview}, {\tt Alinea DDT} or recently {\tt Eclipse PTP} \end{itemize} \end{frame} \subsubsection{Profiling} \begin{frame} \frametitle{Profiling ?} Where do I spend most of the time ? \begin{itemize} \item (good) using tools like {\tt gprof} or {\tt Intel Amplifier} \item (bad) ``by hand'' using timings and {\tt printf}'s \end{itemize} \end{frame} \begin{frame} \frametitle{Profiling ?} What should be profiled ? \begin{itemize} \item TTS (Time To Solution) \item best usage of resources (storage, memory, etc..) \item behavior of the application to scale \item ... \end{itemize} \end{frame} \begin{frame}[containsverbatim] \frametitle{Profiling : an example with gprof} \begin{itemize} \item {{\bf MiniFE as test application} \begin{itemize} \item 3D implicit finite-elements on an unstructured mesh \item mini-application written in C++ \item \url{http://www.mantevo.org} \end{itemize} } \item compile with {\tt -pg -g -O3 -ftree-vectorize} \item run it. It should produce a {\tt gmon.out} file \item then profile it {\tt gprof miniFE.x} \end{itemize} \end{frame} \begin{frame}[containsverbatim] \frametitle{Profiling : an example with gprof} Size : ($128~x~128~x~128$) \begin{Verbatim}[fontsize=\tiny] Flat profile: Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls s/call s/call name 62.15 2.61 2.61 1 2.61 2.61 void miniFE::cg_solve 8.57 2.97 0.36 2 0.18 0.18 void miniFE::impose_dirichlet 5.71 3.21 0.24 7471812 0.00 0.00 int miniFE::find_row_for_id 5.71 3.45 0.24 274625 0.00 0.00 void miniFE::Hex8::diffusionMatrix_symm 4.76 3.65 0.20 274625 0.00 0.00 void miniFE::sum_in_symm_elem_matrix 2.62 3.76 0.11 2197000 0.00 0.00 void miniFE::Hex8::gradients_and_invJ_and_detJ 1.90 3.84 0.08 274625 0.00 0.00 void miniFE::get_elem_nodes_and_coords 1.90 3.92 0.08 1 0.08 0.08 int miniFE::verify_solution 1.67 3.99 0.07 2197000 0.00 0.00 void miniFE::Hex8::gradients_and_detJ 0.95 4.03 0.04 274625 0.00 0.00 void miniFE::Hex8::sourceVector 0.95 4.07 0.04 1 0.04 0.04 void miniFE::make_local_matrix 0.71 4.10 0.03 1 0.03 0.03 std::vector::_M_fill_insert 0.71 4.13 0.03 1649773 0.00 0.00 miniFE::mytimer() 0.71 4.16 0.03 1 0.03 0.31 int miniFE::generate_matrix_structure 0.48 4.18 0.02 1 0.02 0.03 void miniFE::create_map_id_to_row 0.24 4.19 0.01 8 0.00 0.00 void miniFE::get_ids 0.24 4.20 0.01 1 0.01 0.27 void miniFE::init_matrix 0.00 4.20 0.00 270400 0.00 0.00 void sort_if_needed 0.00 4.20 0.00 33282 0.00 0.00 std::_Rb_tree ... \end{Verbatim} \end{frame} \begin{frame} \frametitle{Profiling : an example with gprof} What do we learn ? \begin{itemize} \item 62.15 \% of the time is spent in the solver (Conjugate Gradient) \item 8.57 \% is spent in imposing the boundary conditions \item etc.. \item with that specific problem size ($128~x~128~x~128$). Is that similar with a larger/smaller one ? \end{itemize} \end{frame} \begin{frame}[containsverbatim] \frametitle{Profiling : an example with gprof} Smaller ($16~x~16~x~16$) \begin{Verbatim}[fontsize=\tiny] Flat profile: Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls ms/call ms/call name 100.01 0.01 0.01 1 10.00 10.00 void miniFE::cg_solve 0.00 0.01 0.00 18605 0.00 0.00 int miniFE::find_row_for_id 0.00 0.01 0.00 5832 0.00 0.00 void miniFE::Hex8::gradients_and_detJ 0.00 0.01 0.00 5832 0.00 0.00 void miniFE::Hex8::gradients_and_invJ_and_detJ 0.00 0.01 0.00 4907 0.00 0.00 miniFE::mytimer() 0.00 0.01 0.00 729 0.00 0.00 void sort_if_needed 0.00 0.01 0.00 729 0.00 0.00 void miniFE::sum_in_symm_elem_matrix 0.00 0.01 0.00 729 0.00 0.00 void miniFE::get_elem_nodes_and_coords 0.00 0.01 0.00 729 0.00 0.00 void miniFE::Hex8::sourceVector 0.00 0.01 0.00 729 0.00 0.00 void miniFE::Hex8::diffusionMatrix_symm 0.00 0.01 0.00 578 0.00 0.00 std::_Rb_tree