Page MenuHomec4science

mg_gbs.tex
No OneTemporary

File Metadata

Created
Sun, Nov 17, 07:18

mg_gbs.tex

%
% @file mg_gbs.tex
%
% @brief
%
% @copyright
% Copyright (©) 2021 EPFL (Ecole Polytechnique Fédérale de Lausanne)
% SPC (Swiss Plasma Center)
%
% spclibs is free software: you can redistribute it and/or modify it under
% the terms of the GNU Lesser General Public License as published by the Free
% Software Foundation, either version 3 of the License, or (at your option)
% any later version.
%
% spclibs is distributed in the hope that it will be useful, but WITHOUT ANY
% WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
% FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
%
% You should have received a copy of the GNU Lesser General Public License
% along with this program. If not, see <https://www.gnu.org/licenses/>.
%
% @authors
% (in alphabetical order)
% @author Trach-Minh Tran <trach-minh.tran@epfl.ch>
%
\documentclass[a4paper]{article}
\usepackage{linuxdoc-sgml}
\usepackage{graphicx}
\usepackage{hyperref}
%\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{placeins}
\usepackage{multirow}
\usepackage{latexsym}
\usepackage{listings}
\usepackage{xcolor}
\usepackage{rotating}
\def\RepFigures{FIGURES_mg_gbs}
\title{\tt Multigrid Solver for GBS}
\author{Trach-Minh Tran, Federico Halpern\\CRPP/EPFL}
\date{v1.0, June 2015}
\begin{document}
\maketitle
\tableofcontents
\section{The PDE}
The PDE considered is
\begin{equation}
\label{eq:pde}
\left[\frac{\partial^2}{\partial x^2} + \tau\frac{\partial^2}{\partial
x\partial y} + \frac{\partial^2}{\partial y^2} - a(x,y)\right] u(x,y) =
f(x,y), \qquad 0\le x \le L_x, \; 0\le y \le L_y.
\end{equation}
On the four boundaries, homogeneous Dirichlet boundary condition
$u=0$ as well as Neumann boundary condition
$\partial u/\partial n=0$ can be applied.
\section{Discretization}
The grid points $(x_i,y_j)$ are defined by
\begin{equation}
\begin{split}
x_i &= ih_x = i\frac{L_x}{N_x}, \quad i=0,\ldots, N_x \\
y_j &= jh_y = j\frac{L_y}{N_y}, \quad j=0,\ldots, N_y \\
\end{split}
\end{equation}
Second order Finite Difference discretization of Eq.\ref{eq:pde} leads
to the following 9-point stencil
\begin{equation}
\label{eq:stencil}
S_{ij} = \frac{1}{h_x^2}
\begin{bmatrix}
-\tau\alpha/4 & \alpha^2 & \tau\alpha/4 \\
1 & -2(1+\alpha^2)-h_x^2a_{ij} & 1 \\
\tau\alpha/4 & \alpha^2 &-\tau\alpha/4 \\
\end{bmatrix}
, \qquad \mbox{where $\alpha=h_x/h_y$}.
\end{equation}
Note that the mesh aspect ratio $\alpha$ results in the same stencil
for the \emph{anisotropic} Poisson equation with $h_x=h_y$:
\begin{equation}
\frac{\partial^2 u}{\partial x^2} + \alpha^2
\frac{\partial^2 u}{\partial y^2} = f.
\end{equation}
It is shown in \cite[p.~119]{Briggs} that this anisotropy can degrade
the performance of multigrid using standard relaxations such as
Gauss-Seidel or damped Jacobi can be strongly degraded.
With the \emph{lexicographic} numbering
\begin{equation}
I = j(N_x+1) + i+1,
\end{equation}
for the $(N_x+1)(N_y+1)$ nodes, the discretized problem can be
expressed as a matrix problem
\begin{equation}
\label{eq:matrix}
\mathbf{Au} = \mathbf{f},
\end{equation}
where $\mathbf{A}$ is a 9-diagonal matrix, assembled using the stencil defined
above. Homogeneous Dirichlet boundary condition can be imposed, for example, on
the face $j=0$ simply by \emph{clearing} the matrix rows and columns
$1,2,\ldots, N_x+1$, and setting the diagonal terms to 1.
Neumann boundary condition $\partial u/\partial x=0$ at
the face $i=0$, can be simply implemented by imposing
$u_{-1j}=u_{1j}$. The stencil for the boundary nodes $(0,j)$
can thus be modified as
\begin{equation}
S_{0j} = \frac{1}{h_x^2}
\begin{bmatrix}
0 &\alpha^2 & 0 \\
0 &-2(1+\alpha^2)-h_x^2a_{0j} & 2 \\
0 &\alpha^2 & 0 \\
\end{bmatrix}
.
\end{equation}
Two model problems are considered in this report:
\begin{description}
\item[\texttt{\textbf{DDDD}} problem:] Homogeneous Dirichlet BC
at all the 4 boundaries. The \emph{analytic solution} is
\begin{equation}
u(x,y) = \sin\frac{2\pi k_xx}{L_x}\sin\frac{2\pi k_yy}{L_y},
\qquad \mbox{where $k_x$, $k_y$ are positive integers}.
\end{equation}
\item[\texttt{\textbf{NNDD}} problem:] Neumann boundary conditions
$\partial u/\partial x=0$ at $x=0$ and $x=L_x$, homogeneous
Dirichlet BC at $y=0$ and $y=L_y$. The \emph{analytic solution} is
\begin{equation}
u(x,y) = \cos\frac{2\pi k_xx}{L_x}\sin\frac{2\pi k_yy}{L_y},
\qquad \mbox{where $k_x$, $k_y$ are positive integers}.
\end{equation}
\end{description}
In both problems, $a$ depends only on $x$:
\begin{equation}
\label{eq:density}
a(x,y)= \exp\left[-\frac{(x-L_x/3)^2}{(L_x/2)^2}\right].
\end{equation}
The sparse direct solver MUMPS \cite{MUMPS} is used to solve (\ref{eq:matrix}) in
order to check the convergence of the schema described
above. Fig.\ref{fig:convergence} shows the expected \emph{quadratic}
convergence of the error with respect to $h_x$ with fixed $\alpha=h_x/h_y=0.5$ for
both problems, when the grid size is varied from $16\times 64$ to $512\times 2048$.
\begin{figure}[hbt]
\centering
\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/convergence}
\caption{Convergence of the error
$\| u_{calc}- u_{anal}\|_\infty$ wrt the number of intervals
in the $x$ direction $N_x$ for $L_x=100$, $L_y=800$, $k_x=k_y=4$, $\tau=1$ and $N_y=4N_x$.}
\label{fig:convergence}
\end{figure}
\section{Multigrid $V$-cycle}
\label{sec-mgProc}
Given an approximate $\mathbf{u}^h$ and right hand side $\mathbf{f}^h$
defined at some grid level represented by the grid spacing $h$, the
following MG $V$-cycle procedure
\begin{equation*}
\boxed{\mathbf{u}^h \leftarrow MG^h(\mathbf{u}^h,\mathbf{f}^h)}
\end{equation*}
will compute a \emph{new} $\mathbf{u}^h$. It is defined recursively by the
following steps:
\begin{enumerate}
\item If $h$ is the coarsest mesh size,
\begin{itemize}
\item Direct solve $\mathbf{A}^h\mathbf{u}^h=\mathbf{f}^h$
\item Goto 3.
\end{itemize}
\item Else
\begin{itemize}
\item Relax $\mathbf{u}^h$ $\nu_1$ times.
\item $\mathbf{f}^{2h} \leftarrow {\mathbf{R}}(\mathbf{f}^h-\mathbf{A}^h\mathbf{u}^h)$.
\item $\mathbf{u}^{2h} \leftarrow MG^{2h}(\mathbf{u}^{2h},\mathbf{f}^{2h})$ $\mu$ times.
\item $\mathbf{u}^h\leftarrow
\mathbf{u}^h+{\mathbf{P}}\mathbf{u}^{2h}$.
\item Relax $\mathbf{u}^h$ $\nu_2$ times.
\end{itemize}
\item Return
\end{enumerate}
In the procedure above, the operators $\mathbf{R}$ and $\mathbf{P}$
denote respectively the \emph{restriction} (from \emph{fine} to
\emph{coarse} grid) and the \emph{prolongation} (from \emph{coarse} to
\emph{fine} grid). Notice that in this multigrid procedure,
$\mathbf{R}$ applies only to the \emph{right hand side} while $\mathbf{P}$
applies only to the \emph{solution}. The standard $V(\nu_1,\nu_2)$ cycle is obtained by
calling this $MG^h$ procedure with $\mathbf{f}^h$ defined at the
\emph{finest} grid level, a guess $\mathbf{u}^h=0$ and $\mu=1$, while
$\mu=2$ results in the $W(\nu_1,\nu_2)$ cycle.
Details on the grid coarsening, the inter-grid transfers and methods
of relaxation are given in the following.
\subsection{Grid coarsening}
Let start with the one-dimensional \emph{fine} grid defined
by $x_i,\, i=0,\ldots,N$, assuming that $N$ is even. The next coarse grid
(with $N/2$ intervals) is obtained by simply discarding the grid
points with \emph{odd} indices.
In order to get a \emph{smallest coarsest} grid (so that it is possible
to solve \emph{cheaply} the problem with a \emph{direct} method), $N$ should be
$N=N_c2^{L-1}$ where $L$ the total number of grid levels and $N_c$ is either
2 or a \emph{small odd} integer. As an example, the fine grid with $N=768$ can have
up to 9 grid levels, and a coarsest grid with 3 intervals, see
Table~\ref{tab:level}.
\begin{table}[hbt]
\centering
\begin{tabular}{|l||r|r|r|r|r|}\hline
$L$ & \multicolumn{5}{c|}{$N$} \\ \hline
1 & 2 & 3 & 5 & 7 & 9\\
2 & 4 & 6 & 10 & 14 & 18\\
3 & 8 & 12 & 20 & 28 & 36\\
4 & 16 & 24 & 40 & 56 & 72\\
5 & 32 & 48 & 80 & 112 & 144\\
6 & 64 & 96 & 160 & 224 & 288\\
7 & 128 & 192 & 320 & 448 & 576\\
8 & 256 & 384 & 640 & 896 & 1152\\
9 & 512 & 768 & 1280 & 1792 & 2304\\
10 & 1024 & 1536 & 2560 & 3584 & 4608\\
\hline
\end{tabular}
\caption{Set of values of the \emph{fine} grid number of intervals $N$
to obtain a \emph{coarsest} grid size at most equal to $9$ with
at most $10$ grid levels.}
\label{tab:level}
\end{table}
For a two-dimensional grid, the same procedure is applied to both
dimensions. The result of such procedure is illustrated in
Fig.~\ref{fig:2d_coarsening}, for a $8\times4$ fine grid.
\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=0.6\hsize]{grid}
\caption{A \emph{coarse} $4\times 2$ grid ($\Box$) obtained from a
$8\times4$ fine grid ($\bullet$).}
\label{fig:2d_coarsening}
\end{figure}
\subsection{Inter-grid transfers}
The one-dimensional \emph{prolongation} operator for the second-order FD
discretization is chosen the same as the one obtained with the
\emph{linear Finite Elements} \cite{MG1D}. For a $N=8$ grid, it can be
represented as a $9\times 5$ matrix given by
\begin{equation}
\label{eq:1dprolongation}
\mathbf{P} =
\left(
\begin{matrix}
1 & 0 & 0 & 0 & 0 \\
1/2 & 1/2 & 0 & 0 & 0 \\
0 & 1 & 0 & 0 & 0 \\
0 & 1/2 & 1/2 & 0 & 0 \\
0 & 0 & 1 & 0 & 0 \\
0 & 0 & 1/2 & 1/2 & 0 \\
0 & 0 & 0 & 1 & 0 \\
0 & 0 & 0 & 1/2 & 1/2\\
0 & 0 & 0 & 0 & 1 \\
\end{matrix}\right)
\end{equation}
The \emph{restriction} matrix $\mathbf{R}$ is simply related to $\mathbf{P}$ by
\begin{equation}
\label{eq:1drestriction}
\mathbf{R} = \frac{1}{2}\mathbf{P}^{T}=\frac{1}{2}\left(
\begin{matrix}
1&1/2&0&0&0&0&0&0&0\\
0&1/2&1&1/2&0&0&0&0&0\\
0&0&0&1/2&1&1/2&0&0&0\\
0&0&0&0&0&1/2&1&1/2&0\\
0&0&0&0&0&0&0&1/2&1\\
\end{matrix}
\right).
\end{equation}
For Dirichlet BC imposed on the \emph{left} boundary one has to set
${P}_{21}=R_{12}=0$, while for Dirichlet BC imposed on the \emph{right}
boundary, ${P}_{N,N/2+1}=R_{N/2+1,N}=0$. Notice that these inter-grid operators are
identical to the standard \emph{linear interpolation} and \emph{full
weighting} operators.
For a two-dimensional problem, using the property that the grid is a
\emph{tensor product} of two one-dimensional grids, the
restriction of the right hand side $f^{h}_{ij}$ and the prolongation of
the solution $u^{2h}_{ij}$ can be computed as
\begin{equation}
\label{eq:2dintergrid}
\begin{split}
\mathbf{f}^{2h} &= \mathbf{R}^x \cdot \mathbf{f}^{h}\cdot (\mathbf{R}^y)^T \\
\mathbf{u}^{h} &= \mathbf{\mathbf{P}}^x \cdot \mathbf{u}^{2h}\cdot
(\mathbf{\mathbf{P}}^y)^T \\
\end{split}
\end{equation}
\subsection{Relaxations}
Gauss-Seidel and damped Jacobi iterations are used as relaxation
methods in the multigrid $V$ cycle. In general, Gauss-Seidel
method is more efficient but much more difficult to
\emph{parallelize} than the Jacobi method.
It should be noted that if $a(x,y)$ in Eq.~\ref{eq:pde} is
non-positive, both relaxations diverge! This can be seen by
considering the following one-dimensional FD equation with uniform
$a$:
\begin{equation}
u_{j-1} -(2+ah^2)u_j + u_{j+1} = h^2f_j.
\end{equation}
Using the damped Jacobi relaxation, the error
$\epsilon^{(m)}_j\equiv u_{anal}(x_j)-u_j^{(m)}$ at iteration $m+1$ is given by
\begin{equation}
\epsilon^{(m+1)}_j =
\frac{\omega}{2+h^2a}(\epsilon^{(m)}_{j-1}+\epsilon^{(m)}_{j+1})
+(1-\omega)\epsilon^{(m)}_j .
\end{equation}
Performing a \emph{local mode analysis} (or Fourier analysis) (see
\cite[p.~48]{Briggs}), assuming that
$\epsilon^{(m)}_j=A(m)e^{ij\theta}$, where $\theta$ is related to the
mode number $k$ by $\theta=2\pi k/N$, the
\emph{amplification factor} $G(\theta)$ is obtained as
\begin{equation}
\begin{split}
G(\theta) &= \frac{A(m+1)}{A(m)} =
\frac{2\omega}{2+h^2a}\cos\theta + (1-\omega) \\
&= G_0(\theta) -\frac{ \omega h^2a}{2+h^2a}\cos\theta
\simeq G_0(\theta) -\frac{ \omega h^2a}{2}\cos\theta, \\
G_0(\theta) &=1-2\omega\sin^2\frac{\theta}{2}, \\
\end{split}
\end{equation}
where $G_0(\theta)$ is the amplification factor for
$a=0$. Note that $|G_0(\theta)|< 1$ for \emph{all} $\theta$ and $0<\omega< 1$
but $\displaystyle{\max_{|\theta|<\pi}|G(\theta)|>1}$ if $a<0$.
In Gauss-Seidel relaxation method, the errors evolve as:
\begin{equation}
\epsilon^{(m+1)}_j = \frac{\epsilon^{(m+1)}_{j-1}+\epsilon^{(m)}_{j+1}}{2+h^2a}.
\end{equation}
Applying again the same Fourier analysis yields the
following complex amplification factor:
\begin{equation}
\begin{split}
G(\theta) &\simeq G_0(\theta)\left(1-\frac{h^2a}{2-e^{-i\theta}}\right) \\
G_0(\theta) &=\frac{e^{i\theta}}{2-e^{-i\theta}}, \quad
|G_0(\theta)| < 1
\end{split}
\end{equation}
which show that the Gauss-Seidel relaxations \emph{diverge} if $a<0$.
Notice that when $a>0$, the effect of $a$ on the amplification is
negligible and is thus ignored in the following two-dimensional
analysis. Applying the damped Jacobi scheme on the
stencil~(\ref{eq:stencil}), the error at the iteration $m+1$ is given
by:
\begin{equation}
\begin{split}
\epsilon^{(m+1)}_{ij} = & \frac{\omega}{2(1+\alpha^2)} \left[
\epsilon^{(m)}_{i-1,j} + \epsilon^{(m)}_{i+1,j} + \alpha^2(
\epsilon^{(m)}_{i,j-1} + \epsilon^{(m)}_{i,j+1}) + \frac{\tau\alpha}{4}(
\epsilon^{(m)}_{i+1,j+1}+\epsilon^{(m)}_{i-1,j-1} -
\epsilon^{(m)}_{i-1,j+1}-\epsilon^{(m)}_{i+1,j-1})
\right] \\
& + (1-\omega)\epsilon^{(m)}_{ij}. \\
\end{split}
\end{equation}
Using the two-dimensional Fourier mode expression
\begin{equation}
\epsilon^{(m)}_{ij} = A(m)e^{i(\theta_1+\theta_2)}, \quad -\pi <\theta_1,
\theta_2 \le \pi,
\end{equation}
the amplification factor $G=A(m+1)/A(m)$ is given
by
\begin{equation}
\label{eq:amp_jac}
G(\theta_1,\theta_2;\omega,\alpha,\tau)=1 - \frac{2\omega}{1+\alpha^2} \left(
\sin^2\frac{\theta_1}{2} + \alpha^2\sin^2\frac{\theta_2}{2} +
\frac{\tau\alpha}{4}\sin\theta_1\,\sin\theta_2
\right).
\end{equation}
The errors in Gauss-Seidel method, assuming a \emph{lexicographic} ordering
for the unknowns (increasing first $i$ then $j$), are updated according
to
\begin{equation}
\epsilon^{(m+1)}_{ij} = \frac{1}{2(1+\alpha^2)} \left[
\epsilon^{(m+1)}_{i-1,j} + \epsilon^{(m)}_{i+1,j} + \alpha^2(
\epsilon^{(m+1)}_{i,j-1} + \epsilon^{(m)}_{i,j+1}) + \frac{\tau\alpha}{4}(
\epsilon^{(m)}_{i+1,j+1}+\epsilon^{(m+1)}_{i-1,j-1} -
\epsilon^{(m)}_{i-1,j+1}-\epsilon^{(m+1)}_{i+1,j-1})
\right].
\end{equation}
The Fourier mode analysis then leads to the following complex
amplification factor
\begin{equation}
\label{eq:amp_gs}
G(\theta_1,\theta_2;\alpha,\tau) =
\frac{e^{i\theta_1} +
\left(\alpha^2+i\dfrac{\tau\alpha}{2}\sin\theta_1\right)e^{i\theta_2}}
{2(1+\alpha^2) - \left[e^{-i\theta_1}+\left(\alpha^2 -
i\dfrac{\tau\alpha}{2}\sin\theta_1\right)e^{-i\theta_2}\right]}.
\end{equation}
Curves of $G$ for \emph{fixed} $\theta_2$ are plotted in
Fig.~\ref{fig:fourier_jac} showing \emph{convergence} ($\max|G|<1$) for
$\tau=-1,0,1,2$, using the damped Jacobi method. The
same conclusions are obtained for Gauss-Seidel relaxations as shown
in Fig.~\ref{fig:fourier_gs} where the absolute values
of the complex amplification factor $G$ are plotted. However, for larger
$|\tau|>2$, both methods diverge as can be seen in
Fig.~\ref{fig:relax_diverge}.Notice however that the PDE
(\ref{eq:pde}) is \emph{elliptic} only when $|\tau|<2$ is satisfied!
\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=0.85\hsize]{\RepFigures/fourier_jac}
\caption{Amplification factor for damped Jacobi relaxations with
$\omega=0.8$ and $\alpha=h_x/h_y=1$ and $\tau=-1,0,1,2$ displayed as curves of
constant $\theta_2$. $\theta_2=0$ on the \emph{green} curve and $\pi$ on
the \emph{red} curve.}
\label{fig:fourier_jac}
\end{figure}
\begin{figure}[hbt]
\centering
\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/fourier_gs}
\caption{Absolute value of the amplification factor for Gauss-Seidel
relaxations with $\alpha=h_x/h_y=1$ and $\tau=-1,0,1,2$, displayed as curves of
constant $\theta_2$. $\theta_2=0$ on the \emph{green} curve and $\pi$ on
the \emph{red} curve.}
\label{fig:fourier_gs}
\end{figure}
\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/relax_diverge}
\caption{Amplification factor for Jacobi (left) and Gauss-Seidel (right)
relaxations for $|\tau|=3,5$, $\alpha=h_x/h_y=1$, displayed as curves of
constant $\theta_2$. $\theta_2=0$ on the \emph{green} curve and $\pi$ on
the \emph{red} curve.}
\label{fig:relax_diverge}
\end{figure}
In summary, the local mode analysis predicts that
\begin{itemize}
\item Negative values of the coefficient $a$ and large
mixed derivative ($|\tau| > 2$) can make both damped Jacobi and
Gauss-Seidel relaxations diverge.
\item Positive values of $a$ can decrease the amplification factor
(improving thus the convergence rate) but
its contributions $h^2a$ decrease for increasing grid resolution.
\end{itemize}
These predictions will be checked against numerical experiments in the
next section.
\FloatBarrier
\section{Numerical Experiments}
\label{sec:NumExp1}
In the following numerical experiments, we look at the convergence
rate of the residual norm and the error norm which are defined at the
iteration $m$ by
\begin{equation}
\begin{split}
r^{(m)} &= \|\mathbf{f}-\mathbf{A}\mathbf{u}^{(m)}\|_\infty, \\
e^{(m)} &= \|\mathbf{u}^{(m)}-\mathbf{u}_{anal}\|_\infty.\\
\end{split}
\end{equation}
The iterations are stopped when the number of iterations reach an
user supplied \emph{maximum} of iterations or when the residual norm
is smaller than either a given \emph{relative} tolerance {\tt rtol}
\cite[p.~51]{TEMPL} or \emph{absolute} tolerance {\tt atol}:
\begin{equation}
\begin{split}
r^{(m)} &< \mbox{\tt rtol}
\cdot(\|\mathbf{A}\|_\infty\cdot\|\mathbf{u}^{(m)}\|_\infty +
\|\mathbf{f}\|_\infty), \\
r^{(m)} &< \mbox{\tt atol}.\\
\end{split}
\end{equation}
An additional stopping criteria consists of stopping the
iterations when the change of the discretization error between
successive iteration is small enough:
\begin{equation}
\frac{e^{(m)}-e^{(m-1)}}{e^{(m-1)}}< \mbox{\tt etol}.
\end{equation}
\subsection{$V$-cycle performances}
Table~\ref{tab:iternum} shows the numbers of $V$-cycles required to
reach the \emph{relative tolerance}
$\mbox{\tt rtol}=10^{-8}$. In these runs where $\alpha=0.5$, $\tau=1$
and $a(x,y)$ given by Eq.~\ref{eq:density}, we observe that the
biggest improvement is obtained at $\nu_1=\nu_2=2$. For larger
$\nu_1,\nu_2$, the number of required iterations is relatively insensitive
to the grid sizes. As can be seen in Fig.~\ref{fig:mg_iterations}
which plots the evolution of the error $e^{(m)}$, it
is clear that the level of discretization error has been largely
reached. Finally the times used by these runs are shown in
Fig.~\ref{fig:dddd_pc220} and Fig.~\ref{fig:nndd_pc220} where the
times spent in the direct solver using MUMPS \cite{MUMPS} are included
for comparison. For the $512\times 2048$ grid (the largest
case using the direct solver), the multigrid $V(3,3)$ is about $30$
times faster!
\begin{table}[htb]
\centering
\begin{tabular}{|l||c|c|c|c||c|c|c|c|}\hline
& \multicolumn{4}{c||}{\texttt{\textbf{DDDD}} problem}
& \multicolumn{4}{c|}{\texttt{\textbf{NNDD}} problem} \\ \cline{2-9}
Grid size & $V(1,1)$ & $V(2,2)$ & $V(3,3)$ & $V(4,4)$
& $V(1,1)$ & $V(2,2)$ & $V(3,3)$ & $V(4,4)$ \\ \hline
$16\times 64$ & 3 & 2 & 2 & 1 & 4 & 2 & 2 & 1\\
$32\times 128$ & 5 & 3 & 2 & 2 & 5 & 3 & 2 & 2\\
$64\times 256$ & 7 & 4 & 3 & 3 & 7 & 4 & 3 & 3\\
$128\times 512$ & 10 & 6 & 4 & 4 & 10 & 6 & 5 & 4\\
$256\times 1024$ & 11 & 6 & 5 & 4 & 11 & 6 & 5 & 4\\
$512\times 2048$ & 11 & 6 & 5 & 4 & 11 & 6 & 5 & 4\\
$1024\times 4096$ & 10 & 6 & 4 & 4 & 10 & 6 & 4 & 4\\
$1536\times 6144$ & 9 & 6 & 4 & 4 & 9 & 5 & 4 & 3\\
\hline
\end{tabular}
\caption{Multigrid $V$-cycle results for the {\tt DDDD} and {\tt NNDD}
model problems with $k_x=k_y=4$, $L_x=100$, $L_y=800$, $\tau=1$ and
$a(x,y)$ given by Eq.~\ref{eq:density}. Shown are the numbers of
multigrid $V$-cycles required to reduce the \emph{relative}
residual norm to less than $10^{-8}$ for different
grid sizes and numbers of pre and post relaxation
sweeps. Gauss-Seidel relaxation is used. The coarsest grid size of the
$1536\times 6144$ case is $3\times 12$ while all the others have a coarsest
grid of size $2\times 8$.}
\label{tab:iternum}
\end{table}
\begin{figure}[htb!]
\centering
\includegraphics[width=\textwidth]{\RepFigures/mg_iterations}
\caption{Performance of the $V(2,2)$-cycle using the Gauss-Seidel
relaxation scheme for the {\tt DDDD} (upper curve) and {\tt NNDD}
(lower curve) problem. The relative tolerance {\tt rtol} is
set to $10^{-8}$ the coarsest grid size for all the problem size
is fixed to $2\times 8$.}
\label{fig:mg_iterations}
\end{figure}
The fittings of the obtained data show that the multigrid
$V$ cycle cost scales almost \emph{linearly} with the number of
unknowns $N=(N_x+1)(N_y+1)$ (as does the backsolve stage of MUMPS)
while the \emph{total} direct solve time scales as $N^{1.4}$.
\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/dddd_pc220}
\caption{Times used by the multigrid $V$ cycles for the runs reported
in Table~\ref{tab:iternum} for the \texttt{\textbf{DDDD}}
problem. The last 6 $V(3,3)$ data points are used for the multigrid fit. The
MUMPS direct solver's cost is included for comparison. All the
timing results are obtained on an Intel Nehalem i7 processor, using the
Intel compiler version 13.0.1 and MUMPS-4.10.0. }
\label{fig:dddd_pc220}
\end{figure}
\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/nndd_pc220}
\caption{As in Fig.~\ref{fig:dddd_pc220} for the
\texttt{\textbf{NNDD}} problem.}
\label{fig:nndd_pc220}
\end{figure}
\FloatBarrier
\subsection{Effects of the mesh aspect ratio $\alpha$}
From Table~\ref{tab:anisotropy}, one can observe that the required number of
$V(2,2)$ cycles increase quickly when $\alpha<0.5$ and $\alpha>2$. Advanced
\emph{relaxation} methods and \emph{coarsening} strategies
\cite[chap. 7]{Wesseling} can solve this performance degradation
but are generally more difficult to parallelize.
\begin{table}[htb]
\centering
\begin{tabular}{|l||c|c|c|}\hline
$\alpha$ & \texttt{\textbf{DDDD}} & \texttt{\textbf{NNDD}} \\ \hline
0.125 & 19 & 22 \\
0.25 & 12 & 12 \\
0.5 & 6 & 6 \\
1.0 & 5 & 5 \\
2.0 & 7 & 7 \\
4.0 & 20 & 19 \\
\hline
\end{tabular}
\caption{Effects of the \emph{mesh aspect ratio} $\alpha=h_x/h_y$ on the
number of $V(2,2)$ cycles required to reach $\mbox{\tt
rtol}=10^{-8}$ for {\tt DDDD} and {\tt NNDD}
model problems. The listed $\alpha$'s are obtained by fixing
$N_x=256$, $N_y=1024$, $L_x=100$ and varying $L_y$.}
\label{tab:anisotropy}
\end{table}
\subsection{Effects of the mixed partial derivative}
When $|\tau|>2$, Table~\ref{tab:mixedterm} shows that the multigrid
$V$-cycle diverge, as predicted from the local mode analysis based on
the amplification factor given in Eq.~\ref{eq:amp_gs}. Although, a
non-negative coefficient $a$ has a \emph{stabilizing} effect, the
latter disappears already for a $256\times 1024$ grid.
\begin{table}[htb]
\centering
\begin{tabular}{|l|l|c|c|c|c|c|c|c|c|}\hline
&Grid size & $\tau=-3$ & $\tau=-2$ & $\tau=-1$ & $\tau=0$ & $\tau=1$
& $\tau=2$ & $\tau=3$ \\ \hline
\multirow{4}{*}{\texttt{\textbf{DDDD}}}
&$128\times 512(a=0)$ & - & 39 & 7 & 5 & 7 & 38 & - \\
& $128\times 512$ &16 & 6 & 5 & 4 & 4 & 6 & 17 \\
& $256\times 1024$ &- & 8 & 5 & 4 & 5 & 7 & - \\
& $512\times 2048$ &- & 9 & 5 & 4 & 5 & 9 & - \\
\hline\hline
\multirow{4}{*}{\texttt{\textbf{NNDD}}}
&$128\times 512(a=0)$ & - & 42 & 7 & 5 & 7 & 41 & - \\
& $128\times 512$ &13 & 6 & 5 & 4 & 5 & 5 & 13 \\
& $256\times 1024$ &- & 7 & 5 & 4 & 5 & 7 & - \\
& $512\times 2048$ &- & 7 & 5 & 4 & 5 & 7 & - \\
\hline
\end{tabular}
\caption{Effects of the mixed derivative term $\tau$ on the
performances of the $V(3,3)$ cycle. The dashes
indicate that the $V$-cycle diverges. In theses runs, $a(x,y)$ is given by
Eq.~\ref{eq:density} except for the cases where it is set to
0. Notice the \emph{stabilizing} effect of $a\ne 0$ for the
$128\times 512$ grid at $\tau = \pm 3$.}
\label{tab:mixedterm}
\end{table}
\subsection{Using the damped Jacobi relaxation}
The optimum Jacobi damping factor $\omega$ can be determined by minimizing
the \emph{smoothing factor} defined as the maximum amplification
coefficient (\ref{eq:amp_jac}) restricted to the \emph{oscillatory modes}:
\begin{equation}
\label{eq:mu_jac}
\mu(\omega,\alpha,\tau) = \max_{(\theta_1,\theta_2)\in\Omega}
|G(\theta_1,\theta_2,\omega,\alpha,\tau)|, \qquad \Omega =
[|\theta_1|>\pi/2]\,\bigcup\,[|\theta_2|>\pi/2].
\end{equation}
Results from numerical computation of (\ref{eq:mu_jac} are shown
in Fig.~\ref{fig:jac_opt}. An analytic expression for $\tau=0$
assuming $\alpha\le 1$ is derived in \cite[p.~119]{salpha}:
\begin{gather}
\mu(\omega,\alpha,\tau=0) =
\max\left(\left|1-2\omega\right|,\;\left|1-\frac{\alpha^2}{1+\alpha^2}\omega\right|\right),
\nonumber\\
\mu_{\mbox{opt}} = \frac{2+\alpha^2}{2+3\alpha^2} \quad
\mbox{at} \quad \omega_{\mbox{opt}} = \frac{2+2\alpha^2}{2+3\alpha^2}.
\end{gather}
Notice that the smoothing factor increases as $\alpha$ departs from 1
and for increasing $\tau$.
For Gauss-Seidel relaxation, the same numerical procedure applied to
(\ref{eq:amp_gs}) yields a smoothing factor $\mu$ equal to
respectively $0.5$, $ 0.68$ and $0.70$ for the three cases shown in
Fig.~\ref{fig:jac_opt}, which result in a better smoothing property
than the damped Jacobi relaxation.
\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/jac_opt}
\caption{The smoothing factor for damped Jacobi relaxation
for different values of $\alpha$ and $\tau$.}
\label{fig:jac_opt}
\end{figure}
Numerical experiments with the reference case ($\alpha=0.5$, $\tau=1$,
$a(x,y)$ given by Eq.~\ref{eq:density}) and the $128\times 512$ grid
using damped Jacobi relaxation, are shown in Table~\ref{tab:jac_opt}
and confirm that $\omega=0.9$ is the optimum damping factor and that
it is less efficient than the Gauss-Seidel relaxation, in agreement
with the Fourier analysis.
\begin{table}[htb]
\centering
\begin{tabular}{l c c c c c c}\hline
&$\omega=0.5$ &$\omega=0.6$ &$\omega=0.7$ &$\omega=0.8$ &$\omega=0.9$
&$\omega=1.0$ \\ \cline{2-7}
\texttt{\textbf{DDDD}}& 12 & 10 & 9 & 8 & 7 & 15 \\
\texttt{\textbf{NNDD}}& 12 & 11 & 9 & 8 & 7 & 18 \\
\hline
\end{tabular}
\caption{The number of $V(3,3)$ cycles required to obtain
$\mbox{\texttt{rtol}}=10^{-8}$ versus the Jacobi \emph{damped
factor} $\omega$. The grid size is $128\times 512$ with
$\alpha=0.5$, $\tau=1$ and $a(x,y)$ given by Eq.~\ref{eq:density}.}
\label{tab:jac_opt}
\end{table}
\subsection{Matrix storage}
Initially the \emph{Compressed Sparse Row} storage format (CSR or CRS) (see
\cite[p.~58--59]{TEMPL}) was used to store the discretized finite
difference matrix. With this choice, the CPU time used by the matrix
construction (and boundary condition setting) is found to be always larger
than the multigrid solver time as shown in
Fig.~\ref{fig:matcon_time}. Fortunately, switching to the \emph{Compressed
Diagonal Storage} (CDS), where the 9 diagonal structure of the
matrix is fully exploited, the matrix construction time is considerably
reduced as shown in the same figure. On the other hand, no
difference in the multigrid solver performance is noticeable
between the two matrix storage.
\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=\hsize]{\RepFigures/matcon_time}
\caption{CPU time used by the matrix construction for CSR and CDS
matrix storage compared to the multigrid $V(3,3)$ cycle time for
the \textbf{DDDD} and \textbf{NNDD} model problems. The timing is
obtained using the same conditions as in Fig.~\ref{fig:dddd_pc220}.}
\label{fig:matcon_time}
\end{figure}
\FloatBarrier
\section{Modified PDE}
Here, the following modified PDE is considered:
\begin{equation}
\label{eq:new_pde}
\left[\frac{\partial^2}{\partial x^2} + \tau\frac{\partial^2}{\partial
x\partial y} + (1+\tau^2/4)\frac{\partial^2}{\partial y^2} - a(x,y)\right] u(x,y) =
f(x,y), \qquad 0\le x \le L_x, \; 0\le y \le L_y.
\end{equation}
This PDE which is obtained from the $\hat s-\alpha$ model
\cite[Eq.10]{salpha} is
\emph{elliptic} for any value of $\tau$. The resulting stencil is
changed from the previous stencil \ref{eq:stencil} to
\begin{equation}
\label{eq:new_stencil}
S_{ij} = \frac{1}{h_x^2}
\begin{bmatrix}
-\tau\alpha/4 & \alpha^2(1+\tau^2/4) & \tau\alpha/4 \\
1 & -2\left[1+\alpha^2(1+\tau^2/4)\right]-h_x^2a_{ij} & 1 \\
\tau\alpha/4 & \alpha^2(1+\tau^2/4) &-\tau\alpha/4 \\
\end{bmatrix}
.
\end{equation}
Note that the
\emph{anisotropy} of the resulting Finite Difference discretization
is now $\alpha^2(1+\tau^2/4)$ and could be controlled by adjusting both
the mesh aspect ratio $\alpha$ and the \emph{shear} term $\tau$.
Numerical calculations show that the multigrid $V$-cycles do always
converge, as shown in Table~\ref{tab:new_mixedterm}.
\begin{table}[htb]
\centering
\begin{tabular}{|l|l|c|c|c|c|c|c|c|}\hline
&Grid size & $\tau=0$ & $\tau=1$ & $\tau=2$ & $\tau=4$ & $\tau=8$
& $\tau=16$ \\ \hline
\multirow{3}{*}{\texttt{\textbf{DDDD}}}
& $128\times 512$ &4 & 4 & 5 & 6 & 9 & 20 (6) \\
& $256\times 1024$ &4 & 4 & 5 & 6 & 11 & 25 (8)\\
& $512\times 2048$ &4 & 4 & 5 & 7 & 12 & 29 (8)\\
\hline\hline
\multirow{3}{*}{\texttt{\textbf{NNDD}}}
& $128\times 512$ &4 & 4 & 4 & 5 & 8 & 17 (5) \\
& $256\times 1024$ &4 & 5 & 5 & 5 & 8 & 19 (6) \\
& $512\times 2048$ &4 & 4 & 4 & 5 & 7 & 18 (6) \\
\hline
\end{tabular}
\caption{Effects of the mixed derivative term $\tau$ on the
performances of the $V(3,3)$ cycle. In theses runs, $a(x,y)$ is
given by Eq.~\ref{eq:density}. The mesh aspect ratio $\alpha=0.5$
was used. On the last column and shown in parenthesis are the
numbers of $V(3,3)$ cycles when $\alpha$ is reduced to 0.125 by
increasing the length $L_y$ while keeping all the other values fixed.}
\label{tab:new_mixedterm}
\end{table}
\section{Parallel Multigrid}
In order to maximize the parallel efficiency and the flexibility of
utilization, a two-dimensional domain partition scheme is chosen to
parallelize the multigrid solver. As shown below, generalization of this
procedure for higher dimensions is straightforward.
\subsection{Distributed grid coarsening}
The coarsening algorithm can be summarized as follow:
\begin{itemize}
\item Partition the grid points on each dimension at the
\emph{finest} grid level, as evenly as possible.
\item The range for each sub-grid, using \emph{global indexing} is
thus specified by $[s,e]$, with $s=0$ for the first sub-grid and
$e=N$ for the last sub-grid, $N$ being the number of grid intervals.
\item The next coarse sub-grid is thus obtained by discarding all the
\emph{odd} indexed grid points, as in the serial case.
\item This process can continue (as long as the total of number of
intervals is even) until there exists a prescribed \emph{minimum}
number of grid points on any sub-grid is reached.
\end{itemize}
\subsection{Matrix-free formulation}
Using standard \emph{matrix} to represent the discretized 2D (or higher
dimensional) operators imply an \emph{one-dimensional numbering} of
the grid nodes. For example on a 2D $N_x\times N_y$ grid, the 1D numbering
of the node $(x_{i_1},y_{i_2})$ could be defined as
\[
k=i_1+i_2\times N_x, \quad i_1=0:N_x,\;i_2=0:N_y.
\]
However, using 2D domain partition defined by
\begin{equation}
\label{eq:2dnumber}
i_1=s_1:e_1,\;i_2=s_2:e_2,
\end{equation}
with $s=(s_1,s_2)$ and $e=(e_1,e_2)$ denoting respectively the
\emph{starting} and \emph{ending} indices of a rectangular sub-domain,
result in a \emph{non-contiguous} set of the indices ${k}$ and in a
complicate structure of the partitioned matrix for the linear
operator.
On the other hand, using the \emph{stencil notation}
introduced in \cite[chap. 5.2]{Wesseling} based on the
\emph{multidimensional} node labeling as defined by
(\ref{eq:2dnumber}) for a 2D problem, one can define a simple data
structure for the partitioned operator, $A(i,\delta)$,
where the $d$-tuple $i=(i_1,\ldots,i_d)$ represents a node of the
$d$-dimensional grid and the $d$-tuple
$\delta=(\delta_1,\ldots,\delta_d)$, the
\emph{distance} between the connected nodes. The result of
$\mathbf{A}u$ can thus be defined as
\begin{equation}
\label{eq:vmx}
(\mathbf{A}u)_i = \sum_{\delta\in\mathbb{Z}^d}
A(i,\delta)u_{i+\delta}, \quad i=s:e.
\end{equation}
In (\ref{eq:vmx}), the sum is performed over all
indices $\delta$ such that $A(i,\delta)$ is non-zero. For the 2D
nine-point stencil defined in (\ref{eq:stencil}), the 2-tuple
$\delta$ can be specified as the 9 columns of the following
\emph{structure} matrix
\begin{equation}
\label{5points}
S_\delta = \left(
\begin{array}{rrrrrrrrr}
0 & -1 & 0 & 1 & -1 & 1 & -1& 0 & 1 \\
0 & -1 & -1&-1 & 0 & 0 & 1 & 1 & 1 \\
\end{array} \right).
\end{equation}
In the general case of a $d$-dimensional grid and $\mathcal{N}$ point stencil,
$S_\delta$ is a $d\times\mathcal{N}$
matrix. By noting that the subscript $i+\delta$ of $u$ on the right hand side of
(\ref{eq:vmx}) should be in the range $[0,N]$ \emph{only} for sub-domains which are
\emph{adjacent} to the boundary, one can deduce that for
a \emph{fixed} $\delta$, the lower and upper bounds of the indices $i$
should be
\begin{equation}
\begin{split}
i_{\mbox{min}} &= \max (0, -\delta, s), \\
i_{\mbox{max}} &= \min (N, N-\delta, e) \\
\end{split}
\end{equation}
where $N=(N_1,N_2,\ldots,N_d)$ specify the number of intervals,
since, for sub-domains \emph{not adjacent} to the boundary, $u$ should
include values at the \emph{ghost} cells $s-g$ and $e+g$ where $g$ is
given by
\begin{equation}
g = \max|S_\delta|
\end{equation}
with the operator max taken along the \emph{rows} of the matrix.
The formula defined in (\ref{eq:vmx}) can then be implemented as in
the \emph{pseudo} Fortran code
\par
\addvspace{\medskipamount}
\nopagebreak\hrule
\begin{lstlisting}[mathescape]
do k=1,SIZE($S_{\delta}$,2) ! loop over the stencil points
$\delta$ = $S_{\delta}$(:,k)
lb = MAX(0,-$\delta$,$s$)
ub = MIN($N$,$N-\delta$,e)
do i=lb,ub
Au(i) = Au(i) + A(i,$\delta$)*u(i+$\delta$)
enddo
enddo
\end{lstlisting}
\nopagebreak\hrule
\addvspace{\medskipamount}
On the other hand, if the values of $u$ at the ghost cells of the
sub-domains \emph{adjacent} to the boundary are set to 0
\begin{equation*}
u_{-g} = u_{N+g} = 0,
\end{equation*}
the lower and upper bounds of the
inner loop can be simply set to $lb=s$ and $ub=e$. Note that the inner
loop should be interpreted as $d$ nested loops over the $d$-tuple
$i=(i_1,\ldots,i_d)$ for a $d$-dimensional problem.
\subsection{Inter-grid transfers}
\subsubsection{Restriction}
Using the definition in the first equation of (\ref{eq:2dintergrid})
together with (\ref{eq:1drestriction}), the 2D restriction operator
can be represented by the following 9-point stencil:
\begin{equation}
\label{eq:2drestriction}
\mathbf{R}_i = \frac{1}{16}
\begin{pmatrix}
1 & 2 & 1 \\
2 & 4 & 2 \\
1 & 2 & 1 \\
\end{pmatrix},
\end{equation}
and the restriction of $f$ can be computed as
\begin{equation}
\bar{f}_i = (\mathbf{R}f)_i = \sum_{\delta\in\mathbb{Z}^2}
R(i,\delta)f_{2i+\delta}, \quad i=\bar{s}:\bar{e},
\end{equation}
where $\bar{s},\bar{e}$ denote the partitioned domain boundary indices on the
\emph{coarse} grid, using the same algorithm described previously.
\subsubsection{BC for the restriction operator}
\label{sec:restrict_bc}
Dirichlel boundary conditions can be imposed by modifying the
\emph{restriction stencil} on each of the four boundaries as follow:
\begin{equation}
\mathbf{R}_{0,.} = \frac{1}{16}\begin{pmatrix}
1 & 2 & 0 \\
2 & 4 & 0 \\
1 & 2 & 0 \\
\end{pmatrix},\quad
\mathbf{R}_{N_x,.} = \frac{1}{16}\begin{pmatrix}
0 & 2 & 1 \\
0 & 4 & 2 \\
0 & 2 & 1 \\
\end{pmatrix},\quad
\mathbf{R}_{.,0} = \frac{1}{16}\begin{pmatrix}
0 & 0 & 0 \\
2 & 4 & 2 \\
1 & 2 & 1 \\
\end{pmatrix},\quad
\mathbf{R}_{.,N_y} = \frac{1}{16}\begin{pmatrix}
1 & 2 & 1 \\
2 & 4 & 2 \\
0 & 0 & 0 \\
\end{pmatrix}.
\end{equation}
With the natural Neumann BC, no change of the restriction operator is needed.
\subsubsection{Prolongation}
Stencil notation for \emph{prolongation} operators is less obvious to
formulate, see \cite[chap. 5.2]{Wesseling}. A more straightforward
implementation is obtained in the 2D case, by simply applying
\emph{bilinear interpolation} on the \emph{coarse grid}:
\begin{equation}
\label{eq:2dprolongation}
\begin{split}
(\mathbf{P}\bar{u})_{2i} &= \bar{u}_{i}, \\
(\mathbf{P}\bar{u})_{2i+e_1} &= (\bar{u}_{i} + \bar{u}_{i+e_1})/2, \quad
(\mathbf{P}\bar{u})_{2i+e_2} = (\bar{u}_{i} + \bar{u}_{i+e_2})/2, \\
(\mathbf{P}\bar{u})_{2i+e_1+e_2} &= (\bar{u}_{i} +
\bar{u}_{i+e_1} + \bar{u}_{i+e_2} + \bar{u}_{i+e_1+e_2})/4, \\
\end{split}
\end{equation}
\subsection{Relaxations}
While the Gauss-Seidel proves to be more efficient, the damped Jacobi
method, at least for a first version of the parallel multigrid solver,
is used because it is straightforward to \emph{parallelize}. The same
undamped Jacobi (with $\omega=1$) with a \emph{few} number of
iterations is also used to solve the linear system at the coarsest
mesh as prescribed by the multigrid $V$-cycle procedure defined in
section \ref{sec-mgProc}.
\subsection{Local vectors and stencils}
All local vectors (used to represent solutions or
right-hand-sides) contain \emph{ghost cells} and are implemented
using 2D arrays, for example
\[\mbox{\texttt{sol(s(1)-1:e(1)+1,s(2)-1:e(2)+1)}}\]
for the solution vector.
The partitioned stencils are defined
only for the \emph{local} grid points, without the ghost cells.
Thus, before each operation on the local vectors, an exchange (or
update) of the values on the ghost cells is performed.
As a result, all the memory required by the solver is completely
partitioned, except for the space used by the ghost cells.
\subsection{Numerical Experiments}
In this section, all the numerical experiments are conducted on
\texttt{helios.iferc-csc.org}, using the Intel compiler version 13.1.3
and bullxpmi-1.2.4.3. The \emph{stopping criteria} for the $V$-cycles
is based on the absolute and relative residual norms as well as the
discretization error norm as defined in section \ref{sec:NumExp1}. In
cases where the analytic solution is not known, the latter can be
replaced by some norm of the solution.
\subsubsection{Strong scaling}
\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=\hsize]{\RepFigures/strong_256x1024_DDD}
\caption{DDDD problem for a $256\times 1024$ size, using multigrid
$V(3,3)$ cycles. Different times for a given number of
processes are obtained with different combinations of processes in
each dimension. The number of grid levels are fixed to 6. Five Jacobi
iterations are used at the coarsest grid.}
\label{fig:strong_scal_small}
\end{figure}
\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=\hsize]{\RepFigures/strong_512x2048_DDD}
\caption{DDDD problem for a $512\times 2048$ size using multigrid
$V(3,3)$ cycles. The \textcolor{red}{red marker} on the left shows
the time for the serial multigrid solver. Different times for a
given number of processes are obtained with different combinations
of processes in each dimension. The number of grid levels are fixed
to 6. Five Jacobi iterations are used at the coarsest grid.}
\label{fig:strong_scal}
\end{figure}
Here 2 \emph{fixed} problem sizes are considered:
\begin{itemize}
\item A small size with the (fine) grid of $256\times 1024$
shown in Fig.~\ref{fig:strong_scal_small} and
\item a larger size of $512\times 2048$ in Fig.~\ref{fig:strong_scal}.
\end{itemize}
In both cases $\mbox{\tt rtol}=10^{-8}$ and $\mbox{\tt
etol}=10^{-3}$. It was checked that the results do not change when more
than 5 Jacobi iterations are used at the coarsest mesh. Notice that
for the small problem, the parallel efficiency starts to degrade at 32
MPI processes while for the larger case, this happens after 64 MPI
processes. This can be explained by
the ghost cell exchange communication overhead: denoting $N_1$ and
$N_2$, the number of grid points in each direction and $P_1$ and $P_2$
the number of MPI processes in each direction, the ratio $S/V$ between the
number of ghost points and interior grid points for each local
subdomains can be estimated as
\begin{equation}
\label{eq:comm_overhead}
S/V\simeq\frac{2(N_1/P_1+N_2/P_2)}{N_1N_2/P_1P_2} = 2\left(P_1/N_1+P_2/N_2\right).
\end{equation}
This ratio increases as the number MPI processes increases while
keeping the problem size fixed. On very coarse grids, this communication
cost can become prohibitive. For this reason, in all the runs shown
here, the number of grid points on each direction for the coarsest
grid is limited to 2.
\subsubsection{Weak Scaling}
\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/weak_DDDD}
\caption{Weak scaling for a DDDD problem, using multigrid $V(3,3)$
cycles. The number of grid levels are fixed to 7. The solver for the
coarsest grid uses 5 Jacobi iterations except for the 2 largest cases
which require respectively 20 and 100 iterations to converge. The 2
sets of curves on the right figure show respectively the timings with
and without the calculations of the residual norm and discretization
error which require both a \emph{global reduction}.}
\label{fig:weak_scal_DDDD}
\end{figure}
\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/weak_NNDD}
\caption{Weak scaling for a NNDD problem, using multigrid $V(3,3)$
cycles. The number of grid levels are fixed to 7. The solver for the
coarsest grid uses 5 Jacobi iterations except for the 2 largest cases
which require respectively 20 and 100 iterations to converge. The 2
sets of curves on the right figure show respectively the timings with
and without the calculations of the residual norm and discretization
error which require both a \emph{global reduction}.}
\label{fig:weak_scal_NNDD}
\end{figure}
According to Eq.~\ref{eq:comm_overhead}, varying the problem size
together with the number of MPI processes by keeping $N_1/P_1$ and
$N_2/P_2$ constant should yield a \emph{constant scaling},
provided that the convergence rate does not depend on the problem sizes.
The results for the \texttt{DDDD} and \texttt{NNDD} problems are shown
in Fig.~\ref{fig:weak_scal_DDDD} and
Fig.~\ref{fig:weak_scal_NNDD}. The left part of the figures shows that
the convergence rate depends only weakly on the problem sizes, which
leads indeed to a (almost) constant time obtained for numbers of MPI
processes $P$ between 16 and 1024 . The reason for the good timings
for smaller $P$ is simply that there are only 2 ghost cell exchanges
for $P=2\times 2$ (instead of 4 for $P\ge 16$) and that there is no
exchange for $P=0$.
\section{Non-homogeneous Boundary Conditions}
\subsection{Non-homogeneous Dirichlet Conditions}
Non-homogeneous Dirichlet boundary conditions can be imposed on all the
Dirichlet faces simply by \emph{clearing}, as for the
\emph{homogeneous case}, the matrice rows and columns and setting its
diagonal term to 1. Moreover, the corresponding corresponding
\emph{right-hand-side} should be set to:
\begin{equation}
\begin{split}
f_{0,j} &= D^W(y_j), \quad f_{N_x,j}=D^E(y_j), \qquad j=0,\ldots,N_y, \\
f_{i,0} &= D^S(x_i), \quad f_{j,N_y}=D^N(x_i), \qquad i=0,\ldots,N_x, \\
\end{split}
\end{equation}
where $D^W, D^E, D^S, D^N$ are the values of $u$ at the 4 Dirichlet
faces. As for the homogeneous Dirichlet BC, the \emph{restriction}
operator should be changed as described in section
\ref{sec:restrict_bc} while the \emph{prolongation} defined
in (\ref{eq:2dprolongation}) remains unchanged.
\subsection{Non-homogeneous Neumann Conditions}
The non-homogeneous Neumann conditions at the 4 faces $x=0$ can be
defined as
\begin{equation}
\begin{split}
\left.\frac{\partial u}{\partial x}\right|_{x=0} &= N^W(y), \quad
\left.\frac{\partial u}{\partial x}\right|_{x=L_x} = N^E(y), \\
\left.\frac{\partial u}{\partial y}\right|_{y=0} &= N^S(x), \quad
\left.\frac{\partial u}{\partial y}\right|_{y=L_y} = N^N(x). \\
\end{split}
\end{equation}
Discretization of the BC defined above, using the \emph{central
difference} yields on the 4 faces
\begin{equation}
\begin{split}
u_{-1,j} &= u_{1,j} -2h_xN^W(y_j), \quad
u_{N_x+1,j} = u_{N_x-1,j} + 2h_xN^E(y_j), \qquad j=0,\ldots,N_y, \\
u_{i,-1} &= u_{i,1} -2h_yN^S(x_i), \quad
u_{i,N_y+1} = u_{i,N_y-1} + 2h_yN^N(x_i), \qquad i=0,\ldots,N_x. \\
\end{split}
\end{equation}
With these relations, the stencil (\ref{eq:new_stencil}) on the 4
boundaries is modified as follow
\begin{equation}
\begin{split}
S^W &= \frac{1}{h_x^2}
\begin{bmatrix}
0 &\alpha^2(1+\tau^2/4) & 0 \\
0 &-2(1+\alpha^2(1+\tau^2/4))-h_x^2a_{0,j} & 2 \\
0 &\alpha^2(1+\tau^2/4) & 0 \\
\end{bmatrix}
,\quad
S^E = \frac{1}{h_x^2}
\begin{bmatrix}
0 &\alpha^2(1+\tau^2/4) & 0 \\
2 &-2(1+\alpha^2(1+\tau^2/4))-h_x^2a_{N_x,j} & 0 \\
0 &\alpha^2(1+\tau^2/4) & 0 \\
\end{bmatrix}
, \\
S^S &= \frac{1}{h_x^2}
\begin{bmatrix}
0 & 2\alpha^2(1+\tau^2/4) & 0 \\
1 &-2(1+\alpha^2(1+\tau^2/4))-h_x^2a_{i,0} & 1 \\
0 & 0 & 0 \\
\end{bmatrix}
,\quad
S^N = \frac{1}{h_x^2}
\begin{bmatrix}
0 & 0 & 0 \\
1 &-2(1+\alpha^2(1+\tau^2/4))-h_x^2a_{i,N_y} & 1 \\
0 & 2\alpha^2(1+\tau^2/4) & 0 \\
\end{bmatrix}
, \\
\end{split}
\end{equation}
while the right-hand-side should be changed according to
\begin{equation}
\begin{split}
f_{0,j} &\longleftarrow f_{0,j} + \frac{2}{h_x}\left[\frac{\tau\alpha}{4}\,N^W(y_{j-1})
+ N^W(y_j) - \frac{\tau\alpha}{4}\,N^W(y_{j+1})\right], \\
f_{N_x,j} &\longleftarrow f_{N_x,j} + \frac{2}{h_x}\left[\frac{\tau\alpha}{4}\,N^E(y_{j-1})
- N^E(y_j) - \frac{\tau\alpha}{4}\,N^E(y_{j+1})\right], \\
f_{i,0} &\longleftarrow f_{i,0} + \frac{2}{h_y}\left[\frac{\tau}{4\alpha}\,N^S(x_{i-1})
+ (1+\tau^2/4)N^S(x_i) - \frac{\tau}{4\alpha}\,N^S(x_{i+1})\right], \\
f_{i,N_y} &\longleftarrow f_{i,N_y} + \frac{2}{h_y}\left[\frac{\tau}{4\alpha}\,N^N(x_{i-1})
- (1+\tau^2/4)N^N(x_i) - \frac{\tau}{4\alpha}\,N^N(x_{i+1})\right]. \\
\end{split}
\end{equation}
\subsection{The NNDD test problem}
In order to test the discretization of the non-homogeneous boundary
conditions as formulated above, a test problem with the prescribed
\emph{exact} solution
\begin{equation}
u(x,y) = 1 + \sin\frac{2\pi k_xx}{L_x}\sin\frac{2\pi k_yy}{L_y},
\qquad \mbox{where $k_x$, $k_y$ are positive integers}
\end{equation}
and the following non-homogeneous boundary conditions
\begin{equation}
\begin{split}
\left.\frac{\partial u}{\partial x}\right|_{x=0} =
\left.\frac{\partial u}{\partial x}\right|_{x=L_x} &=
k_x\sin\frac{2\pi k_yy}{L_y}, \\
u(x,0) = u(x,L_y) &= 1, \\
\end{split}
\end{equation}
is solved with varying grid spacing. The discretization errors versus
the number of grid intervals $N_x$ displayed in Fig(\ref{fig:conv_nh_bc} shows
a \emph{quadratic} convergence as expected from the second order
finite differences used in both the PDE and the Neumann boundary condition
discretization.
\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/conv_nh_bc}
\caption{Convergence of the error
$\| u_{calc}- u_{anal}\|_\infty$ wrt the number of intervals in
the $x$ direction $N_x$ for the non-homogeneous NNDD
problem. Here, $L_x=100$, $L_y=800$, $k_x=k_y=4$, $\tau=1$ and $N_y=4N_x$.}
\label{fig:conv_nh_bc}
\end{figure}
As shown in Fig.(\ref{fig:nndd_nh}), the multigrid $V$-cycles for the
\emph{non-homogeneous} problem converge with a slightly smaller efficiency,
than the \emph{homogeneous} problem shown in Fig.(\ref{fig:weak_scal_NNDD}).
\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/nndd_nh}
\caption{Performances of the $V(3,3)$-cycle for the non-homogeneous
NNDD problem. The same parameters in Fig.(\ref{fig:conv_nh_bc})
are used here.}
\label{fig:nndd_nh}
\end{figure}
\FloatBarrier
\subsection{Local relaxation methods}
In addition to the damped Jacobi, three methods of relaxations are
added in this parallel multigrid solver:
\begin{enumerate}
\item The 4 color Gauss-Seidel method (RBGS).
\item The Gauss-Seidel method (GS).
\item The successive over-relaxation method (SOR).
\end{enumerate}
In order to apply correctly the parallel 4 color Gauss-Seidel, a
complicated ghost cell exchange has to be performed for each sweep
for each color. Here we simply apply the method \emph{locally} on each
subdomain with only one ghost exchange performed at the beginning of
each relaxation.
The same procedure is also used for the other 2 methods which are
inherently \emph{serial}. All these 3 relaxations are
thus only correct if there is only one subdomain. As a
consequence, while the damped Jacobi does not depend on the
partition of the subdomains, results from these 3 methods do depend on
how the domain is partitioned.
Table~\ref{tab:advrelax} show
however that all of the 3 \emph{approximated} relaxation methods produce
a much \emph{faster} convergence rate than the damped Jacobi relaxations for the NNDD test
problem considered here. The performance of the implemented solver
using the 4 relaxation methods on HELIOS is compared in
Fig,(\ref{fig:weakhelios}. The bad performance of the 4 color
Gauss-Seidel relaxations (RBGS) can be explained by the 4 nested loops
required to sweep each of the 4 colors.
\begin{table}[hbt]
\centering
\begin{tabular}{|l||r|r|r|r|r|r|}\hline
Grid Sizes & $256\times 1024$ & $512\times 2048$ & $1024\times 4096$ & $2048\times 8192$ & $4096\times 16384$ & $8192\times 32768$ \\
\hline
Process topology &
\multicolumn{1}{c|} {$1\times 1$} &
\multicolumn{1}{c|} {$2\times 2$} &
\multicolumn{1}{c|} {$4\times 4$} &
\multicolumn{1}{c|} {$8\times 8$} &
\multicolumn{1}{c|} {$16\times 16$} &
\multicolumn{1}{c|} {$32\times 32$} \\
\hline
Jacobi $\omega=0.9$ & 0.22 & 0.24 & 0.24 & 0.24 & 0.24 & 0.25 \\
RBGS & 0.05 & 0.07 & 0.10 & 0.10 & 0.12 & 0.12 \\
GS & 0.07 & 0.08 & 0.10 & 0.11 & 0.11 & 0.12 \\
SOR $\omega=1.2$ & 0.04 & 0.05 & 0.07 & 0.07 & 0.07 & 0.08 \\
\hline
\end{tabular}
\caption{Reduction factor for the residuals (obtained as the \emph{geometric mean} of
all its values except the first 2 values) for the non-homogeneous
NNDD test problem. The same parameters as in Fig.(\ref{fig:conv_nh_bc}) are used here.}
\label{tab:advrelax}
\end{table}
\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=0.9\hsize]{\RepFigures/weak_helios}
\caption{Performance of the 4 relaxations on the non-homogeneous
NNDD problem. The same parameters in Fig.(\ref{fig:conv_nh_bc})
are used here. The grid sizes used in this \emph{weak scaling}
run are shown in Table~\ref{tab:advrelax}.}
\label{fig:weakhelios}
\end{figure}
\FloatBarrier
\section{Performance of the Stencil Kernel on different platform}
To get a feeling on the performances gained on the different
platforms and how well the compilers (with their
auto-vectorization capability) support these platforms,
the following Fortran \emph{9-point stencil} kernel has been used. The
OpenMP directives are used for parallelization on both Xeon and Xeon
Phi while offload to GPU card is done via the high level OpenACC
directives. \emph{First touch} is applied in the initialization of
\texttt{x} and \texttt{mat}.
\begin{lstlisting}[language=Fortran,numbers=left,commentstyle=\color{blue},keywordstyle=\color{red},frame=single]
!$omp parallel do private(ix,iy)
!$acc parallel loop present(mat,x,y) private(ix,iy)
DO iy=0,ny
DO ix=0,nx
y(ix,iy) = mat(ix,iy,1)*x(ix-1,iy-1) &
& + mat(ix,iy,2)*x(ix, iy-1) &
& + mat(ix,iy,3)*x(ix+1,iy-1) &
& + mat(ix,iy,4)*x(ix-1,iy) &
& + mat(ix,iy,0)*x(ix,iy) &
& + mat(ix,iy,5)*x(ix+1,iy) &
& + mat(ix,iy,6)*x(ix-1,iy+1) &
& + mat(ix,iy,7)*x(ix, iy+1) &
& + mat(ix,iy,8)*x(ix+1,iy+1)
END DO
END DO
!$acc end parallel loop
!$omp end parallel do
\end{lstlisting}
The performances on a Helios dual processor node and its attached Xeon
Phi co-processor are shown in Fig.~\ref{fig:cpu_mic} while the
performances on a Cray XC30 CPU and its attached NVIDIA graphics card are
shown in Fig.~\ref{fig:cpu_gpu}. In these figures, Intel optimization
flag \texttt{-O3} and default Cray optimization were applied. In
Fig.~\ref{fig:cpu_mic_O1_O3}, the speedup by vectorization is shown by
comparing performances obtained with \texttt{-O3} and \texttt{-O1}.
Several observations can be drawn from these results.
\begin{itemize}
\item The parallel scaling, using OpenMP is linear for both Intel and
Cray compilers, when the problem sizes fit into the 20MB cache of
the Sandybridge processor. For grid sizes smaller than $32\times8$, the
overhead of thread creation dominates. When the memory footprint is
larger than the cache, 4 threads per socket already saturate the memory
bandwidth.
\item On the MIC, the parallel speedup scales linearly up to 60
cores with 1 thread per core. Using 2 or 3 threads per core does not help
while with 4 threads, the performance even degrades.
\item The MIC, using the Intel \emph{mic native} mode, does not
perform better than 8 cores of the Sandybridge processor.
\item Since the benefit from \emph{vectorization} is quite large for
the MIC (see Fig.~\ref{fig:cpu_mic_O1_O3}), the poor parallel
scalability may be explained by the low flop intensity per thread
coupled with the high overhead of the (many) thread creation and
thread synchronization.
\item The NIVIDIA card, using the high level OpenACC programming
style is more than 3 times faster than 8 Sandybridge cores, for grid
sizes larger than $1024\times256$. For smaller sizes, there are not
enough flops to keep the GPU threads busy.
\end{itemize}
\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=\hsize]{\RepFigures/cpu_mic}
\caption{Performance on the Helios dual processor (left) using the
\texttt{-O3} compiler option and on the MIC (right),
using the native mode \texttt{-mmic}.}
\label{fig:cpu_mic}
\end{figure}
\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=\hsize]{\RepFigures/cpu_gpu}
\caption{Performance on a Cray XC30 single 8 core processor node
(left) and the NVIDIA card (right) using OpenACC. Default Cray
Fortran compiler optimization has been used on both runs.}
\label{fig:cpu_gpu}
\end{figure}
\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=\hsize]{\RepFigures/cpu_mic_O1_O3}
\caption{Performance comparison between using \texttt{-O3} and
\texttt{-O1}, on the Helios dual processor (left) and on the MIC (right).}
\label{fig:cpu_mic_O1_O3}
\end{figure}
\FloatBarrier
\section{Hybrid MPI+OpenMP \texttt{PARMG (r599})}
In this version, a \emph{straightforward} parallelization is
done in the subroutines \texttt{jacobi, residue, prolong, restrict} and
\texttt{norm\_vec}, using the OpenMP work sharing directives. The
ghost cell exchange is executed by the \emph{master} thread.
All the 2D arrays (solutions, RHS, etc.) are allocated and initialized
\emph{once} by the \emph{master} thread. Dynamic array allocations
during the multigrid $V$-cycles are thus avoided.
To help further optimization, timings are introduced for each of the 4
multigrid components \texttt{jacobi, residue, prolong, restrict} and
the ghost cell \texttt{exchange} as well as on the \emph{recursive}
subroutine \texttt{mg}. Since the timings of the 4 MG components
include already calls to \texttt{exchange}, the time obtained for
\texttt{mg} should be equal to the sum of the 4 MG components and
the \emph{extras} time which includes operations in \texttt{mg} but
not in the 4 components:
\begin{equation}
\label{eq:timings}
t_\text{mg} = t_\text{jacobi} + t_\text{residue} + t_\text{prolong}
+ t_\text{restrict} + t_\text{extras}.
\end{equation}
We will see in the following sections that, in addition to these 5
contributions to $t_\text{mg}$,
there is \emph{overhead} probably due to the \emph{recursive}
calls of \texttt{mg}.
\subsection{Parallel efficiency on single node}
The comparison in Fig.~\ref{fig:single_node} shows that the pure
OpenMP version is at most $30\%$ slower than the pure MPI version when all the
16 cores are used but less than $10\%$ when only one socket is
used. The degradation of the OpenMP version can be explained by the
\emph{numa} effects when 2 sockets are used. It is also observed
that the performance level off at 4 cores, due to the
saturation of the socket memory bandwidth.
\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/single_node}
\caption{Parallel performance of the 7 level $V(3,3)$-cycle on a
dual socket Helios node ($2\times 8$ cores) for pure OpenMP and pure
MPI. The non-homogeneous NNDD problem with the same parameters in
Fig.~\ref{fig:conv_nh_bc} is considered here. The OpenMP threads and MPI
tasks are placed first on the first socket before filling the
second socket, using the \texttt{srun} option
``\texttt{--cpu\_bind=cores -m block:block}'' and the environment
variable \texttt{OMP\_PROC\_BIND=true}.}
\label{fig:single_node}
\end{figure}
\subsection{Hybrid efficiency on multi-nodes}
In the following multi-node experiments, all the 16 cores on each
Helios node are
utilized. The numbers of OpenMP threads \emph{per} MPI \emph{process} NT, the
number of MPI processes \emph{per node} NP, the number of nodes NNODES
and the \emph{total} number of MPI processes $\text{NP}_{tot}$ verify
thus the following relations:
\begin{equation}
\begin{gathered}
1\le\text{NT} \le 16, \qquad 1\le\text{NP} \le 16 \\
\text{NT}\times\text{NP} = 16 \\
\text{NP}_{tot} = 16\times\text{NNODES}/\text{NT} \\
\end{gathered}
\end{equation}
The times of the different MG components and the relative
contributions for the \emph{strong scaling}
experiments using a $1024\times 4096$ grid size, are shown in
Fig.~\ref{fig:hybrid_strong} and Fig.~\ref{fig:hybrid_strong_contrib}
respectively. The following observations can be made:
\begin{enumerate}
\item The \texttt{exchange} time increases strongly with increasing
NNODES, due to smaller partitioned subdomains and thus their larger
surface/volume ratio.
\item The pure MPI (NT=1) \texttt{exchange} time is on the other
hand reduced with
$\text{NT}>1$ since the local partitioned grid becomes larger.
\item The less efficient OpenMP parallelization (numa effects,
Amdahl's law) tends to limit however this advantage.
\item As a result, there is an optimal NT for a given NNODES:
2 for 4 and 16 nodes, 8 for 64 nodes.
\item The \texttt{jacobi} and \texttt{residue} contributions dominate
largely with $0.63\le t_\text{jacobi}/t_\text{mg}\le 0.83$ and
$0.09\le t_\text{residue}/t_\text{mg}\le 0.18$.
\item The \emph{overhead} (see Eq.~\ref{eq:timings}) times increase
with NNODES but decrease slightly for increasing NT.
\end{enumerate}
The times of the different MG components and the relative
contributions for the \emph{weak scaling}
experiments are shown in Fig.~\ref{fig:hybrid_weak} and
Fig.~\ref{fig:hybrid_weak_contrib} respectively. The following
observations can be made:
\begin{enumerate}
\item A steady increase of MG times with the number of nodes can
be attributed to the increase of ghost cells \texttt{exchange} time,
even though the amount of communications between nodes does not
change.
\item The MG performance is improved slightly when NT=2 but drop
drastically for NT=16. This seems to indicate that
\emph{numa} effects are important here, since the array initialization
is not done locally on each thread.
\item The \emph{overhead} (see Eq.~\ref{eq:timings}) times are much
smaller than in the \emph{strong scaling} runs.
\end{enumerate}
\begin{sidewaysfigure}
\centering
\includegraphics[angle=0,width=\hsize]{\RepFigures/hybrid_strong}
\caption{Detailed timings for strong scaling experiments using the same
problem parameters as in Fig.~\ref{fig:single_node}, except that 5
levels are chosen to be able to run the runs with 64 nodes.}
\label{fig:hybrid_strong}
\end{sidewaysfigure}
\begin{sidewaysfigure}
%\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=\hsize]{\RepFigures/hybrid_strong_contrib}
\caption{Relative contributions of each of the MG components for the
strong scaling experiments using the same
problem parameters as in Fig.~\ref{fig:single_node}.}
\label{fig:hybrid_strong_contrib}
%\end{figure}
\end{sidewaysfigure}
\begin{sidewaysfigure}
%\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=\hsize]{\RepFigures/hybrid_weak}
\caption{Detailed timings for weak scaling experiments using the same
problem parameters as in Fig.~\ref{fig:single_node}.}
\label{fig:hybrid_weak}
%\end{figure}
\end{sidewaysfigure}
\begin{sidewaysfigure}
%\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=\hsize]{\RepFigures/hybrid_weak_contrib}
\caption{Relative contributions of each of the MG components for the
weak scaling experiments using the same
problem parameters as in Fig.~\ref{fig:single_node}.}
\label{fig:hybrid_weak_contrib}
%\end{figure}
\end{sidewaysfigure}
Finally, Table~\ref{tab:memory} shows that using $\text{NT}>1$
decreases the memory needed by the multigrid procedure for both strong
and weak scaling runs.
\begin{table}[htb]
\begin{center}
\begin{tabular}{|l|c||r|r|r|r|r|}
\hline
&\texttt{\textbf{NNODES}}& NT=1 & NT=2 & NT=4 & NT=8 & NT=16 \\
\hline
\multirow{4}{*}{\texttt{\textbf{Strong Scaling}}}
&1 & 57.01 & 53.70 & 52.04 & 51.06 & 48.57 \\
&4 & 21.87 & 21.49 & 15.75 & 13.87 & 13.11 \\
&16 & 13.82 & 8.52 & 5.75 & 5.69 & 4.00 \\
&64 & 13.93 & 6.80 & 3.73 & 2.32 & 1.48 \\
\hline\hline
\multirow{4}{*}{\texttt{\textbf{Weak Scaling}}}
&1 & 57.03 & 53.71 & 52.04 & 51.08 & 48.61 \\
&4 & 59.24 & 59.18 & 53.39 & 51.52 & 48.69 \\
&16 & 60.48 & 55.33 & 52.69 & 52.74 & 49.06 \\
&64 & 63.30 & 56.13 & 53.30 & 51.58 & 48.79 \\
\hline
\end{tabular}
\end{center}
\caption{Memory footprint \emph{per core} (MB/core) for the strong scaling and weak
scaling experiments.}
\label{tab:memory}
\end{table}
\subsection{Summary and conclusions}
The \emph{strong scaling} and \emph{weak scaling} wrt NT and NNODES
are summarized in Fig.~\ref{fig:scaling}. The speed up for the strong
scaling experiments shows a good efficiency up to 16 nodes
for all NT but degrades at 64 nodes (1024 cores) due the partitioned
grid becoming too small. A good \emph{weak scaling} is also
obtained with an increase in $t_\text{mg}$ of
less than $10\%$ when NNODES vary from 4 to 64. However, for NT=16, the
efficiency drops significantly, due to the non-local memory access
when the OpenMP threads are placed on both sockets (\emph{numa} effect).
In order to improve the hybrid MPI+OpenMP multigrid, especially for large number
of threads per MPI process NT, the following optimizations should be done:
\begin{itemize}
\item \emph{First touch} array initialization in order to avoid
\emph{numa} effects.
\item OpenMP parallelization of some remaining \emph{serial} loops.
\item Better vectorization of inner loops.
\end{itemize}
The outcome of these optimization steps is important in order to run
efficiently on upcoming \emph{multicore} processors and
\emph{manycore} (MIC) devices.
\begin{figure}[htb]
\centering
\includegraphics[angle=0,width=0.9\hsize]{\RepFigures/scaling}
\caption{Strong scaling with a $1024\times 4096$ grid size (left) and
weak scaling (right) with grid sizes $1024\times 4096,10248\times 8192,
4096\times 16384$ and $8192\times32768$ respectively for 1, 4, 16
and 64 nodes.}
\label{fig:scaling}
\end{figure}
\FloatBarrier
\pagebreak
\begin{thebibliography}{99}
\bibitem{Briggs} {W.L.~Briggs, V.E.~Henson and S.F.~McCormick, A
Multigrid Tutorial, Second Edition, SIAM (2000)}.
\bibitem{MUMPS} \url{http://graal.ens-lyon.fr/MUMPS/}.
\bibitem{MG1D} {\tt Multigrid Formulation for Finite Elements},\\
\url{https://crppsvn.epfl.ch/repos/bsplines/trunk/multigrid/docs/multigrid.pdf}
\bibitem{TEMPL} {R. Barrett, M. Berry, T. F. Chan,
J. Demmel, J. Donato, J. Dongarra, V. Eijkhout,
R. Pozo, C. Romine and H. Van der Vorst,
Templates for the Solution of Linear Systems: Building Blocks for
Iterative Methods, 2nd Edition , SIAM, (1994)}.
\bibitem{Wesseling} {P.~Wesseling, An Introduction to Multigrid
Methods, Edwards, 2004}.
\bibitem{salpha} {X. Lapillonne, S. Brunner, T. Dannert, S. Jolliet,
A. Marinoni et al., Phys. Plasmas 16, 032308 (2009)}.
\end{thebibliography}
\end{document}

Event Timeline