mg_gbs.tex
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, Nov 17, 07:18

mg_gbs.tex
View Options

	%
	% @file mg_gbs.tex
	%
	% @brief
	%
	% @copyright
	% Copyright (©) 2021 EPFL (Ecole Polytechnique Fédérale de Lausanne)
	% SPC (Swiss Plasma Center)
	%
	% spclibs is free software: you can redistribute it and/or modify it under
	% the terms of the GNU Lesser General Public License as published by the Free
	% Software Foundation, either version 3 of the License, or (at your option)
	% any later version.
	%
	% spclibs is distributed in the hope that it will be useful, but WITHOUT ANY
	% WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	% FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	%
	% You should have received a copy of the GNU Lesser General Public License
	% along with this program. If not, see <https://www.gnu.org/licenses/>.
	%
	% @authors
	% (in alphabetical order)
	% @author Trach-Minh Tran <trach-minh.tran@epfl.ch>
	%
	\documentclass[a4paper]{article}
	\usepackage{linuxdoc-sgml}
	\usepackage{graphicx}
	\usepackage{hyperref}
	%\usepackage{amsmath}
	\usepackage{amssymb}
	\usepackage{mathtools}
	\usepackage{placeins}
	\usepackage{multirow}
	\usepackage{latexsym}
	\usepackage{listings}
	\usepackage{xcolor}
	\usepackage{rotating}
	\def\RepFigures{FIGURES_mg_gbs}


	\title{\tt Multigrid Solver for GBS}
	\author{Trach-Minh Tran, Federico Halpern\\CRPP/EPFL}
	\date{v1.0, June 2015}

	\begin{document}
	\maketitle
	\tableofcontents

	\section{The PDE}
	The PDE considered is
	\begin{equation}
	\label{eq:pde}
	\left[\frac{\partial^2}{\partial x^2} + \tau\frac{\partial^2}{\partial
	x\partial y} + \frac{\partial^2}{\partial y^2} - a(x,y)\right] u(x,y) =
	f(x,y), \qquad 0\le x \le L_x, \; 0\le y \le L_y.
	\end{equation}
	On the four boundaries, homogeneous Dirichlet boundary condition
	$u=0$ as well as Neumann boundary condition
	$\partial u/\partial n=0$ can be applied.

	\section{Discretization}
	The grid points $(x_i,y_j)$ are defined by
	\begin{equation}
	\begin{split}
	x_i &= ih_x = i\frac{L_x}{N_x}, \quad i=0,\ldots, N_x \\
	y_j &= jh_y = j\frac{L_y}{N_y}, \quad j=0,\ldots, N_y \\
	\end{split}
	\end{equation}

	Second order Finite Difference discretization of Eq.\ref{eq:pde} leads
	to the following 9-point stencil

	\begin{equation}
	\label{eq:stencil}
	S_{ij} = \frac{1}{h_x^2}
	\begin{bmatrix}
	-\tau\alpha/4 & \alpha^2 & \tau\alpha/4 \\
	1 & -2(1+\alpha^2)-h_x^2a_{ij} & 1 \\
	\tau\alpha/4 & \alpha^2 &-\tau\alpha/4 \\
	\end{bmatrix}
	, \qquad \mbox{where $\alpha=h_x/h_y$}.
	\end{equation}

	Note that the mesh aspect ratio $\alpha$ results in the same stencil
	for the \emph{anisotropic} Poisson equation with $h_x=h_y$:
	\begin{equation}
	\frac{\partial^2 u}{\partial x^2} + \alpha^2
	\frac{\partial^2 u}{\partial y^2} = f.
	\end{equation}

	It is shown in \cite[p.~119]{Briggs} that this anisotropy can degrade
	the performance of multigrid using standard relaxations such as
	Gauss-Seidel or damped Jacobi can be strongly degraded.

	With the \emph{lexicographic} numbering
	\begin{equation}
	I = j(N_x+1) + i+1,
	\end{equation}
	for the $(N_x+1)(N_y+1)$ nodes, the discretized problem can be
	expressed as a matrix problem
	\begin{equation}
	\label{eq:matrix}
	\mathbf{Au} = \mathbf{f},
	\end{equation}
	where $\mathbf{A}$ is a 9-diagonal matrix, assembled using the stencil defined
	above. Homogeneous Dirichlet boundary condition can be imposed, for example, on
	the face $j=0$ simply by \emph{clearing} the matrix rows and columns
	$1,2,\ldots, N_x+1$, and setting the diagonal terms to 1.

	Neumann boundary condition $\partial u/\partial x=0$ at
	the face $i=0$, can be simply implemented by imposing
	$u_{-1j}=u_{1j}$. The stencil for the boundary nodes $(0,j)$
	can thus be modified as

	\begin{equation}
	S_{0j} = \frac{1}{h_x^2}
	\begin{bmatrix}
	0 &\alpha^2 & 0 \\
	0 &-2(1+\alpha^2)-h_x^2a_{0j} & 2 \\
	0 &\alpha^2 & 0 \\
	\end{bmatrix}
	.
	\end{equation}

	Two model problems are considered in this report:
	\begin{description}
	\item[\texttt{\textbf{DDDD}} problem:] Homogeneous Dirichlet BC
	at all the 4 boundaries. The \emph{analytic solution} is
	\begin{equation}
	u(x,y) = \sin\frac{2\pi k_xx}{L_x}\sin\frac{2\pi k_yy}{L_y},
	\qquad \mbox{where $k_x$, $k_y$ are positive integers}.
	\end{equation}
	\item[\texttt{\textbf{NNDD}} problem:] Neumann boundary conditions
	$\partial u/\partial x=0$ at $x=0$ and $x=L_x$, homogeneous
	Dirichlet BC at $y=0$ and $y=L_y$. The \emph{analytic solution} is
	\begin{equation}
	u(x,y) = \cos\frac{2\pi k_xx}{L_x}\sin\frac{2\pi k_yy}{L_y},
	\qquad \mbox{where $k_x$, $k_y$ are positive integers}.
	\end{equation}
	\end{description}
	In both problems, $a$ depends only on $x$:
	\begin{equation}
	\label{eq:density}
	a(x,y)= \exp\left[-\frac{(x-L_x/3)^2}{(L_x/2)^2}\right].
	\end{equation}

	The sparse direct solver MUMPS \cite{MUMPS} is used to solve (\ref{eq:matrix}) in
	order to check the convergence of the schema described
	above. Fig.\ref{fig:convergence} shows the expected \emph{quadratic}
	convergence of the error with respect to $h_x$ with fixed $\alpha=h_x/h_y=0.5$ for
	both problems, when the grid size is varied from $16\times 64$ to $512\times 2048$.

	\begin{figure}[hbt]
	\centering
	\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/convergence}
	\caption{Convergence of the error
	$\\| u_{calc}- u_{anal}\\|_\infty$ wrt the number of intervals
	in the $x$ direction $N_x$ for $L_x=100$, $L_y=800$, $k_x=k_y=4$, $\tau=1$ and $N_y=4N_x$.}
	\label{fig:convergence}
	\end{figure}

	\section{Multigrid $V$-cycle}
	\label{sec-mgProc}
	Given an approximate $\mathbf{u}^h$ and right hand side $\mathbf{f}^h$
	defined at some grid level represented by the grid spacing $h$, the
	following MG $V$-cycle procedure
	\begin{equation*}
	\boxed{\mathbf{u}^h \leftarrow MG^h(\mathbf{u}^h,\mathbf{f}^h)}
	\end{equation*}
	will compute a \emph{new} $\mathbf{u}^h$. It is defined recursively by the
	following steps:

	\begin{enumerate}
	\item If $h$ is the coarsest mesh size,
	\begin{itemize}
	\item Direct solve $\mathbf{A}^h\mathbf{u}^h=\mathbf{f}^h$
	\item Goto 3.
	\end{itemize}
	\item Else
	\begin{itemize}
	\item Relax $\mathbf{u}^h$ $\nu_1$ times.
	\item $\mathbf{f}^{2h} \leftarrow {\mathbf{R}}(\mathbf{f}^h-\mathbf{A}^h\mathbf{u}^h)$.
	\item $\mathbf{u}^{2h} \leftarrow MG^{2h}(\mathbf{u}^{2h},\mathbf{f}^{2h})$ $\mu$ times.
	\item $\mathbf{u}^h\leftarrow
	\mathbf{u}^h+{\mathbf{P}}\mathbf{u}^{2h}$.
	\item Relax $\mathbf{u}^h$ $\nu_2$ times.
	\end{itemize}
	\item Return
	\end{enumerate}

	In the procedure above, the operators $\mathbf{R}$ and $\mathbf{P}$
	denote respectively the \emph{restriction} (from \emph{fine} to
	\emph{coarse} grid) and the \emph{prolongation} (from \emph{coarse} to
	\emph{fine} grid). Notice that in this multigrid procedure,
	$\mathbf{R}$ applies only to the \emph{right hand side} while $\mathbf{P}$
	applies only to the \emph{solution}. The standard $V(\nu_1,\nu_2)$ cycle is obtained by
	calling this $MG^h$ procedure with $\mathbf{f}^h$ defined at the
	\emph{finest} grid level, a guess $\mathbf{u}^h=0$ and $\mu=1$, while
	$\mu=2$ results in the $W(\nu_1,\nu_2)$ cycle.

	Details on the grid coarsening, the inter-grid transfers and methods
	of relaxation are given in the following.

	\subsection{Grid coarsening}
	Let start with the one-dimensional \emph{fine} grid defined
	by $x_i,\, i=0,\ldots,N$, assuming that $N$ is even. The next coarse grid
	(with $N/2$ intervals) is obtained by simply discarding the grid
	points with \emph{odd} indices.

	In order to get a \emph{smallest coarsest} grid (so that it is possible
	to solve \emph{cheaply} the problem with a \emph{direct} method), $N$ should be
	$N=N_c2^{L-1}$ where $L$ the total number of grid levels and $N_c$ is either
	2 or a \emph{small odd} integer. As an example, the fine grid with $N=768$ can have
	up to 9 grid levels, and a coarsest grid with 3 intervals, see
	Table~\ref{tab:level}.

	\begin{table}[hbt]
	\centering
	\begin{tabular}{\|l\|\|r\|r\|r\|r\|r\|}\hline
	$L$ & \multicolumn{5}{c\|}{$N$} \\ \hline
	1 & 2 & 3 & 5 & 7 & 9\\
	2 & 4 & 6 & 10 & 14 & 18\\
	3 & 8 & 12 & 20 & 28 & 36\\
	4 & 16 & 24 & 40 & 56 & 72\\
	5 & 32 & 48 & 80 & 112 & 144\\
	6 & 64 & 96 & 160 & 224 & 288\\
	7 & 128 & 192 & 320 & 448 & 576\\
	8 & 256 & 384 & 640 & 896 & 1152\\
	9 & 512 & 768 & 1280 & 1792 & 2304\\
	10 & 1024 & 1536 & 2560 & 3584 & 4608\\
	\hline
	\end{tabular}
	\caption{Set of values of the \emph{fine} grid number of intervals $N$
	to obtain a \emph{coarsest} grid size at most equal to $9$ with
	at most $10$ grid levels.}
	\label{tab:level}
	\end{table}

	For a two-dimensional grid, the same procedure is applied to both
	dimensions. The result of such procedure is illustrated in
	Fig.~\ref{fig:2d_coarsening}, for a $8\times4$ fine grid.

	\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=0.6\hsize]{grid}
	\caption{A \emph{coarse} $4\times 2$ grid ($\Box$) obtained from a
	$8\times4$ fine grid ($\bullet$).}
	\label{fig:2d_coarsening}
	\end{figure}


	\subsection{Inter-grid transfers}
	The one-dimensional \emph{prolongation} operator for the second-order FD
	discretization is chosen the same as the one obtained with the
	\emph{linear Finite Elements} \cite{MG1D}. For a $N=8$ grid, it can be
	represented as a $9\times 5$ matrix given by
	\begin{equation}
	\label{eq:1dprolongation}
	\mathbf{P} =
	\left(
	\begin{matrix}
	1 & 0 & 0 & 0 & 0 \\
	1/2 & 1/2 & 0 & 0 & 0 \\
	0 & 1 & 0 & 0 & 0 \\
	0 & 1/2 & 1/2 & 0 & 0 \\
	0 & 0 & 1 & 0 & 0 \\
	0 & 0 & 1/2 & 1/2 & 0 \\
	0 & 0 & 0 & 1 & 0 \\
	0 & 0 & 0 & 1/2 & 1/2\\
	0 & 0 & 0 & 0 & 1 \\
	\end{matrix}\right)
	\end{equation}
	The \emph{restriction} matrix $\mathbf{R}$ is simply related to $\mathbf{P}$ by
	\begin{equation}
	\label{eq:1drestriction}
	\mathbf{R} = \frac{1}{2}\mathbf{P}^{T}=\frac{1}{2}\left(
	\begin{matrix}
	1&1/2&0&0&0&0&0&0&0\\
	0&1/2&1&1/2&0&0&0&0&0\\
	0&0&0&1/2&1&1/2&0&0&0\\
	0&0&0&0&0&1/2&1&1/2&0\\
	0&0&0&0&0&0&0&1/2&1\\
	\end{matrix}
	\right).
	\end{equation}
	For Dirichlet BC imposed on the \emph{left} boundary one has to set
	${P}_{21}=R_{12}=0$, while for Dirichlet BC imposed on the \emph{right}
	boundary, ${P}_{N,N/2+1}=R_{N/2+1,N}=0$. Notice that these inter-grid operators are
	identical to the standard \emph{linear interpolation} and \emph{full
	weighting} operators.

	For a two-dimensional problem, using the property that the grid is a
	\emph{tensor product} of two one-dimensional grids, the
	restriction of the right hand side $f^{h}_{ij}$ and the prolongation of
	the solution $u^{2h}_{ij}$ can be computed as
	\begin{equation}
	\label{eq:2dintergrid}
	\begin{split}
	\mathbf{f}^{2h} &= \mathbf{R}^x \cdot \mathbf{f}^{h}\cdot (\mathbf{R}^y)^T \\
	\mathbf{u}^{h} &= \mathbf{\mathbf{P}}^x \cdot \mathbf{u}^{2h}\cdot
	(\mathbf{\mathbf{P}}^y)^T \\
	\end{split}
	\end{equation}

	\subsection{Relaxations}
	Gauss-Seidel and damped Jacobi iterations are used as relaxation
	methods in the multigrid $V$ cycle. In general, Gauss-Seidel
	method is more efficient but much more difficult to
	\emph{parallelize} than the Jacobi method.

	It should be noted that if $a(x,y)$ in Eq.~\ref{eq:pde} is
	non-positive, both relaxations diverge! This can be seen by
	considering the following one-dimensional FD equation with uniform
	$a$:

	\begin{equation}
	u_{j-1} -(2+ah^2)u_j + u_{j+1} = h^2f_j.
	\end{equation}

	Using the damped Jacobi relaxation, the error
	$\epsilon^{(m)}_j\equiv u_{anal}(x_j)-u_j^{(m)}$ at iteration $m+1$ is given by

	\begin{equation}
	\epsilon^{(m+1)}_j =
	\frac{\omega}{2+h^2a}(\epsilon^{(m)}_{j-1}+\epsilon^{(m)}_{j+1})
	+(1-\omega)\epsilon^{(m)}_j .
	\end{equation}
	Performing a \emph{local mode analysis} (or Fourier analysis) (see
	\cite[p.~48]{Briggs}), assuming that
	$\epsilon^{(m)}_j=A(m)e^{ij\theta}$, where $\theta$ is related to the
	mode number $k$ by $\theta=2\pi k/N$, the
	\emph{amplification factor} $G(\theta)$ is obtained as
	\begin{equation}
	\begin{split}
	G(\theta) &= \frac{A(m+1)}{A(m)} =
	\frac{2\omega}{2+h^2a}\cos\theta + (1-\omega) \\
	&= G_0(\theta) -\frac{ \omega h^2a}{2+h^2a}\cos\theta
	\simeq G_0(\theta) -\frac{ \omega h^2a}{2}\cos\theta, \\
	G_0(\theta) &=1-2\omega\sin^2\frac{\theta}{2}, \\
	\end{split}
	\end{equation}
	where $G_0(\theta)$ is the amplification factor for
	$a=0$. Note that $\|G_0(\theta)\|< 1$ for \emph{all} $\theta$ and $0<\omega< 1$
	but $\displaystyle{\max_{\|\theta\|<\pi}\|G(\theta)\|>1}$ if $a<0$.

	In Gauss-Seidel relaxation method, the errors evolve as:
	\begin{equation}
	\epsilon^{(m+1)}_j = \frac{\epsilon^{(m+1)}_{j-1}+\epsilon^{(m)}_{j+1}}{2+h^2a}.
	\end{equation}
	Applying again the same Fourier analysis yields the
	following complex amplification factor:
	\begin{equation}
	\begin{split}
	G(\theta) &\simeq G_0(\theta)\left(1-\frac{h^2a}{2-e^{-i\theta}}\right) \\
	G_0(\theta) &=\frac{e^{i\theta}}{2-e^{-i\theta}}, \quad
	\|G_0(\theta)\| < 1
	\end{split}
	\end{equation}
	which show that the Gauss-Seidel relaxations \emph{diverge} if $a<0$.

	Notice that when $a>0$, the effect of $a$ on the amplification is
	negligible and is thus ignored in the following two-dimensional
	analysis. Applying the damped Jacobi scheme on the
	stencil~(\ref{eq:stencil}), the error at the iteration $m+1$ is given
	by:
	\begin{equation}
	\begin{split}
	\epsilon^{(m+1)}_{ij} = & \frac{\omega}{2(1+\alpha^2)} \left[
	\epsilon^{(m)}_{i-1,j} + \epsilon^{(m)}_{i+1,j} + \alpha^2(
	\epsilon^{(m)}_{i,j-1} + \epsilon^{(m)}_{i,j+1}) + \frac{\tau\alpha}{4}(
	\epsilon^{(m)}_{i+1,j+1}+\epsilon^{(m)}_{i-1,j-1} -
	\epsilon^{(m)}_{i-1,j+1}-\epsilon^{(m)}_{i+1,j-1})
	\right] \\
	& + (1-\omega)\epsilon^{(m)}_{ij}. \\
	\end{split}
	\end{equation}
	Using the two-dimensional Fourier mode expression
	\begin{equation}
	\epsilon^{(m)}_{ij} = A(m)e^{i(\theta_1+\theta_2)}, \quad -\pi <\theta_1,
	\theta_2 \le \pi,
	\end{equation}
	the amplification factor $G=A(m+1)/A(m)$ is given
	by
	\begin{equation}
	\label{eq:amp_jac}
	G(\theta_1,\theta_2;\omega,\alpha,\tau)=1 - \frac{2\omega}{1+\alpha^2} \left(
	\sin^2\frac{\theta_1}{2} + \alpha^2\sin^2\frac{\theta_2}{2} +
	\frac{\tau\alpha}{4}\sin\theta_1\,\sin\theta_2
	\right).
	\end{equation}

	The errors in Gauss-Seidel method, assuming a \emph{lexicographic} ordering
	for the unknowns (increasing first $i$ then $j$), are updated according
	to
	\begin{equation}
	\epsilon^{(m+1)}_{ij} = \frac{1}{2(1+\alpha^2)} \left[
	\epsilon^{(m+1)}_{i-1,j} + \epsilon^{(m)}_{i+1,j} + \alpha^2(
	\epsilon^{(m+1)}_{i,j-1} + \epsilon^{(m)}_{i,j+1}) + \frac{\tau\alpha}{4}(
	\epsilon^{(m)}_{i+1,j+1}+\epsilon^{(m+1)}_{i-1,j-1} -
	\epsilon^{(m)}_{i-1,j+1}-\epsilon^{(m+1)}_{i+1,j-1})
	\right].
	\end{equation}
	The Fourier mode analysis then leads to the following complex
	amplification factor
	\begin{equation}
	\label{eq:amp_gs}
	G(\theta_1,\theta_2;\alpha,\tau) =
	\frac{e^{i\theta_1} +
	\left(\alpha^2+i\dfrac{\tau\alpha}{2}\sin\theta_1\right)e^{i\theta_2}}
	{2(1+\alpha^2) - \left[e^{-i\theta_1}+\left(\alpha^2 -
	i\dfrac{\tau\alpha}{2}\sin\theta_1\right)e^{-i\theta_2}\right]}.
	\end{equation}

	Curves of $G$ for \emph{fixed} $\theta_2$ are plotted in
	Fig.~\ref{fig:fourier_jac} showing \emph{convergence} ($\max\|G\|<1$) for
	$\tau=-1,0,1,2$, using the damped Jacobi method. The
	same conclusions are obtained for Gauss-Seidel relaxations as shown
	in Fig.~\ref{fig:fourier_gs} where the absolute values
	of the complex amplification factor $G$ are plotted. However, for larger
	$\|\tau\|>2$, both methods diverge as can be seen in
	Fig.~\ref{fig:relax_diverge}.Notice however that the PDE
	(\ref{eq:pde}) is \emph{elliptic} only when $\|\tau\|<2$ is satisfied!

	\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=0.85\hsize]{\RepFigures/fourier_jac}
	\caption{Amplification factor for damped Jacobi relaxations with
	$\omega=0.8$ and $\alpha=h_x/h_y=1$ and $\tau=-1,0,1,2$ displayed as curves of
	constant $\theta_2$. $\theta_2=0$ on the \emph{green} curve and $\pi$ on
	the \emph{red} curve.}
	\label{fig:fourier_jac}
	\end{figure}

	\begin{figure}[hbt]
	\centering
	\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/fourier_gs}
	\caption{Absolute value of the amplification factor for Gauss-Seidel
	relaxations with $\alpha=h_x/h_y=1$ and $\tau=-1,0,1,2$, displayed as curves of
	constant $\theta_2$. $\theta_2=0$ on the \emph{green} curve and $\pi$ on
	the \emph{red} curve.}
	\label{fig:fourier_gs}
	\end{figure}

	\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/relax_diverge}
	\caption{Amplification factor for Jacobi (left) and Gauss-Seidel (right)
	relaxations for $\|\tau\|=3,5$, $\alpha=h_x/h_y=1$, displayed as curves of
	constant $\theta_2$. $\theta_2=0$ on the \emph{green} curve and $\pi$ on
	the \emph{red} curve.}
	\label{fig:relax_diverge}
	\end{figure}

	In summary, the local mode analysis predicts that
	\begin{itemize}
	\item Negative values of the coefficient $a$ and large
	mixed derivative ($\|\tau\| > 2$) can make both damped Jacobi and
	Gauss-Seidel relaxations diverge.
	\item Positive values of $a$ can decrease the amplification factor
	(improving thus the convergence rate) but
	its contributions $h^2a$ decrease for increasing grid resolution.
	\end{itemize}
	These predictions will be checked against numerical experiments in the
	next section.

	\FloatBarrier
	\section{Numerical Experiments}
	\label{sec:NumExp1}
	In the following numerical experiments, we look at the convergence
	rate of the residual norm and the error norm which are defined at the
	iteration $m$ by
	\begin{equation}
	\begin{split}
	r^{(m)} &= \\|\mathbf{f}-\mathbf{A}\mathbf{u}^{(m)}\\|_\infty, \\
	e^{(m)} &= \\|\mathbf{u}^{(m)}-\mathbf{u}_{anal}\\|_\infty.\\
	\end{split}
	\end{equation}

	The iterations are stopped when the number of iterations reach an
	user supplied \emph{maximum} of iterations or when the residual norm
	is smaller than either a given \emph{relative} tolerance {\tt rtol}
	\cite[p.~51]{TEMPL} or \emph{absolute} tolerance {\tt atol}:
	\begin{equation}
	\begin{split}
	r^{(m)} &< \mbox{\tt rtol}
	\cdot(\\|\mathbf{A}\\|_\infty\cdot\\|\mathbf{u}^{(m)}\\|_\infty +
	\\|\mathbf{f}\\|_\infty), \\
	r^{(m)} &< \mbox{\tt atol}.\\
	\end{split}
	\end{equation}

	An additional stopping criteria consists of stopping the
	iterations when the change of the discretization error between
	successive iteration is small enough:
	\begin{equation}
	\frac{e^{(m)}-e^{(m-1)}}{e^{(m-1)}}< \mbox{\tt etol}.
	\end{equation}

	\subsection{$V$-cycle performances}
	Table~\ref{tab:iternum} shows the numbers of $V$-cycles required to
	reach the \emph{relative tolerance}
	$\mbox{\tt rtol}=10^{-8}$. In these runs where $\alpha=0.5$, $\tau=1$
	and $a(x,y)$ given by Eq.~\ref{eq:density}, we observe that the
	biggest improvement is obtained at $\nu_1=\nu_2=2$. For larger
	$\nu_1,\nu_2$, the number of required iterations is relatively insensitive
	to the grid sizes. As can be seen in Fig.~\ref{fig:mg_iterations}
	which plots the evolution of the error $e^{(m)}$, it
	is clear that the level of discretization error has been largely
	reached. Finally the times used by these runs are shown in
	Fig.~\ref{fig:dddd_pc220} and Fig.~\ref{fig:nndd_pc220} where the
	times spent in the direct solver using MUMPS \cite{MUMPS} are included
	for comparison. For the $512\times 2048$ grid (the largest
	case using the direct solver), the multigrid $V(3,3)$ is about $30$
	times faster!

	\begin{table}[htb]
	\centering
	\begin{tabular}{\|l\|\|c\|c\|c\|c\|\|c\|c\|c\|c\|}\hline
	& \multicolumn{4}{c\|\|}{\texttt{\textbf{DDDD}} problem}
	& \multicolumn{4}{c\|}{\texttt{\textbf{NNDD}} problem} \\ \cline{2-9}
	Grid size & $V(1,1)$ & $V(2,2)$ & $V(3,3)$ & $V(4,4)$
	& $V(1,1)$ & $V(2,2)$ & $V(3,3)$ & $V(4,4)$ \\ \hline
	$16\times 64$ & 3 & 2 & 2 & 1 & 4 & 2 & 2 & 1\\
	$32\times 128$ & 5 & 3 & 2 & 2 & 5 & 3 & 2 & 2\\
	$64\times 256$ & 7 & 4 & 3 & 3 & 7 & 4 & 3 & 3\\
	$128\times 512$ & 10 & 6 & 4 & 4 & 10 & 6 & 5 & 4\\
	$256\times 1024$ & 11 & 6 & 5 & 4 & 11 & 6 & 5 & 4\\
	$512\times 2048$ & 11 & 6 & 5 & 4 & 11 & 6 & 5 & 4\\
	$1024\times 4096$ & 10 & 6 & 4 & 4 & 10 & 6 & 4 & 4\\
	$1536\times 6144$ & 9 & 6 & 4 & 4 & 9 & 5 & 4 & 3\\
	\hline
	\end{tabular}
	\caption{Multigrid $V$-cycle results for the {\tt DDDD} and {\tt NNDD}
	model problems with $k_x=k_y=4$, $L_x=100$, $L_y=800$, $\tau=1$ and
	$a(x,y)$ given by Eq.~\ref{eq:density}. Shown are the numbers of
	multigrid $V$-cycles required to reduce the \emph{relative}
	residual norm to less than $10^{-8}$ for different
	grid sizes and numbers of pre and post relaxation
	sweeps. Gauss-Seidel relaxation is used. The coarsest grid size of the
	$1536\times 6144$ case is $3\times 12$ while all the others have a coarsest
	grid of size $2\times 8$.}
	\label{tab:iternum}
	\end{table}

	\begin{figure}[htb!]
	\centering
	\includegraphics[width=\textwidth]{\RepFigures/mg_iterations}
	\caption{Performance of the $V(2,2)$-cycle using the Gauss-Seidel
	relaxation scheme for the {\tt DDDD} (upper curve) and {\tt NNDD}
	(lower curve) problem. The relative tolerance {\tt rtol} is
	set to $10^{-8}$ the coarsest grid size for all the problem size
	is fixed to $2\times 8$.}
	\label{fig:mg_iterations}
	\end{figure}

	The fittings of the obtained data show that the multigrid
	$V$ cycle cost scales almost \emph{linearly} with the number of
	unknowns $N=(N_x+1)(N_y+1)$ (as does the backsolve stage of MUMPS)
	while the \emph{total} direct solve time scales as $N^{1.4}$.

	\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/dddd_pc220}
	\caption{Times used by the multigrid $V$ cycles for the runs reported
	in Table~\ref{tab:iternum} for the \texttt{\textbf{DDDD}}
	problem. The last 6 $V(3,3)$ data points are used for the multigrid fit. The
	MUMPS direct solver's cost is included for comparison. All the
	timing results are obtained on an Intel Nehalem i7 processor, using the
	Intel compiler version 13.0.1 and MUMPS-4.10.0. }
	\label{fig:dddd_pc220}
	\end{figure}


	\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/nndd_pc220}
	\caption{As in Fig.~\ref{fig:dddd_pc220} for the
	\texttt{\textbf{NNDD}} problem.}
	\label{fig:nndd_pc220}
	\end{figure}

	\FloatBarrier
	\subsection{Effects of the mesh aspect ratio $\alpha$}
	From Table~\ref{tab:anisotropy}, one can observe that the required number of
	$V(2,2)$ cycles increase quickly when $\alpha<0.5$ and $\alpha>2$. Advanced
	\emph{relaxation} methods and \emph{coarsening} strategies
	\cite[chap. 7]{Wesseling} can solve this performance degradation
	but are generally more difficult to parallelize.

	\begin{table}[htb]
	\centering
	\begin{tabular}{\|l\|\|c\|c\|c\|}\hline
	$\alpha$ & \texttt{\textbf{DDDD}} & \texttt{\textbf{NNDD}} \\ \hline
	0.125 & 19 & 22 \\
	0.25 & 12 & 12 \\
	0.5 & 6 & 6 \\
	1.0 & 5 & 5 \\
	2.0 & 7 & 7 \\
	4.0 & 20 & 19 \\
	\hline
	\end{tabular}
	\caption{Effects of the \emph{mesh aspect ratio} $\alpha=h_x/h_y$ on the
	number of $V(2,2)$ cycles required to reach $\mbox{\tt
	rtol}=10^{-8}$ for {\tt DDDD} and {\tt NNDD}
	model problems. The listed $\alpha$'s are obtained by fixing
	$N_x=256$, $N_y=1024$, $L_x=100$ and varying $L_y$.}
	\label{tab:anisotropy}
	\end{table}

	\subsection{Effects of the mixed partial derivative}
	When $\|\tau\|>2$, Table~\ref{tab:mixedterm} shows that the multigrid
	$V$-cycle diverge, as predicted from the local mode analysis based on
	the amplification factor given in Eq.~\ref{eq:amp_gs}. Although, a
	non-negative coefficient $a$ has a \emph{stabilizing} effect, the
	latter disappears already for a $256\times 1024$ grid.

	\begin{table}[htb]
	\centering
	\begin{tabular}{\|l\|l\|c\|c\|c\|c\|c\|c\|c\|c\|}\hline
	&Grid size & $\tau=-3$ & $\tau=-2$ & $\tau=-1$ & $\tau=0$ & $\tau=1$
	& $\tau=2$ & $\tau=3$ \\ \hline
	\multirow{4}{*}{\texttt{\textbf{DDDD}}}
	&$128\times 512(a=0)$ & - & 39 & 7 & 5 & 7 & 38 & - \\
	& $128\times 512$ &16 & 6 & 5 & 4 & 4 & 6 & 17 \\
	& $256\times 1024$ &- & 8 & 5 & 4 & 5 & 7 & - \\
	& $512\times 2048$ &- & 9 & 5 & 4 & 5 & 9 & - \\
	\hline\hline
	\multirow{4}{*}{\texttt{\textbf{NNDD}}}
	&$128\times 512(a=0)$ & - & 42 & 7 & 5 & 7 & 41 & - \\
	& $128\times 512$ &13 & 6 & 5 & 4 & 5 & 5 & 13 \\
	& $256\times 1024$ &- & 7 & 5 & 4 & 5 & 7 & - \\
	& $512\times 2048$ &- & 7 & 5 & 4 & 5 & 7 & - \\
	\hline
	\end{tabular}
	\caption{Effects of the mixed derivative term $\tau$ on the
	performances of the $V(3,3)$ cycle. The dashes
	indicate that the $V$-cycle diverges. In theses runs, $a(x,y)$ is given by
	Eq.~\ref{eq:density} except for the cases where it is set to
	0. Notice the \emph{stabilizing} effect of $a\ne 0$ for the
	$128\times 512$ grid at $\tau = \pm 3$.}
	\label{tab:mixedterm}
	\end{table}

	\subsection{Using the damped Jacobi relaxation}
	The optimum Jacobi damping factor $\omega$ can be determined by minimizing
	the \emph{smoothing factor} defined as the maximum amplification
	coefficient (\ref{eq:amp_jac}) restricted to the \emph{oscillatory modes}:
	\begin{equation}
	\label{eq:mu_jac}
	\mu(\omega,\alpha,\tau) = \max_{(\theta_1,\theta_2)\in\Omega}
	\|G(\theta_1,\theta_2,\omega,\alpha,\tau)\|, \qquad \Omega =
	[\|\theta_1\|>\pi/2]\,\bigcup\,[\|\theta_2\|>\pi/2].
	\end{equation}
	Results from numerical computation of (\ref{eq:mu_jac} are shown
	in Fig.~\ref{fig:jac_opt}. An analytic expression for $\tau=0$
	assuming $\alpha\le 1$ is derived in \cite[p.~119]{salpha}:
	\begin{gather}
	\mu(\omega,\alpha,\tau=0) =
	\max\left(\left\|1-2\omega\right\|,\;\left\|1-\frac{\alpha^2}{1+\alpha^2}\omega\right\|\right),
	\nonumber\\
	\mu_{\mbox{opt}} = \frac{2+\alpha^2}{2+3\alpha^2} \quad
	\mbox{at} \quad \omega_{\mbox{opt}} = \frac{2+2\alpha^2}{2+3\alpha^2}.
	\end{gather}
	Notice that the smoothing factor increases as $\alpha$ departs from 1
	and for increasing $\tau$.

	For Gauss-Seidel relaxation, the same numerical procedure applied to
	(\ref{eq:amp_gs}) yields a smoothing factor $\mu$ equal to
	respectively $0.5$, $ 0.68$ and $0.70$ for the three cases shown in
	Fig.~\ref{fig:jac_opt}, which result in a better smoothing property
	than the damped Jacobi relaxation.

	\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/jac_opt}
	\caption{The smoothing factor for damped Jacobi relaxation
	for different values of $\alpha$ and $\tau$.}
	\label{fig:jac_opt}
	\end{figure}

	Numerical experiments with the reference case ($\alpha=0.5$, $\tau=1$,
	$a(x,y)$ given by Eq.~\ref{eq:density}) and the $128\times 512$ grid
	using damped Jacobi relaxation, are shown in Table~\ref{tab:jac_opt}
	and confirm that $\omega=0.9$ is the optimum damping factor and that
	it is less efficient than the Gauss-Seidel relaxation, in agreement
	with the Fourier analysis.

	\begin{table}[htb]
	\centering
	\begin{tabular}{l c c c c c c}\hline
	&$\omega=0.5$ &$\omega=0.6$ &$\omega=0.7$ &$\omega=0.8$ &$\omega=0.9$
	&$\omega=1.0$ \\ \cline{2-7}
	\texttt{\textbf{DDDD}}& 12 & 10 & 9 & 8 & 7 & 15 \\
	\texttt{\textbf{NNDD}}& 12 & 11 & 9 & 8 & 7 & 18 \\
	\hline
	\end{tabular}
	\caption{The number of $V(3,3)$ cycles required to obtain
	$\mbox{\texttt{rtol}}=10^{-8}$ versus the Jacobi \emph{damped
	factor} $\omega$. The grid size is $128\times 512$ with
	$\alpha=0.5$, $\tau=1$ and $a(x,y)$ given by Eq.~\ref{eq:density}.}
	\label{tab:jac_opt}
	\end{table}

	\subsection{Matrix storage}
	Initially the \emph{Compressed Sparse Row} storage format (CSR or CRS) (see
	\cite[p.~58--59]{TEMPL}) was used to store the discretized finite
	difference matrix. With this choice, the CPU time used by the matrix
	construction (and boundary condition setting) is found to be always larger
	than the multigrid solver time as shown in
	Fig.~\ref{fig:matcon_time}. Fortunately, switching to the \emph{Compressed
	Diagonal Storage} (CDS), where the 9 diagonal structure of the
	matrix is fully exploited, the matrix construction time is considerably
	reduced as shown in the same figure. On the other hand, no
	difference in the multigrid solver performance is noticeable
	between the two matrix storage.

	\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=\hsize]{\RepFigures/matcon_time}
	\caption{CPU time used by the matrix construction for CSR and CDS
	matrix storage compared to the multigrid $V(3,3)$ cycle time for
	the \textbf{DDDD} and \textbf{NNDD} model problems. The timing is
	obtained using the same conditions as in Fig.~\ref{fig:dddd_pc220}.}
	\label{fig:matcon_time}
	\end{figure}

	\FloatBarrier

	\section{Modified PDE}
	Here, the following modified PDE is considered:
	\begin{equation}
	\label{eq:new_pde}
	\left[\frac{\partial^2}{\partial x^2} + \tau\frac{\partial^2}{\partial
	x\partial y} + (1+\tau^2/4)\frac{\partial^2}{\partial y^2} - a(x,y)\right] u(x,y) =
	f(x,y), \qquad 0\le x \le L_x, \; 0\le y \le L_y.
	\end{equation}
	This PDE which is obtained from the $\hat s-\alpha$ model
	\cite[Eq.10]{salpha} is
	\emph{elliptic} for any value of $\tau$. The resulting stencil is
	changed from the previous stencil \ref{eq:stencil} to
	\begin{equation}
	\label{eq:new_stencil}
	S_{ij} = \frac{1}{h_x^2}
	\begin{bmatrix}
	-\tau\alpha/4 & \alpha^2(1+\tau^2/4) & \tau\alpha/4 \\
	1 & -2\left[1+\alpha^2(1+\tau^2/4)\right]-h_x^2a_{ij} & 1 \\
	\tau\alpha/4 & \alpha^2(1+\tau^2/4) &-\tau\alpha/4 \\
	\end{bmatrix}
	.
	\end{equation}

	Note that the
	\emph{anisotropy} of the resulting Finite Difference discretization
	is now $\alpha^2(1+\tau^2/4)$ and could be controlled by adjusting both
	the mesh aspect ratio $\alpha$ and the \emph{shear} term $\tau$.

	Numerical calculations show that the multigrid $V$-cycles do always
	converge, as shown in Table~\ref{tab:new_mixedterm}.

	\begin{table}[htb]
	\centering
	\begin{tabular}{\|l\|l\|c\|c\|c\|c\|c\|c\|c\|}\hline
	&Grid size & $\tau=0$ & $\tau=1$ & $\tau=2$ & $\tau=4$ & $\tau=8$
	& $\tau=16$ \\ \hline
	\multirow{3}{*}{\texttt{\textbf{DDDD}}}
	& $128\times 512$ &4 & 4 & 5 & 6 & 9 & 20 (6) \\
	& $256\times 1024$ &4 & 4 & 5 & 6 & 11 & 25 (8)\\
	& $512\times 2048$ &4 & 4 & 5 & 7 & 12 & 29 (8)\\
	\hline\hline
	\multirow{3}{*}{\texttt{\textbf{NNDD}}}
	& $128\times 512$ &4 & 4 & 4 & 5 & 8 & 17 (5) \\
	& $256\times 1024$ &4 & 5 & 5 & 5 & 8 & 19 (6) \\
	& $512\times 2048$ &4 & 4 & 4 & 5 & 7 & 18 (6) \\
	\hline
	\end{tabular}
	\caption{Effects of the mixed derivative term $\tau$ on the
	performances of the $V(3,3)$ cycle. In theses runs, $a(x,y)$ is
	given by Eq.~\ref{eq:density}. The mesh aspect ratio $\alpha=0.5$
	was used. On the last column and shown in parenthesis are the
	numbers of $V(3,3)$ cycles when $\alpha$ is reduced to 0.125 by
	increasing the length $L_y$ while keeping all the other values fixed.}
	\label{tab:new_mixedterm}
	\end{table}

	\section{Parallel Multigrid}
	In order to maximize the parallel efficiency and the flexibility of
	utilization, a two-dimensional domain partition scheme is chosen to
	parallelize the multigrid solver. As shown below, generalization of this
	procedure for higher dimensions is straightforward.

	\subsection{Distributed grid coarsening}
	The coarsening algorithm can be summarized as follow:
	\begin{itemize}
	\item Partition the grid points on each dimension at the
	\emph{finest} grid level, as evenly as possible.
	\item The range for each sub-grid, using \emph{global indexing} is
	thus specified by $[s,e]$, with $s=0$ for the first sub-grid and
	$e=N$ for the last sub-grid, $N$ being the number of grid intervals.
	\item The next coarse sub-grid is thus obtained by discarding all the
	\emph{odd} indexed grid points, as in the serial case.
	\item This process can continue (as long as the total of number of
	intervals is even) until there exists a prescribed \emph{minimum}
	number of grid points on any sub-grid is reached.
	\end{itemize}

	\subsection{Matrix-free formulation}
	Using standard \emph{matrix} to represent the discretized 2D (or higher
	dimensional) operators imply an \emph{one-dimensional numbering} of
	the grid nodes. For example on a 2D $N_x\times N_y$ grid, the 1D numbering
	of the node $(x_{i_1},y_{i_2})$ could be defined as
	\[
	k=i_1+i_2\times N_x, \quad i_1=0:N_x,\;i_2=0:N_y.
	\]
	However, using 2D domain partition defined by
	\begin{equation}
	\label{eq:2dnumber}
	i_1=s_1:e_1,\;i_2=s_2:e_2,
	\end{equation}
	with $s=(s_1,s_2)$ and $e=(e_1,e_2)$ denoting respectively the
	\emph{starting} and \emph{ending} indices of a rectangular sub-domain,
	result in a \emph{non-contiguous} set of the indices ${k}$ and in a
	complicate structure of the partitioned matrix for the linear
	operator.

	On the other hand, using the \emph{stencil notation}
	introduced in \cite[chap. 5.2]{Wesseling} based on the
	\emph{multidimensional} node labeling as defined by
	(\ref{eq:2dnumber}) for a 2D problem, one can define a simple data
	structure for the partitioned operator, $A(i,\delta)$,
	where the $d$-tuple $i=(i_1,\ldots,i_d)$ represents a node of the
	$d$-dimensional grid and the $d$-tuple
	$\delta=(\delta_1,\ldots,\delta_d)$, the
	\emph{distance} between the connected nodes. The result of
	$\mathbf{A}u$ can thus be defined as
	\begin{equation}
	\label{eq:vmx}
	(\mathbf{A}u)_i = \sum_{\delta\in\mathbb{Z}^d}
	A(i,\delta)u_{i+\delta}, \quad i=s:e.
	\end{equation}
	In (\ref{eq:vmx}), the sum is performed over all
	indices $\delta$ such that $A(i,\delta)$ is non-zero. For the 2D
	nine-point stencil defined in (\ref{eq:stencil}), the 2-tuple
	$\delta$ can be specified as the 9 columns of the following
	\emph{structure} matrix
	\begin{equation}
	\label{5points}
	S_\delta = \left(
	\begin{array}{rrrrrrrrr}
	0 & -1 & 0 & 1 & -1 & 1 & -1& 0 & 1 \\
	0 & -1 & -1&-1 & 0 & 0 & 1 & 1 & 1 \\
	\end{array} \right).
	\end{equation}
	In the general case of a $d$-dimensional grid and $\mathcal{N}$ point stencil,
	$S_\delta$ is a $d\times\mathcal{N}$
	matrix. By noting that the subscript $i+\delta$ of $u$ on the right hand side of
	(\ref{eq:vmx}) should be in the range $[0,N]$ \emph{only} for sub-domains which are
	\emph{adjacent} to the boundary, one can deduce that for
	a \emph{fixed} $\delta$, the lower and upper bounds of the indices $i$
	should be
	\begin{equation}
	\begin{split}
	i_{\mbox{min}} &= \max (0, -\delta, s), \\
	i_{\mbox{max}} &= \min (N, N-\delta, e) \\
	\end{split}
	\end{equation}
	where $N=(N_1,N_2,\ldots,N_d)$ specify the number of intervals,
	since, for sub-domains \emph{not adjacent} to the boundary, $u$ should
	include values at the \emph{ghost} cells $s-g$ and $e+g$ where $g$ is
	given by
	\begin{equation}
	g = \max\|S_\delta\|
	\end{equation}
	with the operator max taken along the \emph{rows} of the matrix.
	The formula defined in (\ref{eq:vmx}) can then be implemented as in
	the \emph{pseudo} Fortran code

	\par
	\addvspace{\medskipamount}
	\nopagebreak\hrule
	\begin{lstlisting}[mathescape]
	do k=1,SIZE($S_{\delta}$,2) ! loop over the stencil points
	$\delta$ = $S_{\delta}$(:,k)
	lb = MAX(0,-$\delta$,$s$)
	ub = MIN($N$,$N-\delta$,e)
	do i=lb,ub
	Au(i) = Au(i) + A(i,$\delta$)*u(i+$\delta$)
	enddo
	enddo
	\end{lstlisting}
	\nopagebreak\hrule
	\addvspace{\medskipamount}

	On the other hand, if the values of $u$ at the ghost cells of the
	sub-domains \emph{adjacent} to the boundary are set to 0
	\begin{equation*}
	u_{-g} = u_{N+g} = 0,
	\end{equation*}
	the lower and upper bounds of the
	inner loop can be simply set to $lb=s$ and $ub=e$. Note that the inner
	loop should be interpreted as $d$ nested loops over the $d$-tuple
	$i=(i_1,\ldots,i_d)$ for a $d$-dimensional problem.

	\subsection{Inter-grid transfers}

	\subsubsection{Restriction}
	Using the definition in the first equation of (\ref{eq:2dintergrid})
	together with (\ref{eq:1drestriction}), the 2D restriction operator
	can be represented by the following 9-point stencil:
	\begin{equation}
	\label{eq:2drestriction}
	\mathbf{R}_i = \frac{1}{16}
	\begin{pmatrix}
	1 & 2 & 1 \\
	2 & 4 & 2 \\
	1 & 2 & 1 \\
	\end{pmatrix},
	\end{equation}
	and the restriction of $f$ can be computed as
	\begin{equation}
	\bar{f}_i = (\mathbf{R}f)_i = \sum_{\delta\in\mathbb{Z}^2}
	R(i,\delta)f_{2i+\delta}, \quad i=\bar{s}:\bar{e},
	\end{equation}
	where $\bar{s},\bar{e}$ denote the partitioned domain boundary indices on the
	\emph{coarse} grid, using the same algorithm described previously.

	\subsubsection{BC for the restriction operator}
	\label{sec:restrict_bc}
	Dirichlel boundary conditions can be imposed by modifying the
	\emph{restriction stencil} on each of the four boundaries as follow:
	\begin{equation}
	\mathbf{R}_{0,.} = \frac{1}{16}\begin{pmatrix}
	1 & 2 & 0 \\
	2 & 4 & 0 \\
	1 & 2 & 0 \\
	\end{pmatrix},\quad
	\mathbf{R}_{N_x,.} = \frac{1}{16}\begin{pmatrix}
	0 & 2 & 1 \\
	0 & 4 & 2 \\
	0 & 2 & 1 \\
	\end{pmatrix},\quad
	\mathbf{R}_{.,0} = \frac{1}{16}\begin{pmatrix}
	0 & 0 & 0 \\
	2 & 4 & 2 \\
	1 & 2 & 1 \\
	\end{pmatrix},\quad
	\mathbf{R}_{.,N_y} = \frac{1}{16}\begin{pmatrix}
	1 & 2 & 1 \\
	2 & 4 & 2 \\
	0 & 0 & 0 \\
	\end{pmatrix}.
	\end{equation}
	With the natural Neumann BC, no change of the restriction operator is needed.

	\subsubsection{Prolongation}
	Stencil notation for \emph{prolongation} operators is less obvious to
	formulate, see \cite[chap. 5.2]{Wesseling}. A more straightforward
	implementation is obtained in the 2D case, by simply applying
	\emph{bilinear interpolation} on the \emph{coarse grid}:
	\begin{equation}
	\label{eq:2dprolongation}
	\begin{split}
	(\mathbf{P}\bar{u})_{2i} &= \bar{u}_{i}, \\
	(\mathbf{P}\bar{u})_{2i+e_1} &= (\bar{u}_{i} + \bar{u}_{i+e_1})/2, \quad
	(\mathbf{P}\bar{u})_{2i+e_2} = (\bar{u}_{i} + \bar{u}_{i+e_2})/2, \\
	(\mathbf{P}\bar{u})_{2i+e_1+e_2} &= (\bar{u}_{i} +
	\bar{u}_{i+e_1} + \bar{u}_{i+e_2} + \bar{u}_{i+e_1+e_2})/4, \\
	\end{split}
	\end{equation}

	\subsection{Relaxations}
	While the Gauss-Seidel proves to be more efficient, the damped Jacobi
	method, at least for a first version of the parallel multigrid solver,
	is used because it is straightforward to \emph{parallelize}. The same
	undamped Jacobi (with $\omega=1$) with a \emph{few} number of
	iterations is also used to solve the linear system at the coarsest
	mesh as prescribed by the multigrid $V$-cycle procedure defined in
	section \ref{sec-mgProc}.

	\subsection{Local vectors and stencils}
	All local vectors (used to represent solutions or
	right-hand-sides) contain \emph{ghost cells} and are implemented
	using 2D arrays, for example
	\[\mbox{\texttt{sol(s(1)-1:e(1)+1,s(2)-1:e(2)+1)}}\]
	for the solution vector.

	The partitioned stencils are defined
	only for the \emph{local} grid points, without the ghost cells.
	Thus, before each operation on the local vectors, an exchange (or
	update) of the values on the ghost cells is performed.

	As a result, all the memory required by the solver is completely
	partitioned, except for the space used by the ghost cells.

	\subsection{Numerical Experiments}
	In this section, all the numerical experiments are conducted on
	\texttt{helios.iferc-csc.org}, using the Intel compiler version 13.1.3
	and bullxpmi-1.2.4.3. The \emph{stopping criteria} for the $V$-cycles
	is based on the absolute and relative residual norms as well as the
	discretization error norm as defined in section \ref{sec:NumExp1}. In
	cases where the analytic solution is not known, the latter can be
	replaced by some norm of the solution.

	\subsubsection{Strong scaling}
	\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=\hsize]{\RepFigures/strong_256x1024_DDD}
	\caption{DDDD problem for a $256\times 1024$ size, using multigrid
	$V(3,3)$ cycles. Different times for a given number of
	processes are obtained with different combinations of processes in
	each dimension. The number of grid levels are fixed to 6. Five Jacobi
	iterations are used at the coarsest grid.}
	\label{fig:strong_scal_small}
	\end{figure}

	\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=\hsize]{\RepFigures/strong_512x2048_DDD}
	\caption{DDDD problem for a $512\times 2048$ size using multigrid
	$V(3,3)$ cycles. The \textcolor{red}{red marker} on the left shows
	the time for the serial multigrid solver. Different times for a
	given number of processes are obtained with different combinations
	of processes in each dimension. The number of grid levels are fixed
	to 6. Five Jacobi iterations are used at the coarsest grid.}
	\label{fig:strong_scal}
	\end{figure}

	Here 2 \emph{fixed} problem sizes are considered:
	\begin{itemize}
	\item A small size with the (fine) grid of $256\times 1024$
	shown in Fig.~\ref{fig:strong_scal_small} and
	\item a larger size of $512\times 2048$ in Fig.~\ref{fig:strong_scal}.
	\end{itemize}

	In both cases $\mbox{\tt rtol}=10^{-8}$ and $\mbox{\tt
	etol}=10^{-3}$. It was checked that the results do not change when more
	than 5 Jacobi iterations are used at the coarsest mesh. Notice that
	for the small problem, the parallel efficiency starts to degrade at 32
	MPI processes while for the larger case, this happens after 64 MPI
	processes. This can be explained by
	the ghost cell exchange communication overhead: denoting $N_1$ and
	$N_2$, the number of grid points in each direction and $P_1$ and $P_2$
	the number of MPI processes in each direction, the ratio $S/V$ between the
	number of ghost points and interior grid points for each local
	subdomains can be estimated as
	\begin{equation}
	\label{eq:comm_overhead}
	S/V\simeq\frac{2(N_1/P_1+N_2/P_2)}{N_1N_2/P_1P_2} = 2\left(P_1/N_1+P_2/N_2\right).
	\end{equation}
	This ratio increases as the number MPI processes increases while
	keeping the problem size fixed. On very coarse grids, this communication
	cost can become prohibitive. For this reason, in all the runs shown
	here, the number of grid points on each direction for the coarsest
	grid is limited to 2.

	\subsubsection{Weak Scaling}
	\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/weak_DDDD}
	\caption{Weak scaling for a DDDD problem, using multigrid $V(3,3)$
	cycles. The number of grid levels are fixed to 7. The solver for the
	coarsest grid uses 5 Jacobi iterations except for the 2 largest cases
	which require respectively 20 and 100 iterations to converge. The 2
	sets of curves on the right figure show respectively the timings with
	and without the calculations of the residual norm and discretization
	error which require both a \emph{global reduction}.}
	\label{fig:weak_scal_DDDD}
	\end{figure}

	\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/weak_NNDD}
	\caption{Weak scaling for a NNDD problem, using multigrid $V(3,3)$
	cycles. The number of grid levels are fixed to 7. The solver for the
	coarsest grid uses 5 Jacobi iterations except for the 2 largest cases
	which require respectively 20 and 100 iterations to converge. The 2
	sets of curves on the right figure show respectively the timings with
	and without the calculations of the residual norm and discretization
	error which require both a \emph{global reduction}.}
	\label{fig:weak_scal_NNDD}
	\end{figure}

	According to Eq.~\ref{eq:comm_overhead}, varying the problem size
	together with the number of MPI processes by keeping $N_1/P_1$ and
	$N_2/P_2$ constant should yield a \emph{constant scaling},
	provided that the convergence rate does not depend on the problem sizes.
	The results for the \texttt{DDDD} and \texttt{NNDD} problems are shown
	in Fig.~\ref{fig:weak_scal_DDDD} and
	Fig.~\ref{fig:weak_scal_NNDD}. The left part of the figures shows that
	the convergence rate depends only weakly on the problem sizes, which
	leads indeed to a (almost) constant time obtained for numbers of MPI
	processes $P$ between 16 and 1024 . The reason for the good timings
	for smaller $P$ is simply that there are only 2 ghost cell exchanges
	for $P=2\times 2$ (instead of 4 for $P\ge 16$) and that there is no
	exchange for $P=0$.

	\section{Non-homogeneous Boundary Conditions}

	\subsection{Non-homogeneous Dirichlet Conditions}
	Non-homogeneous Dirichlet boundary conditions can be imposed on all the
	Dirichlet faces simply by \emph{clearing}, as for the
	\emph{homogeneous case}, the matrice rows and columns and setting its
	diagonal term to 1. Moreover, the corresponding corresponding
	\emph{right-hand-side} should be set to:
	\begin{equation}
	\begin{split}
	f_{0,j} &= D^W(y_j), \quad f_{N_x,j}=D^E(y_j), \qquad j=0,\ldots,N_y, \\
	f_{i,0} &= D^S(x_i), \quad f_{j,N_y}=D^N(x_i), \qquad i=0,\ldots,N_x, \\
	\end{split}
	\end{equation}
	where $D^W, D^E, D^S, D^N$ are the values of $u$ at the 4 Dirichlet
	faces. As for the homogeneous Dirichlet BC, the \emph{restriction}
	operator should be changed as described in section
	\ref{sec:restrict_bc} while the \emph{prolongation} defined
	in (\ref{eq:2dprolongation}) remains unchanged.

	\subsection{Non-homogeneous Neumann Conditions}
	The non-homogeneous Neumann conditions at the 4 faces $x=0$ can be
	defined as
	\begin{equation}
	\begin{split}
	\left.\frac{\partial u}{\partial x}\right\|_{x=0} &= N^W(y), \quad
	\left.\frac{\partial u}{\partial x}\right\|_{x=L_x} = N^E(y), \\
	\left.\frac{\partial u}{\partial y}\right\|_{y=0} &= N^S(x), \quad
	\left.\frac{\partial u}{\partial y}\right\|_{y=L_y} = N^N(x). \\
	\end{split}
	\end{equation}
	Discretization of the BC defined above, using the \emph{central
	difference} yields on the 4 faces
	\begin{equation}
	\begin{split}
	u_{-1,j} &= u_{1,j} -2h_xN^W(y_j), \quad
	u_{N_x+1,j} = u_{N_x-1,j} + 2h_xN^E(y_j), \qquad j=0,\ldots,N_y, \\
	u_{i,-1} &= u_{i,1} -2h_yN^S(x_i), \quad
	u_{i,N_y+1} = u_{i,N_y-1} + 2h_yN^N(x_i), \qquad i=0,\ldots,N_x. \\
	\end{split}
	\end{equation}

	With these relations, the stencil (\ref{eq:new_stencil}) on the 4
	boundaries is modified as follow

	\begin{equation}
	\begin{split}
	S^W &= \frac{1}{h_x^2}
	\begin{bmatrix}
	0 &\alpha^2(1+\tau^2/4) & 0 \\
	0 &-2(1+\alpha^2(1+\tau^2/4))-h_x^2a_{0,j} & 2 \\
	0 &\alpha^2(1+\tau^2/4) & 0 \\
	\end{bmatrix}
	,\quad
	S^E = \frac{1}{h_x^2}
	\begin{bmatrix}
	0 &\alpha^2(1+\tau^2/4) & 0 \\
	2 &-2(1+\alpha^2(1+\tau^2/4))-h_x^2a_{N_x,j} & 0 \\
	0 &\alpha^2(1+\tau^2/4) & 0 \\
	\end{bmatrix}
	, \\
	S^S &= \frac{1}{h_x^2}
	\begin{bmatrix}
	0 & 2\alpha^2(1+\tau^2/4) & 0 \\
	1 &-2(1+\alpha^2(1+\tau^2/4))-h_x^2a_{i,0} & 1 \\
	0 & 0 & 0 \\
	\end{bmatrix}
	,\quad
	S^N = \frac{1}{h_x^2}
	\begin{bmatrix}
	0 & 0 & 0 \\
	1 &-2(1+\alpha^2(1+\tau^2/4))-h_x^2a_{i,N_y} & 1 \\
	0 & 2\alpha^2(1+\tau^2/4) & 0 \\
	\end{bmatrix}
	, \\
	\end{split}
	\end{equation}
	while the right-hand-side should be changed according to
	\begin{equation}
	\begin{split}
	f_{0,j} &\longleftarrow f_{0,j} + \frac{2}{h_x}\left[\frac{\tau\alpha}{4}\,N^W(y_{j-1})
	+ N^W(y_j) - \frac{\tau\alpha}{4}\,N^W(y_{j+1})\right], \\
	f_{N_x,j} &\longleftarrow f_{N_x,j} + \frac{2}{h_x}\left[\frac{\tau\alpha}{4}\,N^E(y_{j-1})
	- N^E(y_j) - \frac{\tau\alpha}{4}\,N^E(y_{j+1})\right], \\
	f_{i,0} &\longleftarrow f_{i,0} + \frac{2}{h_y}\left[\frac{\tau}{4\alpha}\,N^S(x_{i-1})
	+ (1+\tau^2/4)N^S(x_i) - \frac{\tau}{4\alpha}\,N^S(x_{i+1})\right], \\
	f_{i,N_y} &\longleftarrow f_{i,N_y} + \frac{2}{h_y}\left[\frac{\tau}{4\alpha}\,N^N(x_{i-1})
	- (1+\tau^2/4)N^N(x_i) - \frac{\tau}{4\alpha}\,N^N(x_{i+1})\right]. \\
	\end{split}
	\end{equation}

	\subsection{The NNDD test problem}
	In order to test the discretization of the non-homogeneous boundary
	conditions as formulated above, a test problem with the prescribed
	\emph{exact} solution
	\begin{equation}
	u(x,y) = 1 + \sin\frac{2\pi k_xx}{L_x}\sin\frac{2\pi k_yy}{L_y},
	\qquad \mbox{where $k_x$, $k_y$ are positive integers}
	\end{equation}
	and the following non-homogeneous boundary conditions

	\begin{equation}
	\begin{split}
	\left.\frac{\partial u}{\partial x}\right\|_{x=0} =
	\left.\frac{\partial u}{\partial x}\right\|_{x=L_x} &=
	k_x\sin\frac{2\pi k_yy}{L_y}, \\
	u(x,0) = u(x,L_y) &= 1, \\
	\end{split}
	\end{equation}

	is solved with varying grid spacing. The discretization errors versus
	the number of grid intervals $N_x$ displayed in Fig(\ref{fig:conv_nh_bc} shows
	a \emph{quadratic} convergence as expected from the second order
	finite differences used in both the PDE and the Neumann boundary condition
	discretization.

	\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/conv_nh_bc}
	\caption{Convergence of the error
	$\\| u_{calc}- u_{anal}\\|_\infty$ wrt the number of intervals in
	the $x$ direction $N_x$ for the non-homogeneous NNDD
	problem. Here, $L_x=100$, $L_y=800$, $k_x=k_y=4$, $\tau=1$ and $N_y=4N_x$.}
	\label{fig:conv_nh_bc}
	\end{figure}

	As shown in Fig.(\ref{fig:nndd_nh}), the multigrid $V$-cycles for the
	\emph{non-homogeneous} problem converge with a slightly smaller efficiency,
	than the \emph{homogeneous} problem shown in Fig.(\ref{fig:weak_scal_NNDD}).

	\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/nndd_nh}
	\caption{Performances of the $V(3,3)$-cycle for the non-homogeneous
	NNDD problem. The same parameters in Fig.(\ref{fig:conv_nh_bc})
	are used here.}
	\label{fig:nndd_nh}
	\end{figure}

	\FloatBarrier

	\subsection{Local relaxation methods}
	In addition to the damped Jacobi, three methods of relaxations are
	added in this parallel multigrid solver:
	\begin{enumerate}
	\item The 4 color Gauss-Seidel method (RBGS).
	\item The Gauss-Seidel method (GS).
	\item The successive over-relaxation method (SOR).
	\end{enumerate}
	In order to apply correctly the parallel 4 color Gauss-Seidel, a
	complicated ghost cell exchange has to be performed for each sweep
	for each color. Here we simply apply the method \emph{locally} on each
	subdomain with only one ghost exchange performed at the beginning of
	each relaxation.

	The same procedure is also used for the other 2 methods which are
	inherently \emph{serial}. All these 3 relaxations are
	thus only correct if there is only one subdomain. As a
	consequence, while the damped Jacobi does not depend on the
	partition of the subdomains, results from these 3 methods do depend on
	how the domain is partitioned.

	Table~\ref{tab:advrelax} show
	however that all of the 3 \emph{approximated} relaxation methods produce
	a much \emph{faster} convergence rate than the damped Jacobi relaxations for the NNDD test
	problem considered here. The performance of the implemented solver
	using the 4 relaxation methods on HELIOS is compared in
	Fig,(\ref{fig:weakhelios}. The bad performance of the 4 color
	Gauss-Seidel relaxations (RBGS) can be explained by the 4 nested loops
	required to sweep each of the 4 colors.

	\begin{table}[hbt]
	\centering
	\begin{tabular}{\|l\|\|r\|r\|r\|r\|r\|r\|}\hline
	Grid Sizes & $256\times 1024$ & $512\times 2048$ & $1024\times 4096$ & $2048\times 8192$ & $4096\times 16384$ & $8192\times 32768$ \\
	\hline
	Process topology &
	\multicolumn{1}{c\|} {$1\times 1$} &
	\multicolumn{1}{c\|} {$2\times 2$} &
	\multicolumn{1}{c\|} {$4\times 4$} &
	\multicolumn{1}{c\|} {$8\times 8$} &
	\multicolumn{1}{c\|} {$16\times 16$} &
	\multicolumn{1}{c\|} {$32\times 32$} \\
	\hline
	Jacobi $\omega=0.9$ & 0.22 & 0.24 & 0.24 & 0.24 & 0.24 & 0.25 \\
	RBGS & 0.05 & 0.07 & 0.10 & 0.10 & 0.12 & 0.12 \\
	GS & 0.07 & 0.08 & 0.10 & 0.11 & 0.11 & 0.12 \\
	SOR $\omega=1.2$ & 0.04 & 0.05 & 0.07 & 0.07 & 0.07 & 0.08 \\
	\hline
	\end{tabular}
	\caption{Reduction factor for the residuals (obtained as the \emph{geometric mean} of
	all its values except the first 2 values) for the non-homogeneous
	NNDD test problem. The same parameters as in Fig.(\ref{fig:conv_nh_bc}) are used here.}
	\label{tab:advrelax}
	\end{table}

	\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=0.9\hsize]{\RepFigures/weak_helios}
	\caption{Performance of the 4 relaxations on the non-homogeneous
	NNDD problem. The same parameters in Fig.(\ref{fig:conv_nh_bc})
	are used here. The grid sizes used in this \emph{weak scaling}
	run are shown in Table~\ref{tab:advrelax}.}
	\label{fig:weakhelios}
	\end{figure}

	\FloatBarrier

	\section{Performance of the Stencil Kernel on different platform}
	To get a feeling on the performances gained on the different
	platforms and how well the compilers (with their
	auto-vectorization capability) support these platforms,
	the following Fortran \emph{9-point stencil} kernel has been used. The
	OpenMP directives are used for parallelization on both Xeon and Xeon
	Phi while offload to GPU card is done via the high level OpenACC
	directives. \emph{First touch} is applied in the initialization of
	\texttt{x} and \texttt{mat}.

	\begin{lstlisting}[language=Fortran,numbers=left,commentstyle=\color{blue},keywordstyle=\color{red},frame=single]
	!$omp parallel do private(ix,iy)
	!$acc parallel loop present(mat,x,y) private(ix,iy)
	DO iy=0,ny
	DO ix=0,nx
	y(ix,iy) = mat(ix,iy,1)*x(ix-1,iy-1) &
	& + mat(ix,iy,2)*x(ix, iy-1) &
	& + mat(ix,iy,3)*x(ix+1,iy-1) &
	& + mat(ix,iy,4)*x(ix-1,iy) &
	& + mat(ix,iy,0)*x(ix,iy) &
	& + mat(ix,iy,5)*x(ix+1,iy) &
	& + mat(ix,iy,6)*x(ix-1,iy+1) &
	& + mat(ix,iy,7)*x(ix, iy+1) &
	& + mat(ix,iy,8)*x(ix+1,iy+1)
	END DO
	END DO
	!$acc end parallel loop
	!$omp end parallel do
	\end{lstlisting}

	The performances on a Helios dual processor node and its attached Xeon
	Phi co-processor are shown in Fig.~\ref{fig:cpu_mic} while the
	performances on a Cray XC30 CPU and its attached NVIDIA graphics card are
	shown in Fig.~\ref{fig:cpu_gpu}. In these figures, Intel optimization
	flag \texttt{-O3} and default Cray optimization were applied. In
	Fig.~\ref{fig:cpu_mic_O1_O3}, the speedup by vectorization is shown by
	comparing performances obtained with \texttt{-O3} and \texttt{-O1}.
	Several observations can be drawn from these results.
	\begin{itemize}
	\item The parallel scaling, using OpenMP is linear for both Intel and
	Cray compilers, when the problem sizes fit into the 20MB cache of
	the Sandybridge processor. For grid sizes smaller than $32\times8$, the
	overhead of thread creation dominates. When the memory footprint is
	larger than the cache, 4 threads per socket already saturate the memory
	bandwidth.
	\item On the MIC, the parallel speedup scales linearly up to 60
	cores with 1 thread per core. Using 2 or 3 threads per core does not help
	while with 4 threads, the performance even degrades.
	\item The MIC, using the Intel \emph{mic native} mode, does not
	perform better than 8 cores of the Sandybridge processor.
	\item Since the benefit from \emph{vectorization} is quite large for
	the MIC (see Fig.~\ref{fig:cpu_mic_O1_O3}), the poor parallel
	scalability may be explained by the low flop intensity per thread
	coupled with the high overhead of the (many) thread creation and
	thread synchronization.
	\item The NIVIDIA card, using the high level OpenACC programming
	style is more than 3 times faster than 8 Sandybridge cores, for grid
	sizes larger than $1024\times256$. For smaller sizes, there are not
	enough flops to keep the GPU threads busy.
	\end{itemize}


	\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=\hsize]{\RepFigures/cpu_mic}
	\caption{Performance on the Helios dual processor (left) using the
	\texttt{-O3} compiler option and on the MIC (right),
	using the native mode \texttt{-mmic}.}
	\label{fig:cpu_mic}
	\end{figure}

	\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=\hsize]{\RepFigures/cpu_gpu}
	\caption{Performance on a Cray XC30 single 8 core processor node
	(left) and the NVIDIA card (right) using OpenACC. Default Cray
	Fortran compiler optimization has been used on both runs.}
	\label{fig:cpu_gpu}
	\end{figure}

	\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=\hsize]{\RepFigures/cpu_mic_O1_O3}
	\caption{Performance comparison between using \texttt{-O3} and
	\texttt{-O1}, on the Helios dual processor (left) and on the MIC (right).}
	\label{fig:cpu_mic_O1_O3}
	\end{figure}

	\FloatBarrier

	\section{Hybrid MPI+OpenMP \texttt{PARMG (r599})}
	In this version, a \emph{straightforward} parallelization is
	done in the subroutines \texttt{jacobi, residue, prolong, restrict} and
	\texttt{norm\_vec}, using the OpenMP work sharing directives. The
	ghost cell exchange is executed by the \emph{master} thread.
	All the 2D arrays (solutions, RHS, etc.) are allocated and initialized
	\emph{once} by the \emph{master} thread. Dynamic array allocations
	during the multigrid $V$-cycles are thus avoided.

	To help further optimization, timings are introduced for each of the 4
	multigrid components \texttt{jacobi, residue, prolong, restrict} and
	the ghost cell \texttt{exchange} as well as on the \emph{recursive}
	subroutine \texttt{mg}. Since the timings of the 4 MG components
	include already calls to \texttt{exchange}, the time obtained for
	\texttt{mg} should be equal to the sum of the 4 MG components and
	the \emph{extras} time which includes operations in \texttt{mg} but
	not in the 4 components:
	\begin{equation}
	\label{eq:timings}
	t_\text{mg} = t_\text{jacobi} + t_\text{residue} + t_\text{prolong}
	+ t_\text{restrict} + t_\text{extras}.
	\end{equation}
	We will see in the following sections that, in addition to these 5
	contributions to $t_\text{mg}$,
	there is \emph{overhead} probably due to the \emph{recursive}
	calls of \texttt{mg}.

	\subsection{Parallel efficiency on single node}
	The comparison in Fig.~\ref{fig:single_node} shows that the pure
	OpenMP version is at most $30\%$ slower than the pure MPI version when all the
	16 cores are used but less than $10\%$ when only one socket is
	used. The degradation of the OpenMP version can be explained by the
	\emph{numa} effects when 2 sockets are used. It is also observed
	that the performance level off at 4 cores, due to the
	saturation of the socket memory bandwidth.
	\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=0.8\hsize]{\RepFigures/single_node}
	\caption{Parallel performance of the 7 level $V(3,3)$-cycle on a
	dual socket Helios node ($2\times 8$ cores) for pure OpenMP and pure
	MPI. The non-homogeneous NNDD problem with the same parameters in
	Fig.~\ref{fig:conv_nh_bc} is considered here. The OpenMP threads and MPI
	tasks are placed first on the first socket before filling the
	second socket, using the \texttt{srun} option
	``\texttt{--cpu\_bind=cores -m block:block}'' and the environment
	variable \texttt{OMP\_PROC\_BIND=true}.}
	\label{fig:single_node}
	\end{figure}

	\subsection{Hybrid efficiency on multi-nodes}
	In the following multi-node experiments, all the 16 cores on each
	Helios node are
	utilized. The numbers of OpenMP threads \emph{per} MPI \emph{process} NT, the
	number of MPI processes \emph{per node} NP, the number of nodes NNODES
	and the \emph{total} number of MPI processes $\text{NP}_{tot}$ verify
	thus the following relations:
	\begin{equation}
	\begin{gathered}
	1\le\text{NT} \le 16, \qquad 1\le\text{NP} \le 16 \\
	\text{NT}\times\text{NP} = 16 \\
	\text{NP}_{tot} = 16\times\text{NNODES}/\text{NT} \\
	\end{gathered}
	\end{equation}

	The times of the different MG components and the relative
	contributions for the \emph{strong scaling}
	experiments using a $1024\times 4096$ grid size, are shown in
	Fig.~\ref{fig:hybrid_strong} and Fig.~\ref{fig:hybrid_strong_contrib}
	respectively. The following observations can be made:
	\begin{enumerate}
	\item The \texttt{exchange} time increases strongly with increasing
	NNODES, due to smaller partitioned subdomains and thus their larger
	surface/volume ratio.
	\item The pure MPI (NT=1) \texttt{exchange} time is on the other
	hand reduced with
	$\text{NT}>1$ since the local partitioned grid becomes larger.
	\item The less efficient OpenMP parallelization (numa effects,
	Amdahl's law) tends to limit however this advantage.
	\item As a result, there is an optimal NT for a given NNODES:
	2 for 4 and 16 nodes, 8 for 64 nodes.
	\item The \texttt{jacobi} and \texttt{residue} contributions dominate
	largely with $0.63\le t_\text{jacobi}/t_\text{mg}\le 0.83$ and
	$0.09\le t_\text{residue}/t_\text{mg}\le 0.18$.
	\item The \emph{overhead} (see Eq.~\ref{eq:timings}) times increase
	with NNODES but decrease slightly for increasing NT.
	\end{enumerate}

	The times of the different MG components and the relative
	contributions for the \emph{weak scaling}
	experiments are shown in Fig.~\ref{fig:hybrid_weak} and
	Fig.~\ref{fig:hybrid_weak_contrib} respectively. The following
	observations can be made:
	\begin{enumerate}
	\item A steady increase of MG times with the number of nodes can
	be attributed to the increase of ghost cells \texttt{exchange} time,
	even though the amount of communications between nodes does not
	change.
	\item The MG performance is improved slightly when NT=2 but drop
	drastically for NT=16. This seems to indicate that
	\emph{numa} effects are important here, since the array initialization
	is not done locally on each thread.
	\item The \emph{overhead} (see Eq.~\ref{eq:timings}) times are much
	smaller than in the \emph{strong scaling} runs.
	\end{enumerate}

	\begin{sidewaysfigure}
	\centering
	\includegraphics[angle=0,width=\hsize]{\RepFigures/hybrid_strong}
	\caption{Detailed timings for strong scaling experiments using the same
	problem parameters as in Fig.~\ref{fig:single_node}, except that 5
	levels are chosen to be able to run the runs with 64 nodes.}
	\label{fig:hybrid_strong}
	\end{sidewaysfigure}

	\begin{sidewaysfigure}
	%\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=\hsize]{\RepFigures/hybrid_strong_contrib}
	\caption{Relative contributions of each of the MG components for the
	strong scaling experiments using the same
	problem parameters as in Fig.~\ref{fig:single_node}.}
	\label{fig:hybrid_strong_contrib}
	%\end{figure}
	\end{sidewaysfigure}

	\begin{sidewaysfigure}
	%\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=\hsize]{\RepFigures/hybrid_weak}
	\caption{Detailed timings for weak scaling experiments using the same
	problem parameters as in Fig.~\ref{fig:single_node}.}
	\label{fig:hybrid_weak}
	%\end{figure}
	\end{sidewaysfigure}

	\begin{sidewaysfigure}
	%\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=\hsize]{\RepFigures/hybrid_weak_contrib}
	\caption{Relative contributions of each of the MG components for the
	weak scaling experiments using the same
	problem parameters as in Fig.~\ref{fig:single_node}.}
	\label{fig:hybrid_weak_contrib}
	%\end{figure}
	\end{sidewaysfigure}

	Finally, Table~\ref{tab:memory} shows that using $\text{NT}>1$
	decreases the memory needed by the multigrid procedure for both strong
	and weak scaling runs.

	\begin{table}[htb]
	\begin{center}
	\begin{tabular}{\|l\|c\|\|r\|r\|r\|r\|r\|}
	\hline
	&\texttt{\textbf{NNODES}}& NT=1 & NT=2 & NT=4 & NT=8 & NT=16 \\
	\hline
	\multirow{4}{*}{\texttt{\textbf{Strong Scaling}}}
	&1 & 57.01 & 53.70 & 52.04 & 51.06 & 48.57 \\
	&4 & 21.87 & 21.49 & 15.75 & 13.87 & 13.11 \\
	&16 & 13.82 & 8.52 & 5.75 & 5.69 & 4.00 \\
	&64 & 13.93 & 6.80 & 3.73 & 2.32 & 1.48 \\
	\hline\hline
	\multirow{4}{*}{\texttt{\textbf{Weak Scaling}}}
	&1 & 57.03 & 53.71 & 52.04 & 51.08 & 48.61 \\
	&4 & 59.24 & 59.18 & 53.39 & 51.52 & 48.69 \\
	&16 & 60.48 & 55.33 & 52.69 & 52.74 & 49.06 \\
	&64 & 63.30 & 56.13 & 53.30 & 51.58 & 48.79 \\
	\hline
	\end{tabular}
	\end{center}
	\caption{Memory footprint \emph{per core} (MB/core) for the strong scaling and weak
	scaling experiments.}
	\label{tab:memory}
	\end{table}

	\subsection{Summary and conclusions}
	The \emph{strong scaling} and \emph{weak scaling} wrt NT and NNODES
	are summarized in Fig.~\ref{fig:scaling}. The speed up for the strong
	scaling experiments shows a good efficiency up to 16 nodes
	for all NT but degrades at 64 nodes (1024 cores) due the partitioned
	grid becoming too small. A good \emph{weak scaling} is also
	obtained with an increase in $t_\text{mg}$ of
	less than $10\%$ when NNODES vary from 4 to 64. However, for NT=16, the
	efficiency drops significantly, due to the non-local memory access
	when the OpenMP threads are placed on both sockets (\emph{numa} effect).

	In order to improve the hybrid MPI+OpenMP multigrid, especially for large number
	of threads per MPI process NT, the following optimizations should be done:
	\begin{itemize}
	\item \emph{First touch} array initialization in order to avoid
	\emph{numa} effects.
	\item OpenMP parallelization of some remaining \emph{serial} loops.
	\item Better vectorization of inner loops.
	\end{itemize}
	The outcome of these optimization steps is important in order to run
	efficiently on upcoming \emph{multicore} processors and
	\emph{manycore} (MIC) devices.

	\begin{figure}[htb]
	\centering
	\includegraphics[angle=0,width=0.9\hsize]{\RepFigures/scaling}
	\caption{Strong scaling with a $1024\times 4096$ grid size (left) and
	weak scaling (right) with grid sizes $1024\times 4096,10248\times 8192,
	4096\times 16384$ and $8192\times32768$ respectively for 1, 4, 16
	and 64 nodes.}
	\label{fig:scaling}
	\end{figure}

	\FloatBarrier

	\pagebreak
	\begin{thebibliography}{99}
	\bibitem{Briggs} {W.L.~Briggs, V.E.~Henson and S.F.~McCormick, A
	Multigrid Tutorial, Second Edition, SIAM (2000)}.
	\bibitem{MUMPS} \url{http://graal.ens-lyon.fr/MUMPS/}.
	\bibitem{MG1D} {\tt Multigrid Formulation for Finite Elements},\\
	\url{https://crppsvn.epfl.ch/repos/bsplines/trunk/multigrid/docs/multigrid.pdf}
	\bibitem{TEMPL} {R. Barrett, M. Berry, T. F. Chan,
	J. Demmel, J. Donato, J. Dongarra, V. Eijkhout,
	R. Pozo, C. Romine and H. Van der Vorst,
	Templates for the Solution of Linear Systems: Building Blocks for
	Iterative Methods, 2nd Edition , SIAM, (1994)}.
	\bibitem{Wesseling} {P.~Wesseling, An Introduction to Multigrid
	Methods, Edwards, 2004}.
	\bibitem{salpha} {X. Lapillonne, S. Brunner, T. Dannert, S. Jolliet,
	A. Marinoni et al., Phys. Plasmas 16, 032308 (2009)}.
	\end{thebibliography}

	\end{document}

mg_gbs.texNo OneTemporaryActions

File Metadata

mg_gbs.texView Options

Event Timeline

mg_gbs.tex
No OneTemporary
Actions

mg_gbs.tex
View Options