using_bsplines.tex
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sun, May 5, 13:21

using_bsplines.tex
View Options

	%
	% @file using_bsplines.tex
	%
	% @brief
	%
	% @copyright
	% Copyright (©) 2021 EPFL (Ecole Polytechnique Fédérale de Lausanne)
	% SPC (Swiss Plasma Center)
	%
	% SPClibs is free software: you can redistribute it and/or modify it under
	% the terms of the GNU Lesser General Public License as published by the Free
	% Software Foundation, either version 3 of the License, or (at your option)
	% any later version.
	%
	% SPClibs is distributed in the hope that it will be useful, but WITHOUT ANY
	% WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	% FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	%
	% You should have received a copy of the GNU Lesser General Public License
	% along with this program. If not, see <https://www.gnu.org/licenses/>.
	%
	% @author
	% (in alphabetical order)
	% @author Trach-Minh Tran <trach-minh.tran@epfl.ch>
	%
	\documentclass[a4paper]{article}
	\usepackage{linuxdoc-sgml}
	\usepackage{graphicx}
	\usepackage{hyperref}
	\usepackage{amsmath}
	%\usepackage{verbatim}
	%\usepackage[notref]{showkeys}

	\title{\tt Using BSPLINES in Particle Codes}
	\author{Trach-Minh Tran}
	\date{v0.1, March 2012}
	\abstract{These notes present some practical considerations on using
	BSPLINES in particle codes, in particular for the charge or current
	assignment as well as the field interpolation. Performance
	measurements are done on an Intel Xeon X5570 and the more recent
	Xeon E5-2680.}

	\begin{document}
	\maketitle
	%\tableofcontents

	\section{Introduction}
	For simplicity, we assume in these notes that we are dealing with a
	2D electrostatic particle code and the 2D Poisson equation is to be solved
	using the Finite Element Method. Starting from the \emph{weak form}
	and using the \emph{splines} for both \emph{basis} and \emph{test}
	functions, the electrostatic field potential together with its
	gradient and the right hand side can be computed from
	\begin{equation}
	\begin{split}
	\phi(x,y) &=
	\sum_{ij}\,c_{ij}\,\Lambda_i(x)\Lambda_j(y) \\
	\frac{\partial\phi}{\partial x} &=
	\sum_{ij}\,c_{ij}\,\Lambda'_i(x)\Lambda_j(y) \\
	\frac{\partial\phi}{\partial y} &=
	\sum_{ij}\,c_{ij}\,\Lambda_i(x)\Lambda'_j(y) \\
	S_{ij} &= \sum_{\mu=1}^{N_p}\, q_\mu\Lambda_i(x_\mu)\Lambda_j(y_\mu),
	\end{split}
	\end{equation}
	where $c_{ij}$ are the solutions of the discretized Poisson equation
	and $\{x_\mu,y_\mu\}$ are the coordinates of the $N_p$ simulation
	particles. At each time step, the calculation of both the field $\phi$
	and its gradient (\emph{field interpolation}) for the particle
	pusher and the construction of the RHS $S_{i}$ (\emph{charge
	assignment}) involve thus the computation of a large number of
	splines $\Lambda$ and its derivatives $\Lambda'$.

	Notice that the construction of the solver matrix requires also
	the calculations of the splines. This operation is however performed only
	once at the initial timestep in the (most common) case where the
	matrix is time independent and thus will not be considered in further
	these notes.

	\section{Computation of splines}
	Let consider the grid defined by $x_i$, $i=1,\ldots,N+1$. Inside the
	interval $[x_i, x_{i+1}]$, the $p+1$ non-zero splines of degree $p$
	can be computed efficiently using its polynomial representation given
	by
	\begin{equation}
	\begin{split}
	\Lambda_{i+\alpha}(x) &= \sum_{k=0}^{p}\, V^{i}_{k\alpha}
	(x-x_i)^k, \qquad \alpha=1,\ldots,p+1, \\
	V^i_{k\alpha} &=
	\left.\frac{1}{k!}\frac{d^k}{dx^k}\Lambda_{i+\alpha}(x)\right\|_{x=x_i}.
	\end{split}
	\end{equation}
	The $(p+1)^2N$ coefficients $V^i_{k\alpha}$ are precalculated and stored
	during the spline initialization (in routine {\tt SET\_SPLINE}) by
	using the \emph{recurrence relation} \cite{BSPLINES} to compute the spline and all its
	$p$ derivatives. Note that for periodic splines on an equidistant
	mesh, only $(p+1)^2$ coefficients $V_{k\alpha}$ are required since
	the splines have \emph{translational invariance}.

	For a polynomial $P(x)=a_0+a_1x+\ldots +a_px^p$, its value can be
	calculated together with it first derivative, using Horner's rule as:

	\par
	\addvspace{\medskipamount}
	\nopagebreak\hrule
	\begin{verbatim}
	f = a(p)
	fp = f
	DO i=p-1,1,-1
	f = a(i) + x*f
	fp = f + x*fp
	END DO
	f = a(0) + x*f
	\end{verbatim}
	\nopagebreak\hrule
	\addvspace{\medskipamount}
	showing that exactly $4p-2$ floating operations (flops) per point are
	required. If only the value of the polynomial is needed, only $2p$ flops
	per point are required.

	\section{Field interpolation}
	\subsection{1D case}
	Let considered first the 1D case. The spline expansion of $\phi$ for
	$x_i\le x < x_{i+1}$ are expressed as
	\begin{equation}
	\phi(x) = \sum_{\alpha=0}^pc_{i+\alpha}\Lambda_{i+\alpha}(x).
	\end{equation}
	To calculate the field using this spline expansion, $p+1$ splines
	have to be first calculated followed by the sum above, which
	yields a total cost of $2(p+1)^2\sim 2p^2$ flops per point. This cost
	can be reduced by observing that
	$\phi(x)$ is a \emph{piecewise polynomial} (PP) of degree $p$ in
	each interval. Its PP coefficients can be obtained from
	\begin{equation}
	\begin{split}
	\phi(x) &= \sum_{\alpha=0}^pc_{i+\alpha}\sum_{k=0}^{p}\, V^{i}_{k\alpha}
	(x-x_i)^k \\
	&= \sum_{k=0}^{p}\, \Pi^{i}_{k}(x-x_i)^k, \qquad
	\Pi^{i}_{k}= \sum_{\alpha=0}^pc_{i+\alpha} V^{i}_{k\alpha}
	\end{split}
	\end{equation}
	Once the $N(p+1)$ PP coefficients $\Pi^{i}_{k}$ have been calculated
	from the spline expansion coefficients $c_{i+\alpha}$, only $2p$ flops per
	point are required to obtain the field value, using the Horner's rule
	described previously.

	\subsection{2D case}
	Extension for the spline expansion and the PP representation for
	$\phi(x,y)$ is straightforwards and yields, for $x_i\le x < x_{i+1}$,
	$y_j\le y < y_{j+1}$:
	\begin{equation}
	\begin{split}
	\phi(x,y) &= \sum_{\alpha=0}^{p1}\sum_{\beta=0}^{p2}c_{i+\alpha,j+\beta}
	\Lambda_{i+\alpha}(x)\Lambda_{j+\beta}(y) \\
	\phi(x,y) &= \sum_{k=0}^{p1}\sum_{l=0}^{p2}\,\Pi^{ij}_{kl}(x-x_i)^k(y-y_j)^l, \qquad
	\Pi^{ij}_{kl}= \sum_{\alpha=0}^{p1}\sum_{\beta=0}^{p2}c_{i+\alpha,j+\beta} V^{i}_{k\alpha}V^{j}_{l\beta},
	\end{split}
	\end{equation}
	where $ V^{i}_{k\alpha}$ and $V^{j}_{l\beta}$ are the PP
	coefficients of the splines $\Lambda_{i+\alpha}(x)$ and
	$\Lambda_{j+\beta}(y)$ respectively. Assuming the same spline order
	$p$ in
	both $x$ and $y$, the flop counts per point for the 2 representations are
	respectively $2(3p+2)(p+1)\sim 6p^2$ and $2p(p+2)\sim 2p^2$, while the
	storages required for the spline coefficients $c$ and the PP
	coefficients $\Pi$ are $(N+p)^2\sim N^2$ and $N^2(p+1)^2$ respectively.

	\subsection{Implementation in BSPLINES}
	The PP representation is selected by default in BSPLINES,
	\emph{unless} the logical keyword {\tt NLPPFORM} is set to
	{\tt .FALSE.} when calling the spline initialization routine {\tt
	SET\_SPLINE}. The flop counts per point for both methods are
	summarized in the table below
	\begin{center}
	\begin{tabular}{\|l\|c\|c\|}
	\hline
	& 1D & 2D \\\hline
	Spline expansion & $2(p+1)^{2}$ & $2(3p+2)(p+1)$ \\
	PP representation & $2p$ & $2p(p+2)$ \\\hline
	\end{tabular}
	\end{center}
	The routine {\tt GRIDVAL} computes the value of the
	field or one of its derivatives. The first call to this routine
	computes the PP coefficients $\Pi$ if {\tt NLPPFORM=.TRUE.} is
	selected or just store the spline coefficients $c$ in the spline
	internal data otherwise. In the following calls to {\tt GRIDVAL}, $c$
	should not be passed to the routines.

	Notice that the PP representation requires to store the $N^2(p+1)^2$
	PP coefficients in the 2D case, which is still acceptable. In the 3D
	case, this storage requirement becomes $N^3(p+1)^3$ which can be
	prohibitive! In this case the less efficient \emph{Spline expansion}
	formulation should be selected.

	In the \emph{particle loop}, the routine {\tt GETGRAD} which computes
	the function and all its first partial derivatives at once should be
	called instead of {\tt GRIDVAL}.

	\section{Particle localization({\tt locintv})}
	In both charge assignment and field interpolation, finding in which
	interval of the spatial grid the particle is localized should be first
	performed. This operation is trivial for the case of an equidistant
	mesh. For non-equidistant mesh, an \emph{equidistant fine} mesh and its mapping to the
	actual mesh are first constructed in the spline initialization routine
	{\tt SET\_SPLINE} and used to localize the particles in the routine
	{\tt LOCINTV}.

	\section{Performances}
	From the considerations above, using BSPLINES to perform the charge
	assignment and field interpolation in 2D and 3D particle codes might
	result in large overheads because of the large number of calls to the
	routines {\tt BASFUN} to compute the splines or {\tt GETGRAD} to perform the field
	interpolation at a \emph{single} particle position. In the following,
	the performances the 2D linearized gyrokinetic code GYGLES which has
	been adapted to use BSPLINES are analyzed. Vectorization by grouping
	the particles for both charge assignment and field interpolation is then
	proposed as a way to speed up these two operations when using
	BSPLINES.

	\subsection{Scalar performances}
	Optimization of the scalar versions of {\tt BASFUN} and {\tt GETGRAD}
	(when these routines are called with a \emph{single} particle) is
	performed essentially by
	\begin{itemize}
	\item Minimizing the flop counts and reducing redundant operations.
	\item Unrolling small loops, for example the loop over the $p+1$
	splines that are non-zero at a given position, for small $p$.
	\item Define all routines called by {\tt BASFUN} and {\tt GETGRAD} as
	\emph{internal procedures}.
	\item Rearranging the memory layout of the multi-dimension array
	containing the PP coefficients of the spline.
	\end{itemize}

	The timings of the charge and current assignment (assign), the particle
	pusher (push) and the main time loop for a 5 time step run of GYGLES,
	on an Intel Xeon X5570 (hpcff.fz-juelich.de), using 4 MPI
	processes and Intel Fortran 12.1.2 are summarized in the following
	table

	\begin{center}
	\begin{tabular}{lrrrrr}
	\hline
	& $T_0$(s) & $T_1$(s) & $T_2$(s) & $T_1/T_0$ & $T_2/T_1$ \\
	\hline
	assign & 1.454E+01 & 2.126E+01 & 2.259E+01 & 1.46 & 1.06 \\
	push & 2.536E+01 & 3.080E+01 & 3.144E+01 & 1.21 & 1.02 \\
	mainloop & 4.197E+01 & 5.955E+01 & 6.149E+01 & 1.42 & 1.03 \\
	\hline
	\end{tabular}
	\end{center}

	where $T_0$ is the time in seconds obtained with the original code
	while $T_1$ and $T_2$ are the times obtained with BSPLINES,
	respectively using an \emph{equidistant} and \emph{non-equidistant}
	radial mesh. In all the 3 runs, a quadratic splines were used.
	The small difference between \emph{equidistant} and
	\emph{non-equidistant} mesh comes mainly from the particle localization.

	The same run on an Intel Xeon E5-2680 (helios.iferc-csc.org), using
	the same Intel compiler (with AVX instructions) yields

	\begin{center}
	\begin{tabular}{lrrrrr}
	\hline
	& $T_0$(s) & $T_1$(s) & $T_2$(s) & $T_1/T_0$ & $T_2/T_1$ \\
	\hline
	assign & 1.093E+01 & 1.987E+01 & 2.086E+01 & 1.82 & 1.05 \\
	push & 2.385E+01 & 2.868E+01 & 2.994E+01 & 1.20 & 1.04 \\
	mainloop & 3.656E+01 & 5.411E+01 & 5.598E+01 & 1.48 & 1.03 \\
	\hline
	\end{tabular}
	\end{center}


	\subsection{Speed up by vectorization}
	As found in the last section, using external routines from BSPLINES
	instead of \emph{hard coding} the spline computations
	results in a slowing down of 40--50\% for the main time
	loop. As will shown later, this problem could be solved by \emph{grouping} the
	particles and using the vectorized {\tt BASFUN} and {\tt GETGRAD}
	routines. Such particle grouping can be done for example, by replacing the usual
	particle loop by the following Fortran code fragment
	\par
	\addvspace{\medskipamount}
	\nopagebreak\hrule
	\begin{verbatim}
	nset = npart/ngroup
	IF(MODULO(npt, ngroup).NE.0) nset = nset+1
	i2 = 0
	DO is=1,nset
	i1 = i2+1
	i2 = MIN(i2+ngroup,npart)
	CALL basfun(x(i1:i2), ...)
	END DO
	\end{verbatim}
	\nopagebreak\hrule
	\addvspace{\medskipamount}

	where {\tt npart} particles are partitioned into {\tt nset} groups,
	each containing at most {\tt ngroup} particles. Vectorization of the
	routines {\tt BASFUN} and {\tt GETGRAD} is achieved by moving whenever
	is possible the loop over the {\tt ngroup} particles into the
	innermost loop.

	The vectorization performances shown in Fig~.\ref{fig:basfun_hpcff} and
	Fig~.\ref{fig:getgrad_hpcff}, respectively for {\tt BASFUN} and {\tt
	GETGRAD} are obtained using version $12.1.2$ of Intel compiler on
	an Intel Xeon X5570 (hpcff.fz-juelich.de). With a speedup of at least
	2 for quadratic splines, the slowing down found previously in the
	scalar version could be likely compensated. The new AVX instructions
	present in the recent Intel Xeon E5-2680 (helios.iferc-csc.org) seems
	to improve somewhat the vectorization performance as shown in
	Fig~.\ref{fig:basfun_helios} and Fig~.\ref{fig:getgrad_helios}.

	\begin{figure}
	\centering
	\includegraphics[angle=0,width=\hsize]{basfun_perf_hpcff}
	\caption{In this test, $10^5$ particles are distributed randomly on
	an equidistant mesh of 64 intervals. On each point, all the $p+1$
	splines are computed. The particle localization routine {\tt
	locintv} is included in the timing. In order to have a good
	statistics in the measurements, $1'000$ iterations of the particle loop are considered.}
	\label{fig:basfun_hpcff}
	\end{figure}

	\begin{figure}
	\centering
	\includegraphics[angle=0,width=\hsize]{getgrad_perf_hpcff}
	\caption{In this test, $10^5$ particles are distributed randomly on
	an equidistant 2D $(x,y)$ mesh of $64\times 64$ intervals, where
	the coordinate $y$ is periodic. On each point, the function
	together with its gradient are computed, using the PP
	representation. The particle localization routine {\tt
	locintv} is included in the timing. In order to have a good
	statistics in the measurements, $100$ iterations of the particle loop are considered.}
	\label{fig:getgrad_hpcff}
	\end{figure}

	\begin{figure}
	\centering
	\includegraphics[angle=0,width=\hsize]{basfun_perf_helios}
	\caption{In this test, $10^5$ particles are distributed randomly on
	an equidistant mesh of 64 intervals. On each point, all the $p+1$
	splines are computed. The particle localization routine {\tt
	locintv} is included in the timing. In order to have a good
	statistics in the measurements, $1'000$ iterations of the particle loop are considered.}
	\label{fig:basfun_helios}
	\end{figure}

	\begin{figure}
	\centering
	\includegraphics[angle=0,width=\hsize]{getgrad_perf_helios}
	\caption{In this test, $10^5$ particles are distributed randomly on
	an equidistant 2D $(x,y)$ mesh of $64\times 64$ intervals, where
	the coordinate $y$ is periodic. On each point, the function
	together with its gradient are computed, using the PP
	representation. The particle localization routine {\tt
	locintv} is included in the timing. In order to have a good
	statistics in the measurements, $100$ iterations of the particle loop are considered.}
	\label{fig:getgrad_helios}
	\end{figure}

	\begin{thebibliography}{99}
	\bibitem{BSPLINES} {\tt BSPLINES} Reference Guide.
	\end{thebibliography}

	\end{document}

using_bsplines.texNo OneTemporaryActions

File Metadata

using_bsplines.texView Options

Event Timeline

using_bsplines.tex
No OneTemporary
Actions

using_bsplines.tex
View Options