Page MenuHomec4science

appendix.tex
No OneTemporary

File Metadata

Created
Sun, Nov 10, 21:41

appendix.tex

%!TEX root = ../main.tex
%\section{Proof of Corollary~\ref{cor:first}}
%Denote $B_k = \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|$ for every $k$. Using the convergence proof of the outer algorithm, we have the following bound
%\begin{proof}
%Let $K$ denote the number of (outer) iterations of Algorithm~\ref{Algo:2} and let $\epsilon_{f}$ denote the desired accuracy of Algorithm~\ref{Algo:2}, see~(\ref{eq:inclu3}). Recalling Theorem~\ref{thm:main}, we can then write that
%\begin{equation}
% \epsilon_{f} = \frac{Q}{\b_{K}},
% \label{eq:acc_to_b}
%\end{equation}
%or, equivalently, $\b_{K} = Q/\epsilon_{f}$.
%%where $K$ denotes the last outer iterate and $\epsilon$ is the final accuracy we would like to get for the optimality conditions given in~\eqref{eq:inclu3}.
%We now count the number of total (inner) iterations $T$ of Algorithm~\ref{Algo:2} to reach the accuracy $\epsilon_{f}$. From \eqref{eq:smoothness of Lagrangian} and for sufficiently large $k$, recall that $\lambda_{\b_k}\le \lambda'' \b_k$ is the smoothness parameter of the augmented Lagrangian. Then, from \eqref{eq:iter_1storder} ad by summing over the outer iterations, we bound the total number of (inner) iterations of Algorithm~\ref{Algo:2} as
%\begin{align}\label{eq: tk_bound}
%T &= \sum_{k=1}^K\mathcal{O}\left ( \frac{\lambda_{\beta_{k-1}}^2 \rho^2 }{\epsilon_k} \right) \nonumber\\
%& = \sum_{k=1}^K\mathcal{O}\left (\beta_{k-1}^3 \rho^2 \right)
%\qquad \text{(Step 1 of Algorithm \ref{Algo:2})}
%\nonumber\\
%& \leq \mathcal{O} \left(K\beta_{K-1}^3 \rho^2 \right)
%\qquad \left( \{\b_k\}_k \text{ is increasing} \right)
% \nonumber\\
% & \le \mathcal{O}\left( \frac{K Q^{{3}} \rho^2}{\epsilon_{f}^{{3}}} \right).
% \qquad \text{(see \eqref{eq:acc_to_b})}
%\end{align}
%In addition, if we specify $\beta_k=b^k$ for all $k$, we can further refine $T$. Indeed,
%\begin{equation}
%\beta_K = b^K~~ \Longrightarrow~~ K = \log_b \left( \frac{Q}{\epsilon_f} \right),
%\end{equation}
%which, after substituting into~\eqref{eq: tk_bound} gives the final bound in Corollary~\ref{cor:first}.
%\end{proof}
%\begin{equation}
%T \leq \mathcal{O}\left( \frac{Q^{\frac{3}{2}+\frac{1}{2b}} x_{\max}^2}{\epsilon_f^{\frac{3}{2}+\frac{1}{2b}}} \right),
%\end{equation}
%which completes the proof of Corollary~\ref{cor:first}.
%\section{Analysis of different rates for $\beta_k$ and $\epsilon_k$}
%
%We check the iteration complexity analysis by decoupling $\beta_k$ and $\epsilon_k$.
%\begin{equation}
%\beta_k = k^b, ~~~~ \epsilon_k = k^{-e}.
%\end{equation}
%
%By denoting $B_k = \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|$, the algorithm bound becomes,
%
%\begin{equation}
%B_k \leq \frac{1}{\beta_k} + \epsilon_k = k^{-b} + k^{-e}.
%\end{equation}
%
%Total iteration number is
%
%\begin{equation}
%T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1}.
%\end{equation}
%
%We now consider two different relations between $b$ and $e$ to see what is going on.
%
%\textbf{Case 1:} $b\geq e$:
%
%Bound for the algorithm:
%
%\begin{equation}
%B_k \leq \frac{1}{\beta_k} + \epsilon_k = k^{-b} + k^{-e} \leq \frac{2}{k^e} = \epsilon,
%\end{equation}
%
%which gives the relation $K = \left( \frac{2}{\epsilon}\right)^{1/e}$.
%Writing down the total number of iterations and plugging in $K$,
%
%\begin{equation}
%T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1} \leq \left(\frac{2}{\epsilon}\right)^{\frac{2b}{e}+1+\frac{1}{e}}.
%\end{equation}
%
%To get the least number of total iterations for a given accuracy $\epsilon$, one needs to pick $e$ as large as possible and $b$ as small as possible.
%Since in this case we had $b\geq e$, this suggests picking $b=e$ for the optimal iteration complexity.
%
%\textbf{Case 2:} $b\leq e$:
%
%Same calculations yield the following bound on the total number of iterations:
%
%\begin{equation}
%T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1} \leq \left(\frac{2}{\epsilon}\right)^{2+\frac{e}{b}+\frac{1}{b}}.
%\end{equation}
%
%Given that $b\leq e$, the bound suggests picking $e$ as small as possible and $b$ as big as possible.
%
%To minimize the total number of iterations in both cases with flexible $b$ and $e$, the bounds suggest to pick $b=e=\alpha$ and take this value to be as large as possible.
\section{Proof of Lemma \ref{lem:smoothness}\label{sec:proof of smoothness lemma}}
\begin{proof}
Note that
\begin{align}
\mathcal{L}_{\beta}(x,y) = f(x) + \sum_{i=1}^m y_i A_i (x) + \frac{\b}{2} \sum_{i=1}^m (A_i(x))^2,
\end{align}
which implies that
\begin{align}
& \nabla_x \mathcal{L}_\beta(x,y) \nonumber\\
& = \nabla f(x) + \sum_{i=1}^m y_i \nabla A_i(x) + \frac{\b}{2} \sum_{i=1}^m A_i(x) \nabla A_i(x) \nonumber\\
& = \nabla f(x) + DA(x)^\top y + \b DA(x)^\top A(x),
\end{align}
where $DA(x)$ is the Jacobian of $A$ at $x$. By taking another derivative with respect to $x$, we reach
\begin{align}
\nabla^2_x \mathcal{L}_\beta(x,y) & = \nabla^2 f(x) + \sum_{i=1}^m \left( y_i + \b A_i(x) \right) \nabla^2 A_i(x) \nonumber\\
& \qquad +\b \sum_{i=1}^m \nabla A_i(x) \nabla A_i(x)^\top.
\end{align}
It follows that
\begin{align}
& \|\nabla_x^2 \mathcal{L}_\beta(x,y)\|\nonumber\\
& \le \| \nabla^2 f(x) \| + \max_i \| \nabla^2 A_i(x)\| \left (\|y\|_1+\b \|A(x)\|_1 \right) \nonumber\\
& \qquad +\beta\sum_{i=1}^m \|\nabla A_i(x)\|^2 \nonumber\\
& \le \lambda_h+ \sqrt{m} \lambda_A \left (\|y\|+\b \|A(x)\| \right) + \b \|DA(x)\|^2_F.
\end{align}
For every $x$ such that $\|x\|\le \rho$ and $\|A(x)\|\le \rho$, we conclude that
\begin{align}
\|\nabla_x^2 \mathcal{L}_\beta(x,y)\|
& \le \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho' \right)+ \b \max_{\|x\|\le \rho}\|DA(x)\|_F^2,
\end{align}
which completes the proof of Lemma \ref{lem:smoothness}.
\end{proof}
%\section{Proof of Lemma \ref{lem:11}\label{sec:proof of descent lemma}}
%
%Throughout, let
%\begin{align}
%G = G_{\b,\g}(x,y) = \frac{x-x^+}{\g},
%\end{align}
%for short.
%Suppose that $\|A(x)\|\le \rho$, $\|x\|\le \rho$, and similarly $\|A(x^+)\|\le \rho$, $\|x^+\|\le \rho'$. An application of Lemma \ref{lem:smoothness} yields that
%\begin{align}
%\L_\b(x^+,y)+g(x^+) & \le \L_\b(x,y)+ \langle x^+-x,\nabla_x \L_\b(x,y) \rangle
%+ \frac{\lambda_\b}{2} \|x^+ - x\|^2 + g(x^+) \nonumber\\
%& = \L_\b(x,y)-\g \langle G ,\nabla_x \L_\b(x,y) \rangle
%+ \frac{\g^2 \lambda_\b }{2} \|G\|^2 + g(x^+)
%\label{eq:descent pr 1}
%\end{align}
%Since $x^+ = P_g(x - \g \nabla_x \L_\b(x,y))$, we also have that
%\begin{align}
%\g (G - \nabla_x \L_\b(x,y)) = \xi \in \partial g(x^+).
%\label{eq:opt of prox}
%\end{align}
%By combining (\ref{eq:descent pr 1},\ref{eq:opt of prox}), we find that
%\begin{align}
%\L_\b(x^+,y)+g(x^+) &
%\le \L_\b(x,y) -\g \|G\|^2 + \g \langle G, \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\
%& = \L_\b(x,y) -\g \|G\|^2 + \langle x- x^+ , \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\
%& \le \L_\b(x,y) + g(x) - \g\left( 1-\frac{\g\lambda_\b}{2}\right) \|G\|^2,
%\end{align}
%where the last line above uses the convexity of $g$. Recalling that $\g\le 1/\lambda_\b$ completes the proof of Lemma \ref{lem:11}.
%
%
%\section{Proof of Lemma \ref{lem:eval Lipsc cte}\label{sec:proof of linesearch}}
%
%
%Recalling $x^+_{\gamma}$ in \eqref{eq:defn of gamma line search}, we note that
%\begin{equation}
%x^+_{\gamma} - x +\gamma \nabla_x \mathcal{L}_\beta(x,y) = -\xi \in -\partial g (x^+_{\gamma}).
%\label{eq:optimality of uplus}
%\end{equation}
%Lastly, $\gamma$ by definition in \eqref{eq:defn of gamma line search} satisfies
%\begin{align}
%& \mathcal{L}_{\beta}(x^+_{\gamma},y) + g(x_\g^+) \nonumber\\
% & \le \mathcal{L}_\beta(x,y) + \g \left\langle
%x^+_{\gamma} - x , \nabla_x \mathcal{L}_\beta (x,y)
%\right\rangle + \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2
%+ g(x_\g^+)
%\nonumber\\
%& = \mathcal{L}_\beta(x,y) + \left\langle
%x - x^+_{\gamma} ,\xi
%\right\rangle
%- \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x_\g^+)\nonumber\\
%& \le \mathcal{L}_\beta(x,y)
%- \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x) - g(x^+_\g)
%\qquad (\text{convexity of } g) \nonumber\\
%& = \mathcal{L}_\beta(x,y) - \frac{\gamma}{2} \|G_{\beta,\gamma}(x,y)\|^2 +g(x) - g(x^+_\g),
%\qquad \text{(see Definition \ref{def:grad map})}
%\end{align}
%which completes the proof of Lemma \ref{lem:eval Lipsc cte}.
%
%
%
\section{Clustering \label{sec:verification2}}
We only verify the condition in~\eqref{eq:regularity} here.
Note that
\begin{align}
A(x) = VV^\top \mathbf{1}- \mathbf{1},
\end{align}
\begin{align}
DA(x) & =
\left[
\begin{array}{cccc}
w_{1,1} x_1^\top & \cdots & w_{1,n} x_{1}^\top\\
%x_2^\top p& \cdots & x_{2}^\top\\
\vdots\\
w_{n,1}x_{n}^\top & \cdots & w_{n,n}1 x_{n}^\top
\end{array}
\right] \nonumber\\
& = \left[
\begin{array}{ccc}
V & \cdots & V
\end{array}
\right]
+
\left[
\begin{array}{ccc}
x_1^\top & \\
& \ddots & \\
& & x_n^\top
\end{array}
\right],
\label{eq:Jacobian clustering}
\end{align}
%where $I_n\in\RR^{n\times n}$ is the identity matrix,
where $w_{i.i}=2$ and $w_{i,j}=1$ for $i\ne j$. In the last line above, $n$ copies of $V$ appear and the last matrix above is block-diagonal. For $x_k$, define $V_k$ accordingly and let $x_{k,i}$ be the $i$th row of $V_k$.
Consequently,
\begin{align}
DA(x_k)^\top A(x_k) & =
\left[
\begin{array}{c}
(V_k^\top V_k - I_n) V_k^\top \mathbf{1}\\
\vdots\\
(V_k^\top V_k - I_n) V_k^\top \mathbf{1}
\end{array}
\right] \nonumber\\
& \qquad +
\left[
\begin{array}{c}
x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\
\vdots \\
x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n
\end{array}
\right],
\end{align}
where $I_n\in \RR^{n\times n}$ is the identity matrix.
Let us make a number of simplifying assumptions. First, we assume that $\|x_k\|< \sqrt{s}$ (which can be enforced in the iterates by replacing $C$ with $(1-\epsilon)C$ for a small positive $\epsilon$ in the subproblems). Under this assumption, it follows that
\begin{align}
(\partial g(x_k))_i =
\begin{cases}
0 & (x_k)_i > 0\\
\{a: a\le 0\} & (x_k)_i = 0,
\end{cases}
\qquad i\le d.
\label{eq:exp-subgrad-cluster}
\end{align}
Second, we assume that $V_k$ has nearly orthonormal columns, namely, $V_k^\top V_k \approx I_n$. This can also be enforced in each iterate of Algorithm~\ref{Algo:2} and naturally corresponds to well-separated clusters. While a more fine-tuned argument can remove these assumptions, they will help us simplify the presentation here. Under these assumptions, the (squared) right-hand side of \eqref{eq:regularity} becomes
\begin{align}
& \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right)^2 \nonumber\\
& = \left\| \left( -DA(x_k)^\top A(x_k) \right)_+\right\|^2
\qquad (a_+ = \max(a,0))
\nonumber\\
& =
\left\|
\left[
\begin{array}{c}
x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\
\vdots \\
x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n
\end{array}
\right]
\right\|^2
\qquad (x_k\in C \Rightarrow x_k\ge 0)
\nonumber\\
& = \sum_{i=1}^n \| x_{k,i}\|^2 (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\
& \ge \min_i \| x_{k,i}\|^2
\cdot \sum_{i=1}^n (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\
& = \min_i \| x_{k,i}\|^2
\cdot \| V_kV_k^\top \mathbf{1}-\mathbf{1} \|^2.
\label{eq:final-cnd-cluster}
\end{align}
Therefore, given a prescribed $\nu$, ensuring $\min_i \|x_{k,i}\| \ge \nu$ guarantees \eqref{eq:regularity}. When the algorithm is initialized close enough to the constraint set, there is indeed no need to separately enforce \eqref{eq:final-cnd-cluster}. In practice, often $n$ exceeds the number of true clusters and a more intricate analysis is required to establish \eqref{eq:regularity} by restricting the argument to a particular subspace of $\RR^n$.
%\section{Basis Pursuit \label{sec:verification1}}
%We only verify the regularity condition in \eqref{eq:regularity} for \eqref{prob:01} with $f,A,g$ specified in \eqref{eq:bp-equiv}. Note that
%\begin{align}
%DA(x) = 2 \overline{B} \text{diag}(x),
%\label{eq:jacob-bp}
%\end{align}
%where $\text{diag}(x)\in\RR^{2d\times 2d}$ is the diagonal matrix formed by $x$. The left-hand side of \eqref{eq:regularity} then reads as
%\begin{align}
%& \text{dist} \left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{\b_{k-1}} \right) \nonumber\\
%& = \text{dist} \left( -DA(x_k)^\top A(x_k) , \{0\} \right) \qquad (g\equiv 0)\nonumber\\
%& = \|DA(x_k)^\top A(x_k) \| \nonumber\\
%& =2 \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \|.
%\qquad \text{(see \eqref{eq:jacob-bp})}
%\label{eq:cnd-bp-pre}
%\end{align}
%To bound the last line above, let $x_*$ be a solution of~\eqref{prob:01} and note that $\overline{B} x_*^{\circ 2} = b $ by definition. Let also $z_k,z_*\in \RR^d$ denote the vectors corresponding to $x_k,x_*$. Corresponding to $x_k$, also define $u_{k,1},u_{k,2}$ naturally and let $|z_k| = u_{k,1}^{\circ 2} + u_{k,2}^{\circ 2} \in \RR^d$ be the vector of amplitudes of $z_k$. To simplify matters, let us assume also that $B$ is full-rank.
%We then rewrite the norm in the last line of \eqref{eq:cnd-bp-pre} as
%\begin{align}
%& \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \|^2 \nonumber\\
%& = \| \text{diag}(x_k) \overline{B}^\top \overline{B} (x_k^{\circ 2} -x_*^{\circ 2}) \|^2
%\qquad (\overline{B} x_*^{\circ 2} = b)
%\nonumber\\
%& = \| \text{diag}(x_k)\overline{B}^\top B (x_k - x_*) \|^2 \nonumber\\
%& = \| \text{diag}(u_{k,1})B^\top B (z_k - z_*) \|^2 \nonumber\\
%& \qquad + \| \text{diag}(u_{k,2})B^\top B (z_k - z_*) \|^2 \nonumber\\
%& = \| \text{diag}(u_{k,1}^{\circ 2}+ u_{k,2}^{\circ 2}) B^\top B (z_k - z_*) \|^2 \nonumber\\
%& = \| \text{diag}(|z_k|) B^\top B (z_k - z_*) \|^2 \nonumber\\
%& \ge
%\eta_n ( B \text{diag}(|z_k|) )^2
%\| B(z_k - z_*) \|^2 \nonumber\\
%& =
%\eta_n ( B \text{diag}(|z_k|) )^2
%\| B z_k -b \|^2
%\qquad ( Bz_* = \ol{B} x^{\circ2}_* = b) \nonumber\\
%& \ge \min_{|T|=n}\eta_n(B_T)\cdot |z_{k,(n)}|^2 \|Bz_k - b\|^2,
%\end{align}
%where $\eta_n(\cdot)$ returns the $n$th largest singular value of its argument. In the last line above, $B_T$ is the restriction of $B$ to the columns indexed by $T$ of size $n$. Moreover, $z_{k,(n)}$ is the $n$th largest entry of $z$ in magnitude.
%Given a prescribed $\nu$, \eqref{eq:regularity} therefore holds if
%\begin{align}
%|z_{k,(n)} | \ge \frac{\nu}{2 \sqrt{\min_{|T|=n} \eta_n(B_T)}} ,
%\label{eq:final-bp-cnd}
%\end{align}
%for every iteration $k$. If Algorithm \ref{Algo:2} is initialized close enough to the solution $z^*$ and the entries of $z^*$ are sufficiently large in magnitude, there will be no need to directly enforce \eqref{eq:final-bp-cnd}.
%\paragraph{Discussion}
%The true potential of the reformulation of BP in \eqref{eq:bp-equiv} is in dealing with more structured norms than $\ell_1$, where computing the proximal operator is often intractable. One such case is the latent group lasso norm~\cite{obozinski2011group}, defined as
%\begin{align*}
%\|z\|_{\Omega} = \sum_{i=1}^I \| z_{\Omega_i} \|,
%\end{align*}
%where $\{\Omega_i\}_{i=1}^I$ are (not necessarily disjoint) index sets of $\{1,\cdots,d\}$. Although not studied here, we believe that the non-convex framework presented in this paper can serve to solve more complicated problems, such as the latent group lasso. We leave this research direction for future work.
\section{Additional Experiments}{\label{sec:adexp}}
\subsection{Basis Pursuit}{\label{sec:bp}}
Basis Pursuit (BP) finds sparsest solutions of an under-determined system of linear equations by solving
\begin{align}
%\begin{cases}
\min_{z} \|z\|_1 \subjto
Bz = b,
%\end{cases}
\label{eq:bp_main}
\end{align}
where $B \in \RR^{n \times d}$ and $b \in \RR^{n}$.
%BP has found many applications in machine learning, statistics and signal processing \cite{chen2001atomic, candes2008introduction, arora2018compressed}.
Various primal-dual convex optimization algorithms are available in the literature to solve BP, including~\cite{tran2018adaptive,chambolle2011first}.
%There also exists a line of work \cite{beck2009fast} that handles sparse regression via regularization with $\ell_1$ norm, known as Lasso regression, which we do not compare with here due to their need for tuning the regularization term.
We compare our algorithm against state-of-the-art primal-dual convex methods for solving \eqref{eq:bp_main}, namely, Chambole-Pock~\cite{chambolle2011first}, ASGARD~\cite{tran2018smooth} and ASGARD-DL~\cite{tran2018adaptive}.
%\textbf{AE: Fatih, maybe mention a few other algorithms including asgard and also say why we can't use fista and nista.}
Here, we take a different approach and cast~(\ref{eq:bp_main}) as an instance of~\eqref{prob:01}. Note that any $z \in \RR^d$ can be decomposed as $z = z^+ - z^-$, where $z^+,z^-\in \RR^d$ are the positive and negative parts of $z$, respectively. Then consider the change of variables $z^+ = u_1^{\circ 2}$ and $z^-= u_2^{\circ 2} \in \RR^d$, where $\circ$ denotes element-wise power. Next, we concatenate $u_1$ and $u_2$ as $x := [ u_1^{\top}, u_2^{\top} ]^{\top} \in \RR^{2d}$ and define $\overline{B} := [B, -B] \in \RR^{n \times 2d}$. Then, \eqref{eq:bp_main} is equivalent to \eqref{prob:01} with
\begin{align}
f(x) =& \|x\|^2, \quad g(x) = 0,\subjto
A(x) = \overline{B}x^{\circ 2}- b.
\label{eq:bp-equiv}
\end{align}
We draw the entries of $B$ independently from a zero-mean and unit-variance Gaussian distribution. For a fixed sparsity level $k$, the support of $z_*\in \RR^d$ and its nonzero amplitudes are also drawn from the standard Gaussian distribution.
Then the measurement vector is created as $b = Bz + \epsilon$, where $\epsilon$ is the noise vector with entries drawn independently from the zero-mean Gaussian distribution with variance $\sigma^2=10^{-6}$.
The results are compiled in Figure~\ref{fig:bp1}. Clearly, the performance of Algorithm~\ref{Algo:2} with a second-order solver for BP is comparable to the rest.
%Figure~\ref{fig:bp1} compiles our results for the proposed relaxation.
It is, indeed, interesting to see that these type of nonconvex relaxations gives the solution of convex one and first order methods succeed.
\begin{figure}[]
% \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf}
%\begin{center}
\centering
{\includegraphics[width=.4\columnwidth]{figs/bp_all_obj.pdf}}
{\includegraphics[width=.4\columnwidth]{figs/bp_all_feas.pdf}}
\caption{Basis Pursuit}
\label{fig:bp1}
\end{figure}
%
\paragraph{Discussion:}
The true potential of our reformulation is in dealing with more structured norms rather than $\ell_1$, where computing the proximal operator is often intractable. One such case is the latent group lasso norm~\cite{obozinski2011group}, defined as
\begin{align*}
\|z\|_{\Omega} = \sum_{i=1}^I \| z_{\Omega_i} \|,
\end{align*}
{where $\{\Omega_i\}_{i=1}^I$ are (not necessarily disjoint) index sets of $\{1,\cdots,d\}$. Although not studied here, we believe that the nonconvex framework presented in this paper can serve to solve more complicated problems, such as the latent group lasso. We leave this research direction for future work.
%\vspace{3mm}
\paragraph{Condition verification:}
In the sequel, we verify that Theorem~\ref{thm:main} indeed applies to~\eqref{prob:01} with the above $f,A,g$.
Note that
%We hereby verify the regularity condition in \eqref{eq:regularity} for \eqref{prob:01} with $f,A,g$ specified in \eqref{eq:bp-equiv}.
\begin{align}
DA(x) = 2 \overline{B} \text{diag}(x),
\label{eq:jacob-bp}
\end{align}
where $\text{diag}(x)\in\RR^{2d\times 2d}$ is the diagonal matrix formed by $x$. The left-hand side of \eqref{eq:regularity} then reads as
\begin{align}
& \text{dist} \left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{\b_{k-1}} \right) \nonumber\\
& = \text{dist} \left( -DA(x_k)^\top A(x_k) , \{0\} \right) \qquad (g\equiv 0)\nonumber\\
& = \|DA(x_k)^\top A(x_k) \| \nonumber\\
& =2 \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \|.
\qquad \text{(see \eqref{eq:jacob-bp})}
\label{eq:cnd-bp-pre}
\end{align}
To bound the last line above, let $x_*$ be a solution of~\eqref{prob:01} and note that $\overline{B} x_*^{\circ 2} = b $ by definition. Let also $z_k,z_*\in \RR^d$ denote the vectors corresponding to $x_k,x_*$. Corresponding to $x_k$, also define $u_{k,1},u_{k,2}$ naturally and let $|z_k| = u_{k,1}^{\circ 2} + u_{k,2}^{\circ 2} \in \RR^d$ be the vector of amplitudes of $z_k$. To simplify matters, let us assume also that $B$ is full-rank.
We then rewrite the norm in the last line of \eqref{eq:cnd-bp-pre} as
\begin{align}
& \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \|^2 \nonumber\\
& = \| \text{diag}(x_k) \overline{B}^\top \overline{B} (x_k^{\circ 2} -x_*^{\circ 2}) \|^2
\qquad (\overline{B} x_*^{\circ 2} = b)
\nonumber\\
& = \| \text{diag}(x_k)\overline{B}^\top B (x_k - x_*) \|^2 \nonumber\\
& = \| \text{diag}(u_{k,1})B^\top B (z_k - z_*) \|^2 \nonumber\\
& \qquad + \| \text{diag}(u_{k,2})B^\top B (z_k - z_*) \|^2 \nonumber\\
& = \| \text{diag}(u_{k,1}^{\circ 2}+ u_{k,2}^{\circ 2}) B^\top B (z_k - z_*) \|^2 \nonumber\\
& = \| \text{diag}(|z_k|) B^\top B (z_k - z_*) \|^2 \nonumber\\
& \ge
\eta_n ( B \text{diag}(|z_k|) )^2
\| B(z_k - z_*) \|^2 \nonumber\\
& =
\eta_n ( B \text{diag}(|z_k|) )^2
\| B z_k -b \|^2
\qquad ( Bz_* = \ol{B} x^{\circ2}_* = b) \nonumber\\
& \ge \min_{|T|=n}\eta_n(B_T)\cdot |z_{k,(n)}|^2 \|Bz_k - b\|^2,
\end{align}
where $\eta_n(\cdot)$ returns the $n$th largest singular value of its argument. In the last line above, $B_T$ is the restriction of $B$ to the columns indexed by $T$ of size $n$. Moreover, $z_{k,(n)}$ is the $n$th largest entry of $z$ in magnitude.
Given a prescribed $\nu$, \eqref{eq:regularity} therefore holds if
\begin{align}
|z_{k,(n)} | \ge \frac{\nu}{2 \sqrt{\min_{|T|=n} \eta_n(B_T)}} ,
\label{eq:final-bp-cnd}
\end{align}
for every iteration $k$. If Algorithm \ref{Algo:2} is initialized close enough to the solution $z^*$ and the entries of $z^*$ are sufficiently large in magnitude, there will be no need to directly enforce \eqref{eq:final-bp-cnd}.
\subsection{Generalized Eigenvalue Problem}{\label{sec:geig}}
\begin{figure}[!h]
%\begin{table}[h]
\begin{tabular}{l|l|l}
~~~~~~~~~(i) $C:$ Gaussian iid & ~~~~~(ii) $C:$ Polynomial decay & ~~~~~(iii) $C:$ Exponential decay \\
\includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case1_convergence.pdf}&
\includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case2_convergence.pdf}&
\includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case3_convergence.pdf}\\
\includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case1_eigenvalues.pdf} &
\includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case2_eigenvalues.pdf} &
\includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case3_eigenvalues.pdf} \\ \hline
~~~~~~~~~~~~~~~~~~~~(iv) & ~~~~~~~~~~~~~~~~~~~~(v) & ~~~~~~~~~~~~~~~~~~~(vi) \\
\includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case4_convergence.pdf}&
\includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case6_convergence.pdf}&
\includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case7_convergence.pdf}\\
\includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case4_eigenvalues.pdf} &
\includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case6_eigenvalues.pdf} &
\includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case7_eigenvalues.pdf}
\end{tabular}
%\end{table}
\caption{{\it{(Top)}} Objective convergence for calculating top generalized eigenvalue and eigenvector of $B$ and $C$. {\it{(Bottom)}} Eigenvalue structure of the matrices. For (i),(ii) and (iii), $C$ is positive semidefinite; for (iv), (v) and (vi), $C$ contains negative eigenvalues. {[(i): Generated by taking symmetric part of iid Gaussian matrix. (ii): Generated by randomly rotating diag($1^{-p}, 2^{-p}, \cdots, 1000^{-p}$)($p=1$). (iii): Generated by randomly rotating diag($10^{-p}, 10^{-2p}, \cdots, 10^{-1000p}$)($p=0.0025$).]}
}
\label{fig:geig1}
\end{figure}
Generalized eigenvalue problem has extensive applications in machine learning, statistics and data analysis~\cite{ge2016efficient}.
The well-known nonconvex formulation of the problem is~\cite{boumal2016non} given by
\begin{align}
\begin{cases}
\underset{x\in\mathbb{R}^n}{\min} x^\top C x \\
x^\top B x = 1,
\end{cases}
\label{eq:eig}
\end{align}
where $B, C \in \RR^{n \times n}$ are symmetric matrices and $B$ is positive definite, namely, $B \succ 0$.
%Due to the invertibilty of $B$, we have $\langle x, Bx \rangle \geq \frac{\| x\|_F^2}{\| B^{-1} \|}$, which implies the constraint $\| x \|_F ^2 \leq \| B^{-1} \|$.
The generalized eigenvector computation is equivalent to performing principal component analysis (PCA) of $C$ in the norm $B$. It is also equivalent to computing the top eigenvector of symmetric matrix $S = B^{-1/2}CB^{1/2}$ and multiplying the resulting vector by $B^{-1/2}$. However, for large values of $n$, computing $B^{-1/2}$ is extremely expensive.
The natural convex SDP relaxation for~\eqref{eq:eig} involves lifting $Y = xx^\top$ and removing the nonconvex rank$(Y) = 1$ constraint, namely,
\begin{align}
\begin{cases}
\underset{Y \in \RR^{n \times n}}{\min} \text{tr}(CY)\\
\text{tr}(BY) = 1, \quad X \succeq 0.
\end{cases}
\label{eq:eig-sdp}
\end{align}
Here, however, we opt to directly solve~\eqref{eq:eig} because it fits into our template with
\begin{align}
f(x) =& x^\top C x, \quad g(x) = 0,\nonumber\\
A(x) =& x^\top B x - 1.
\label{eq:eig-equiv}
\end{align}
We compare our approach against three different methods: manifold based Riemannian gradient descent and Riemannian trust region methods in \cite{boumal2016global} and the linear system solver in~\cite{ge2016efficient}, abbrevated as GenELin. We have used Manopt software package in \cite{manopt} for the manifold based methods. For GenELin, we have utilized Matlab's backslash operator as the linear solver. The results are compiled in Figure \ref{fig:geig1}.
\paragraph{Condition verification:}
Here, we verify the regularity condition in \eqref{eq:regularity} for problem \eqref{eq:eig}. Note that
\begin{align}
DA(x) = (2Bx)^\top.
\label{eq:jacobian-gen-eval}
\end{align}
Therefore,
\begin{align}
\dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right)^2 & = \dist\left( -DA(x_k)^\top A(x_k) , \{0\} \right)^2
\qquad (g\equiv 0)
\nonumber\\
& = \| DA(x_k)^\top A(x_k) \|^2
\nonumber\\
& = \|2Bx_k (x_k^\top Bx_k - 1)\|^2
\qquad \text{(see \eqref{eq:jacobian-gen-eval})}
\nonumber\\
& = 4 (x_k^\top Bx_k - 1)^2\|Bx_k\|^2
\nonumber\\
& = 4\|Bx_k\|^2 \|A(x_k)\|^2
\qquad \text{(see \eqref{eq:eig-equiv})}
\nonumber \\
& \ge \eta_{\min}(B)^2\|x_k\|^2 \|A(x_k)\|^2,
\end{align}
where $\eta_{\min}(B)$ is the smallest eigenvalue of the positive definite matrix $B$.
Therefore, for a prescribed $\nu$, the regularity condition in \eqref{eq:regularity} holds with $\|x_k\| \ge \nu /\eta_{min}$ for every $k$. If the algorithm is initialized close enough to the constraint set, there will be again no need to directly enforce this latter condition.
\subsection{$\ell_\infty$ Denoising with a Generative Prior}{\label{sec:gan}}
The authors of \cite{Samangouei2018,Ilyas2017} have proposed to
project onto the range of a Generative Adversarial network (GAN)
\cite{Goodfellow2014}, as a way to defend against adversarial examples. For a
given noisy observation $x^* + \eta$, they consider a projection in the
$\ell_2$ norm. We instead propose to use our augmented Lagrangian method to
denoise in the $\ell_\infty$ norm, a much harder task:
\begin{align}
\begin{array}{lll}
\underset{x, z}{\text{min}} & & \|x^* + \eta - x\|_\infty \\
\text{s.t. } && x=G(z).
\end{array}
\end{align}
\begin{figure}[!h]
% \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf}
\begin{center}
{\includegraphics[scale=0.9]{figs/example_denoising_fab.pdf}}
%\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}}
\caption{Augmented Lagrangian vs Adam and Gradient descent for $\ell_\infty$ denoising}
\label{fig:comparison_fab}
\end{center}
\end{figure}
We use a pretrained generator for the MNIST dataset, given by a standard
deconvolutional neural network architecture \cite{Radford2015}. We compare the
succesful optimizer Adam \cite{Kingma2014} and gradient Descent against our
method. Our algorithm involves two forward and one backward pass through the
network, as oposed to Adam that requires only one forward/backward pass. For
this reason we let our algorithm run for 2000 iterations, and Adam and GD for
3000 iterations. Both Adam and gradient descent generate a sequence of feasible
iterates $x_t=G(z_t)$. For this reason we plot the objective evaluated at the
point $G(z_t)$ vs iteration count in figure \ref{fig:comparison_fab}. Our
method successfully minimizes the objective value, while Adam and GD do not.
\subsection{Quadratic assginment problem}{\label{sec:qap}}
Let $K$, $L$ be $n \times n$ symmetric metrices. QAP in its simplest form can be written as
\begin{align}
\max \text{tr}(KPLP), \,\,\,\,\,\,\, \text{subject to}\,\, P \,\,\text{be a permutation matrix}
\label{eq:qap1}
\end{align}
A direct approach for solving \eqref{eq:qap1} involves a combinatorial search.
To get the SDP relaxation of \eqref{eq:qap1}, we will first lift the QAP to a problem involving a larger
matrix. Observe that the objective function takes the form
\begin{align*}
\text{tr}((K\otimes L) (\text{vec}(P)\text{vec}(P^\top))),
\end{align*}
where $\otimes$ denotes the Kronecker product. Therefore, we can recast \eqref{eq:qap1} as
\begin{align}
\text{tr}((K\otimes L) Y) \,\,\,\,\text{subject to} \,\,\, Y = \text{vec}(P)\text{vec}(P^\top),
\label{eq:qapkron}
\end{align}
where $P$ is a permutation matrix. We can relax the equality constraint in \eqref{eq:qapkron} to a semidefinite constraint and write it in an equivalent form as
\begin{align*}
X =
\begin{bmatrix}
1 & \text{vec}(P)^\top\\
\text{vec}(P) & Y
\end{bmatrix}
\succeq 0 \,\,\, \text{for a symmetric} X \in \mathbb{S}^{(n^2+1) \times (n^2+1)}
\end{align*}
We now introduce the following constraints such that
\begin{equation}
B_k(X) = {\bf{b_k}}, \,\,\,\, {\bf b_k} \in \RR^{m_k}\,\, %\text{and}\, k \in [1, 6]
\label{eq:qapcons}
\end{equation}
to make sure X has a proper structure. Here, $B_k$ is a linear operator on $X$ and the total number of constraints is $m = \sum_{k} m_k.$ Hence, SDP relaxation of the quadratic assignment problem takes the form,
%\begin{enumerate}
%\item Each entry of the permutation matrix $P$ is either $0$ or $1$. For this, we need to have that first column of the matrix $X$ is equal to its diagonal elements, i.e., $X_{1, i} = X_{i,i} \,\,\, \forall i \in [2, n^2+1]$.
%\item A permutation matrix is doubly stochastic, meaning that all the entries are nonnegative, row sum and column sum of the matrix is equal to $1$ for each row and column, i.e., $\sum_i p_{ij} = 1$ and $\sum_j p_{ij} = 1 \,\, \forall i,j \in [1, n] $.
%\item We know that a permutation matrix is orthogonal, i.e., $PP^\top= P^\top P = I $. This constraint is enforced through partial traces, i.e., $\text{trace}_1(Y) = \text{trace}_1(Y) = I $.
%\end{enumerate}
\begin{align}
\max\,\,\, & \langle C, X \rangle \nonumber \\%\,\text{trace}(K\otimes L)
\text{subject to} \,\,\,& P1 = 1, \,\, 1^\top P = 1,\,\, P\geq 0 \nonumber \\
& \text{trace}_1(Y) = I \,\,\, \text{trace}_2(Y) = I \nonumber \\
& \text{vec}(P) = \text{diag}(Y) \nonumber \\
& \text{trace}(Y) = n \,\, \begin{bmatrix}
1 & \text{vec}(P)^\top\\
\text{vec}(P) & Y
\end{bmatrix}
\succeq 0,
\label{eq:qap_sdp}
\end{align}
where $\text{trace}_1(.)$ and $\text{trace}_2(.)$ are partial traces satisfying,
$$
\text{trace}_1(K \otimes L) = \text{trace}(K)L \,\,\,\,\,\, \text{and} \,\,\,\,\, \text{trace}_2(K\otimes L)= K \text{trace}(L)
$$
$$
\text{trace}_1^*(T) = I \otimes T \,\,\,\,\,\, \text{and} \,\,\,\,\, \text{trace}_2^*(T)= T \otimes I
$$
$1st$ set of equalities are due to the fact that permutation matrices are doubly stochastic. $2nd$ set of equalities are to ensure permutation matrices are orthogonal, i.e., $PP^\top = P^\top P=I$. $3rd$ set of equalities are to enforce every individual entry of the permutation matrix takes either $0$ or $1$, i.e., $X_{1, i} = X_{i,i} \,\,\, \forall i \in [1, n^2+1]$. . Trace constraint in the last line is to bound the problem domain.
By concatenating the $B_k$'s in \eqref{eq:qapcons}, we can rewrite \eqref{eq:qap_sdp} in standard SDP form as
\begin{align}
\max\,\,\, & \langle C, X \rangle \nonumber \\%\,\text{trace}(K\otimes L)
\text{subject to} \,\,\,& B(X) = {\bf{b},\,\,\, b} \in \RR^m \nonumber \\
& \text{trace}(X) = n+1 \nonumber \\
& X_{ij} \geq 0, \,\,\,\, i,j\,\,\, \mathcal{G}\nonumber \\
& X\succeq0,
\end{align}
where $\mathcal{G}$ represents the index set for which we introduce the nonnegativities. When $\mathcal{G}$ covers the wholes set of indices, we get the best approximation to the original problem. However, it becomes computationally undesirable as the problem dimension increases. Hence, we remove the redundant nonnegativity constraints and enforce it for the indices where Kronecker product between $K$ and $L$ is nonzero.
We penalize the non-negativity constraints and add it to the augmented Lagrangian objective since a projection to the positive orthant approach in the low rank space as we did for the clustering does not work here.
We take \cite{ferreira2018semidefinite} as the baseline. This is an SDP based approach for solving QAP problems containing a sparse graph.
We compare against the best feasible upper bounds reported in \cite{ferreira2018semidefinite} for the given instances. Here, optimality gap is defined as
$$
\% \text{Gap} = \frac{|\text{bound} - \text{optimal}|}{\text{optimal}} \times 100
$$
We used a (relatively) sparse graph data set from the QAP library.
We run our low rank algorithm for different rank values. $r_m$ in each instance corresponds to the smallest integer satisfying the Pataki bound~\cite{pataki1998rank, barvinok1995problems}.
Results are shown in Table \ref{tb:qap}.
Primal feasibility values except for the last instance $esc128$ is less than $10^{-5}$ and we obtained bounds at least as good as the ones reported in \cite{ferreira2018semidefinite} for these problems.
For $esc128$, the primal feasibility is $\approx 10^{-1}$, hence, we could not manage to obtain a good optimality gap.% due to limited time.
%% not yet complete
%\begin{table}[]
%\begin{tabular}{|l|l|l|l|l|l|l|}
%\hline
% &rank &Primal feasibility &Feasible upper-bound & Opt. Gap & ADMM & SparseQAP \\ \hline
%\multirow{4}{*}{esc132} &$\sqrt{m}$ & & & & & \\ \cline{2-7}
% & $10$& & & & & \\ \cline{2-7}
% & $25$ & & & & & \\ \cline{2-7}
% & $50$ & & & & & \\ \cline{2-7} \hline
%\multirow{4}{*}{esc64a} & $\sqrt{m}$ & & & & & \\ \cline{2-7}
% & $10$ & & & & & \\ \cline{2-7}
% & $25$& & & & & \\ \cline{2-7}
% &$50$ & & & & & \\ \cline{2-7} \hline
%\multirow{4}{*}{esc16a} & $\sqrt{m}$ & & & & & \\ \cline{2-7}
% & $10$ & & & & & \\ \cline{2-7}
% & $25$& & & & & \\ \cline{2-7}
% & $50$ & & & & & \\ \hline
%\multirow{4}{*}{esc16b} & $\sqrt{m}$ & & & & & \\ \cline{2-7}
% & $10$ & & & & & \\ \cline{2-7}
% & $25$& & & & & \\ \cline{2-7}
% & $50$ & & & & & \\ \hline
%\multirow{4}{*}{esc16c} & $\sqrt{m}$ & & & & & \\ \cline{2-7}
% & $10$& & & & & \\ \cline{2-7}
% & $25$& & & & & \\ \cline{2-7}
% & $50$ & & & & & \\ \hline
%\multirow{4}{*}{esc16d} & $\sqrt{m}$& & & & & \\ \cline{2-7}
% & $10$ & & & & & \\ \cline{2-7}
% & $25$& & & & & \\ \cline{2-7}
% &$50$ & & & & & \\ \hline
%\multirow{4}{*}{esc16e} & $\sqrt{m}$ & & & & & \\ \cline{2-7}
% & $10$ & & & & & \\ \cline{2-7}
% & $25$ & & & & & \\ \cline{2-7}
% & $50$& & & & & \\ \hline
%\end{tabular}
%\end{table}
\begin{table}[]
\begin{tabular}{|l|l|l|l|l|l|l|l|}
\hline
& \multicolumn{1}{l|}{} & \multicolumn{6}{c|}{Optimality Gap ($\%$)} \\ \hline
\multicolumn{1}{|c|}{\multirow{2}{*}{Data}} & \multicolumn{1}{c|}{\multirow{2}{*}{Optimal Value}} & \multicolumn{1}{c|}{\multirow{2}{*}{Sparse QAP~\cite{ferreira2018semidefinite}}} & \multicolumn{5}{c|}{iAL} \\ \cline{4-8}
\multicolumn{1}{|c|}{} & \multicolumn{1}{c|}{} & \multicolumn{1}{c|}{} & $r=10$ & $r=25$ & $r=50$ & $r=r_m$ & $r_m$ \\ \hline
esc16a & 68 & 8.8 & 11.8 & $\mathbf{0}$ & $\mathbf{0}$ & 5.9 & 157 \\ \hline
esc16b & 292 & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & 224 \\ \hline
esc16c & 160 & 5 & 5.0 & 5.0 & $\mathbf{2.5}$ & 3.8 & 177 \\ \hline
esc16d & 16 & 12.5 & 37.5 & $\mathbf{0}$ & $\mathbf{0}$ & 25.0 & 126 \\ \hline
esc16e & 28 & 7.1 & 7.1 & $\mathbf{0}$ & 14.3 & 7.1 & 126 \\ \hline
esc16g & 26 & $\mathbf{0}$ & 23.1 & 7.7 & $\mathbf{0}$ & $\mathbf{0}$ & 126 \\ \hline
esc16h & 996 & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & 224 \\ \hline
esc16i & 14 & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & 14.3 & $\mathbf{0}$ & 113 \\ \hline
esc16j & 8 & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & 106 \\ \hline
esc32a & 130 & 93.8 & 129.2 & 109.2 & 104.6 &$\mathbf{83.1}$ & 433 \\ \hline
esc32b & 168 & 88.1 & 111.9 & 92.9 & 97.6 & $\mathbf{69.0}$ & 508 \\ \hline
esc32c & 642 & 7.8 & 15.6 & 14.0 & 15.0 & $\mathbf{4.0}$ & 552 \\ \hline
esc32d & 200 & 21 & 28.0 & 28.0 & 29.0 &$\mathbf{17.0}$ & 470 \\ \hline
esc32e & 2 & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & 220 \\ \hline
esc32g & 6 & $\mathbf{0}$ & 33.3 &$\mathbf{0}$ & $\mathbf{0}$ & $\mathbf{0}$ & 234 \\ \hline
esc32h & 438 & 18.3 & 25.1 & 19.6 & 25.1 & $\mathbf{13.2}$ & 570 \\ \hline
esc64a & 116 & 53.4 & 62.1 & 51.7 & 58.6 & $\mathbf{34.5}$ & 899 \\ \hline
esc128 & 64 & $\mathbf{175}$ & 256.3 & 193.8 & 243.8 & 215.6 & 2045 \\ \hline
\end{tabular}
\caption{Comparison between upper bounds on the problems from the QAP library with (relatively) sparse $L$.}
\label{tb:qap}
\end{table}

Event Timeline