Page MenuHomec4science

appendix.tex
No OneTemporary

File Metadata

Created
Tue, Oct 8, 18:30

appendix.tex

%!TEX root = ../main.tex
\section{Proof of Corollary~\ref{cor:first}}
%Denote $B_k = \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|$ for every $k$. Using the convergence proof of the outer algorithm, we have the following bound
Let $K$ denote the number of (outer) iterations of Algorithm~\ref{Algo:2} and let $\epsilon_{f}$ denote the desired accuracy of Algorithm~\ref{Algo:2}, see~(\ref{eq:inclu3}). Recalling Theorem~\ref{thm:main}, we can then write that
\begin{equation}
\epsilon_{f} = \frac{Q}{\b_{K}},
\label{eq:acc_to_b}
\end{equation}
or, equivalently, $\b_{K} = Q/\epsilon_{f}$.
%where $K$ denotes the last outer iterate and $\epsilon$ is the final accuracy we would like to get for the optimality conditions given in~\eqref{eq:inclu3}.
We now count the number of total (inner) iterations $T$ of Algorithm~\ref{Algo:2} to reach the accuracy $\epsilon_{f}$. From \eqref{eq:smoothness of Lagrangian} and for sufficiently large $k$, recall that $\lambda_{\b_k}\le \lambda'' \b_k$ is the smoothness parameter of the augmented Lagrangian. Then, from \eqref{eq:iter_1storder} ad by summing over the outer iterations, we bound the total number of (inner) iterations of Algorithm~\ref{Algo:2} as
\begin{align}\label{eq: tk_bound}
T &= \sum_{k=1}^K\mathcal{O}\left ( \frac{\lambda_{\beta_{k-1}}^2 \rho'^2 }{\epsilon_k} \right) \nonumber\\
& = \sum_{k=1}^K\mathcal{O}\left (\beta_{k-1}^3 \rho'^2 \right)
\qquad \text{(Step 1 of Algorithm \ref{Algo:2})}
\nonumber\\
& \leq \mathcal{O} \left(K\beta_{K-1}^3 \rho'^2 \right)
\qquad \left( \{\b_k\}_k \text{ is increasing} \right)
\nonumber\\
& \le \mathcal{O}\left( \frac{K Q^{{3}} \rho'^2}{\epsilon_{f}^{{3}}} \right).
\qquad \text{(see \eqref{eq:acc_to_b})}
\end{align}
In addition, if we specify $\beta_k=b^k$ for all $k$, we can further refine $T$. Indeed,
\begin{equation}
\beta_K = b^K~~ \Longrightarrow~~ K = \log_b \left( \frac{Q}{\epsilon_f} \right),
\end{equation}
which, after substituting into~\eqref{eq: tk_bound} gives the final bound in Corollary~\ref{cor:first}.
%\begin{equation}
%T \leq \mathcal{O}\left( \frac{Q^{\frac{3}{2}+\frac{1}{2b}} x_{\max}^2}{\epsilon_f^{\frac{3}{2}+\frac{1}{2b}}} \right),
%\end{equation}
%which completes the proof of Corollary~\ref{cor:first}.
%\section{Analysis of different rates for $\beta_k$ and $\epsilon_k$}
%
%We check the iteration complexity analysis by decoupling $\beta_k$ and $\epsilon_k$.
%\begin{equation}
%\beta_k = k^b, ~~~~ \epsilon_k = k^{-e}.
%\end{equation}
%
%By denoting $B_k = \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|$, the algorithm bound becomes,
%
%\begin{equation}
%B_k \leq \frac{1}{\beta_k} + \epsilon_k = k^{-b} + k^{-e}.
%\end{equation}
%
%Total iteration number is
%
%\begin{equation}
%T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1}.
%\end{equation}
%
%We now consider two different relations between $b$ and $e$ to see what is going on.
%
%\textbf{Case 1:} $b\geq e$:
%
%Bound for the algorithm:
%
%\begin{equation}
%B_k \leq \frac{1}{\beta_k} + \epsilon_k = k^{-b} + k^{-e} \leq \frac{2}{k^e} = \epsilon,
%\end{equation}
%
%which gives the relation $K = \left( \frac{2}{\epsilon}\right)^{1/e}$.
%Writing down the total number of iterations and plugging in $K$,
%
%\begin{equation}
%T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1} \leq \left(\frac{2}{\epsilon}\right)^{\frac{2b}{e}+1+\frac{1}{e}}.
%\end{equation}
%
%To get the least number of total iterations for a given accuracy $\epsilon$, one needs to pick $e$ as large as possible and $b$ as small as possible.
%Since in this case we had $b\geq e$, this suggests picking $b=e$ for the optimal iteration complexity.
%
%\textbf{Case 2:} $b\leq e$:
%
%Same calculations yield the following bound on the total number of iterations:
%
%\begin{equation}
%T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1} \leq \left(\frac{2}{\epsilon}\right)^{2+\frac{e}{b}+\frac{1}{b}}.
%\end{equation}
%
%Given that $b\leq e$, the bound suggests picking $e$ as small as possible and $b$ as big as possible.
%
%To minimize the total number of iterations in both cases with flexible $b$ and $e$, the bounds suggest to pick $b=e=\alpha$ and take this value to be as large as possible.
\section{Proof of Lemma \ref{lem:smoothness}\label{sec:proof of smoothness lemma}}
Note that
\begin{align}
\mathcal{L}_{\beta}(x,y) = f(x) + \sum_{i=1}^m y_i A_i (x) + \frac{\b}{2} \sum_{i=1}^m (A_i(x))^2,
\end{align}
which implies that
\begin{align}
& \nabla_x \mathcal{L}_\beta(x,y) \nonumber\\
& = \nabla f(x) + \sum_{i=1}^m y_i \nabla A_i(x) + \frac{\b}{2} \sum_{i=1}^m A_i(x) \nabla A_i(x) \nonumber\\
& = \nabla f(x) + DA(x)^\top y + \b DA(x)^\top A(x),
\end{align}
where $DA(x)$ is the Jacobian of $A$ at $x$. By taking another derivative with respect to $x$, we reach
\begin{align}
\nabla^2_x \mathcal{L}_\beta(x,y) & = \nabla^2 f(x) + \sum_{i=1}^m \left( y_i + \b A_i(x) \right) \nabla^2 A_i(x) \nonumber\\
& \qquad +\b \sum_{i=1}^m \nabla A_i(x) \nabla A_i(x)^\top.
\end{align}
It follows that
\begin{align}
& \|\nabla_x^2 \mathcal{L}_\beta(x,y)\|\nonumber\\
& \le \| \nabla^2 f(x) \| + \max_i \| \nabla^2 A_i(x)\| \left (\|y\|_1+\b \|A(x)\|_1 \right) \nonumber\\
& \qquad +\beta\sum_{i=1}^m \|\nabla A_i(x)\|^2 \nonumber\\
& \le \lambda_h+ \sqrt{m} \lambda_A \left (\|y\|+\b \|A(x)\| \right) + \b \|DA(x)\|^2_F.
\end{align}
For every $x$ such that $\|x\|\le \rho$ and $\|A(x)\|\le \rho$, we conclude that
\begin{align}
\|\nabla_x^2 \mathcal{L}_\beta(x,y)\|
& \le \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho' \right)+ \b \max_{\|x\|\le \rho}\|DA(x)\|_F^2,
\end{align}
which completes the proof of Lemma \ref{lem:smoothness}.
%\section{Proof of Lemma \ref{lem:11}\label{sec:proof of descent lemma}}
%
%Throughout, let
%\begin{align}
%G = G_{\b,\g}(x,y) = \frac{x-x^+}{\g},
%\end{align}
%for short.
%Suppose that $\|A(x)\|\le \rho$, $\|x\|\le \rho$, and similarly $\|A(x^+)\|\le \rho$, $\|x^+\|\le \rho'$. An application of Lemma \ref{lem:smoothness} yields that
%\begin{align}
%\L_\b(x^+,y)+g(x^+) & \le \L_\b(x,y)+ \langle x^+-x,\nabla_x \L_\b(x,y) \rangle
%+ \frac{\lambda_\b}{2} \|x^+ - x\|^2 + g(x^+) \nonumber\\
%& = \L_\b(x,y)-\g \langle G ,\nabla_x \L_\b(x,y) \rangle
%+ \frac{\g^2 \lambda_\b }{2} \|G\|^2 + g(x^+)
%\label{eq:descent pr 1}
%\end{align}
%Since $x^+ = P_g(x - \g \nabla_x \L_\b(x,y))$, we also have that
%\begin{align}
%\g (G - \nabla_x \L_\b(x,y)) = \xi \in \partial g(x^+).
%\label{eq:opt of prox}
%\end{align}
%By combining (\ref{eq:descent pr 1},\ref{eq:opt of prox}), we find that
%\begin{align}
%\L_\b(x^+,y)+g(x^+) &
%\le \L_\b(x,y) -\g \|G\|^2 + \g \langle G, \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\
%& = \L_\b(x,y) -\g \|G\|^2 + \langle x- x^+ , \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\
%& \le \L_\b(x,y) + g(x) - \g\left( 1-\frac{\g\lambda_\b}{2}\right) \|G\|^2,
%\end{align}
%where the last line above uses the convexity of $g$. Recalling that $\g\le 1/\lambda_\b$ completes the proof of Lemma \ref{lem:11}.
%
%
%\section{Proof of Lemma \ref{lem:eval Lipsc cte}\label{sec:proof of linesearch}}
%
%
%Recalling $x^+_{\gamma}$ in \eqref{eq:defn of gamma line search}, we note that
%\begin{equation}
%x^+_{\gamma} - x +\gamma \nabla_x \mathcal{L}_\beta(x,y) = -\xi \in -\partial g (x^+_{\gamma}).
%\label{eq:optimality of uplus}
%\end{equation}
%Lastly, $\gamma$ by definition in \eqref{eq:defn of gamma line search} satisfies
%\begin{align}
%& \mathcal{L}_{\beta}(x^+_{\gamma},y) + g(x_\g^+) \nonumber\\
% & \le \mathcal{L}_\beta(x,y) + \g \left\langle
%x^+_{\gamma} - x , \nabla_x \mathcal{L}_\beta (x,y)
%\right\rangle + \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2
%+ g(x_\g^+)
%\nonumber\\
%& = \mathcal{L}_\beta(x,y) + \left\langle
%x - x^+_{\gamma} ,\xi
%\right\rangle
%- \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x_\g^+)\nonumber\\
%& \le \mathcal{L}_\beta(x,y)
%- \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x) - g(x^+_\g)
%\qquad (\text{convexity of } g) \nonumber\\
%& = \mathcal{L}_\beta(x,y) - \frac{\gamma}{2} \|G_{\beta,\gamma}(x,y)\|^2 +g(x) - g(x^+_\g),
%\qquad \text{(see Definition \ref{def:grad map})}
%\end{align}
%which completes the proof of Lemma \ref{lem:eval Lipsc cte}.
%
%
%
\section{Clustering \label{sec:verification2}}
We only verify the condition in~\eqref{eq:regularity}.
Note that
\begin{align}
A(x) = VV^\top \mathbf{1}- \mathbf{1},
\end{align}
\begin{align}
DA(x) & =
\left[
\begin{array}{cccc}
w_{1,1} x_1^\top & \cdots & w_{1,n} x_{1}^\top\\
%x_2^\top p& \cdots & x_{2}^\top\\
\vdots\\
w_{n,1}x_{n}^\top & \cdots & w_{n,n}1 x_{n}^\top
\end{array}
\right] \nonumber\\
& = \left[
\begin{array}{ccc}
V & \cdots & V
\end{array}
\right]
+
\left[
\begin{array}{ccc}
x_1^\top & \\
& \ddots & \\
& & x_n^\top
\end{array}
\right],
\label{eq:Jacobian clustering}
\end{align}
%where $I_n\in\RR^{n\times n}$ is the identity matrix,
where $w_{i.i}=2$ and $w_{i,j}=1$ for $i\ne j$. In the last line above, $n$ copies of $V$ appear and the last matrix above is block-diagonal. For $x_k$, define $V_k$ as in the example and let $x_{k,i}$ be the $i$th row of $V_k$.
Consequently,
\begin{align}
DA(x_k)^\top A(x_k) & =
\left[
\begin{array}{c}
(V_k^\top V_k - I_n) V_k^\top \mathbf{1}\\
\vdots\\
(V_k^\top V_k - I_n) V_k^\top \mathbf{1}
\end{array}
\right] \nonumber\\
& \qquad +
\left[
\begin{array}{c}
x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\
\vdots \\
x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n
\end{array}
\right],
\end{align}
where $I_n\in \RR^{n\times n}$ is the identity matrix.
Let us make a number of simplifying assumptions. First, we assume that $\|x_k\|< \sqrt{s}$, which can be easily enforced in the iterates. Under this assumption, it follows that
\begin{align}
(\partial g(x_k))_i =
\begin{cases}
0 & (x_k)_i > 0\\
\{a: a\le 0\} & (x_k)_i = 0,
\end{cases}
\qquad i\le d.
\label{eq:exp-subgrad-cluster}
\end{align}
Second, we assume that $V_k$ has nearly orthonormal columns, namely, $V_k^\top V_k \approx I_n$. This can also be easily enforced in each iterate of Algorithm~\ref{Algo:2} and naturally corresponds to well-separated clusters. While a more fine-tuned argument can remove these assumptions, they will help us simplify the presentation here. Under these assumptions, the (squared) right-hand side of \eqref{eq:regularity} becomes
\begin{align}
& \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right)^2 \nonumber\\
& = \left\| \left( -DA(x_k)^\top A(x_k) \right)_+\right\|^2
\qquad (a_+ = \max(a,0))
\nonumber\\
& =
\left\|
\left[
\begin{array}{c}
x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\
\vdots \\
x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n
\end{array}
\right]
\right\|^2
\qquad (x_k\in C \Rightarrow x_k\ge 0)
\nonumber\\
& = \sum_{i=1}^n \| x_{k,i}\|^2 (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\
& \ge \min_i \| x_{k,i}\|^2
\cdot \sum_{i=1}^n (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\
& = \min_i \| x_{k,i}\|^2
\cdot \| V_kV_k^\top \mathbf{1}-\mathbf{1} \|^2.
\end{align}
Given a prescribed $\nu$, ensuring $\|x_{k,i}\| \ge \nu$ guarantees \eqref{eq:regularity}. This requirement corresponds again to well-separated clusters. When the clusters are sufficiently separated and the algorithm is initialized close enough to the constraint set, there is indeed no need to separately enforce this condition. In practice, often $n$ exceeds the number of true clusters and a more fine-tuned analysis is required to establish \eqref{eq:regularity} by restricting the argument to a particular subspace of $\RR^n$.
\section{Basis Pursuit \label{sec:verification1}}
We only verify the regularity condition in \eqref{eq:regularity} for \eqref{prob:01} with $f,A,g$ specified in \eqref{eq:bp-equiv}. Note that
\begin{align}
DA(x) = 2 \overline{B} \text{diag}(x),
\label{eq:jacob-bp}
\end{align}
where $\text{diag}(x)\in\RR^{2d\times 2d}$ is the diagonal matrix formed by $x$. The left-hand side of \eqref{eq:regularity} then reads as
\begin{align}
& \text{dist} \left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{\b_{k-1}} \right) \nonumber\\
& = \text{dist} \left( -DA(x_k)^\top A(x_k) , \{0\} \right) \qquad (g\equiv 0)\nonumber\\
& = \|DA(x_k)^\top A(x_k) \| \nonumber\\
& =2 \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \|.
\qquad \text{(see \eqref{eq:jacob-bp})}
\label{eq:cnd-bp-pre}
\end{align}
To bound the last line above, let $x_*$ be a solution of~\eqref{prob:01} and note that $\overline{B} x_*^{\circ 2} = b $ by definition. Let also $z_k,z_*\in \RR^d$ denote the vectors corresponding to $x_k,x_*$. Corresponding to $x_k$, also define $u_{k,1},u_{k,2}$ naturally and let $|z_k| = u_{k,1}^{\circ 2} + u_{k,2}^{\circ 2} \in \RR^d$ be the amplitudes of $z_k$. To simplify matters, let us assume also that $B$ is full-rank.
We then rewrite the last line of \eqref{eq:cnd-bp-pre} as
\begin{align}
& \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \|^2 \nonumber\\
& = \| \text{diag}(x_k) \overline{B}^\top \overline{B} (x_k^{\circ 2} -x_*^{\circ 2}) \|^2
\qquad (\overline{B} x_*^{\circ 2} = b)
\nonumber\\
& = \| \text{diag}(x_k)\overline{B}^\top B (x_k - x_*) \|^2 \nonumber\\
& = \| \text{diag}(u_{k,1})B^\top B (z_k - z_*) \|^2 \nonumber\\
& \qquad + \| \text{diag}(u_{k,2})B^\top B (z_k - z_*) \|^2 \nonumber\\
& = \| \text{diag}(u_{k,1}^{\circ 2}+ u_{k,2}^{\circ 2}) B^\top B (z_k - z_*) \|^2 \nonumber\\
& = \| \text{diag}(|z_k|) B^\top B (z_k - z_*) \|^2 \nonumber\\
& \ge
\eta_n ( B \text{diag}(|z_k|) )^2
\| B(z_k - z_*) \|^2 \nonumber\\
& =
\eta_n ( B \text{diag}(|z_k|) )^2
\| B z_k -b \|^2
\qquad ( Bz_* = \ol{B} x^{\circ2}_* = b) \nonumber\\
& \ge \min_{|T|=n}\eta_n(B_T)\cdot |z_{k,(n)}|^2 \|Bz_k - b\|^2,
\end{align}
where $\eta_n(\cdot)$ returns the $n$th largest singular value of its argument. In the last line above, $B_T$ is the restriction of $B$ to the columns indexed by $T$ of size $n$. Moreover, $z_{k,(n)}$ is the $n$th largest entry of $z$ in magnitude.
Given a prescribed $\nu$, \eqref{eq:regularity} therefore holds if
\begin{align}
|z_{k,(n)} | \ge \sqrt{\frac{\nu}{ \min_{|T|=n} \eta_n(B_T) \cdot \| Bz_k - b\|^2}},
\end{align}
for every iteration $k$. If Algorithm \ref{Algo:2} is initialized close enough to the solution $z^*$, there will be no need to directly enforce this condition.
%\paragraph{Discussion}
%The true potential of the reformulation of BP in \eqref{eq:bp-equiv} is in dealing with more structured norms than $\ell_1$, where computing the proximal operator is often intractable. One such case is the latent group lasso norm~\cite{obozinski2011group}, defined as
%\begin{align*}
%\|z\|_{\Omega} = \sum_{i=1}^I \| z_{\Omega_i} \|,
%\end{align*}
%where $\{\Omega_i\}_{i=1}^I$ are (not necessarily disjoint) index sets of $\{1,\cdots,d\}$. Although not studied here, we believe that the non-convex framework presented in this paper can serve to solve more complicated problems, such as the latent group lasso. We leave this research direction for future work.
\subsection{$\ell_\infty$ Denoising with a Generative Prior}
\editf{(mfs): We need Fabian's input here.}
The authors of \citep{Ilyas2017} have proposed to
project onto the range of a Generative Adversarial network (GAN)
\citep{Goodfellow2014}, as a way to defend against adversarial examples. For a
given noisy observation $x^* + \eta$, they consider a projection in the
$\ell_2$ norm. We instead propose to use our augmented Lagrangian method to
denoise in the $\ell_\infty$ norm, a much harder task:
\begin{align}
\begin{array}{lll}
\underset{x, z}{\text{min}} & & \|x^* + \eta - x\|_\infty \\
\text{s.t. } && x=G(z).
\end{array}
\end{align}
\begin{figure}[h!]
% \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf}
\begin{center}
{\includegraphics[width = 6cm, height=5cm]{figs/example_denoising_fab.pdf}}
%\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}}
\label{fig:comparison_fab}
\caption{Augmented Lagrangian vs Adam for $\ell_\infty$ denoising (left). $\ell_2$ vs $\ell_\infty$ denoising as defense against adversarial examples}
\end{center}
\end{figure}
We use a pretrained generator for the MNIST dataset, given by a standard
deconvolutional neural network architecture. We compare the succesful optimizer
Adam against our method. Our algorithm involves two forward/backward passes
through the network, as oposed to Adam that requires only one. For this reason
we let our algorithm run for 4000 iterations, and Adam for 8000 iterations.
For a particular example, we plot the objective value vs iteration count in figure
\ref{fig:comparison_fab}. Our method successfully minimizes the objective value,
while Adam does not succeed.
\subsection{Generalized Eigenvalue Problem}
\begin{figure}[h]
%\begin{table}[h]
\begin{tabular}{l|l|l}
~~~~~~~~~(i) $C:$ Gaussian iid & ~~~~~(ii) $C:$ Polynomial decay & ~~~~~(iii) $C:$ Exponential decay \\
\includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case1_convergence.pdf}&
\includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case2_convergence.pdf}&
\includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case3_convergence.pdf}\\
\includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case1_eigenvalues.pdf} &
\includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case2_eigenvalues.pdf} &
\includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case3_eigenvalues.pdf} \\ \hline
~~~~~~~~~~~~~~~~~~~~(iv) & ~~~~~~~~~~~~~~~~~~~~(v) & ~~~~~~~~~~~~~~~~~~~(vi) \\
\includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case4_convergence.pdf}&
\includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case6_convergence.pdf}&
\includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case7_convergence.pdf}\\
\includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case4_eigenvalues.pdf} &
\includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case6_eigenvalues.pdf} &
\includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case7_eigenvalues.pdf}
\end{tabular}
%\end{table}
\caption{{\it{(Top)}} Objective convergence for calculating top generalized eigenvalue and eigenvector of $B$ and $C$. {\it{(Bottom)}} Eigenvalue structure of the matrices. For (i),(ii) and (iii), $C$ is positive semidefinite; for (iv), (v) and (vi), $C$ contains negative eigenvalues. {[(i): Generated by taking symmetric part of iid Gaussian matrix. (ii): Generated by randomly rotating diag($1^{-p}, 2^{-p}, \cdots, 1000^{-p}$)($p=1$). (iii): Generated by randomly rotating diag($10^{-p}, 10^{-2p}, \cdots, 10^{-1000p}$)($p=0.0025$).]}
}
\label{fig:geig1}
\end{figure}
Generalized eigenvalue problem has extensive applications in machine learning, statistics and data analysis~\cite{ge2016efficient}.
The well-known nonconvex formulation of the problem is~\cite{boumal2016non}.
\begin{align}
\begin{cases}
\underset{x\in\mathbb{R}^n}{\min} x^\top C x \\
x^\top B x = 1,
\end{cases}
\label{eq:eig}
\end{align}
where $B, C \in \RR^{n \times n}$ are symmetric matrices and $B$ is positive definite, i.e. $B \succ 0$.
%Due to the invertibilty of $B$, we have $\langle x, Bx \rangle \geq \frac{\| x\|_F^2}{\| B^{-1} \|}$, which implies the constraint $\| x \|_F ^2 \leq \| B^{-1} \|$.
The generalized eigenvector computation is equivalent to performing principal component analysis (PCA) of $C$ in the norm $B$. Moreover, it is also equivalent to computing the top eigenvector of symmetric matrix $S = B^{-1/2}CB^{1-2}$ and multiplying the resulting vector by $B^{-1/2}$. However, for sufficiently large $n$, computing $B^{-1/2}$ is extremely expensive.
The natural convex sdp relaxation for~\eqref{eq:eig} involves lifting $Y = xx^\top$ and removes the non-convex rank$(Y) = 1$ constraint,
\begin{align}
\begin{cases}
\underset{Y \in \RR^{n \times n}}{\min} \text{tr}(CY)\\
\text{tr}(BY) = 1, \quad X \succeq 0.
\end{cases}
\label{eq:eig-sdp}
\end{align}
Here, we solve~\eqref{eq:eig} because it directly fits into our template with,
\begin{align}
f(x) =& x^\top C x, \quad g(x) = 0\nonumber\\
A(x) =& x^\top B x - 1.
\label{eq:eig-equiv}
\end{align}
We compare our approach against 3 different methods. Manifold based Riemannian gradient descent and Riemannian trust region methods in\cite{boumal2016global} and generalized eigenvector via linear system solver (abbrevated as. GenELin) in~\cite{ge2016efficient}. We have used Manopt software package in \cite{manopt} for the manifold based methods. For GenELin, we have utilized Matlab's backslash operator as the linear solver. The results are compiled in Figure \ref{fig:geig1}.
Here, we verify the regularity condition in \eqref{eq:regularity} for problem \eqref{eq:eig}. Note that
\begin{align*}
DA(x) = (2Bx)^\top
\end{align*}
Therefore,
\begin{align}
\dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right)^2 & = \dist\left( -DA(x_k)^\top A(x_k) , \{0\} \right)^2 \nonumber\\
& = \| DA(x_k)^\top A(x_k) \|^2 \nonumber\\
& = \|2Bx (x^\top Bx - 1)\|^2 \nonumber\\
& = 4 (x^\top Bx - 1)^2\|Bx\|^2 \nonumber\\
& = 4\|Bx\|^2 \|A(x)\|^2 \nonumber \\
& \ge \eta_{min}^2\|x\|^2 \|A(x)\|^2 \nonumber \\
& \ge \nu^2 \|A(x)\|^2.
\end{align}
where $\eta_{min}$ is the smallest eigenvalue of the positive definite matrix $B$.
Therefore, regularity condition holds with $\|x\| \ge \nu /\eta_{min}$.
%\subsection{Quadratic assginment problem}
%
%Let $K$, $L$ be $n \times n$ symmetric metrices. QAP in its simplest form can be written as
%
%\begin{align}
%\max \text{tr}(KPLP), \,\,\,\,\,\,\, \text{subject to}\,\, P \,\,\text{be a permutation matrix}
%\label{eq:qap1}
%\end{align}
%
%A direct approach for solving \eqref{eq:qap1} involves a combinatorial search.
%To get the SDP relaxation of \eqref{eq:qap1}, We will first lift the QAP to a problem involving a larger
%matrix. Observe that the objective function takes the form
%
%\begin{align*}
%\text{tr}((K\otimes L) (\text{vec}(P)\text{vec}(P^\top))),
%\end{align*}
% where $\otimes$ denotes the Kronecker product. Therefore, we can recast \eqref{eq:qap1} as
%
% \begin{align*}
%\text{tr}((K\otimes L) Y) \,\,\,\,\text{subject to} \,\,\, Y = \text{vec}(P)\text{vec}(P^\top),
%\end{align*}
%where $P$ is a permutation matrix. We can relax the equality constraint to a semidefinite constraint and write it in an equivalent form as
%
% \begin{align*}
%X =
%\begin{bmatrix}
%1 & \text{vec}(P)^\top\\
% \text{vec}(P) & Y
%\end{bmatrix}
%\succeq 0 \,\,\, \text{for a symmetric} X \in \mathbb{S}^{(n^2+1) \times (n^2+1)}
%\end{align*}
% not yet complete
%\begin{table}[]
%\begin{tabular}{|l|l|l|l|l|l|l|}
%\hline
% &rank &Primal feasibility &Feasible upper-bound & Opt. Gap & ADMM & SparseQAP \\ \hline
%\multirow{4}{*}{esc132} &$\sqrt{m}$ & & & & & \\ \cline{2-7}
% & $10$& & & & & \\ \cline{2-7}
% & $25$ & & & & & \\ \cline{2-7}
% & $50$ & & & & & \\ \cline{2-7} \hline
%\multirow{4}{*}{esc64a} & $\sqrt{m}$ & & & & & \\ \cline{2-7}
% & $10$ & & & & & \\ \cline{2-7}
% & $25$& & & & & \\ \cline{2-7}
% &$50$ & & & & & \\ \cline{2-7} \hline
%\multirow{4}{*}{esc16a} & $\sqrt{m}$ & & & & & \\ \cline{2-7}
% & $10$ & & & & & \\ \cline{2-7}
% & $25$& & & & & \\ \cline{2-7}
% & $50$ & & & & & \\ \hline
%\multirow{4}{*}{esc16b} & $\sqrt{m}$ & & & & & \\ \cline{2-7}
% & $10$ & & & & & \\ \cline{2-7}
% & $25$& & & & & \\ \cline{2-7}
% & $50$ & & & & & \\ \hline
%\multirow{4}{*}{esc16c} & $\sqrt{m}$ & & & & & \\ \cline{2-7}
% & $10$& & & & & \\ \cline{2-7}
% & $25$& & & & & \\ \cline{2-7}
% & $50$ & & & & & \\ \hline
%\multirow{4}{*}{esc16d} & $\sqrt{m}$& & & & & \\ \cline{2-7}
% & $10$ & & & & & \\ \cline{2-7}
% & $25$& & & & & \\ \cline{2-7}
% &$50$ & & & & & \\ \hline
%\multirow{4}{*}{esc16e} & $\sqrt{m}$ & & & & & \\ \cline{2-7}
% & $10$ & & & & & \\ \cline{2-7}
% & $25$ & & & & & \\ \cline{2-7}
% & $50$& & & & & \\ \hline
%\end{tabular}
%\label{tb:qap}
%\caption{QAP bounds for the sparse QAPLIB data.}
%\end{table}
%\begin{figure*}[ht]
%% \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf}
%\begin{center}
%{\includegraphics[width=.7\columnwidth]{figs/qap/r4esc16i}}
%{\includegraphics[width=.7\columnwidth]{figs/qap/r4esc64a}}
%%\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}}
%\label{fig:qap}
%\caption{Convergence of feasibility(left) and feasible upper bound after rounding(right) for sparse quadratic assignment data from QAPLIB.}
%\end{center}
%\end{figure*}
%
%}

Event Timeline