Page MenuHomec4science

appendix.tex
No OneTemporary

File Metadata

Created
Wed, May 22, 09:30

appendix.tex

%!TEX root = ../iALM_main.tex
\section{Proof of Corollary~\ref{cor:first}}
%Denote $B_k = \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|$ for every $k$. Using the convergence proof of the outer algorithm, we have the following bound
Let $K$ denote the number of (outer) iterations of Algorithm~\ref{Algo:2} and let $\epsilon_{f}$ denote the desired accuracy of Algorithm~\ref{Algo:2}, see~(\ref{eq:inclu3}). Recalling Theorem~\ref{thm:main}, we can then write that
\begin{equation}
\epsilon_{f} = \frac{Q}{\b_{K}},
\label{eq:acc_to_b}
\end{equation}
or, equivalently, $\b_{K} = Q/\epsilon_{f}$.
%where $K$ denotes the last outer iterate and $\epsilon$ is the final accuracy we would like to get for the optimality conditions given in~\eqref{eq:inclu3}.
We now count the number of total (inner) iterations $T$ of Algorithm~\ref{Algo:2} to reach the accuracy $\epsilon_{f}$. From \eqref{eq:smoothness of Lagrangian} and for sufficiently large $k$, recall that $\lambda_{\b_k}\le \lambda'' \b_k$ is the smoothness parameter of the augmented Lagrangian. Then, from \eqref{eq:iter_1storder} ad by summing over the outer iterations, we bound the total number of (inner) iterations of Algorithm~\ref{Algo:2} as
\begin{align}\label{eq: tk_bound}
T &= \sum_{k=1}^K\mathcal{O}\left ( \frac{\lambda_{\beta_{k-1}}^2 x_{\max}^2 }{\epsilon_k} \right) \nonumber\\
& = \sum_{k=1}^K\mathcal{O}\left (\beta_{k-1}^3 x_{\max}^2 \right)
\qquad \text{(Step 1 of Algorithm \ref{Algo:2})}
\nonumber\\
& \leq \mathcal{O} \left(K\beta_{K-1}^3 x_{\max}^2 \right)
\qquad \left( \{\b_k\}_k \text{ is increasing} \right)
\nonumber\\
& \le \mathcal{O}\left( \frac{K Q^{{3}} x_{\max}^2}{\epsilon_{f}^{{3}}} \right).
\qquad \text{(see \eqref{eq:acc_to_b})}
\end{align}
In addition, if we specify $\beta_k=b^k$ for all $k$, we can further refine $T$. Indeed,
\begin{equation}
\beta_K = b^K~~ \Longrightarrow~~ K = \log_b \left( \frac{Q}{\epsilon_f} \right),
\end{equation}
which, after substituting into~\eqref{eq: tk_bound} gives the final bound in Corollary~\ref{cor:first}.
%\begin{equation}
%T \leq \mathcal{O}\left( \frac{Q^{\frac{3}{2}+\frac{1}{2b}} x_{\max}^2}{\epsilon_f^{\frac{3}{2}+\frac{1}{2b}}} \right),
%\end{equation}
%which completes the proof of Corollary~\ref{cor:first}.
%\section{Analysis of different rates for $\beta_k$ and $\epsilon_k$}
%
%We check the iteration complexity analysis by decoupling $\beta_k$ and $\epsilon_k$.
%\begin{equation}
%\beta_k = k^b, ~~~~ \epsilon_k = k^{-e}.
%\end{equation}
%
%By denoting $B_k = \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \| A(x_k)\|$, the algorithm bound becomes,
%
%\begin{equation}
%B_k \leq \frac{1}{\beta_k} + \epsilon_k = k^{-b} + k^{-e}.
%\end{equation}
%
%Total iteration number is
%
%\begin{equation}
%T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1}.
%\end{equation}
%
%We now consider two different relations between $b$ and $e$ to see what is going on.
%
%\textbf{Case 1:} $b\geq e$:
%
%Bound for the algorithm:
%
%\begin{equation}
%B_k \leq \frac{1}{\beta_k} + \epsilon_k = k^{-b} + k^{-e} \leq \frac{2}{k^e} = \epsilon,
%\end{equation}
%
%which gives the relation $K = \left( \frac{2}{\epsilon}\right)^{1/e}$.
%Writing down the total number of iterations and plugging in $K$,
%
%\begin{equation}
%T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1} \leq \left(\frac{2}{\epsilon}\right)^{\frac{2b}{e}+1+\frac{1}{e}}.
%\end{equation}
%
%To get the least number of total iterations for a given accuracy $\epsilon$, one needs to pick $e$ as large as possible and $b$ as small as possible.
%Since in this case we had $b\geq e$, this suggests picking $b=e$ for the optimal iteration complexity.
%
%\textbf{Case 2:} $b\leq e$:
%
%Same calculations yield the following bound on the total number of iterations:
%
%\begin{equation}
%T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1} \leq \left(\frac{2}{\epsilon}\right)^{2+\frac{e}{b}+\frac{1}{b}}.
%\end{equation}
%
%Given that $b\leq e$, the bound suggests picking $e$ as small as possible and $b$ as big as possible.
%
%To minimize the total number of iterations in both cases with flexible $b$ and $e$, the bounds suggest to pick $b=e=\alpha$ and take this value to be as large as possible.
\section{Proof of Lemma \ref{lem:smoothness}\label{sec:proof of smoothness lemma}}
Note that
\begin{align}
\mathcal{L}_{\beta}(x,y) = f(x) + \sum_{i=1}^m y_i A_i (x) + \frac{\b}{2} \sum_{i=1}^m (A_i(x))^2,
\end{align}
which implies that
\begin{align}
& \nabla_x \mathcal{L}_\beta(x,y) \nonumber\\
& = \nabla f(x) + \sum_{i=1}^m y_i \nabla A_i(x) + \frac{\b}{2} \sum_{i=1}^m A_i(x) \nabla A_i(x) \nonumber\\
& = \nabla f(x) + DA(x)^\top y + \b DA(x)^\top A(x),
\end{align}
where $DA(x)$ is the Jacobian of $A$ at $x$. By taking another derivative with respect to $x$, we reach
\begin{align}
\nabla^2_x \mathcal{L}_\beta(x,y) & = \nabla^2 f(x) + \sum_{i=1}^m \left( y_i + \b A_i(x) \right) \nabla^2 A_i(x) \nonumber\\
& \qquad +\b \sum_{i=1}^m \nabla A_i(x) \nabla A_i(x)^\top.
\end{align}
It follows that
\begin{align}
& \|\nabla_x^2 \mathcal{L}_\beta(x,y)\|\nonumber\\
& \le \| \nabla^2 f(x) \| + \max_i \| \nabla^2 A_i(x)\| \left (\|y\|_1+\b \|A(x)\|_1 \right) \nonumber\\
& \qquad +\beta\sum_{i=1}^m \|\nabla A_i(x)\|^2 \nonumber\\
& \le \lambda_h+ \sqrt{m} \lambda_A \left (\|y\|+\b \|A(x)\| \right) + \b \|DA(x)\|^2_F.
\end{align}
For every $x$ such that $\|A(x)\|\le \rho$ and $\|x\|\le \rho'$, we conclude that
\begin{align}
\|\nabla_x^2 \mathcal{L}_\beta(x,y)\|
& \le \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho \right) \nonumber\\
& \qquad + \b \max_{\|x\|\le \rho'}\|DA(x)\|_F^2,
\end{align}
which completes the proof of Lemma \ref{lem:smoothness}.
%\section{Proof of Lemma \ref{lem:11}\label{sec:proof of descent lemma}}
%
%Throughout, let
%\begin{align}
%G = G_{\b,\g}(x,y) = \frac{x-x^+}{\g},
%\end{align}
%for short.
%Suppose that $\|A(x)\|\le \rho$, $\|x\|\le \rho$, and similarly $\|A(x^+)\|\le \rho$, $\|x^+\|\le \rho'$. An application of Lemma \ref{lem:smoothness} yields that
%\begin{align}
%\L_\b(x^+,y)+g(x^+) & \le \L_\b(x,y)+ \langle x^+-x,\nabla_x \L_\b(x,y) \rangle
%+ \frac{\lambda_\b}{2} \|x^+ - x\|^2 + g(x^+) \nonumber\\
%& = \L_\b(x,y)-\g \langle G ,\nabla_x \L_\b(x,y) \rangle
%+ \frac{\g^2 \lambda_\b }{2} \|G\|^2 + g(x^+)
%\label{eq:descent pr 1}
%\end{align}
%Since $x^+ = P_g(x - \g \nabla_x \L_\b(x,y))$, we also have that
%\begin{align}
%\g (G - \nabla_x \L_\b(x,y)) = \xi \in \partial g(x^+).
%\label{eq:opt of prox}
%\end{align}
%By combining (\ref{eq:descent pr 1},\ref{eq:opt of prox}), we find that
%\begin{align}
%\L_\b(x^+,y)+g(x^+) &
%\le \L_\b(x,y) -\g \|G\|^2 + \g \langle G, \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\
%& = \L_\b(x,y) -\g \|G\|^2 + \langle x- x^+ , \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\
%& \le \L_\b(x,y) + g(x) - \g\left( 1-\frac{\g\lambda_\b}{2}\right) \|G\|^2,
%\end{align}
%where the last line above uses the convexity of $g$. Recalling that $\g\le 1/\lambda_\b$ completes the proof of Lemma \ref{lem:11}.
%
%
%\section{Proof of Lemma \ref{lem:eval Lipsc cte}\label{sec:proof of linesearch}}
%
%
%Recalling $x^+_{\gamma}$ in \eqref{eq:defn of gamma line search}, we note that
%\begin{equation}
%x^+_{\gamma} - x +\gamma \nabla_x \mathcal{L}_\beta(x,y) = -\xi \in -\partial g (x^+_{\gamma}).
%\label{eq:optimality of uplus}
%\end{equation}
%Lastly, $\gamma$ by definition in \eqref{eq:defn of gamma line search} satisfies
%\begin{align}
%& \mathcal{L}_{\beta}(x^+_{\gamma},y) + g(x_\g^+) \nonumber\\
% & \le \mathcal{L}_\beta(x,y) + \g \left\langle
%x^+_{\gamma} - x , \nabla_x \mathcal{L}_\beta (x,y)
%\right\rangle + \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2
%+ g(x_\g^+)
%\nonumber\\
%& = \mathcal{L}_\beta(x,y) + \left\langle
%x - x^+_{\gamma} ,\xi
%\right\rangle
%- \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x_\g^+)\nonumber\\
%& \le \mathcal{L}_\beta(x,y)
%- \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x) - g(x^+_\g)
%\qquad (\text{convexity of } g) \nonumber\\
%& = \mathcal{L}_\beta(x,y) - \frac{\gamma}{2} \|G_{\beta,\gamma}(x,y)\|^2 +g(x) - g(x^+_\g),
%\qquad \text{(see Definition \ref{def:grad map})}
%\end{align}
%which completes the proof of Lemma \ref{lem:eval Lipsc cte}.
%
%
%
\section{Basis Pursuit \label{sec:verification1}}
We only verify the regularity condition in \eqref{eq:regularity} for \eqref{prob:01} with $f,A$ specified in \eqref{eq:bp-equiv}. Note that
\begin{align*}
DA(x) = 2 \overline{B} \text{diag}(x),
\end{align*}
where $\text{diag}(x)\in\RR^{2d\times 2d}$ is the diagonal matrix formed by $x$. The left-hand side of \eqref{eq:regularity} then reads as
\begin{align}
& \text{dist} \left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{\b_{k-1}} \right) \nonumber\\
& = \text{dist} \left( -DA(x_k)^\top A(x_k) , \{0\} \right) \nonumber\\
& = \|DA(x_k)^\top A(x_k) \| \nonumber\\
& =2 \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \|.
\label{eq:cnd-bp-pre}
\end{align}
To bound the last line above, let $x_*$ be a solution of Program~\eqref{prob:01} and note that $\overline{B} x_*^{\circ 2} = b $ by definition. Let also $z_k,z_*\in \RR^d$ denote the vectors corresponding to $x_k,x_*$. Corresponding to $x_k$, also define $u_{k,1},u_{k,2}$ naturally and let $|z_k| = u_{k,1}^{\circ 2} + u_{k,2}^{\circ 2} \in \RR^d$ be the amplitudes of $z_k$. To simplify matters, let us assume also that $B$ is full-rank.
We then rewrite the last line of \eqref{eq:cnd-bp-pre} as
\begin{align}
& \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \|^2 \nonumber\\
& = \| \text{diag}(x_k) \overline{B}^\top \overline{B} (x_k^{\circ 2} -x_*^{\circ 2}) \|^2 \nonumber\\
& = \| \text{diag}(x_k)\overline{B}^\top B (x_k - x_*) \|^2 \nonumber\\
& = \| \text{diag}(u_{k,1})B^\top B (x_k - x_*) \|^2 \nonumber\\
& \qquad + \| \text{diag}(u_{k,2})B^\top B (x_k - x_*) \|^2 \nonumber\\
& = \| \text{diag}(u_{k,1}^{\circ 2}+ u_{k,2}^{\circ 2}) B^\top B (x_k - x_*) \|^2 \nonumber\\
& = \| \text{diag}(|z_k|) B^\top B (x_k - x_*) \|^2 \nonumber\\
& \ge
\eta_n ( B \text{diag}(|z_k|) )^2
\| B(x_k - x_*) \|^2 \nonumber\\
& =
\eta_n ( B \text{diag}(|z_k|) )^2
\| B x_k -b \|^2,
\end{align}
where $\eta_n(\cdot)$ returns the $n$th largest singular value of its argument. We can therefore ensure that \eqref{eq:regularity} holds by enforcing that
\begin{align}
z_k \in \left\{ z \in \RR^d: \eta_n( B \text{diag}(|z|)) > \nu \right\},
\end{align}
for every iteration $k$.
\section{Clustering \label{sec:verification2}}
We only verify the condition in~\eqref{eq:regularity}.
Note that
\begin{align}
A(x) = VV^\top \mathbf{1}- \mathbf{1},
\end{align}
\begin{align}
DA(x) & =
\left[
\begin{array}{cccc}
w_{1,1} x_1^\top & \cdots & w_{1,n} x_{1}^\top\\
%x_2^\top p& \cdots & x_{2}^\top\\
\vdots\\
w_{n,1}x_{n}^\top & \cdots & w_{n,n}1 x_{n}^\top
\end{array}
\right] \nonumber\\
& = \left[
\begin{array}{ccc}
V & \cdots & V
\end{array}
\right]
+
\left[
\begin{array}{ccc}
x_1^\top & \\
& \ddots & \\
& & x_n^\top
\end{array}
\right],
\label{eq:Jacobian clustering}
\end{align}
%where $I_n\in\RR^{n\times n}$ is the identity matrix,
where $w_{i.i}=2$ and $w_{i,j}=1$ for $i\ne j$. In the last line above, $n$ copies of $V$ appear and the last matrix above is block-diagonal. For $x_k$, define $V_k$ as in the example and let $x_{k,i}$ be the $i$th row of $V_k$.
Consequently,
\begin{align}
DA(x_k)^\top A(x_k) & =
\left[
\begin{array}{c}
V_k^\top (V_k V_k^\top- I_n ) \mathbf{1}\\
\vdots\\
V_k^\top (V_k V_k^\top- I_n ) \mathbf{1}
\end{array}
\right] \nonumber\\
& \qquad +
\left[
\begin{array}{c}
x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\
\vdots \\
x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n
\end{array}
\right],
\end{align}
where $I_n\in \RR^{n\times n}$ is the identity matrix.
Let us make a number of simplifying assumptions. First, we assume that $x_k\in \text{relint}(C)$ so that $\partial g(x_k)=\{0\}$, which can be easily enforced in the iterates. Second, we assume that $V_k$ has nearly orthonormal columns, namely, $V_k V_k^\top \approx I_n$. This can also be easily enforced in each iterate of Algorithm~\ref{Algo:2} and naturally corresponds to well-separated clusters. While a more fine-tuned argument can remove these assumptions, they will help us simplify the derivations. Under these assumptions, the squared right-hand side of \eqref{eq:regularity} becomes
\begin{align}
& \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right)^2 \nonumber\\
& = \dist\left( -DA(x_k)^\top A(x_k) , \{0\} \right)^2 \nonumber\\
& = \| DA(x_k)^\top A(x_k) \|^2 \nonumber\\
& =
\left\|
\begin{array}{c}
x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\
\vdots \\
x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n
\end{array}
\right\|^2 \nonumber\\
& = \sum_{i=1}^n \| x_{k,i}\|^2 (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\
& \ge \min_i \| x_{k,i}\|^2
\cdot \sum_{i=1}^n (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\
& = \min_i \| x_{k,i}\|^2
\cdot \| V_kV_k^\top \mathbf{1}-\mathbf{1} \|^2.
\end{align}
We can enforce the iterates to satisfy $\|x_{k,i}\| \ge \nu$, which corresponds again to well-separated clusters, and guarantee \eqref{eq:regularity}. In practice, often $n$ exceeds the number of true clusters and a more fine-tuned analysis is required to establish \eqref{eq:regularity} by restricting the argument to a particular subspace of $\RR^n$.
\section{$\ell_\infty$ denoising with a generative prior}
The authors of \cite{Ilyas2017} have proposed to
project onto the range of a Generative Adversarial network (GAN)
\cite{Goodfellow2014}, as a way to defend against adversarial examples. For a
given noisy observation $x^* + \eta$, they consider a projection in the
$\ell_2$ norm. We instead propose to use our augmented Lagrangian method to
denoise in the $\ell_\infty$ norm, a much harder task:
\begin{align}
\begin{array}{lll}
\underset{x, z}{\text{min}} & & \|x^* + \eta - x\|_\infty \\
\text{s.t. } && x=G(z).
\end{array}
\end{align}
We use a pretrained generator for the MNIST dataset, given by a standard
deconvolutional neural network architecture. We compare the succesful optimizer
Adam against our method. Our algorithm involves two forward/backward passes
through the network, as oposed to Adam that requires only one. For this reason
we let our algorithm run for 4000 iterations, and Adam for 8000 iterations.
For a particular example, we plot the objective value vs iteration count in figure
\ref{fig:comparison_fab}. Our method successfully minimizes the objective value,
while Adam does not succeed.
\begin{figure*}[ht]
% \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf}
\begin{center}
{\includegraphics[width=.7\columnwidth]{figs/example_denoising_fab.pdf}}
%\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}}
\label{fig:comparison_fab}
\caption{Augmented Lagrangian vs Adam for $\ell_\infty$ denoising (left). $\ell_2$ vs $\ell_\infty$ denoising as defense against adversarial examples}
\end{center}
\end{figure*}

Event Timeline