appendix.tex
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Wed, May 22, 09:30

appendix.tex
View Options

	%!TEX root = ../iALM_main.tex

	\section{Proof of Corollary~\ref{cor:first}}
	%Denote $B_k = \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \\| A(x_k)\\|$ for every $k$. Using the convergence proof of the outer algorithm, we have the following bound

	Let $K$ denote the number of (outer) iterations of Algorithm~\ref{Algo:2} and let $\epsilon_{f}$ denote the desired accuracy of Algorithm~\ref{Algo:2}, see~(\ref{eq:inclu3}). Recalling Theorem~\ref{thm:main}, we can then write that
	\begin{equation}
	\epsilon_{f} = \frac{Q}{\b_{K}},
	\label{eq:acc_to_b}
	\end{equation}
	or, equivalently, $\b_{K} = Q/\epsilon_{f}$.
	%where $K$ denotes the last outer iterate and $\epsilon$ is the final accuracy we would like to get for the optimality conditions given in~\eqref{eq:inclu3}.
	We now count the number of total (inner) iterations $T$ of Algorithm~\ref{Algo:2} to reach the accuracy $\epsilon_{f}$. From \eqref{eq:smoothness of Lagrangian} and for sufficiently large $k$, recall that $\lambda_{\b_k}\le \lambda'' \b_k$ is the smoothness parameter of the augmented Lagrangian. Then, from \eqref{eq:iter_1storder} ad by summing over the outer iterations, we bound the total number of (inner) iterations of Algorithm~\ref{Algo:2} as
	\begin{align}\label{eq: tk_bound}
	T &= \sum_{k=1}^K\mathcal{O}\left ( \frac{\lambda_{\beta_{k-1}}^2 x_{\max}^2 }{\epsilon_k} \right) \nonumber\\
	& = \sum_{k=1}^K\mathcal{O}\left (\beta_{k-1}^3 x_{\max}^2 \right)
	\qquad \text{(Step 1 of Algorithm \ref{Algo:2})}
	\nonumber\\
	& \leq \mathcal{O} \left(K\beta_{K-1}^3 x_{\max}^2 \right)
	\qquad \left( \{\b_k\}_k \text{ is increasing} \right)
	\nonumber\\
	& \le \mathcal{O}\left( \frac{K Q^{{3}} x_{\max}^2}{\epsilon_{f}^{{3}}} \right).
	\qquad \text{(see \eqref{eq:acc_to_b})}
	\end{align}
	In addition, if we specify $\beta_k=b^k$ for all $k$, we can further refine $T$. Indeed,
	\begin{equation}
	\beta_K = b^K~~ \Longrightarrow~~ K = \log_b \left( \frac{Q}{\epsilon_f} \right),
	\end{equation}
	which, after substituting into~\eqref{eq: tk_bound} gives the final bound in Corollary~\ref{cor:first}.
	%\begin{equation}
	%T \leq \mathcal{O}\left( \frac{Q^{\frac{3}{2}+\frac{1}{2b}} x_{\max}^2}{\epsilon_f^{\frac{3}{2}+\frac{1}{2b}}} \right),
	%\end{equation}
	%which completes the proof of Corollary~\ref{cor:first}.

	%\section{Analysis of different rates for $\beta_k$ and $\epsilon_k$}
	%
	%We check the iteration complexity analysis by decoupling $\beta_k$ and $\epsilon_k$.
	%\begin{equation}
	%\beta_k = k^b, ~~~~ \epsilon_k = k^{-e}.
	%\end{equation}
	%
	%By denoting $B_k = \dist( -\nabla_x \L_{\b_{k-1}}(x_k,y_k),\partial g(x_k)) + \\| A(x_k)\\|$, the algorithm bound becomes,
	%
	%\begin{equation}
	%B_k \leq \frac{1}{\beta_k} + \epsilon_k = k^{-b} + k^{-e}.
	%\end{equation}
	%
	%Total iteration number is
	%
	%\begin{equation}
	%T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1}.
	%\end{equation}
	%
	%We now consider two different relations between $b$ and $e$ to see what is going on.
	%
	%\textbf{Case 1:} $b\geq e$:
	%
	%Bound for the algorithm:
	%
	%\begin{equation}
	%B_k \leq \frac{1}{\beta_k} + \epsilon_k = k^{-b} + k^{-e} \leq \frac{2}{k^e} = \epsilon,
	%\end{equation}
	%
	%which gives the relation $K = \left( \frac{2}{\epsilon}\right)^{1/e}$.
	%Writing down the total number of iterations and plugging in $K$,
	%
	%\begin{equation}
	%T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1} \leq \left(\frac{2}{\epsilon}\right)^{\frac{2b}{e}+1+\frac{1}{e}}.
	%\end{equation}
	%
	%To get the least number of total iterations for a given accuracy $\epsilon$, one needs to pick $e$ as large as possible and $b$ as small as possible.
	%Since in this case we had $b\geq e$, this suggests picking $b=e$ for the optimal iteration complexity.
	%
	%\textbf{Case 2:} $b\leq e$:
	%
	%Same calculations yield the following bound on the total number of iterations:
	%
	%\begin{equation}
	%T_K = \sum_{k=1}^K \frac{\beta_k^2}{\epsilon_k} \leq K^{2b+e+1} \leq \left(\frac{2}{\epsilon}\right)^{2+\frac{e}{b}+\frac{1}{b}}.
	%\end{equation}
	%
	%Given that $b\leq e$, the bound suggests picking $e$ as small as possible and $b$ as big as possible.
	%
	%To minimize the total number of iterations in both cases with flexible $b$ and $e$, the bounds suggest to pick $b=e=\alpha$ and take this value to be as large as possible.



	\section{Proof of Lemma \ref{lem:smoothness}\label{sec:proof of smoothness lemma}}
	Note that
	\begin{align}
	\mathcal{L}_{\beta}(x,y) = f(x) + \sum_{i=1}^m y_i A_i (x) + \frac{\b}{2} \sum_{i=1}^m (A_i(x))^2,
	\end{align}
	which implies that
	\begin{align}
	& \nabla_x \mathcal{L}_\beta(x,y) \nonumber\\
	& = \nabla f(x) + \sum_{i=1}^m y_i \nabla A_i(x) + \frac{\b}{2} \sum_{i=1}^m A_i(x) \nabla A_i(x) \nonumber\\
	& = \nabla f(x) + DA(x)^\top y + \b DA(x)^\top A(x),
	\end{align}
	where $DA(x)$ is the Jacobian of $A$ at $x$. By taking another derivative with respect to $x$, we reach
	\begin{align}
	\nabla^2_x \mathcal{L}_\beta(x,y) & = \nabla^2 f(x) + \sum_{i=1}^m \left( y_i + \b A_i(x) \right) \nabla^2 A_i(x) \nonumber\\
	& \qquad +\b \sum_{i=1}^m \nabla A_i(x) \nabla A_i(x)^\top.
	\end{align}
	It follows that
	\begin{align}
	& \\|\nabla_x^2 \mathcal{L}_\beta(x,y)\\|\nonumber\\
	& \le \\| \nabla^2 f(x) \\| + \max_i \\| \nabla^2 A_i(x)\\| \left (\\|y\\|_1+\b \\|A(x)\\|_1 \right) \nonumber\\
	& \qquad +\beta\sum_{i=1}^m \\|\nabla A_i(x)\\|^2 \nonumber\\
	& \le \lambda_h+ \sqrt{m} \lambda_A \left (\\|y\\|+\b \\|A(x)\\| \right) + \b \\|DA(x)\\|^2_F.
	\end{align}
	For every $x$ such that $\\|A(x)\\|\le \rho$ and $\\|x\\|\le \rho'$, we conclude that
	\begin{align}
	\\|\nabla_x^2 \mathcal{L}_\beta(x,y)\\|
	& \le \lambda_f + \sqrt{m}\lambda_A \left(\\|y\\| + \b\rho \right) \nonumber\\
	& \qquad + \b \max_{\\|x\\|\le \rho'}\\|DA(x)\\|_F^2,
	\end{align}
	which completes the proof of Lemma \ref{lem:smoothness}.

	%\section{Proof of Lemma \ref{lem:11}\label{sec:proof of descent lemma}}
	%
	%Throughout, let
	%\begin{align}
	%G = G_{\b,\g}(x,y) = \frac{x-x^+}{\g},
	%\end{align}
	%for short.
	%Suppose that $\\|A(x)\\|\le \rho$, $\\|x\\|\le \rho$, and similarly $\\|A(x^+)\\|\le \rho$, $\\|x^+\\|\le \rho'$. An application of Lemma \ref{lem:smoothness} yields that
	%\begin{align}
	%\L_\b(x^+,y)+g(x^+) & \le \L_\b(x,y)+ \langle x^+-x,\nabla_x \L_\b(x,y) \rangle
	%+ \frac{\lambda_\b}{2} \\|x^+ - x\\|^2 + g(x^+) \nonumber\\
	%& = \L_\b(x,y)-\g \langle G ,\nabla_x \L_\b(x,y) \rangle
	%+ \frac{\g^2 \lambda_\b }{2} \\|G\\|^2 + g(x^+)
	%\label{eq:descent pr 1}
	%\end{align}
	%Since $x^+ = P_g(x - \g \nabla_x \L_\b(x,y))$, we also have that
	%\begin{align}
	%\g (G - \nabla_x \L_\b(x,y)) = \xi \in \partial g(x^+).
	%\label{eq:opt of prox}
	%\end{align}
	%By combining (\ref{eq:descent pr 1},\ref{eq:opt of prox}), we find that
	%\begin{align}
	%\L_\b(x^+,y)+g(x^+) &
	%\le \L_\b(x,y) -\g \\|G\\|^2 + \g \langle G, \xi \rangle + \frac{\g^2 \lambda_\b}{2}\\|G\\|^2 + g(x^+) \nonumber\\
	%& = \L_\b(x,y) -\g \\|G\\|^2 + \langle x- x^+ , \xi \rangle + \frac{\g^2 \lambda_\b}{2}\\|G\\|^2 + g(x^+) \nonumber\\
	%& \le \L_\b(x,y) + g(x) - \g\left( 1-\frac{\g\lambda_\b}{2}\right) \\|G\\|^2,
	%\end{align}
	%where the last line above uses the convexity of $g$. Recalling that $\g\le 1/\lambda_\b$ completes the proof of Lemma \ref{lem:11}.
	%
	%
	%\section{Proof of Lemma \ref{lem:eval Lipsc cte}\label{sec:proof of linesearch}}
	%
	%
	%Recalling $x^+_{\gamma}$ in \eqref{eq:defn of gamma line search}, we note that
	%\begin{equation}
	%x^+_{\gamma} - x +\gamma \nabla_x \mathcal{L}_\beta(x,y) = -\xi \in -\partial g (x^+_{\gamma}).
	%\label{eq:optimality of uplus}
	%\end{equation}
	%Lastly, $\gamma$ by definition in \eqref{eq:defn of gamma line search} satisfies
	%\begin{align}
	%& \mathcal{L}_{\beta}(x^+_{\gamma},y) + g(x_\g^+) \nonumber\\
	% & \le \mathcal{L}_\beta(x,y) + \g \left\langle
	%x^+_{\gamma} - x , \nabla_x \mathcal{L}_\beta (x,y)
	%\right\rangle + \frac{1}{2\gamma}\\|x^+_{\gamma} - x\\|^2
	%+ g(x_\g^+)
	%\nonumber\\
	%& = \mathcal{L}_\beta(x,y) + \left\langle
	%x - x^+_{\gamma} ,\xi
	%\right\rangle
	%- \frac{1}{2\gamma}\\|x^+_{\gamma} - x\\|^2 + g(x_\g^+)\nonumber\\
	%& \le \mathcal{L}_\beta(x,y)
	%- \frac{1}{2\gamma}\\|x^+_{\gamma} - x\\|^2 + g(x) - g(x^+_\g)
	%\qquad (\text{convexity of } g) \nonumber\\
	%& = \mathcal{L}_\beta(x,y) - \frac{\gamma}{2} \\|G_{\beta,\gamma}(x,y)\\|^2 +g(x) - g(x^+_\g),
	%\qquad \text{(see Definition \ref{def:grad map})}
	%\end{align}
	%which completes the proof of Lemma \ref{lem:eval Lipsc cte}.
	%
	%
	%


	\section{Basis Pursuit \label{sec:verification1}}

	We only verify the regularity condition in \eqref{eq:regularity} for \eqref{prob:01} with $f,A$ specified in \eqref{eq:bp-equiv}. Note that
	\begin{align*}
	DA(x) = 2 \overline{B} \text{diag}(x),
	\end{align*}
	where $\text{diag}(x)\in\RR^{2d\times 2d}$ is the diagonal matrix formed by $x$. The left-hand side of \eqref{eq:regularity} then reads as
	\begin{align}
	& \text{dist} \left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{\b_{k-1}} \right) \nonumber\\
	& = \text{dist} \left( -DA(x_k)^\top A(x_k) , \{0\} \right) \nonumber\\
	& = \\|DA(x_k)^\top A(x_k) \\| \nonumber\\
	& =2 \\| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \\|.
	\label{eq:cnd-bp-pre}
	\end{align}
	To bound the last line above, let $x_$ be a solution of Program~\eqref{prob:01} and note that $\overline{B} x_^{\circ 2} = b $ by definition. Let also $z_k,z_\in \RR^d$ denote the vectors corresponding to $x_k,x_$. Corresponding to $x_k$, also define $u_{k,1},u_{k,2}$ naturally and let $\|z_k\| = u_{k,1}^{\circ 2} + u_{k,2}^{\circ 2} \in \RR^d$ be the amplitudes of $z_k$. To simplify matters, let us assume also that $B$ is full-rank.
	We then rewrite the last line of \eqref{eq:cnd-bp-pre} as
	\begin{align}
	& \\| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \\|^2 \nonumber\\
	& = \\| \text{diag}(x_k) \overline{B}^\top \overline{B} (x_k^{\circ 2} -x_*^{\circ 2}) \\|^2 \nonumber\\
	& = \\| \text{diag}(x_k)\overline{B}^\top B (x_k - x_*) \\|^2 \nonumber\\
	& = \\| \text{diag}(u_{k,1})B^\top B (x_k - x_*) \\|^2 \nonumber\\
	& \qquad + \\| \text{diag}(u_{k,2})B^\top B (x_k - x_*) \\|^2 \nonumber\\
	& = \\| \text{diag}(u_{k,1}^{\circ 2}+ u_{k,2}^{\circ 2}) B^\top B (x_k - x_*) \\|^2 \nonumber\\
	& = \\| \text{diag}(\|z_k\|) B^\top B (x_k - x_*) \\|^2 \nonumber\\
	& \ge
	\eta_n ( B \text{diag}(\|z_k\|) )^2
	\\| B(x_k - x_*) \\|^2 \nonumber\\
	& =
	\eta_n ( B \text{diag}(\|z_k\|) )^2
	\\| B x_k -b \\|^2,
	\end{align}
	where $\eta_n(\cdot)$ returns the $n$th largest singular value of its argument. We can therefore ensure that \eqref{eq:regularity} holds by enforcing that
	\begin{align}
	z_k \in \left\{ z \in \RR^d: \eta_n( B \text{diag}(\|z\|)) > \nu \right\},
	\end{align}
	for every iteration $k$.


	\section{Clustering \label{sec:verification2}}


	We only verify the condition in~\eqref{eq:regularity}.
	Note that
	\begin{align}
	A(x) = VV^\top \mathbf{1}- \mathbf{1},
	\end{align}
	\begin{align}
	DA(x) & =
	\left[
	\begin{array}{cccc}
	w_{1,1} x_1^\top & \cdots & w_{1,n} x_{1}^\top\\
	%x_2^\top p& \cdots & x_{2}^\top\\
	\vdots\\
	w_{n,1}x_{n}^\top & \cdots & w_{n,n}1 x_{n}^\top
	\end{array}
	\right] \nonumber\\
	& = \left[
	\begin{array}{ccc}
	V & \cdots & V
	\end{array}
	\right]
	+
	\left[
	\begin{array}{ccc}
	x_1^\top & \\
	& \ddots & \\
	& & x_n^\top
	\end{array}
	\right],
	\label{eq:Jacobian clustering}
	\end{align}
	%where $I_n\in\RR^{n\times n}$ is the identity matrix,
	where $w_{i.i}=2$ and $w_{i,j}=1$ for $i\ne j$. In the last line above, $n$ copies of $V$ appear and the last matrix above is block-diagonal. For $x_k$, define $V_k$ as in the example and let $x_{k,i}$ be the $i$th row of $V_k$.
	Consequently,
	\begin{align}
	DA(x_k)^\top A(x_k) & =
	\left[
	\begin{array}{c}
	V_k^\top (V_k V_k^\top- I_n ) \mathbf{1}\\
	\vdots\\
	V_k^\top (V_k V_k^\top- I_n ) \mathbf{1}
	\end{array}
	\right] \nonumber\\
	& \qquad +
	\left[
	\begin{array}{c}
	x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\
	\vdots \\
	x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n
	\end{array}
	\right],
	\end{align}
	where $I_n\in \RR^{n\times n}$ is the identity matrix.
	Let us make a number of simplifying assumptions. First, we assume that $x_k\in \text{relint}(C)$ so that $\partial g(x_k)=\{0\}$, which can be easily enforced in the iterates. Second, we assume that $V_k$ has nearly orthonormal columns, namely, $V_k V_k^\top \approx I_n$. This can also be easily enforced in each iterate of Algorithm~\ref{Algo:2} and naturally corresponds to well-separated clusters. While a more fine-tuned argument can remove these assumptions, they will help us simplify the derivations. Under these assumptions, the squared right-hand side of \eqref{eq:regularity} becomes
	\begin{align}
	& \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right)^2 \nonumber\\
	& = \dist\left( -DA(x_k)^\top A(x_k) , \{0\} \right)^2 \nonumber\\
	& = \\| DA(x_k)^\top A(x_k) \\|^2 \nonumber\\
	& =
	\left\\|
	\begin{array}{c}
	x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\
	\vdots \\
	x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n
	\end{array}
	\right\\|^2 \nonumber\\
	& = \sum_{i=1}^n \\| x_{k,i}\\|^2 (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\
	& \ge \min_i \\| x_{k,i}\\|^2
	\cdot \sum_{i=1}^n (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\
	& = \min_i \\| x_{k,i}\\|^2
	\cdot \\| V_kV_k^\top \mathbf{1}-\mathbf{1} \\|^2.
	\end{align}
	We can enforce the iterates to satisfy $\\|x_{k,i}\\| \ge \nu$, which corresponds again to well-separated clusters, and guarantee \eqref{eq:regularity}. In practice, often $n$ exceeds the number of true clusters and a more fine-tuned analysis is required to establish \eqref{eq:regularity} by restricting the argument to a particular subspace of $\RR^n$.


	\section{$\ell_\infty$ denoising with a generative prior}
	The authors of \cite{Ilyas2017} have proposed to
	project onto the range of a Generative Adversarial network (GAN)
	\cite{Goodfellow2014}, as a way to defend against adversarial examples. For a
	given noisy observation $x^* + \eta$, they consider a projection in the
	$\ell_2$ norm. We instead propose to use our augmented Lagrangian method to
	denoise in the $\ell_\infty$ norm, a much harder task:
	\begin{align}
	\begin{array}{lll}
	\underset{x, z}{\text{min}} & & \\|x^* + \eta - x\\|_\infty \\
	\text{s.t. } && x=G(z).
	\end{array}
	\end{align}
	We use a pretrained generator for the MNIST dataset, given by a standard
	deconvolutional neural network architecture. We compare the succesful optimizer
	Adam against our method. Our algorithm involves two forward/backward passes
	through the network, as oposed to Adam that requires only one. For this reason
	we let our algorithm run for 4000 iterations, and Adam for 8000 iterations.
	For a particular example, we plot the objective value vs iteration count in figure
	\ref{fig:comparison_fab}. Our method successfully minimizes the objective value,
	while Adam does not succeed.

	\begin{figure*}[ht]
	% \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf}
	\begin{center}
	{\includegraphics[width=.7\columnwidth]{figs/example_denoising_fab.pdf}}
	%\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}}
	\label{fig:comparison_fab}
	\caption{Augmented Lagrangian vs Adam for $\ell_\infty$ denoising (left). $\ell_2$ vs $\ell_\infty$ denoising as defense against adversarial examples}
	\end{center}
	\end{figure*}

appendix.texNo OneTemporaryActions

File Metadata

appendix.texView Options

Event Timeline

appendix.tex
No OneTemporary
Actions

appendix.tex
View Options