diff --git a/ICML19/Arxiv version/sections/experiments.tex b/ICML19/Arxiv version/sections/experiments.tex
index 575c118..71acba6 100644
--- a/ICML19/Arxiv version/sections/experiments.tex	
+++ b/ICML19/Arxiv version/sections/experiments.tex	
@@ -1,398 +1,398 @@
 %!TEX root = ../iALM_main.tex
 
 \section{Numerical evidence \label{sec:experiments}}
 
 We first begin with a caveat:  It is known that quasi-Newton methods, such as BFGS and lBFGS, might not converge for non-convex problems~\cite{dai2002convergence, mascarenhas2004bfgs}. For this reason, we have used the trust region method as the second-order solver in our  analysis in Section~\ref{sec:cvg rate}, which is well-studied for non-convex problems~\cite{cartis2012complexity}. 
 
 Empirically, however, BFGS and lBGFS are extremely successful and we have also opted for those solvers in this section since the subroutine does not affect Theorem~\ref{thm:main} as long as the subsolver can perform in practice. 
 
 \subsection{k-Means Clustering}
 
 Given data points $\{z_i\}_{i=1}^n $, the entries of the corresponding Euclidean distance matrix $D \in \RR^{nxn}$ are $ D_{i, j} =  \left\| z_i - z_j\right\|^2 $. 
 Clustering is then the problem of finding a co-association matrix $Y\in \RR^{n\times n}$ such that $Y_{ij} = 1$ if points $z_i$ and $z_j$ are within the same cluster and $Y_{ij} = 0$ otherwise. In~\cite{Peng2007}, the authors provide  a SDP relaxation of the clustering problem, specified as 
 \begin{align}
 \begin{cases}
 \underset{Y \in \RR^{nxn}}{\min} \text{tr}(DY) \\
 Y\mathbf{1} = \mathbf{1},
 ~\text{tr}(Y) = k,~
  Y\succeq 0,~Y \geq 0,
 \end{cases}
 \label{eq:sdp_svx}
 \end{align}
 where $k$ is the number of clusters and $Y  $ is both positive semidefinite and has nonnegative entries. 
 Standard  SDP solvers do not scale well with the number of data points~$n$, since they often require projection onto the semidefinite cone with the complexity of $\mathcal{O}(n^3)$. We instead use the Burer-Monteiro splitting, sacrificing convexity to reduce the computational complexity. More specifically, we solve the program
 \begin{align}
 \label{eq:nc_cluster}
 \begin{cases}
 \underset{V \in \RR^{nxr}}{\min} \text{tr}(DVV^{\top}) \\
 VV^{\top}\mathbf{1} = \mathbf{1},~~ \|V\|_F^2 \le  k, 
 ~~V  \geq 0,
 \end{cases}
 \end{align}
 where $\mathbf{1}\in \RR^n$ is the vector of all ones.
 Note that $Y \geq 0$ in \eqref{eq:sdp_svx} is replaced above by the much stronger but easier to enforce constraint $V \geq 0$ constraint above, see~\cite{kulis2007fast} for the reasoning behind this relaxation.
 %. Trace constraint translates to a Frobenius norm constraint in factorized space. Semidefinite constraint is naturally removed due to factorization $Y=VV^{\top}$. 
 %See~\citep{kulis2007fast} for the details of the relaxation. 
 Now, we can cast~\eqref{eq:nc_cluster} as an instance of~\eqref{prob:01}. Indeed, for every $i\le n$, let $x_i \in \RR^r$ denote the $i$th row of $V$. We next form $x \in \RR^d$ with $d = nr$ by expanding the factorized variable $V$, namely, 
 \begin{align*}
 x = [x_1^{\top}, \cdots, x_n^{\top}]^{\top} \in \RR^d,
 \end{align*}
 and then set 
 \begin{align*}
 f(x) =\sum_{i,j=1}^n D_{i, j} \left\langle x_i, x_j \right\rangle,
 \qquad g = \delta_C,
 \end{align*}
 \begin{align}
 A(x) = [x_1^{\top}\sum_{j=1}^n x_j -1, \cdots, x_n^{\top}\sum_{j=1}^n x_j-1]^{\top},
 \end{align}
 where $C$ is the intersection of the positive orthant in $\RR^d$ with the Euclidean  ball of radius $\sqrt{k}$. In Appendix~\ref{sec:verification2}, we somewhat informally verify that Theorem~\ref{thm:main} applies to~\eqref{prob:01} with $f,g,A$ specified above. 
 
 
 In our simulations, we use two different solvers for Step~2 of  Algorithm~\ref{Algo:2}, namely, APGM and lBFGS. APGM is a  solver for non-convex problems of the form~\eqref{e:exac} with convergence guarantees to first-order stationarity, as discussed in Section~\ref{sec:cvg rate}. lBFGS is a limited-memory version of BFGS algorithm in~\cite{fletcher2013practical} that approximately leverages the second-order information of the problem. 
 We compare our approach against the following convex methods:
 %\begin{itemize}
 %\item HCGM: Homotopy-based Conditional Gradient Method in\cite{yurtsever2018conditional} which directly solves~\eqref{eq:sdp_svx}. 
 %\item SDPNAL+: A second-order augmented Lagrangian method for solving SDP's with nonnegativity constraints~\cite{yang2015sdpnal}. 
 %\end{itemize}
 
 $\triangleright$ HCGM: Homotopy-based Conditional Gradient Method in\cite{yurtsever2018conditional} which directly solves~\eqref{eq:sdp_svx}. \\[-4mm]
 
 $\triangleright$ SDPNAL+: A second-order augmented Lagrangian method for solving SDP's with nonnegativity constraints~\cite{yang2015sdpnal}. 
 
 As for the dataset, our experimental setup is similar to that described by~\cite{mixon2016clustering}. We use the publicly-available fashion-MNIST data in \cite{xiao2017/online}, which is released as a possible replacement for the MNIST handwritten digits. Each data point is a $28\times 28$ gray-scale image, associated with a label from ten classes, labeled from $0$ to $9$. 
 First, we extract the meaningful features from this dataset using a simple two-layer neural network with a sigmoid activation function. Then, we apply this neural network to 1000 test samples from the same dataset, which gives us a vector of length $10$ for each data point, where each entry represents the posterior probability for each class.  Then, we form the $\ell_2$ distance matrix ${D}$ from these probability vectors. The results are depicted in Figure~\ref{fig:clustering}.
 We implemented 3 algorithms on MATLAB and used the software package for SDPNAL+ which contains mex files. Convergence of the nonconvex approach will be much faster once mex implementation is used.
 
 \begin{figure}[]
 %    \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf}
 \begin{center}
 {\includegraphics[width=.4\columnwidth]{figs/clustering_fig4_times_linearbeta_last.pdf}}
 {\includegraphics[width=.4\columnwidth]{figs/cluster_feas_time.pdf}}
 %\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}}
 \caption{Convergence of different algorithms for k-Means clustering with fashion MNIST dataset. 
 %Here, we set the rank to be equal to 20 for the non-convex approaches. 
 The solution rank for the template~\eqref{eq:sdp_svx} is known and  it is equal to number of clusters $k$ (Theorem 1. \cite{kulis2007fast}). As discussed in~\cite{tepper2018clustering}, setting rank $r>k$ leads more accurate reconstruction in expense of speed. Therefore, we set the rank to 20.}
 \label{fig:clustering}
 %(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])}
 \end{center}
 \end{figure}
 
 
 \subsection{Generalized eigenvalue problem}
 
 Generalized eigenvalue problem has extensive applications in machine learning, statistics and data analysis~\cite{ge2016efficient}.
 The well-known nonconvex formulation of the problem is~\cite{boumal2016non}.
 
 \begin{align}
 \begin{cases}
 \underset{x\in\mathbb{R}^n}{\min} x^\top C x \\ 
 x^\top B x = 1,
 \end{cases}
 \label{eq:eig}
 \end{align}
 where $B, C \in \RR^{n \times n}$ are symmetric matrices and $B$ is positive definite, i.e. $B \succ 0$.
 %Due to the invertibilty of $B$, we have $\langle x, Bx \rangle \geq \frac{\| x\|_F^2}{\| B^{-1} \|}$, which implies the constraint $\| x \|_F ^2 \leq \| B^{-1} \|$.
 The generalized eigenvector computation is equivalent to performing principal component analysis (PCA) of $C$ in the norm $B$. Moreover, it is also equivalent to computing the top eigenvector of symmetric matrix $S = B^{-1/2}CB^{1-2}$ and multiplying the resulting vector by $B^{-1/2}$. However, for sufficiently large $n$, computing $B^{-1/2}$ is extremely expensive. 
 The natural convex sdp relaxation for~\eqref{eq:eig} involves lifting $Y = xx^\top$ and removes the non-convex rank$(Y) = 1$ constraint,
 
 \begin{align}
 \begin{cases}
 \underset{Y \in \RR^{n \times n}}{\min} \text{tr}(CY)\\
 \text{tr}(BY) = 1, \quad X \succeq 0.
 \end{cases}
 \label{eq:eig-sdp}
 \end{align}
 
 Here, we solve~\eqref{eq:eig} because it directly fits into our template with,
 
 \begin{align}
 f(x) =& x^\top C x, \quad g(x) = 0\nonumber\\
 A(x) =& x^\top B x - 1.
 \label{eq:eig-equiv}
 \end{align}
 
 We compare our approach against 3 different methods. Manifold based Riemannian gradient descent and Riemannian trust region methods in\cite{boumal2016global} and generalized eigenvector via linear system solver (abbrevated as. GenELin) in~\cite{ge2016efficient}. We have used Manopt software package in \cite{manopt} for the manifold based methods. For GenELin, we have utilized Matlab's backslash operator as the linear solver. The results are compiled in Figure \ref{fig:geig1}.
 
 %We also exhibit the corresponding eigenvalues for different scenarios where $C$ is positive semidefinite with different eigenvalue structure in Figure . 
 
 %
 %\begin{figure}[h]
 %\begin{center}
 %{\includegraphics[width=.4\columnwidth]{figs/gen_eigenvalue_1.pdf}}
 %\caption{Generalized eigenvalue problem}
 %\label{fig:geig}
 %\end{center}
 %\end{figure}
 \begin{figure}[h!]
 %\begin{table}[h]
 \begin{tabular}{l|l|l}
 ~~~~~~~~~(i) $C:$ Gaussian iid & ~~~~~(ii) $C:$ Polynomial decay & ~~~~~(iii) $C:$ Exponential decay  \\
 
 \includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case1_convergence.pdf}&
 \includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case2_convergence.pdf}& 
 \includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case3_convergence.pdf}\\  
 
 
 \includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case1_eigenvalues.pdf} &
 \includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case2_eigenvalues.pdf}  & 
 \includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case3_eigenvalues.pdf}  \\ \hline
 ~~~~~~~~~~~~~~~~~~~~(iv) & ~~~~~~~~~~~~~~~~~~~~(v) & ~~~~~~~~~~~~~~~~~~~(vi) \\
 
 \includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case4_convergence.pdf}&
 \includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case6_convergence.pdf}& 
 \includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case7_convergence.pdf}\\  
 
 
 \includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case4_eigenvalues.pdf} &
 \includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case6_eigenvalues.pdf}  & 
 \includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case7_eigenvalues.pdf} 
  
 \end{tabular}
 %\end{table}
 \caption{{\it{(Top)}} Objective convergence for calculating top generalized eigenvalue and eigenvector of $B$ and $C$. {\it{(Bottom)}} Eigenvalue structure of the matrices. For (i),(ii) and (iii), $C$ is positive semidefinite; for (iv), (v) and (vi), $C$ contains negative eigenvalues. {[(i): Generated by taking symmetric part of iid Gaussian matrix. (ii): Generated by randomly rotating diag($1^{-p}, 2^{-p}, \cdots, 1000^{-p}$)($p=1$). (iii): Generated by randomly rotating diag($10^{-p}, 10^{-2p}, \cdots, 10^{-1000p}$)($p=0.0025$).]} 
 }
 \label{fig:geig1}
 \end{figure}
 
 %\begin{figure}[h!]
 %%\begin{table}[h]
 %\begin{tabular}{l|l|l}
 %\includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case4_convergence.pdf}&
 %\includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case6_convergence.pdf}& 
 %\includegraphics[width=.3\columnwidth]{figs/gen_eig/gen_eig_case7_convergence.pdf}\\  
 %
 %
 %\includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case4_eigenvalues.pdf} &
 %\includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case6_eigenvalues.pdf}  & 
 %\includegraphics[width=.31\columnwidth]{figs/gen_eig/gen_eig_case7_eigenvalues.pdf} 
 %%\\~~~~~~~~~~~~~~~ Case 4 & ~~~~~~~~~~~~~~~Case 5 & ~~~~~~~~~~~~~~~Case 6   
 %\end{tabular}
 %%\end{table}
 %%\vspace{-10mm}
 %\caption{{\it{(Top)}} Objective convergence for calculating top generalized eigenvalue and eigenvector of $B$ and $C$. {\it{(Bottom)}} Eigenvalue structure of the matrices. $C$ contains negative eigenvalues.}
 %\label{fig:geig2}
 %\end{figure}
 
 We synthetically generate the real and symmetric matrices $B$ and $C$ with $n = 10^3$. The main assumption in \eqref{eq:eig} is that $B$ is positive definite. Therefore, we generated $B$ with all positive repeated eigenvalues in all the experiments. 
 We analyze different eigenvalue structures for $C$. 
 Second and forth row of Figure \ref{fig:geig1} depicts corresponding eigenvalue structures for $B$ and $C$. $\lambda_i^C$ and $\lambda_i^C$ denotes $i^{th}$ largest eigenvalue in magnitude for $C$ and $B$, respectively.
 
 
 Our augmented Lagrangian based method for the generalized eigenvalue problem outperforms the first order Riemannian gradient descent and GenELin in most of the cases. 
 It is promising to see that such a general method can perform well without exploiting any structural property in the constraint set as manifold based methods does.
 
 \subsection{Basis Pursuit}
 Basis Pursuit (BP) finds sparsest solutions of an under-determined system of linear equations, namely, 
 \begin{align}
 \begin{cases}
 \min_{z} \|z\|_1 \\
 Bz = b,
 \end{cases}
 \label{eq:bp_main}
 \end{align}
 where $B \in \RR^{n \times d}$ and $b \in \RR^{n}$. BP has found many applications in machine learning, statistics and signal processing \cite{chen2001atomic, candes2008introduction, arora2018compressed}.  A huge number of primal-dual convex optimization algorithms are proposed to solve BP, including, but not limited to~\cite{tran2018adaptive,chambolle2011first}.  There also exists many line of works \cite{beck2009fast} to handle sparse regression problem via regularization with $\ell_1$ norm. 
 
 %\textbf{AE: Fatih, maybe mention a few other algorithms including asgard and also say why we can't use fista and nista.}
 
 
 
 Here, we take a different approach and cast~(\ref{eq:bp_main}) as an instance of~\eqref{prob:01}. Note that any $z \in \RR^d$ can be decomposed as $z = z^+ - z^-$, where $z^+,z^-\in \RR^d$ are the positive and negative parts of $z$, respectively. Then consider the change of variables $z^+ = u_1^{\circ 2}$ and  $z^-= u_2^{\circ 2} \in \RR^d$, where $\circ$ denotes element-wise power. Next, we concatenate $u_1$ and $u_2$ as $x := [ u_1^{\top}, u_2^{\top} ]^{\top} \in \RR^{2d}$ and define $\overline{B} := [B, -B] \in \RR^{n \times 2d}$. Then, \eqref{eq:bp_main} is equivalent to \eqref{prob:01}  with 
 \begin{align}
 f(x) =& \|x\|_2^2, \quad g(x) = 0\nonumber\\
 A(x) =& \overline{B}x^{\circ 2}- b.
 \label{eq:bp-equiv}
 \end{align}
 In Appendix~\ref{sec:verification1}, we verify with minimal detail that Theorem~\ref{thm:main} indeed applies to~\eqref{prob:01} with the above $f,A$. 
 
 
 
 
 %Let $\mu(B)$ denote the \emph{coherence} of ${B}$, namely, 
 %\begin{align}
 %\mu = \max_{i,j} | \langle B_i, B_j \rangle |,
 %\end{align}
 %where $B_i\in \RR^n$ is the $i$th column of $B$. Let also $z_k,z_*\in \RR^d$ denote the vectors corresponding to $x_k,x_*$. We rewrite the last line of \eqref{eq:cnd-bp-pre} as 
 %\begin{align}
 %& 2  \|  \text{diag}(x_k) \overline{B}^\top ( \overline{B}x^{\circ 2} -b) \| \nonumber\\
 %& = 2  \|  \text{diag}(x_k) \overline{B}^\top  \overline{B} (x^{\circ 2} -x_*^{\circ 2}) \| \nonumber\\
 %& = 2  \|  \text{diag}(x_k) {B}^\top  B(x_k- x_*) \| \nonumber\\ 
 %& = 2  \|  \text{diag}(x_k) {B}^\top  B(x_k- x_*) \|_{\infty} \nonumber\\ 
 %& \ge   2 \|  \text{diag}(x_k) \text{diag}({B}^\top  B) (x_k- x_*) \|_{\infty}\nonumber\\
 %& \qquad - 2 \|  \text{diag}(x_k) \text{hollow}({B}^\top  B) (x_k- x_*) \|_{\infty},
 %\end{align}
 %where $\text{hollow}$ returns the hollow part of th
 %
 % and let $\overline{B}=U\Sigma V^\top$ be its thin singular value decomposition, where $U\in \RR^{n\times n}, V\in \RR^{d\times n}$ have orthonormal columns and the diagonal matrix $\Sigma\in \RR^{n\times n}$ contains the singular values $\{\sigma_i\}_{i=1}^r$. Let also $x_{k,i},U_i$ be the $i$th entry of $x_k$ and the $i$th row of $U_i$, respectively. Then we bound the last line above as 
 %\begin{align}
 %& 2  \|  \text{diag}(x_k) \overline{B}^\top ( \overline{B}x^{\circ 2} -b) \| \nonumber\\
 %& = 2  \|  \text{diag}(x_k) U \Sigma^2 U^\top ( x^{\circ 2} - x_*^{\circ 2}) \| \nonumber\\
 %& \ge 2  \|  \text{diag}(x_k) U \Sigma^2 U^\top ( x^{\circ 2} - x_*^{\circ 2}) \|_{\infty} \nonumber\\
 %& = \max_i   |x_{k,i}| \cdot  | U_i ^\top \Sigma \cdot \Sigma U^\top ( x^{\circ 2} - x_*^{\circ 2}) | \nonumber\\
 %& 
 %\end{align}
 %
 %where $\tilde{b}_{i,j} = (\tilde{B})_{ij},~ i\in [1:n]$ and $j \in [1:2d]$.
 %\begin{align*}
 %-DA(x)^\top(A(x) - b) = -2x \odot (\tilde{B}^\top (A(x) - b)),
 %\end{align*}
 %where $\odot$ denotes hadamard product. 
 %\begin{align*}
 %& \text{dist} \left( -DA(x)^\top (A(x)-b), \frac{\partial g(x)}{\b}  \right) \nonumber\\
 %& =  \text{dist} \left( -DA(x)^\top (A(x)-b), \{0\}  \right)  \nonumber\\
 %& = \left\| -DA(x)^\top (A(x)-b)  \right\| \nonumber\\
 %& \ge ?????,
 %\end{align*}
 %Hence, this completes the proof for regularity condition.
 
 We draw the entries of $B$ independently from a zero-mean and unit-variance Gaussian distribution.  For a fixed sparsity level $k$, the support of $z_*\in \RR^d$ and its nonzero amplitudes are also drawn from the standard Gaussian distribution. 
 %We then pick a sparsity level $k$ and choose $k$ indexes, i.e, $\Omega \subset [1:d]$, which are corresponding to nonzero entries of $z$. 
 %We then assign values from normal distribution to those entries. 
 Then the measurement vector  is created as $b = Bz + \epsilon$, where $\epsilon$ is the noise vector with entries drawn independently from the zero-mean Gaussian distribution with variance $\sigma^2=10^{-6}$. 
 
-\begin{figure}[]
+\begin{figure}[h!]
 \begin{center}
 {\includegraphics[width=.7\columnwidth]{figs/bp_fig1_subsolvers.pdf}}
 \caption{Convergence with different subsolvers for the aforementioned non-convex relaxation. 
 }
 \label{fig:bp1}
 %(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])}
 \end{center}
 \end{figure}
 %\vspace{-3mm}
 
 Figure~\ref{fig:bp1} compiles our results for the proposed relaxation. It is, indeed, interesting to see that these type of non-convex relaxations gives the solution of convex one and first order methods succeed.
 
 \paragraph{Discussion:} 
 The true potential of our reformulation is in dealing with more structured norms rather than $\ell_1$, where computing the proximal operator is often intractable. One such case is the latent group lasso norm~\cite{obozinski2011group}, defined as 
 \begin{align*}
 \|z\|_{\Omega} = \sum_{i=1}^I \| z_{\Omega_i}  \|, 
 \end{align*}
 where $\{\Omega_i\}_{i=1}^I$ are (not necessarily disjoint) index sets of $\{1,\cdots,d\}$. Although not studied here, we believe that the non-convex framework presented in this paper can serve to solve more complicated problems, such as the latent group lasso. We leave this research direction for future work. 
 
 
 \subsection{Adversarial Denoising with GANs}
 \label{exp:ellinf}
 Projection onto the range of a neural network $G:\mathbb{R}^s \to \mathbb{R}^d$ has been considered
 by~\cite{Samangouei2018, Ilyas2017} as a defense mechanism against adversarial
 examples~\cite{Szegedy2013}. In their settings, samples are denoised by projecting on
 the range of a pre-trained \textit{generator}, from either the GAN \cite{Goodfellow2014}
 or VAE framework \cite{Kingma2013}, before being fed to a classifier. Even though the adversarial noise
 introduced is typically bounded in $\ell_\infty$ norm \cite{Madry2018}, the
 projection is performed in the $\ell_2$ metric. We instead propose to directly
 project using the $\ell_\infty$ norm that limits the attacker, i.e. we solve the program
 \begin{align}
 \begin{cases}
 \min_{x, z} \|x - x^\natural \|_\infty \\
 G(z) = x,
 \end{cases}
 \label{eq:darn}
 \end{align}
 with our proposed algorithm \eqref{Algo:2} (AL). On the optimization side,
 projections are usually performed by solving, for a particular choice of norm
 $\| \cdot \|$, the non-convex program
 \begin{equation}
     \min_{z} \|G(z) - x^\natural \| \\
 \label{eq:darn2}
 \end{equation}
 using off-the-shelf optimizers like gradient descent (GD) or ADAM
 \cite{Kingma2014}. Indeed, we assign equal computational budget to ADAM and GD
 for solving \eqref{eq:darn2} with the $\ell_\infty$ norm, and to our algorithm (AL) for solving
 \eqref{eq:darn}. We choose a test image in the range of the generator $G$
 by sampling $z$ at random and setting $x^\natural = G(z) + \eta$ for $\eta$
 a random noise bounded in $\ell_\infty$ norm. For each algorithm we allow
 two random restarts.
 
 \textbf{Results and discussion. }
 For a noise level $\|\eta\| \leq 0.1$ we plot the objective function value
 across iterations (for our algorithm we plot the value $\|G(z_t) -
 x^\natural\|_\infty$). The MNIST images we use are normalized in the range $[0, 1]$.
 We plot the mean value across 6 different random images.
 We observe that our algorithm $AL$ is the only one capable of consistently
 decreasing the value of the objective function, while ADAM only does so slightly.
 Even after extensive tuning of the GD step size, we could not observe any improvement
 in the objective value.
 
 \begin{figure}[h]
     \centering
 \includegraphics[width=.35\columnwidth]{figs/12000_mnist_-1_denoise_perf_comparison}
 %%\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}}
 \caption{$\ell_\infty$ measurement error vs iterations}
 \label{fig:attack_error_time}
 \end{figure}
 
 
 %\begin{figure}[h]
 %    \centering
 %\includegraphics[width=.4\columnwidth]{figs/attack_error_time_clean.pdf}
 %%%\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}}
 %\caption{Error rate on denoised adversarial examples vs time}
 %\label{fig:attack_error_time}
 %\end{figure}
 
 
  
 
 
 %\cite{•}. 
 
  
 %\subsection{Latent Group Lasso \label{sec:latent lasso}}
 %
 %For a  collection of subsets $\Omega=\{\Omega_i\}_{i=1}^{I}\subset [1:p]$, the latent group Lasso norm on $\RR^p$ takes $z\in \RR^p$ to 
 %\begin{align}
 %\|z\|_{\Omega,1} = \sum_{i=1}^I \| z_{\Omega_i}  \|.  
 %\end{align}
 %Note that we do not require $\Omega_i$ to be disjoint. For $B\in \RR^{n\times p}$, $b\in \RR^n$, and $\lambda>0$, consider the latent group lasso  as 
 %\begin{align}
 %\min_{z\in \RR^d}  \frac{1}{2}\| Bz - b\|_2^2+ \lambda \| z\|_{\Omega,1}.
 %\label{eq:group_lasso}
 %\end{align} 
 %Because $\Omega$ is not a partition of $[1:p]$, computing the proximal operator of $\|\cdot\|_{\Omega}$ is often intractable, ruling out proximal gradient descent and similar algorithms for solving Program~\eqref{eq:group_lasso}. Instead, often Program~\eqref{eq:group_lasso}  is solved by Alternating Direction Method of Multipliers (ADMM). More concretely, ???
 %
 %We take a radically different approach here and cast Program~\eqref{eq:group_lasso} as an instance of Program~\eqref{prob:01}. More specifically, let $z^+,z^-\in \RR^p$ be positive and negative parts of $z$, so that $z=z^+-z^-$. Let us introduce the nonnegative $u\in \RR^p$ such that $z^++z^- = u^{\circ 2}$, where we used $\circ$ notation to show entrywise power. We may now write that  
 %\begin{align}
 %\| z^++z^- \|_{\Omega,1} 
 %&  = \| u^{\circ 2} \|_{\Omega,1 } =: \|u\|_{\Omega,2}^2, 
 %\end{align}
 %Unlike $\|\cdot\|_{\Omega,1}$, note that $\|\cdot\|_{\Omega,2}^2$ is differentiable and, in fact, there exists a positive semidefinite matrix $Q\in \RR^{d\times d}$ such that $\|u\|_{\Omega,2}^2=u^\top Q_\Omega u$. Let us form $x=[(z^+)^\top,(z^-)^\top, u^\top]^\top\in \RR^{3d}$ and set
 %\begin{align*}
 %f(x) = \frac{1}{2}\| Bz^+-Bz^- -b\|_1+ \|u\|_{\Omega,2}^2,  
 %\end{align*}
 %\begin{align}
 %g(x) = 0, \qquad A(x) = z^++z^--u^{\circ 2}. 
 %\end{align}
 %We can now apply Algorithm~\ref{Algo:2} to solve Program~\eqref{prob:01} with $f,g,A$ specified above. 
 %
 %
 %\paragraph{Convergence rate.}
 %Clearly, $f,A$ are strongly smooth in the sense that \eqref{eq:smoothness basic} holds with proper $\lambda_f,\lambda_A$. Likewise, both $f$ and the Jacobian $DA$ are continuous and the restricted Lipschitz constants $\lambda'_f,\lambda'_A$ in \eqref{eq:defn_lambda'} are consequently well-defined and finite for a fixed $\rho'>0$. We next verify the key regularity condition in Theorem~\ref{thm:main}, namely, \eqref{eq:regularity}. Note that 
 %\begin{align*}
 %DA(x) & = 
 %\left[
 %\begin{array}{ccc}
 %I_p & I_p & -2\text{diag}(u)
 %\end{array}
 %\right]\in \RR^{d\times 3d},
 %\end{align*}
 %\begin{align*}
 %-DA(x)^\top A(x) = 
 %\left[
 %\begin{array}{c}
 %-z^+-z^-+u^{\circ 2} \\
 %-z^+-z^-+u^{\circ 2}\\
 %2\text{diag}(u)( z^++z^--u^{\circ 2})
 %\end{array}
 %\right],
 %\end{align*}
 %\begin{align*}
 %& \text{dist} \left( -DA(x)^\top A(x), \frac{\partial g(x)}{\b}  \right) \nonumber\\
 %& =  \text{dist} \left( -DA(x)^\top A(x), \{0\}  \right)  \nonumber\\
 %& = \left\| -DA(x)^\top A(x)  \right\| \nonumber\\
 %& \ge \sqrt{2} \| A(x)\|,
 %\end{align*}
 %namely, \eqref{eq:regularity} holds with $\nu=1$.