experiments.tex
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Mon, Aug 19, 14:17

experiments.tex
View Options

	%!TEX root = ../iALM_main.tex

	\section{Numerical evidence \label{sec:experiments}}

	We first begin with a caveat: It is known that quasi-Newton methods, such as BFGS and lBFGS, might not converge for non-convex problems~\cite{dai2002convergence, mascarenhas2004bfgs}. For this reason, we have used the trust region method as the second-order solver in our analysis in Section~\ref{sec:cvg rate}, which is well-studied for non-convex problems~\cite{cartis2012complexity}.

	Empirically, however, BFGS and lBGFS are extremely successful and we have also opted for those solvers in this section since the subroutine does not affect Theorem~\ref{thm:main} as long as the subsolver can perform in practice.

	\subsection{k-Means Clustering}

	Given data points $\{z_i\}_{i=1}^n $, the entries of the corresponding Euclidean distance matrix $D \in \RR^{nxn}$ are $ D_{i, j} = \left\\| z_i - z_j\right\\|^2 $.
	Clustering is then the problem of finding a co-association matrix $Y\in \RR^{n\times n}$ such that $Y_{ij} = 1$ if points $z_i$ and $z_j$ are within the same cluster and $Y_{ij} = 0$ otherwise. In~\cite{Peng2007}, the authors provide a SDP relaxation of the clustering problem, specified as
	\begin{align}
	\begin{cases}
	\underset{Y \in \RR^{nxn}}{\min} \text{tr}(DY) \\
	Y\mathbf{1} = \mathbf{1},
	~\text{tr}(Y) = k,~
	Y\succeq 0,~Y \geq 0,
	\end{cases}
	\label{eq:sdp_svx}
	\end{align}
	where $k$ is the number of clusters and $Y $ is both positive semidefinite and has nonnegative entries.
	Standard SDP solvers do not scale well with the number of data points~$n$, since they often require projection onto the semidefinite cone with the complexity of $\mathcal{O}(n^3)$. We instead use the Burer-Monteiro splitting, sacrificing convexity to reduce the computational complexity. More specifically, we solve the program
	\begin{align}
	\label{eq:nc_cluster}
	\begin{cases}
	\underset{V \in \RR^{nxr}}{\min} \text{tr}(DVV^{\top}) \\
	VV^{\top}\mathbf{1} = \mathbf{1},~~ \\|V\\|_F^2 \le k,
	~~V \geq 0,
	\end{cases}
	\end{align}
	where $\mathbf{1}\in \RR^n$ is the vector of all ones.
	Note that $Y \geq 0$ in \eqref{eq:sdp_svx} is replaced above by the much stronger but easier to enforce constraint $V \geq 0$ constraint above, see~\cite{kulis2007fast} for the reasoning behind this relaxation.
	%. Trace constraint translates to a Frobenius norm constraint in factorized space. Semidefinite constraint is naturally removed due to factorization $Y=VV^{\top}$.
	%See~\citep{kulis2007fast} for the details of the relaxation.
	Now, we can cast~\eqref{eq:nc_cluster} as an instance of~\eqref{prob:01}. Indeed, for every $i\le n$, let $x_i \in \RR^r$ denote the $i$th row of $V$. We next form $x \in \RR^d$ with $d = nr$ by expanding the factorized variable $V$, namely,
	\begin{align*}
	x = [x_1^{\top}, \cdots, x_n^{\top}]^{\top} \in \RR^d,
	\end{align*}
	and then set
	\begin{align*}
	f(x) =\sum_{i,j=1}^n D_{i, j} \left\langle x_i, x_j \right\rangle,
	\qquad g = \delta_C,
	\end{align*}
	\begin{align}
	A(x) = [x_1^{\top}\sum_{j=1}^n x_j -1, \cdots, x_n^{\top}\sum_{j=1}^n x_j-1]^{\top},
	\end{align}
	where $C$ is the intersection of the positive orthant in $\RR^d$ with the Euclidean ball of radius $\sqrt{k}$. In Appendix~\ref{sec:verification2}, we somewhat informally verify that Theorem~\ref{thm:main} applies to~\eqref{prob:01} with $f,g,A$ specified above.


	In our simulations, we use two different solvers for Step~2 of Algorithm~\ref{Algo:2}, namely, APGM and lBFGS. APGM is a solver for non-convex problems of the form~\eqref{e:exac} with convergence guarantees to first-order stationarity, as discussed in Section~\ref{sec:cvg rate}. lBFGS is a limited-memory version of BFGS algorithm in~\cite{fletcher2013practical} that approximately leverages the second-order information of the problem.
	We compare our approach against the following convex methods:
	\begin{itemize}
	\item HCGM: Homotopy-based Conditional Gradient Method in\cite{yurtsever2018conditional} which directly solves~\eqref{eq:sdp_svx}.
	\item SDPNAL+: A second-order augmented Lagrangian method for solving SDP's with nonnegativity constraints~\cite{yang2015sdpnal}.
	\end{itemize}

	As for the dataset, our experimental setup is similar to that described by~\cite{mixon2016clustering}. We use the publicly-available fashion-MNIST data in \cite{xiao2017/online}, which is released as a possible replacement for the MNIST handwritten digits. Each data point is a $28\times 28$ gray-scale image, associated with a label from ten classes, labeled from $0$ to $9$.
	First, we extract the meaningful features from this dataset using a simple two-layer neural network with a sigmoid activation function. Then, we apply this neural network to 1000 test samples from the same dataset, which gives us a vector of length $10$ for each data point, where each entry represents the posterior probability for each class. Then, we form the $\ell_2$ distance matrix ${D}$ from these probability vectors. The results are depicted in Figure~\ref{fig:clustering}.
	We implemented 3 algorithms on MATLAB and used the software package for SDPNAL+ which contains mex files. Convergence of the nonconvex approach will be much faster once mex implementation is used.

	\begin{figure}[]
	% \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf}
	\begin{center}
	{\includegraphics[width=.4\columnwidth]{figs/clustering_fig4_times_linearbeta_last.pdf}}
	{\includegraphics[width=.4\columnwidth]{figs/clustering_fig4_iter_linearbeta_last.pdf}}
	%\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}}
	\caption{Convergence of different algorithms for k-Means clustering with fashion MNIST dataset.
	%Here, we set the rank to be equal to 20 for the non-convex approaches.
	The solution rank for the template~\eqref{eq:sdp_svx} is known and it is equal to number of clusters $k$ (Theorem 1. \cite{kulis2007fast}). As discussed in~\cite{tepper2018clustering}, setting rank $r>k$ leads more accurate reconstruction in expense of speed. Therefore, we set the rank to 20.}
	\label{fig:clustering}
	%(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])}
	\end{center}
	\end{figure}


	\subsection{Basis Pursuit}
	Basis Pursuit (BP) finds sparsest solutions of an under-determined system of linear equations, namely,
	\begin{align}
	\begin{cases}
	\min_{z} \\|z\\|_1 \\
	Bz = b,
	\end{cases}
	\label{eq:bp_main}
	\end{align}
	where $B \in \RR^{n \times d}$ and $b \in \RR^{n}$. BP has found many applications in machine learning, statistics and signal processing \cite{chen2001atomic, candes2008introduction, arora2018compressed}. A huge number of primal-dual convex optimization algorithms are proposed to solve BP, including, but not limited to~\cite{tran2018adaptive,chambolle2011first}. There also exists many line of works \cite{beck2009fast} to handle sparse regression problem via regularization with $\ell_1$ norm.

	%\textbf{AE: Fatih, maybe mention a few other algorithms including asgard and also say why we can't use fista and nista.}



	Here, we take a different approach and cast~(\ref{eq:bp_main}) as an instance of~\eqref{prob:01}. Note that any $z \in \RR^d$ can be decomposed as $z = z^+ - z^-$, where $z^+,z^-\in \RR^d$ are the positive and negative parts of $z$, respectively. Then consider the change of variables $z^+ = u_1^{\circ 2}$ and $z^-= u_2^{\circ 2} \in \RR^d$, where $\circ$ denotes element-wise power. Next, we concatenate $u_1$ and $u_2$ as $x := [ u_1^{\top}, u_2^{\top} ]^{\top} \in \RR^{2d}$ and define $\overline{B} := [B, -B] \in \RR^{n \times 2d}$. Then, \eqref{eq:bp_main} is equivalent to \eqref{prob:01} with
	\begin{align}
	f(x) =& \\|x\\|_2^2, \quad g(x) = 0\nonumber\\
	A(x) =& \overline{B}x^{\circ 2}- b.
	\label{eq:bp-equiv}
	\end{align}
	In Appendix~\ref{sec:verification1}, we verify with minimal detail that Theorem~\ref{thm:main} indeed applies to~\eqref{prob:01} with the above $f,A$.




	%Let $\mu(B)$ denote the \emph{coherence} of ${B}$, namely,
	%\begin{align}
	%\mu = \max_{i,j} \| \langle B_i, B_j \rangle \|,
	%\end{align}
	%where $B_i\in \RR^n$ is the $i$th column of $B$. Let also $z_k,z_\in \RR^d$ denote the vectors corresponding to $x_k,x_$. We rewrite the last line of \eqref{eq:cnd-bp-pre} as
	%\begin{align}
	%& 2 \\| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x^{\circ 2} -b) \\| \nonumber\\
	%& = 2 \\| \text{diag}(x_k) \overline{B}^\top \overline{B} (x^{\circ 2} -x_*^{\circ 2}) \\| \nonumber\\
	%& = 2 \\| \text{diag}(x_k) {B}^\top B(x_k- x_*) \\| \nonumber\\
	%& = 2 \\| \text{diag}(x_k) {B}^\top B(x_k- x_*) \\|_{\infty} \nonumber\\
	%& \ge 2 \\| \text{diag}(x_k) \text{diag}({B}^\top B) (x_k- x_*) \\|_{\infty}\nonumber\\
	%& \qquad - 2 \\| \text{diag}(x_k) \text{hollow}({B}^\top B) (x_k- x_*) \\|_{\infty},
	%\end{align}
	%where $\text{hollow}$ returns the hollow part of th
	%
	% and let $\overline{B}=U\Sigma V^\top$ be its thin singular value decomposition, where $U\in \RR^{n\times n}, V\in \RR^{d\times n}$ have orthonormal columns and the diagonal matrix $\Sigma\in \RR^{n\times n}$ contains the singular values $\{\sigma_i\}_{i=1}^r$. Let also $x_{k,i},U_i$ be the $i$th entry of $x_k$ and the $i$th row of $U_i$, respectively. Then we bound the last line above as
	%\begin{align}
	%& 2 \\| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x^{\circ 2} -b) \\| \nonumber\\
	%& = 2 \\| \text{diag}(x_k) U \Sigma^2 U^\top ( x^{\circ 2} - x_*^{\circ 2}) \\| \nonumber\\
	%& \ge 2 \\| \text{diag}(x_k) U \Sigma^2 U^\top ( x^{\circ 2} - x_*^{\circ 2}) \\|_{\infty} \nonumber\\
	%& = \max_i \|x_{k,i}\| \cdot \| U_i ^\top \Sigma \cdot \Sigma U^\top ( x^{\circ 2} - x_*^{\circ 2}) \| \nonumber\\
	%&
	%\end{align}
	%
	%where $\tilde{b}_{i,j} = (\tilde{B})_{ij},~ i\in [1:n]$ and $j \in [1:2d]$.
	%\begin{align*}
	%-DA(x)^\top(A(x) - b) = -2x \odot (\tilde{B}^\top (A(x) - b)),
	%\end{align*}
	%where $\odot$ denotes hadamard product.
	%\begin{align*}
	%& \text{dist} \left( -DA(x)^\top (A(x)-b), \frac{\partial g(x)}{\b} \right) \nonumber\\
	%& = \text{dist} \left( -DA(x)^\top (A(x)-b), \{0\} \right) \nonumber\\
	%& = \left\\| -DA(x)^\top (A(x)-b) \right\\| \nonumber\\
	%& \ge ?????,
	%\end{align*}
	%Hence, this completes the proof for regularity condition.

	We draw the entries of $B$ independently from a zero-mean and unit-variance Gaussian distribution. For a fixed sparsity level $k$, the support of $z_*\in \RR^d$ and its nonzero amplitudes are also drawn from the standard Gaussian distribution.
	%We then pick a sparsity level $k$ and choose $k$ indexes, i.e, $\Omega \subset [1:d]$, which are corresponding to nonzero entries of $z$.
	%We then assign values from normal distribution to those entries.
	Then the measurement vector is created as $b = Bz + \epsilon$, where $\epsilon$ is the noise vector with entries drawn independently from the zero-mean Gaussian distribution with variance $\sigma^2=10^{-6}$.

	\begin{figure}[]
	\begin{center}
	{\includegraphics[width=1\columnwidth]{figs/bp_fig1_subsolvers.pdf}}
	\caption{Convergence with different subsolvers for the aforementioned non-convex relaxation.
	}
	\label{fig:bp1}
	%(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])}
	\end{center}
	\end{figure}
	%\vspace{-3mm}

	Figure~\ref{fig:bp1} compiles our results for the proposed relaxation. It is, indeed, interesting to see that these type of non-convex relaxations gives the solution of convex one and first order methods succeed.

	\paragraph{Discussion:}
	The true potential of our reformulation is in dealing with more structured norms rather than $\ell_1$, where computing the proximal operator is often intractable. One such case is the latent group lasso norm~\cite{obozinski2011group}, defined as
	\begin{align*}
	\\|z\\|_{\Omega} = \sum_{i=1}^I \\| z_{\Omega_i} \\|,
	\end{align*}
	where $\{\Omega_i\}_{i=1}^I$ are (not necessarily disjoint) index sets of $\{1,\cdots,d\}$. Although not studied here, we believe that the non-convex framework presented in this paper can serve to solve more complicated problems, such as the latent group lasso. We leave this research direction for future work.


	\subsection{Adversarial Denoising with GANs}
	\label{exp:ellinf}
	Projection onto the range of a neural network $G:\mathbb{R}^s \to \mathbb{R}^d$ has been considered
	by~\cite{Samangouei2018, Ilyas2017} as a defense mechanism against adversarial
	examples~\cite{Szegedy2013}. In their settings, samples are denoised by projecting on
	the range of a pre-trained \textit{generator}, from either the GAN \cite{Goodfellow2014}
	or VAE framework \cite{Kingma2013}, before being fed to a classifier. Even though the adversarial noise
	introduced is typically bounded in $\ell_\infty$ norm \cite{Madry2018}, the
	projection is performed in the $\ell_2$ metric. We instead propose to directly
	project using the $\ell_\infty$ norm that limits the attacker, i.e. we solve the program
	\begin{align}
	\begin{cases}
	\min_{x, z} \\|x - x^\natural \\|_\infty \\
	G(z) = x,
	\end{cases}
	\label{eq:darn}
	\end{align}
	with our proposed algorithm \eqref{Algo:2} (AL). On the optimization side, the $\ell_2$
	projection is usually performed by solving the non-convex program
	\begin{equation}
	\min_{z} \\|G(z) - x^\natural \\|_2^2 \\
	\label{eq:darn2}
	\end{equation}
	via off-the-shelf optimizers like gradient descent (GD) or ADAM
	\cite{Kingma2014}. Indeed, we assign equal computational budget to ADAM and GD
	for solving \eqref{eq:darn2}, and to our algorithm (AL) for solving
	\eqref{eq:darn}. We evaluate on a test set of 100 adversarial examples from the
	MNIST dataset, obtained with the Projected Gradient Algorithm of
	\cite{Madry2018} with 30 iterations, stepsize 0.01 and attack size 0.2. For the
	classifier, we use a standard convolutional network trained on clean MNIST
	samples.

	\textbf{Results and discussion. }
	We keep track of the denoised images along the trajectory of the algorithms,
	sampling 1500 equally spaced points in time, and we plot the test error
	achieved in Figure~\ref{fig:attack_error_time}. We observe that the
	$\ell_\infty$ denoising is a superior defense mechanism against $\ell_\infty$
	bounded attacks, and achieves a lower error and does so faster.

	\begin{figure}[h]
	\centering
	\includegraphics[width=.4\columnwidth]{figs/attack_error_time_clean.pdf}
	%%\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}}
	\caption{Error rate on denoised adversarial examples vs time}
	\label{fig:attack_error_time}
	\end{figure}


	%This relaxation transformed a non-smooth objective into a smooth one while loosing the linearity on the equality constraint.


	%\subsection{Latent Group Lasso \label{sec:latent lasso}}
	%
	%For a collection of subsets $\Omega=\{\Omega_i\}_{i=1}^{I}\subset [1:p]$, the latent group Lasso norm on $\RR^p$ takes $z\in \RR^p$ to
	%\begin{align}
	%\\|z\\|_{\Omega,1} = \sum_{i=1}^I \\| z_{\Omega_i} \\|.
	%\end{align}
	%Note that we do not require $\Omega_i$ to be disjoint. For $B\in \RR^{n\times p}$, $b\in \RR^n$, and $\lambda>0$, consider the latent group lasso as
	%\begin{align}
	%\min_{z\in \RR^d} \frac{1}{2}\\| Bz - b\\|_2^2+ \lambda \\| z\\|_{\Omega,1}.
	%\label{eq:group_lasso}
	%\end{align}
	%Because $\Omega$ is not a partition of $[1:p]$, computing the proximal operator of $\\|\cdot\\|_{\Omega}$ is often intractable, ruling out proximal gradient descent and similar algorithms for solving Program~\eqref{eq:group_lasso}. Instead, often Program~\eqref{eq:group_lasso} is solved by Alternating Direction Method of Multipliers (ADMM). More concretely, ???
	%
	%We take a radically different approach here and cast Program~\eqref{eq:group_lasso} as an instance of Program~\eqref{prob:01}. More specifically, let $z^+,z^-\in \RR^p$ be positive and negative parts of $z$, so that $z=z^+-z^-$. Let us introduce the nonnegative $u\in \RR^p$ such that $z^++z^- = u^{\circ 2}$, where we used $\circ$ notation to show entrywise power. We may now write that
	%\begin{align}
	%\\| z^++z^- \\|_{\Omega,1}
	%& = \\| u^{\circ 2} \\|_{\Omega,1 } =: \\|u\\|_{\Omega,2}^2,
	%\end{align}
	%Unlike $\\|\cdot\\|_{\Omega,1}$, note that $\\|\cdot\\|_{\Omega,2}^2$ is differentiable and, in fact, there exists a positive semidefinite matrix $Q\in \RR^{d\times d}$ such that $\\|u\\|_{\Omega,2}^2=u^\top Q_\Omega u$. Let us form $x=[(z^+)^\top,(z^-)^\top, u^\top]^\top\in \RR^{3d}$ and set
	%\begin{align*}
	%f(x) = \frac{1}{2}\\| Bz^+-Bz^- -b\\|_1+ \\|u\\|_{\Omega,2}^2,
	%\end{align*}
	%\begin{align}
	%g(x) = 0, \qquad A(x) = z^++z^--u^{\circ 2}.
	%\end{align}
	%We can now apply Algorithm~\ref{Algo:2} to solve Program~\eqref{prob:01} with $f,g,A$ specified above.
	%
	%
	%\paragraph{Convergence rate.}
	%Clearly, $f,A$ are strongly smooth in the sense that \eqref{eq:smoothness basic} holds with proper $\lambda_f,\lambda_A$. Likewise, both $f$ and the Jacobian $DA$ are continuous and the restricted Lipschitz constants $\lambda'_f,\lambda'_A$ in \eqref{eq:defn_lambda'} are consequently well-defined and finite for a fixed $\rho'>0$. We next verify the key regularity condition in Theorem~\ref{thm:main}, namely, \eqref{eq:regularity}. Note that
	%\begin{align*}
	%DA(x) & =
	%\left[
	%\begin{array}{ccc}
	%I_p & I_p & -2\text{diag}(u)
	%\end{array}
	%\right]\in \RR^{d\times 3d},
	%\end{align*}
	%\begin{align*}
	%-DA(x)^\top A(x) =
	%\left[
	%\begin{array}{c}
	%-z^+-z^-+u^{\circ 2} \\
	%-z^+-z^-+u^{\circ 2}\\
	%2\text{diag}(u)( z^++z^--u^{\circ 2})
	%\end{array}
	%\right],
	%\end{align*}
	%\begin{align*}
	%& \text{dist} \left( -DA(x)^\top A(x), \frac{\partial g(x)}{\b} \right) \nonumber\\
	%& = \text{dist} \left( -DA(x)^\top A(x), \{0\} \right) \nonumber\\
	%& = \left\\| -DA(x)^\top A(x) \right\\| \nonumber\\
	%& \ge \sqrt{2} \\| A(x)\\|,
	%\end{align*}
	%namely, \eqref{eq:regularity} holds with $\nu=1$.

experiments.texNo OneTemporaryActions

File Metadata

experiments.texView Options

Event Timeline

experiments.tex
No OneTemporary
Actions

experiments.tex
View Options