experiments.tex
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Wed, Sep 4, 04:19

experiments.tex
View Options

	%!TEX root = ../main.tex

	\section{Numerical Evidence \label{sec:experiments}}




	We first begin with a caveat: It is known that quasi-Newton methods, such as BFGS and lBFGS, might not converge for nonconvex problems~\cite{dai2002convergence, mascarenhas2004bfgs}. For this reason, we have used the trust region method as the second-order solver in our analysis in Section~\ref{sec:cvg rate}, which is well-studied for nonconvex problems~\cite{cartis2012complexity}. Empirically, however, BFGS and lBGFS are extremely successful and we have therefore opted for those solvers in this section since the subroutine does not affect Theorem~\ref{thm:main} as long as the subsolver performs well in practice.


	\subsection{Clustering}

	Given data points $\{z_i\}_{i=1}^n $, the entries of the corresponding Euclidean distance matrix $D \in \RR^{n\times n}$ are $ D_{i, j} = \left\\| z_i - z_j\right\\|^2 $.
	Clustering is then the problem of finding a co-association matrix $Y\in \RR^{n\times n}$ such that $Y_{ij} = 1$ if points $z_i$ and $z_j$ are within the same cluster and $Y_{ij} = 0$ otherwise. In~\cite{Peng2007}, the authors provide a SDP relaxation of the clustering problem, specified as
	\begin{align}
	%\begin{cases}
	\underset{Y \in \RR^{nxn}}{\min} \text{tr}(DY) \subjto Y\mathbf{1} = \mathbf{1},
	~\text{tr}(Y) = s,~
	Y\succeq 0,~Y \geq 0,
	%\end{cases}
	\label{eq:sdp_svx}
	\end{align}
	where $s$ is the number of clusters and $Y $ is both positive semidefinite and has nonnegative entries.
	Standard SDP solvers do not scale well with the number of data points~$n$, since they often require projection onto the semidefinite cone with the complexity of $\mathcal{O}(n^3)$. We instead use the BM factorization to solve \eqref{eq:sdp_svx}, sacrificing convexity to reduce the computational complexity. More specifically, we solve the program
	\begin{align}
	\label{eq:nc_cluster}
	%\begin{cases}
	\underset{V \in \RR^{n\times r}}{\min} \text{tr}(DVV^{\top}) \subjto
	VV^{\top}\mathbf{1} = \mathbf{1},~~ \\|V\\|_F^2 \le s,
	~~V \geq 0,
	%\end{cases}
	\end{align}
	where $\mathbf{1}\in \RR^n$ is the vector of all ones.
	Note that $Y \geq 0$ in \eqref{eq:sdp_svx} is replaced above by the much stronger but easier-to-enforce constraint $V \geq 0$ in \eqref{eq:nc_cluster}, see~\cite{kulis2007fast} for the reasoning behind this relaxation.
	%. Trace constraint translates to a Frobenius norm constraint in factorized space. Semidefinite constraint is naturally removed due to factorization $Y=VV^{\top}$.
	%See~\citep{kulis2007fast} for the details of the relaxation.
	Now, we can cast~\eqref{eq:nc_cluster} as an instance of~\eqref{prob:01}. Indeed, for every $i\le n$, let $x_i \in \RR^r$ denote the $i$th row of $V$. We next form $x \in \RR^d$ with $d = nr$ by expanding the factorized variable $V$, namely,
	$
	x := [x_1^{\top}, \cdots, x_n^{\top}]^{\top} \in \RR^d,
	$
	and then set
	\begin{align*}
	f(x) =\sum_{i,j=1}^n D_{i, j} \left\langle x_i, x_j \right\rangle,
	\qquad g = \delta_C, \qquad
	A(x) = [x_1^{\top}\sum_{j=1}^n x_j -1, \cdots, x_n^{\top}\sum_{j=1}^n x_j-1]^{\top},
	\end{align*}
	where $C$ is the intersection of the positive orthant in $\RR^d$ with the Euclidean ball of radius $\sqrt{s}$. In Appendix~\ref{sec:verification2}, {we verify that Theorem~\ref{thm:main} applies to~\eqref{prob:01} with $f,g,A$ specified above. }


	In our simulations, we use two different solvers for Step~2 of Algorithm~\ref{Algo:2}, namely, APGM and lBFGS. APGM is a solver for nonconvex problems of the form~\eqref{e:exac} with convergence guarantees to first-order stationarity, as discussed in Section~\ref{sec:cvg rate}. lBFGS is a limited-memory version of BFGS algorithm in~\cite{fletcher2013practical} that approximately leverages the second-order information of the problem.
	We compare our approach against the following convex methods:
	\begin{itemize}
	\item HCGM: Homotopy-based Conditional Gradient Method in~\cite{yurtsever2018conditional} which directly solves~\eqref{eq:sdp_svx}.
	\item SDPNAL+: A second-order augmented Lagrangian method for solving SDP's with nonnegativity constraints~\cite{yang2015sdpnal}.
	\end{itemize}

	%First, we extract the meaningful features from this dataset using a simple two-layer neural network with a sigmoid activation function. Then, we apply this neural network to 1000 test samples from the same dataset, which gives us a vector of length $10$ for each data point, where each entry represents the posterior probability for each class. Then, we form the $\ell_2$ distance matrix ${D}$ from these probability vectors.
	%The solution rank for the template~\eqref{eq:sdp_svx} is known and it is equal to number of clusters $k$ \cite[Theorem~1]{kulis2007fast}. As discussed in~\cite{tepper2018clustering}, setting rank $r>k$ leads more accurate reconstruction in expense of speed.
	%Therefore, we set the rank to 20. The results are depicted in Figure~\ref{fig:clustering}.
	%

	As for the dataset, our experimental setup is similar to that described by~\cite{mixon2016clustering}. We use the publicly-available fashion-MNIST data in \cite{xiao2017/online}, which is released as a possible replacement for the MNIST handwritten digits. Each data point is a $28\times 28$ gray-scale image, associated with a label from ten classes, labeled from $0$ to $9$.
	First, we extract the meaningful features from this dataset using a simple two-layer neural network with a sigmoid activation function. Then, we apply this neural network to 1000 test samples from the same dataset, which gives us a vector of length $10$ for each data point, where each entry represents the posterior probability for each class. Then, we form the $\ell_2$ distance matrix ${D}$ from these probability vectors.
	The solution rank for the template~\eqref{eq:sdp_svx} is known and it is equal to number of clusters $k$ \cite[Theorem~1]{kulis2007fast}. As discussed in~\cite{tepper2018clustering}, setting rank $r>k$ leads more accurate reconstruction in expense of speed.
	Therefore, we set the rank to 20.
	The results are depicted in Figure~\ref{fig:clustering}.
	We implemented 3 algorithms on MATLAB and used the software package for SDPNAL+ which contains mex files.
	It is predictable that the performance of our nonconvex approach would even improve by using mex files.

	%\begin{figure*}[ht]
	%% \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf}
	%\begin{center}
	%{\includegraphics[width=.4\columnwidth]{figs/clustering_fig4_times_linearbeta_last.pdf}}
	%{\includegraphics[width=.4\columnwidth]{figs/clustering_fig4_iter_linearbeta_last.pdf}}
	%%\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}}
	%\caption{Convergence of different algorithms for clustering with fashion-MNIST dataset. Here, we set the rank as $r=20$ for the nonconvex approaches. The solution rank for the template~\eqref{eq:sdp_svx} is the number of clusters $s$ \cite[Theorem 1]{kulis2007fast}. However, as discussed in~\cite{tepper2018clustering}, setting rank $r>s$ leads more accurate reconstruction at the expense of speed, hence our choice of $r=20$. }
	%\label{fig:clustering}
	%%(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])}
	%\end{center}
	%\end{figure*}

	%\begin{figure}[]
	%% \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf}
	%%\begin{center}
	%\centering
	%\begin{minipage}{.48\textwidth}
	%{\includegraphics[width=.49\columnwidth]{figs/clustering_fig4_times_linearbeta_last.pdf}}
	%{\includegraphics[width=.49\columnwidth]{figs/cluster_feas_time.pdf}}
	%\caption{Clustering running time comparison. }
	%\label{fig:clustering}
	%\end{minipage}
	%\begin{minipage}{.48\textwidth}
	%{\includegraphics[width=.49\columnwidth]{figs/bp_all_obj.pdf}}
	%{\includegraphics[width=.49\columnwidth]{figs/bp_all_feas.pdf}}
	%\caption{Basis Pursuit}
	%\label{fig:bp1}
	%\end{minipage}
	%\end{figure}

	\begin{figure}[]
	% \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf}
	\begin{center}
	{\includegraphics[width=.4\columnwidth]{figs/clustering_fig4_times_linearbeta_last.pdf}}
	{\includegraphics[width=.4\columnwidth]{figs/cluster_feas_time.pdf}}
	%\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}}
	\caption{Clustering running time comparison. }
	\label{fig:clustering}
	%(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])}
	\end{center}
	\end{figure}


	\subsection{Additional demonstrations}
	We provide several additional experiments in Appendix \ref{sec:adexp}.
	Section \ref{sec:bp} discusses a novel nonconvex relaxation of the standard basis pursuit template which performs comparable to the state of the art convex solvers.
	In Section \ref{sec:geig}, we provide fast numerical solutions to the generalized eigenvalue problem.
	In Section \ref{sec:gan}, we give a contemporary application example that our template applies, namely, denoising with generative adversarial networks.
	Finally, we provide improved bounds for sparse quadratic assignment problem instances in Section \ref{sec:qap}.

	%\subsection{Basis Pursuit}
	%Basis Pursuit (BP) finds sparsest solutions of an under-determined system of linear equations by solving
	%\begin{align}
	%%\begin{cases}
	%\min_{z} \\|z\\|_1 \subjto
	%Bz = b,
	%%\end{cases}
	%\label{eq:bp_main}
	%\end{align}
	%where $B \in \RR^{n \times d}$ and $b \in \RR^{n}$.
	%%BP has found many applications in machine learning, statistics and signal processing \cite{chen2001atomic, candes2008introduction, arora2018compressed}.
	%Various primal-dual convex optimization algorithms are available in the literature to solve BP, including~\cite{tran2018adaptive,chambolle2011first}.
	%%There also exists a line of work \cite{beck2009fast} that handles sparse regression via regularization with $\ell_1$ norm, known as Lasso regression, which we do not compare with here due to their need for tuning the regularization term.
	%We compare our algorithm against state-of-the-art primal-dual convex methods for solving \eqref{eq:bp_main}, namely, Chambole-Pock~\cite{chambolle2011first}, ASGARD~\cite{tran2018smooth} and ASGARD-DL~\cite{tran2018adaptive}.
	%
	%%\textbf{AE: Fatih, maybe mention a few other algorithms including asgard and also say why we can't use fista and nista.}
	%
	%
	%
	%Here, we take a different approach and cast~(\ref{eq:bp_main}) as an instance of~\eqref{prob:01}. Note that any $z \in \RR^d$ can be decomposed as $z = z^+ - z^-$, where $z^+,z^-\in \RR^d$ are the positive and negative parts of $z$, respectively. Then consider the change of variables $z^+ = u_1^{\circ 2}$ and $z^-= u_2^{\circ 2} \in \RR^d$, where $\circ$ denotes element-wise power. Next, we concatenate $u_1$ and $u_2$ as $x := [ u_1^{\top}, u_2^{\top} ]^{\top} \in \RR^{2d}$ and define $\overline{B} := [B, -B] \in \RR^{n \times 2d}$. Then, \eqref{eq:bp_main} is equivalent to \eqref{prob:01} with
	%\begin{align}
	%f(x) =& \\|x\\|^2, \quad g(x) = 0,\subjto
	%A(x) = \overline{B}x^{\circ 2}- b.
	%\label{eq:bp-equiv}
	%\end{align}
	%In Appendix~\ref{sec:verification1}, we verify with minimal details that Theorem~\ref{thm:main} indeed applies to~\eqref{prob:01} with the above $f,A$.
	%
	%
	%
	%
	%%Let $\mu(B)$ denote the \emph{coherence} of ${B}$, namely,
	%%\begin{align}
	%%\mu = \max_{i,j} \| \langle B_i, B_j \rangle \|,
	%%\end{align}
	%%where $B_i\in \RR^n$ is the $i$th column of $B$. Let also $z_k,z_\in \RR^d$ denote the vectors corresponding to $x_k,x_$. We rewrite the last line of \eqref{eq:cnd-bp-pre} as
	%%\begin{align}
	%%& 2 \\| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x^{\circ 2} -b) \\| \nonumber\\
	%%& = 2 \\| \text{diag}(x_k) \overline{B}^\top \overline{B} (x^{\circ 2} -x_*^{\circ 2}) \\| \nonumber\\
	%%& = 2 \\| \text{diag}(x_k) {B}^\top B(x_k- x_*) \\| \nonumber\\
	%%& = 2 \\| \text{diag}(x_k) {B}^\top B(x_k- x_*) \\|_{\infty} \nonumber\\
	%%& \ge 2 \\| \text{diag}(x_k) \text{diag}({B}^\top B) (x_k- x_*) \\|_{\infty}\nonumber\\
	%%& \qquad - 2 \\| \text{diag}(x_k) \text{hollow}({B}^\top B) (x_k- x_*) \\|_{\infty},
	%%\end{align}
	%%where $\text{hollow}$ returns the hollow part of th
	%%
	%% and let $\overline{B}=U\Sigma V^\top$ be its thin singular value decomposition, where $U\in \RR^{n\times n}, V\in \RR^{d\times n}$ have orthonormal columns and the diagonal matrix $\Sigma\in \RR^{n\times n}$ contains the singular values $\{\sigma_i\}_{i=1}^r$. Let also $x_{k,i},U_i$ be the $i$th entry of $x_k$ and the $i$th row of $U_i$, respectively. Then we bound the last line above as
	%%\begin{align}
	%%& 2 \\| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x^{\circ 2} -b) \\| \nonumber\\
	%%& = 2 \\| \text{diag}(x_k) U \Sigma^2 U^\top ( x^{\circ 2} - x_*^{\circ 2}) \\| \nonumber\\
	%%& \ge 2 \\| \text{diag}(x_k) U \Sigma^2 U^\top ( x^{\circ 2} - x_*^{\circ 2}) \\|_{\infty} \nonumber\\
	%%& = \max_i \|x_{k,i}\| \cdot \| U_i ^\top \Sigma \cdot \Sigma U^\top ( x^{\circ 2} - x_*^{\circ 2}) \| \nonumber\\
	%%&
	%%\end{align}
	%%
	%%where $\tilde{b}_{i,j} = (\tilde{B})_{ij},~ i\in [1:n]$ and $j \in [1:2d]$.
	%%\begin{align*}
	%%-DA(x)^\top(A(x) - b) = -2x \odot (\tilde{B}^\top (A(x) - b)),
	%%\end{align*}
	%%where $\odot$ denotes hadamard product.
	%%\begin{align*}
	%%& \text{dist} \left( -DA(x)^\top (A(x)-b), \frac{\partial g(x)}{\b} \right) \nonumber\\
	%%& = \text{dist} \left( -DA(x)^\top (A(x)-b), \{0\} \right) \nonumber\\
	%%& = \left\\| -DA(x)^\top (A(x)-b) \right\\| \nonumber\\
	%%& \ge ?????,
	%%\end{align*}
	%%Hence, this completes the proof for regularity condition.
	%
	%We draw the entries of $B$ independently from a zero-mean and unit-variance Gaussian distribution. For a fixed sparsity level $k$, the support of $z_*\in \RR^d$ and its nonzero amplitudes are also drawn from the standard Gaussian distribution.
	%%We then pick a sparsity level $k$ and choose $k$ indexes, i.e, $\Omega \subset [1:d]$, which are corresponding to nonzero entries of $z$.
	%%We then assign values from normal distribution to those entries.
	%Then the measurement vector is created as $b = Bz + \epsilon$, where $\epsilon$ is the noise vector with entries drawn independently from the zero-mean Gaussian distribution with variance $\sigma^2=10^{-6}$.
	%
	%%\edita{\textbf{AE: the image sizes throughout the paper are inconsistent which is not nice. The font size in Fig 3 is also very different from the rest of the paper which is not nice. please change.}}
	%%\editf{(mfs:) I equated the figure sizes and will make further modifications to the inconsistent font size in this figure.}
	%
	%%\begin{figure}[ht]
	%%\begin{center}
	%%{\includegraphics[width = 12cm, height = 5cm]{figs/bp_fig1_algos.pdf}}
	%%\caption{Convergence with different subsolvers for the aforementioned nonconvex relaxation.
	%%}
	%%\label{fig:bp1}
	%%%(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])}
	%%\end{center}
	%%\end{figure}
	%%\vspace{-3mm}
	%
	%The results are compiled in Figure~\ref{fig:bp1}. Clearly, the performance of Algorithm~\ref{Algo:2} with a second-order solver for BP is comparable to the rest.
	%%Figure~\ref{fig:bp1} compiles our results for the proposed relaxation.
	%It is, indeed, interesting to see that these type of nonconvex relaxations gives the solution of convex one and first order methods succeed.
	%
	%
	%
	%\paragraph{Discussion:}
	%The true potential of our reformulation is in dealing with more structured norms rather than $\ell_1$, where computing the proximal operator is often intractable. One such case is the latent group lasso norm~\cite{obozinski2011group}, defined as
	%\begin{align*}
	%\\|z\\|_{\Omega} = \sum_{i=1}^I \\| z_{\Omega_i} \\|,
	%\end{align*}
	%{where $\{\Omega_i\}_{i=1}^I$ are (not necessarily disjoint) index sets of $\{1,\cdots,d\}$. Although not studied here, we believe that the nonconvex framework presented in this paper can serve to solve more complicated problems, such as the latent group lasso. We leave this research direction for future work.

	%\subsection{Adversarial Denoising with GANs}
	%In the appendix, we provide a contemporary application example that our template applies.

	%This relaxation transformed a non-smooth objective into a smooth one while loosing the linearity on the equality constraint.


	%\subsection{Latent Group Lasso \label{sec:latent lasso}}
	%
	%For a collection of subsets $\Omega=\{\Omega_i\}_{i=1}^{I}\subset [1:p]$, the latent group Lasso norm on $\RR^p$ takes $z\in \RR^p$ to
	%\begin{align}
	%\\|z\\|_{\Omega,1} = \sum_{i=1}^I \\| z_{\Omega_i} \\|.
	%\end{align}
	%Note that we do not require $\Omega_i$ to be disjoint. For $B\in \RR^{n\times p}$, $b\in \RR^n$, and $\lambda>0$, consider the latent group lasso as
	%\begin{align}
	%\min_{z\in \RR^d} \frac{1}{2}\\| Bz - b\\|_2^2+ \lambda \\| z\\|_{\Omega,1}.
	%\label{eq:group_lasso}
	%\end{align}
	%Because $\Omega$ is not a partition of $[1:p]$, computing the proximal operator of $\\|\cdot\\|_{\Omega}$ is often intractable, ruling out proximal gradient descent and similar algorithms for solving Program~\eqref{eq:group_lasso}. Instead, often Program~\eqref{eq:group_lasso} is solved by Alternating Direction Method of Multipliers (ADMM). More concretely, ???
	%
	%We take a radically different approach here and cast Program~\eqref{eq:group_lasso} as an instance of Program~\eqref{prob:01}. More specifically, let $z^+,z^-\in \RR^p$ be positive and negative parts of $z$, so that $z=z^+-z^-$. Let us introduce the nonnegative $u\in \RR^p$ such that $z^++z^- = u^{\circ 2}$, where we used $\circ$ notation to show entrywise power. We may now write that
	%\begin{align}
	%\\| z^++z^- \\|_{\Omega,1}
	%& = \\| u^{\circ 2} \\|_{\Omega,1 } =: \\|u\\|_{\Omega,2}^2,
	%\end{align}
	%Unlike $\\|\cdot\\|_{\Omega,1}$, note that $\\|\cdot\\|_{\Omega,2}^2$ is differentiable and, in fact, there exists a positive semidefinite matrix $Q\in \RR^{d\times d}$ such that $\\|u\\|_{\Omega,2}^2=u^\top Q_\Omega u$. Let us form $x=[(z^+)^\top,(z^-)^\top, u^\top]^\top\in \RR^{3d}$ and set
	%\begin{align*}
	%f(x) = \frac{1}{2}\\| Bz^+-Bz^- -b\\|_1+ \\|u\\|_{\Omega,2}^2,
	%\end{align*}
	%\begin{align}
	%g(x) = 0, \qquad A(x) = z^++z^--u^{\circ 2}.
	%\end{align}
	%We can now apply Algorithm~\ref{Algo:2} to solve Program~\eqref{prob:01} with $f,g,A$ specified above.
	%
	%
	%\paragraph{Convergence rate.}
	%Clearly, $f,A$ are strongly smooth in the sense that \eqref{eq:smoothness basic} holds with proper $\lambda_f,\lambda_A$. Likewise, both $f$ and the Jacobian $DA$ are continuous and the restricted Lipschitz constants $\lambda'_f,\lambda'_A$ in \eqref{eq:defn_lambda'} are consequently well-defined and finite for a fixed $\rho'>0$. We next verify the key regularity condition in Theorem~\ref{thm:main}, namely, \eqref{eq:regularity}. Note that
	%\begin{align*}
	%DA(x) & =
	%\left[
	%\begin{array}{ccc}
	%I_p & I_p & -2\text{diag}(u)
	%\end{array}
	%\right]\in \RR^{d\times 3d},
	%\end{align*}
	%\begin{align*}
	%-DA(x)^\top A(x) =
	%\left[
	%\begin{array}{c}
	%-z^+-z^-+u^{\circ 2} \\
	%-z^+-z^-+u^{\circ 2}\\
	%2\text{diag}(u)( z^++z^--u^{\circ 2})
	%\end{array}
	%\right],
	%\end{align*}
	%\begin{align*}
	%& \text{dist} \left( -DA(x)^\top A(x), \frac{\partial g(x)}{\b} \right) \nonumber\\
	%& = \text{dist} \left( -DA(x)^\top A(x), \{0\} \right) \nonumber\\
	%& = \left\\| -DA(x)^\top A(x) \right\\| \nonumber\\
	%& \ge \sqrt{2} \\| A(x)\\|,
	%\end{align*}
	%namely, \eqref{eq:regularity} holds with $\nu=1$.

experiments.texNo OneTemporaryActions

File Metadata

experiments.texView Options

Event Timeline

experiments.tex
No OneTemporary
Actions

experiments.tex
View Options