experiments.tex
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sat, May 25, 13:28

experiments.tex
View Options

	%!TEX root = ../main.tex

	\begin{figure*}[ht]
	% \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf}
	\begin{center}
	{\includegraphics[width=.7\columnwidth]{figs/clustering_fig4_times_linearbeta_last.pdf}}
	{\includegraphics[width=.7\columnwidth]{figs/clustering_fig4_iter_linearbeta_last.pdf}}
	%\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}}
	\caption{Convergence of different algorithms for k-Means clustering with fashion MNIST dataset. Here, we set the rank to be equal to 20 for the non-convex approaches. The solution rank for the template~\eqref{eq:sdp_svx} is known and it is equal to number of clusters $k$ (Theorem 1. \cite{kulis2007fast}). As discussed in~\cite{tepper2018clustering}, setting rank $r>k$ leads more accurate reconstruction in expense of speed. Therefore, we set the rank to 20.}
	\label{fig:clustering}
	%(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])}
	\end{center}
	\end{figure*}

	\section{Numerical evidence \label{sec:experiments}}




	We first begin with a caveat: It is known that quasi-Newton methods, such as BFGS and lBFGS, might not converge for non-convex problems~\cite{dai2002convergence, mascarenhas2004bfgs}. For this reason, we have used the trust region method as the second-order solver in our analysis in Section~\ref{sec:cvg rate}, which is well-studied for non-convex problems~\cite{cartis2012complexity}.

	Empirically, however, BFGS and lBGFS are extremely successful and we have also opted for those solvers in this section since the subroutine does not affect Theorem~\ref{thm:main}.


	\subsection{k-Means Clustering}

	Given data points $\{z_i\}_{i=1}^n $, the entries of the corresponding Euclidean distance matrix $D \in \RR^{nxn}$ are $ D_{i, j} = \left\\| z_i - z_j\right\\|^2 $.
	Clustering is then the problem of finding a co-association matrix $Y\in \RR^{n\times n}$ such that $Y_{ij} = 1$ if points $z_i$ and $z_j$ are within the same cluster and $Y_{ij} = 0$ otherwise. In~\cite{Peng2007}, the authors provide a SDP relaxation of the clustering problem, specified as
	\begin{align}
	\begin{cases}
	\underset{Y \in \RR^{nxn}}{\min} \text{tr}(DY) \\
	Y\mathbf{1} = \mathbf{1},
	~\text{tr}(Y) = k,~
	Y\succeq 0,~Y \geq 0,
	\end{cases}
	\label{eq:sdp_svx}
	\end{align}
	where $k$ is the number of clusters and $Y $ is both positive semidefinite and has nonnegative entries.
	Standard SDP solvers do not scale well with the number of data points~$n$, since they often require projection onto the semidefinite cone with the complexity of $\mathcal{O}(n^3)$. We instead use the Burer-Monteiro splitting, sacrificing convexity to reduce the computational complexity. More specifically, we solve the program
	\begin{align}
	\label{eq:nc_cluster}
	\begin{cases}
	\underset{V \in \RR^{nxr}}{\min} \text{tr}(DVV^{\top}) \\
	VV^{\top}\mathbf{1} = \mathbf{1},~~ \\|V\\|_F^2 \le k,
	~~V \geq 0,
	\end{cases}
	\end{align}
	where $\mathbf{1}\in \RR^n$ is the vector of all ones.
	Note that $Y \geq 0$ in \eqref{eq:sdp_svx} is replaced above by the much stronger but easier to enforce constraint $V \geq 0$ constraint above, see~\citep{kulis2007fast} for the reasoning behind this relaxation.
	%. Trace constraint translates to a Frobenius norm constraint in factorized space. Semidefinite constraint is naturally removed due to factorization $Y=VV^{\top}$.
	%See~\citep{kulis2007fast} for the details of the relaxation.
	Now, we can cast~\eqref{eq:nc_cluster} as an instance of~\eqref{prob:01}. Indeed, for every $i\le n$, let $x_i \in \RR^r$ denote the $i$th row of $V$. We next form $x \in \RR^d$ with $d = nr$ by expanding the factorized variable $V$, namely,
	\begin{align*}
	x = [x_1^{\top}, \cdots, x_n^{\top}]^{\top} \in \RR^d,
	\end{align*}
	and then set
	\begin{align*}
	f(x) =\sum_{i,j=1}^n D_{i, j} \left\langle x_i, x_j \right\rangle,
	\qquad g = \delta_C,
	\end{align*}
	\begin{align}
	A(x) = [x_1^{\top}\sum_{j=1}^n x_j -1, \cdots, x_n^{\top}\sum_{j=1}^n x_j-1]^{\top},
	\end{align}
	where $C$ is the intersection of the positive orthant in $\RR^d$ with the Euclidean ball of radius $\sqrt{k}$. In Appendix~\ref{sec:verification2}, we somewhat informally verify that Theorem~\ref{thm:main} applies to~\eqref{prob:01} with $f,g,A$ specified above.


	In our simulations, we use two different solvers for Step~2 of Algorithm~\ref{Algo:2}, namely, APGM and lBFGS. APGM is a solver for non-convex problems of the form~\eqref{e:exac} with convergence guarantees to first-order stationarity, as discussed in Section~\ref{sec:cvg rate}. lBFGS is a limited-memory version of BFGS algorithm in~\cite{fletcher2013practical} that approximately leverages the second-order information of the problem.
	We compare our approach against the following convex methods:
	\begin{itemize}
	\item HCGM: Homotopy-based Conditional Gradient Method in\cite{yurtsever2018conditional} which directly solves~\eqref{eq:sdp_svx}.
	\item SDPNAL+: A second-order augmented Lagrangian method for solving SDP's with nonnegativity constraints~\cite{yang2015sdpnal}.
	\end{itemize}

	As for the dataset, our expreminetal setup is similar to that described by~\cite{mixon2016clustering}. We use the publicly-available fashion-MNIST data in \cite{xiao2017/online}, which is released as a possible replacement for the MNIST handwritten digits. Each data point is a $28\times 28$ grayscale image, associated with a label from ten classes, labeled from $0$ to $9$.
	First, we extract the meaningful features from this dataset using a simple two-layer neural network with a sigmoid activation function. Then, we apply this neural network to 1000 test samples from the same dataset, which gives us a vector of length $10$ for each data point, where each entry represents the posterior probability for each class. Then, we form the $\ell_2$ distance matrix ${D}$ from these probability vectors. The results are depicted in Figure~\ref{fig:clustering}.




	\subsection{Basis Pursuit}
	Basis Pursuit (BP) finds sparsest solutions of an under-determined system of linear equations, namely,
	\begin{align}
	\begin{cases}
	\min_{z} \\|z\\|_1 \\
	Bz = b,
	\end{cases}
	\label{eq:bp_main}
	\end{align}
	where $B \in \RR^{n \times d}$ and $b \in \RR^{n}$. BP has found many applications in machine learning, statistics and signal processing \cite{chen2001atomic, candes2008introduction, arora2018compressed}. A huge number of primal-dual convex optimization algorithms are proposed to solve BP, including, but not limited to~\cite{tran2018adaptive,chambolle2011first}. There also exists many line of works \cite{beck2009fast} to handle sparse regression problem via regularization with $\ell_1$ norm.

	%\textbf{AE: Fatih, maybe mention a few other algorithms including asgard and also say why we can't use fista and nista.}



	Here, we take a different approach and cast~(\ref{eq:bp_main}) as an instance of~\eqref{prob:01}. Note that any $z \in \RR^d$ can be decomposed as $z = z^+ - z^-$, where $z^+,z^-\in \RR^d$ are the positive and negative parts of $z$, respectively. Then consider the change of variables $z^+ = u_1^{\circ 2}$ and $z^-= u_2^{\circ 2} \in \RR^d$, where $\circ$ denotes element-wise power. Next, we concatenate $u_1$ and $u_2$ as $x := [ u_1^{\top}, u_2^{\top} ]^{\top} \in \RR^{2d}$ and define $\overline{B} := [B, -B] \in \RR^{n \times 2d}$. Then, \eqref{eq:bp_main} is equivalent to \eqref{prob:01} with
	\begin{align}
	f(x) =& \\|x\\|_2^2, \quad g(x) = 0\nonumber\\
	A(x) =& \overline{B}x^{\circ 2}- b.
	\label{eq:bp-equiv}
	\end{align}
	In Appendix~\ref{sec:verification1}, we verify with minimal detail that Theorem~\ref{thm:main} indeed applies to~\eqref{prob:01} with the above $f,A$.




	%Let $\mu(B)$ denote the \emph{coherence} of ${B}$, namely,
	%\begin{align}
	%\mu = \max_{i,j} \| \langle B_i, B_j \rangle \|,
	%\end{align}
	%where $B_i\in \RR^n$ is the $i$th column of $B$. Let also $z_k,z_\in \RR^d$ denote the vectors corresponding to $x_k,x_$. We rewrite the last line of \eqref{eq:cnd-bp-pre} as
	%\begin{align}
	%& 2 \\| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x^{\circ 2} -b) \\| \nonumber\\
	%& = 2 \\| \text{diag}(x_k) \overline{B}^\top \overline{B} (x^{\circ 2} -x_*^{\circ 2}) \\| \nonumber\\
	%& = 2 \\| \text{diag}(x_k) {B}^\top B(x_k- x_*) \\| \nonumber\\
	%& = 2 \\| \text{diag}(x_k) {B}^\top B(x_k- x_*) \\|_{\infty} \nonumber\\
	%& \ge 2 \\| \text{diag}(x_k) \text{diag}({B}^\top B) (x_k- x_*) \\|_{\infty}\nonumber\\
	%& \qquad - 2 \\| \text{diag}(x_k) \text{hollow}({B}^\top B) (x_k- x_*) \\|_{\infty},
	%\end{align}
	%where $\text{hollow}$ returns the hollow part of th
	%
	% and let $\overline{B}=U\Sigma V^\top$ be its thin singular value decomposition, where $U\in \RR^{n\times n}, V\in \RR^{d\times n}$ have orthonormal columns and the diagonal matrix $\Sigma\in \RR^{n\times n}$ contains the singular values $\{\sigma_i\}_{i=1}^r$. Let also $x_{k,i},U_i$ be the $i$th entry of $x_k$ and the $i$th row of $U_i$, respectively. Then we bound the last line above as
	%\begin{align}
	%& 2 \\| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x^{\circ 2} -b) \\| \nonumber\\
	%& = 2 \\| \text{diag}(x_k) U \Sigma^2 U^\top ( x^{\circ 2} - x_*^{\circ 2}) \\| \nonumber\\
	%& \ge 2 \\| \text{diag}(x_k) U \Sigma^2 U^\top ( x^{\circ 2} - x_*^{\circ 2}) \\|_{\infty} \nonumber\\
	%& = \max_i \|x_{k,i}\| \cdot \| U_i ^\top \Sigma \cdot \Sigma U^\top ( x^{\circ 2} - x_*^{\circ 2}) \| \nonumber\\
	%&
	%\end{align}
	%
	%where $\tilde{b}_{i,j} = (\tilde{B})_{ij},~ i\in [1:n]$ and $j \in [1:2d]$.
	%\begin{align*}
	%-DA(x)^\top(A(x) - b) = -2x \odot (\tilde{B}^\top (A(x) - b)),
	%\end{align*}
	%where $\odot$ denotes hadamard product.
	%\begin{align*}
	%& \text{dist} \left( -DA(x)^\top (A(x)-b), \frac{\partial g(x)}{\b} \right) \nonumber\\
	%& = \text{dist} \left( -DA(x)^\top (A(x)-b), \{0\} \right) \nonumber\\
	%& = \left\\| -DA(x)^\top (A(x)-b) \right\\| \nonumber\\
	%& \ge ?????,
	%\end{align*}
	%Hence, this completes the proof for regularity condition.

	We draw the entries of $B$ independently from a zero-mean and unit-variance Gaussian distribution. For a fixed sparsity level $k$, the support of $z_*\in \RR^d$ and its nonzero amplitudes are also drawn from the standard Gaussian distribution.
	%We then pick a sparsity level $k$ and choose $k$ indexes, i.e, $\Omega \subset [1:d]$, which are corresponding to nonzero entries of $z$.
	%We then assign values from normal distribution to those entries.
	Then the measurement vector is created as $b = Bz + \epsilon$, where $\epsilon$ is the noise vector with entries drawn independently from the zero-mean Gaussian distribution with variance $\sigma^2=10^{-6}$.

	\begin{figure}[ht]
	\begin{center}
	{\includegraphics[width=1\columnwidth]{figs/bp_fig1_subsolvers.pdf}}
	\caption{Here, we show the convergence based on number of iterations because our complexity results are as such. One \textsc{lBFGS} iteration corresponds to one \textsc{lBFGS} update, whereas one iteration for \textsc{TrustRegion} method is the number of hessian evaluation. All the other methods are first order and one iteration corresponds to one algorithmic step at each. (\textit{Left}) Effect of using different subsolver for the aforementioned non-convex relaxation.
	}
	\label{fig:bp1}
	%(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])}
	\end{center}
	\end{figure}
	\vspace{-3mm}

	Figure~\ref{fig:bp1} compiles our results for the proposed relaxation. It is, indeed, interesting to see that these type of nonconvex relaxations might work well in practice even with first order solvers.
	%This relaxation transformed a non-smooth objective into a smooth one while loosing the linearity on the equality constraint.
	The true potential of our reformulation is in dealing with more structured norms rather than $\ell_1$, where computing the proximal operator is often intractable. One such case is the latent group lasso norm~\cite{obozinski2011group}, defined as
	\begin{align*}
	\\|z\\|_{\Omega} = \sum_{i=1}^I \\| z_{\Omega_i} \\|,
	\end{align*}
	where $\{\Omega_i\}_{i=1}^I$ are (not necessarily disjoint) index sets of $\{1,\cdots,d\}$. Although not studied here, we believe that the non-convex framework presented in this paper can serve to solve more complicated problems, such as the latent group lasso. We leave this research direction for future work.

	%\subsection{Latent Group Lasso \label{sec:latent lasso}}
	%
	%For a collection of subsets $\Omega=\{\Omega_i\}_{i=1}^{I}\subset [1:p]$, the latent group Lasso norm on $\RR^p$ takes $z\in \RR^p$ to
	%\begin{align}
	%\\|z\\|_{\Omega,1} = \sum_{i=1}^I \\| z_{\Omega_i} \\|.
	%\end{align}
	%Note that we do not require $\Omega_i$ to be disjoint. For $B\in \RR^{n\times p}$, $b\in \RR^n$, and $\lambda>0$, consider the latent group lasso as
	%\begin{align}
	%\min_{z\in \RR^d} \frac{1}{2}\\| Bz - b\\|_2^2+ \lambda \\| z\\|_{\Omega,1}.
	%\label{eq:group_lasso}
	%\end{align}
	%Because $\Omega$ is not a partition of $[1:p]$, computing the proximal operator of $\\|\cdot\\|_{\Omega}$ is often intractable, ruling out proximal gradient descent and similar algorithms for solving Program~\eqref{eq:group_lasso}. Instead, often Program~\eqref{eq:group_lasso} is solved by Alternating Direction Method of Multipliers (ADMM). More concretely, ???
	%
	%We take a radically different approach here and cast Program~\eqref{eq:group_lasso} as an instance of Program~\eqref{prob:01}. More specifically, let $z^+,z^-\in \RR^p$ be positive and negative parts of $z$, so that $z=z^+-z^-$. Let us introduce the nonnegative $u\in \RR^p$ such that $z^++z^- = u^{\circ 2}$, where we used $\circ$ notation to show entrywise power. We may now write that
	%\begin{align}
	%\\| z^++z^- \\|_{\Omega,1}
	%& = \\| u^{\circ 2} \\|_{\Omega,1 } =: \\|u\\|_{\Omega,2}^2,
	%\end{align}
	%Unlike $\\|\cdot\\|_{\Omega,1}$, note that $\\|\cdot\\|_{\Omega,2}^2$ is differentiable and, in fact, there exists a positive semidefinite matrix $Q\in \RR^{d\times d}$ such that $\\|u\\|_{\Omega,2}^2=u^\top Q_\Omega u$. Let us form $x=[(z^+)^\top,(z^-)^\top, u^\top]^\top\in \RR^{3d}$ and set
	%\begin{align*}
	%f(x) = \frac{1}{2}\\| Bz^+-Bz^- -b\\|_1+ \\|u\\|_{\Omega,2}^2,
	%\end{align*}
	%\begin{align}
	%g(x) = 0, \qquad A(x) = z^++z^--u^{\circ 2}.
	%\end{align}
	%We can now apply Algorithm~\ref{Algo:2} to solve Program~\eqref{prob:01} with $f,g,A$ specified above.
	%
	%
	%\paragraph{Convergence rate.}
	%Clearly, $f,A$ are strongly smooth in the sense that \eqref{eq:smoothness basic} holds with proper $\lambda_f,\lambda_A$. Likewise, both $f$ and the Jacobian $DA$ are continuous and the restricted Lipschitz constants $\lambda'_f,\lambda'_A$ in \eqref{eq:defn_lambda'} are consequently well-defined and finite for a fixed $\rho'>0$. We next verify the key regularity condition in Theorem~\ref{thm:main}, namely, \eqref{eq:regularity}. Note that
	%\begin{align*}
	%DA(x) & =
	%\left[
	%\begin{array}{ccc}
	%I_p & I_p & -2\text{diag}(u)
	%\end{array}
	%\right]\in \RR^{d\times 3d},
	%\end{align*}
	%\begin{align*}
	%-DA(x)^\top A(x) =
	%\left[
	%\begin{array}{c}
	%-z^+-z^-+u^{\circ 2} \\
	%-z^+-z^-+u^{\circ 2}\\
	%2\text{diag}(u)( z^++z^--u^{\circ 2})
	%\end{array}
	%\right],
	%\end{align*}
	%\begin{align*}
	%& \text{dist} \left( -DA(x)^\top A(x), \frac{\partial g(x)}{\b} \right) \nonumber\\
	%& = \text{dist} \left( -DA(x)^\top A(x), \{0\} \right) \nonumber\\
	%& = \left\\| -DA(x)^\top A(x) \right\\| \nonumber\\
	%& \ge \sqrt{2} \\| A(x)\\|,
	%\end{align*}
	%namely, \eqref{eq:regularity} holds with $\nu=1$.

experiments.texNo OneTemporaryActions

File Metadata

experiments.texView Options

Event Timeline

experiments.tex
No OneTemporary
Actions

experiments.tex
View Options