Page MenuHomec4science

experiments.tex
No OneTemporary

File Metadata

Created
Sat, May 25, 13:28

experiments.tex

%!TEX root = ../main.tex
\begin{figure*}[ht]
% \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf}
\begin{center}
{\includegraphics[width=.7\columnwidth]{figs/clustering_fig4_times_linearbeta_last.pdf}}
{\includegraphics[width=.7\columnwidth]{figs/clustering_fig4_iter_linearbeta_last.pdf}}
%\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}}
\caption{Convergence of different algorithms for k-Means clustering with fashion MNIST dataset. Here, we set the rank to be equal to 20 for the non-convex approaches. The solution rank for the template~\eqref{eq:sdp_svx} is known and it is equal to number of clusters $k$ (Theorem 1. \cite{kulis2007fast}). As discussed in~\cite{tepper2018clustering}, setting rank $r>k$ leads more accurate reconstruction in expense of speed. Therefore, we set the rank to 20.}
\label{fig:clustering}
%(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])}
\end{center}
\end{figure*}
\section{Numerical evidence \label{sec:experiments}}
We first begin with a caveat: It is known that quasi-Newton methods, such as BFGS and lBFGS, might not converge for non-convex problems~\cite{dai2002convergence, mascarenhas2004bfgs}. For this reason, we have used the trust region method as the second-order solver in our analysis in Section~\ref{sec:cvg rate}, which is well-studied for non-convex problems~\cite{cartis2012complexity}.
Empirically, however, BFGS and lBGFS are extremely successful and we have also opted for those solvers in this section since the subroutine does not affect Theorem~\ref{thm:main}.
\subsection{k-Means Clustering}
Given data points $\{z_i\}_{i=1}^n $, the entries of the corresponding Euclidean distance matrix $D \in \RR^{nxn}$ are $ D_{i, j} = \left\| z_i - z_j\right\|^2 $.
Clustering is then the problem of finding a co-association matrix $Y\in \RR^{n\times n}$ such that $Y_{ij} = 1$ if points $z_i$ and $z_j$ are within the same cluster and $Y_{ij} = 0$ otherwise. In~\cite{Peng2007}, the authors provide a SDP relaxation of the clustering problem, specified as
\begin{align}
\begin{cases}
\underset{Y \in \RR^{nxn}}{\min} \text{tr}(DY) \\
Y\mathbf{1} = \mathbf{1},
~\text{tr}(Y) = k,~
Y\succeq 0,~Y \geq 0,
\end{cases}
\label{eq:sdp_svx}
\end{align}
where $k$ is the number of clusters and $Y $ is both positive semidefinite and has nonnegative entries.
Standard SDP solvers do not scale well with the number of data points~$n$, since they often require projection onto the semidefinite cone with the complexity of $\mathcal{O}(n^3)$. We instead use the Burer-Monteiro splitting, sacrificing convexity to reduce the computational complexity. More specifically, we solve the program
\begin{align}
\label{eq:nc_cluster}
\begin{cases}
\underset{V \in \RR^{nxr}}{\min} \text{tr}(DVV^{\top}) \\
VV^{\top}\mathbf{1} = \mathbf{1},~~ \|V\|_F^2 \le k,
~~V \geq 0,
\end{cases}
\end{align}
where $\mathbf{1}\in \RR^n$ is the vector of all ones.
Note that $Y \geq 0$ in \eqref{eq:sdp_svx} is replaced above by the much stronger but easier to enforce constraint $V \geq 0$ constraint above, see~\citep{kulis2007fast} for the reasoning behind this relaxation.
%. Trace constraint translates to a Frobenius norm constraint in factorized space. Semidefinite constraint is naturally removed due to factorization $Y=VV^{\top}$.
%See~\citep{kulis2007fast} for the details of the relaxation.
Now, we can cast~\eqref{eq:nc_cluster} as an instance of~\eqref{prob:01}. Indeed, for every $i\le n$, let $x_i \in \RR^r$ denote the $i$th row of $V$. We next form $x \in \RR^d$ with $d = nr$ by expanding the factorized variable $V$, namely,
\begin{align*}
x = [x_1^{\top}, \cdots, x_n^{\top}]^{\top} \in \RR^d,
\end{align*}
and then set
\begin{align*}
f(x) =\sum_{i,j=1}^n D_{i, j} \left\langle x_i, x_j \right\rangle,
\qquad g = \delta_C,
\end{align*}
\begin{align}
A(x) = [x_1^{\top}\sum_{j=1}^n x_j -1, \cdots, x_n^{\top}\sum_{j=1}^n x_j-1]^{\top},
\end{align}
where $C$ is the intersection of the positive orthant in $\RR^d$ with the Euclidean ball of radius $\sqrt{k}$. In Appendix~\ref{sec:verification2}, we somewhat informally verify that Theorem~\ref{thm:main} applies to~\eqref{prob:01} with $f,g,A$ specified above.
In our simulations, we use two different solvers for Step~2 of Algorithm~\ref{Algo:2}, namely, APGM and lBFGS. APGM is a solver for non-convex problems of the form~\eqref{e:exac} with convergence guarantees to first-order stationarity, as discussed in Section~\ref{sec:cvg rate}. lBFGS is a limited-memory version of BFGS algorithm in~\cite{fletcher2013practical} that approximately leverages the second-order information of the problem.
We compare our approach against the following convex methods:
\begin{itemize}
\item HCGM: Homotopy-based Conditional Gradient Method in\cite{yurtsever2018conditional} which directly solves~\eqref{eq:sdp_svx}.
\item SDPNAL+: A second-order augmented Lagrangian method for solving SDP's with nonnegativity constraints~\cite{yang2015sdpnal}.
\end{itemize}
As for the dataset, our expreminetal setup is similar to that described by~\cite{mixon2016clustering}. We use the publicly-available fashion-MNIST data in \cite{xiao2017/online}, which is released as a possible replacement for the MNIST handwritten digits. Each data point is a $28\times 28$ grayscale image, associated with a label from ten classes, labeled from $0$ to $9$.
First, we extract the meaningful features from this dataset using a simple two-layer neural network with a sigmoid activation function. Then, we apply this neural network to 1000 test samples from the same dataset, which gives us a vector of length $10$ for each data point, where each entry represents the posterior probability for each class. Then, we form the $\ell_2$ distance matrix ${D}$ from these probability vectors. The results are depicted in Figure~\ref{fig:clustering}.
\subsection{Basis Pursuit}
Basis Pursuit (BP) finds sparsest solutions of an under-determined system of linear equations, namely,
\begin{align}
\begin{cases}
\min_{z} \|z\|_1 \\
Bz = b,
\end{cases}
\label{eq:bp_main}
\end{align}
where $B \in \RR^{n \times d}$ and $b \in \RR^{n}$. BP has found many applications in machine learning, statistics and signal processing \cite{chen2001atomic, candes2008introduction, arora2018compressed}. A huge number of primal-dual convex optimization algorithms are proposed to solve BP, including, but not limited to~\cite{tran2018adaptive,chambolle2011first}. There also exists many line of works \cite{beck2009fast} to handle sparse regression problem via regularization with $\ell_1$ norm.
%\textbf{AE: Fatih, maybe mention a few other algorithms including asgard and also say why we can't use fista and nista.}
Here, we take a different approach and cast~(\ref{eq:bp_main}) as an instance of~\eqref{prob:01}. Note that any $z \in \RR^d$ can be decomposed as $z = z^+ - z^-$, where $z^+,z^-\in \RR^d$ are the positive and negative parts of $z$, respectively. Then consider the change of variables $z^+ = u_1^{\circ 2}$ and $z^-= u_2^{\circ 2} \in \RR^d$, where $\circ$ denotes element-wise power. Next, we concatenate $u_1$ and $u_2$ as $x := [ u_1^{\top}, u_2^{\top} ]^{\top} \in \RR^{2d}$ and define $\overline{B} := [B, -B] \in \RR^{n \times 2d}$. Then, \eqref{eq:bp_main} is equivalent to \eqref{prob:01} with
\begin{align}
f(x) =& \|x\|_2^2, \quad g(x) = 0\nonumber\\
A(x) =& \overline{B}x^{\circ 2}- b.
\label{eq:bp-equiv}
\end{align}
In Appendix~\ref{sec:verification1}, we verify with minimal detail that Theorem~\ref{thm:main} indeed applies to~\eqref{prob:01} with the above $f,A$.
%Let $\mu(B)$ denote the \emph{coherence} of ${B}$, namely,
%\begin{align}
%\mu = \max_{i,j} | \langle B_i, B_j \rangle |,
%\end{align}
%where $B_i\in \RR^n$ is the $i$th column of $B$. Let also $z_k,z_*\in \RR^d$ denote the vectors corresponding to $x_k,x_*$. We rewrite the last line of \eqref{eq:cnd-bp-pre} as
%\begin{align}
%& 2 \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x^{\circ 2} -b) \| \nonumber\\
%& = 2 \| \text{diag}(x_k) \overline{B}^\top \overline{B} (x^{\circ 2} -x_*^{\circ 2}) \| \nonumber\\
%& = 2 \| \text{diag}(x_k) {B}^\top B(x_k- x_*) \| \nonumber\\
%& = 2 \| \text{diag}(x_k) {B}^\top B(x_k- x_*) \|_{\infty} \nonumber\\
%& \ge 2 \| \text{diag}(x_k) \text{diag}({B}^\top B) (x_k- x_*) \|_{\infty}\nonumber\\
%& \qquad - 2 \| \text{diag}(x_k) \text{hollow}({B}^\top B) (x_k- x_*) \|_{\infty},
%\end{align}
%where $\text{hollow}$ returns the hollow part of th
%
% and let $\overline{B}=U\Sigma V^\top$ be its thin singular value decomposition, where $U\in \RR^{n\times n}, V\in \RR^{d\times n}$ have orthonormal columns and the diagonal matrix $\Sigma\in \RR^{n\times n}$ contains the singular values $\{\sigma_i\}_{i=1}^r$. Let also $x_{k,i},U_i$ be the $i$th entry of $x_k$ and the $i$th row of $U_i$, respectively. Then we bound the last line above as
%\begin{align}
%& 2 \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x^{\circ 2} -b) \| \nonumber\\
%& = 2 \| \text{diag}(x_k) U \Sigma^2 U^\top ( x^{\circ 2} - x_*^{\circ 2}) \| \nonumber\\
%& \ge 2 \| \text{diag}(x_k) U \Sigma^2 U^\top ( x^{\circ 2} - x_*^{\circ 2}) \|_{\infty} \nonumber\\
%& = \max_i |x_{k,i}| \cdot | U_i ^\top \Sigma \cdot \Sigma U^\top ( x^{\circ 2} - x_*^{\circ 2}) | \nonumber\\
%&
%\end{align}
%
%where $\tilde{b}_{i,j} = (\tilde{B})_{ij},~ i\in [1:n]$ and $j \in [1:2d]$.
%\begin{align*}
%-DA(x)^\top(A(x) - b) = -2x \odot (\tilde{B}^\top (A(x) - b)),
%\end{align*}
%where $\odot$ denotes hadamard product.
%\begin{align*}
%& \text{dist} \left( -DA(x)^\top (A(x)-b), \frac{\partial g(x)}{\b} \right) \nonumber\\
%& = \text{dist} \left( -DA(x)^\top (A(x)-b), \{0\} \right) \nonumber\\
%& = \left\| -DA(x)^\top (A(x)-b) \right\| \nonumber\\
%& \ge ?????,
%\end{align*}
%Hence, this completes the proof for regularity condition.
We draw the entries of $B$ independently from a zero-mean and unit-variance Gaussian distribution. For a fixed sparsity level $k$, the support of $z_*\in \RR^d$ and its nonzero amplitudes are also drawn from the standard Gaussian distribution.
%We then pick a sparsity level $k$ and choose $k$ indexes, i.e, $\Omega \subset [1:d]$, which are corresponding to nonzero entries of $z$.
%We then assign values from normal distribution to those entries.
Then the measurement vector is created as $b = Bz + \epsilon$, where $\epsilon$ is the noise vector with entries drawn independently from the zero-mean Gaussian distribution with variance $\sigma^2=10^{-6}$.
\begin{figure}[ht]
\begin{center}
{\includegraphics[width=1\columnwidth]{figs/bp_fig1_subsolvers.pdf}}
\caption{Here, we show the convergence based on number of iterations because our complexity results are as such. One \textsc{lBFGS} iteration corresponds to one \textsc{lBFGS} update, whereas one iteration for \textsc{TrustRegion} method is the number of hessian evaluation. All the other methods are first order and one iteration corresponds to one algorithmic step at each. (\textit{Left}) Effect of using different subsolver for the aforementioned non-convex relaxation.
}
\label{fig:bp1}
%(left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])}
\end{center}
\end{figure}
\vspace{-3mm}
Figure~\ref{fig:bp1} compiles our results for the proposed relaxation. It is, indeed, interesting to see that these type of nonconvex relaxations might work well in practice even with first order solvers.
%This relaxation transformed a non-smooth objective into a smooth one while loosing the linearity on the equality constraint.
The true potential of our reformulation is in dealing with more structured norms rather than $\ell_1$, where computing the proximal operator is often intractable. One such case is the latent group lasso norm~\cite{obozinski2011group}, defined as
\begin{align*}
\|z\|_{\Omega} = \sum_{i=1}^I \| z_{\Omega_i} \|,
\end{align*}
where $\{\Omega_i\}_{i=1}^I$ are (not necessarily disjoint) index sets of $\{1,\cdots,d\}$. Although not studied here, we believe that the non-convex framework presented in this paper can serve to solve more complicated problems, such as the latent group lasso. We leave this research direction for future work.
%\subsection{Latent Group Lasso \label{sec:latent lasso}}
%
%For a collection of subsets $\Omega=\{\Omega_i\}_{i=1}^{I}\subset [1:p]$, the latent group Lasso norm on $\RR^p$ takes $z\in \RR^p$ to
%\begin{align}
%\|z\|_{\Omega,1} = \sum_{i=1}^I \| z_{\Omega_i} \|.
%\end{align}
%Note that we do not require $\Omega_i$ to be disjoint. For $B\in \RR^{n\times p}$, $b\in \RR^n$, and $\lambda>0$, consider the latent group lasso as
%\begin{align}
%\min_{z\in \RR^d} \frac{1}{2}\| Bz - b\|_2^2+ \lambda \| z\|_{\Omega,1}.
%\label{eq:group_lasso}
%\end{align}
%Because $\Omega$ is not a partition of $[1:p]$, computing the proximal operator of $\|\cdot\|_{\Omega}$ is often intractable, ruling out proximal gradient descent and similar algorithms for solving Program~\eqref{eq:group_lasso}. Instead, often Program~\eqref{eq:group_lasso} is solved by Alternating Direction Method of Multipliers (ADMM). More concretely, ???
%
%We take a radically different approach here and cast Program~\eqref{eq:group_lasso} as an instance of Program~\eqref{prob:01}. More specifically, let $z^+,z^-\in \RR^p$ be positive and negative parts of $z$, so that $z=z^+-z^-$. Let us introduce the nonnegative $u\in \RR^p$ such that $z^++z^- = u^{\circ 2}$, where we used $\circ$ notation to show entrywise power. We may now write that
%\begin{align}
%\| z^++z^- \|_{\Omega,1}
%& = \| u^{\circ 2} \|_{\Omega,1 } =: \|u\|_{\Omega,2}^2,
%\end{align}
%Unlike $\|\cdot\|_{\Omega,1}$, note that $\|\cdot\|_{\Omega,2}^2$ is differentiable and, in fact, there exists a positive semidefinite matrix $Q\in \RR^{d\times d}$ such that $\|u\|_{\Omega,2}^2=u^\top Q_\Omega u$. Let us form $x=[(z^+)^\top,(z^-)^\top, u^\top]^\top\in \RR^{3d}$ and set
%\begin{align*}
%f(x) = \frac{1}{2}\| Bz^+-Bz^- -b\|_1+ \|u\|_{\Omega,2}^2,
%\end{align*}
%\begin{align}
%g(x) = 0, \qquad A(x) = z^++z^--u^{\circ 2}.
%\end{align}
%We can now apply Algorithm~\ref{Algo:2} to solve Program~\eqref{prob:01} with $f,g,A$ specified above.
%
%
%\paragraph{Convergence rate.}
%Clearly, $f,A$ are strongly smooth in the sense that \eqref{eq:smoothness basic} holds with proper $\lambda_f,\lambda_A$. Likewise, both $f$ and the Jacobian $DA$ are continuous and the restricted Lipschitz constants $\lambda'_f,\lambda'_A$ in \eqref{eq:defn_lambda'} are consequently well-defined and finite for a fixed $\rho'>0$. We next verify the key regularity condition in Theorem~\ref{thm:main}, namely, \eqref{eq:regularity}. Note that
%\begin{align*}
%DA(x) & =
%\left[
%\begin{array}{ccc}
%I_p & I_p & -2\text{diag}(u)
%\end{array}
%\right]\in \RR^{d\times 3d},
%\end{align*}
%\begin{align*}
%-DA(x)^\top A(x) =
%\left[
%\begin{array}{c}
%-z^+-z^-+u^{\circ 2} \\
%-z^+-z^-+u^{\circ 2}\\
%2\text{diag}(u)( z^++z^--u^{\circ 2})
%\end{array}
%\right],
%\end{align*}
%\begin{align*}
%& \text{dist} \left( -DA(x)^\top A(x), \frac{\partial g(x)}{\b} \right) \nonumber\\
%& = \text{dist} \left( -DA(x)^\top A(x), \{0\} \right) \nonumber\\
%& = \left\| -DA(x)^\top A(x) \right\| \nonumber\\
%& \ge \sqrt{2} \| A(x)\|,
%\end{align*}
%namely, \eqref{eq:regularity} holds with $\nu=1$.

Event Timeline