diff --git a/ICML19/Drafts/figs/bp_fig1_new.pdf b/ICML19/Drafts/figs/bp_fig1_new.pdf new file mode 100644 index 0000000..4c5c6c7 Binary files /dev/null and b/ICML19/Drafts/figs/bp_fig1_new.pdf differ diff --git a/ICML19/Drafts/figs/bp_fig2_small.pdf b/ICML19/Drafts/figs/bp_fig2_small.pdf new file mode 100644 index 0000000..b983612 Binary files /dev/null and b/ICML19/Drafts/figs/bp_fig2_small.pdf differ diff --git a/ICML19/Drafts/sections/experiments.tex b/ICML19/Drafts/sections/experiments.tex index ba89bb1..2265a9a 100644 --- a/ICML19/Drafts/sections/experiments.tex +++ b/ICML19/Drafts/sections/experiments.tex @@ -1,229 +1,230 @@ %!TEX root = ../main.tex \section{Experiments \label{sec:experiments}} It has been shown in the literature that Quasi Newton methods such as BFGS and L-BFGS need not converge for nonconvex objectives~\cite{dai2002convergence, mascarenhas2004bfgs}. For this reason, we have used the trust region method in our theoretical analysis in Section~\ref{sec:cvg rate}, which is well-understood for nonconvex objectives. However, it is known that BFGS methods has been found to be very successful in practice. That is why we chose to use BFGS as a second-order subroutine in this section. Please note that the choice of the subroutine does not affect our results in Theorem~\ref{thm:main}. \subsection{Basis Pursuit} For a given set of undersampled equations, basis pursuit corresponds to following sparse regression problem \begin{align} \min_{z} &\quad \|z\|_1 \nonumber\\ \text{s.t.}& \quad Bz = b, \label{eq:bp_main} \end{align} where $B \in \RR^{n \times d}$, $b \in \RR^{n}$ and $n < d$. This has found many applications in machine learning, statistics and signal processing \cite{chen2001atomic, candes2008introduction, arora2018compressed}. Program \eqref{eq:bp_main} seeks for the sparsest solution which satisfies the linear equality $Bz = b$. \cite{tran2018adaptive} and \cite{chambolle2011first} proposes primal dual nonsmooth convex methods to solve this program. \begin{figure*}[ht] % \includegraphics[width=0.4\textwidth,natwidth=1300,natheight=1300]{bp_fig1.pdf} -%\begin{center} -\centerline{\includegraphics[width=1.5\columnwidth]{figs/bp_fig1.pdf}} +\begin{center} +\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig1_new.pdf}\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}} +%\centerline{\includegraphics[width=1\columnwidth]{figs/bp_fig2_small.pdf}} \label{fig:bp1} -\caption{Comparison of different algorithms for solving basis pursuit template. ($n=250$, $d=10^4$)} -%\end{center} +\caption{Comparison of different algorithms for solving basis pursuit template. (left:$[n=100$, $d=10^4]$, right:$[n=34$, $d=10^2$])} +\end{center} \end{figure*} Here, we take a different approach to find an equivalent formulation for the above program. Note that any $z \in \\R^d$ can be decomposed as $z = z^+ - z^-$ such that $z^+$ and $z^-$ have all nonnegative values. More precisely, \begin{align*} z^+ = \max\{z, 0\}:= u_1^{\circ 2},\quad z^- = \begin{cases} |z_i|,& \text{if}~~z_i < 0\\ 0, & \text{otherwise} \end{cases} := u_2^{\circ 2}, \end{align*} %are going to solve the given problem with a nonconvex relaxation. where $u_1, u_2 \in \RR^d$ and $\circ$ is used to denote the element-wise square of the vectors. Next, we concatenate $u_1$ and $u_2$ as $x := [ u_1^{\top}, u_2^{\top} ]^{\top} \in \RR^{2d}$ and define $\tilde{B} := [B, -B] \in \RR^{n \times 2d}$. We finally end up with the following relaxation which perfectly fits into the template \eqref{prob:01}. \begin{align*} f(x) =& \|x\|_2^2, \quad g(x) = 0\nonumber\\ A(x) =& \tilde{B}x^{\circ 2} = b \end{align*} Note that A is nonlinear due to element-wise square operation. \paragraph{Convergence rate:} {\textbf{TODO(mfs): maybe add some discussion here about the smoothness of $A$, $f$, etc.} %We created the Although, the nonconvex relaxation does not bring any advantage in terms of compputation We next verify the regularity condition in \eqref{eq:regularity}. \begin{align*} DA(x) = 2\times \left[ \begin{array}{cccc} \tilde{b}_{1,1} x_1 & \cdots,& \tilde{b}_{1,2d}x_{2d}\\ \vdots\\ \tilde{b}_{n,1} x_1 & \cdots,& \tilde{b}_{n,2d}x_{2d} \end{array} \right], \end{align*} where $\tilde{b}_{i,j} = (\tilde{B})_{ij},~ i\in [1:n]$ and $j \in [1:2d]$. \begin{align*} -DA(x)^\top(A(x) - b) = -2x \odot (\tilde{B}^\top (A(x) - b)), \end{align*} where $\odot$ denotes hadamard product. \begin{align*} & \text{dist} \left( -DA(x)^\top (A(x)-b), \frac{\partial g(x)}{\b} \right) \nonumber\\ & = \text{dist} \left( -DA(x)^\top (A(x)-b), \{0\} \right) \nonumber\\ & = \left\| -DA(x)^\top (A(x)-b) \right\| \nonumber\\ & \ge ?????, \end{align*} Hence, this completes the proof for regularity condition. We synthetically generate the matrix B from normal Gaussian distribution $\mathcal{N}(0, 1)$. We then pick a sparsity level $k$ and choose $k$ indicies, i.e, $\Omega \subset [1:d]$, which are corresponding to nonzero entries of $z$. We then assign values from normal distribution to those entries. Measurement vector is created such that $b = Bz + \epsilon$, where $\epsilon$ is the noise vector with $\epsilon_k \sim \mathcal{N}{(0, \sigma)}$. We take $\sigma=10^{-3}$ in our experiments. We compare our algorithm against state of the art primal dual convex methods, namely Chambole-Pock~\cite{chambolle2011first}, ASGARD~\cite{tran2018smooth} and ASGARD-DL~\cite{tran2018adaptive}. The results are compiled in figure~\ref{fig:bp_1}. It is obvious from these plots that the proposed relaxation with Algorithm \ref{Algo:2} for the basis pursuit problem works well in practice and comparable with the state of the art methods with a second order method as the inner solver. This relaxation transformed a non-smooth objective into a smooth one while loosing the linearity on the equality constraint. This relaxation becomes much more meaningful with more structured norms where taking the proximal operator is not easy. One such case is the latent group lasso norm~\cite{obozinski2011group}. It is the group norm with overlaps, namely, \begin{align*} \|z\|_{\Omega} = \sum_{i=1}^I \| z_{\Omega_i} \|, \end{align*} where $\Omega_i$'s are not necessarily disjoint. Although not studied here, the authors believe that the results presented in this paper for the basis pursuit problem will serve as a basis for more complicated relaxations for the group norms and this is left as a future work. %\subsection{Latent Group Lasso \label{sec:latent lasso}} % %For a collection of subsets $\Omega=\{\Omega_i\}_{i=1}^{I}\subset [1:p]$, the latent group Lasso norm on $\RR^p$ takes $z\in \RR^p$ to %\begin{align} %\|z\|_{\Omega,1} = \sum_{i=1}^I \| z_{\Omega_i} \|. %\end{align} %Note that we do not require $\Omega_i$ to be disjoint. For $B\in \RR^{n\times p}$, $b\in \RR^n$, and $\lambda>0$, consider the latent group lasso as %\begin{align} %\min_{z\in \RR^d} \frac{1}{2}\| Bz - b\|_2^2+ \lambda \| z\|_{\Omega,1}. %\label{eq:group_lasso} %\end{align} %Because $\Omega$ is not a partition of $[1:p]$, computing the proximal operator of $\|\cdot\|_{\Omega}$ is often intractable, ruling out proximal gradient descent and similar algorithms for solving Program~\eqref{eq:group_lasso}. Instead, often Program~\eqref{eq:group_lasso} is solved by Alternating Direction Method of Multipliers (ADMM). More concretely, ??? % %We take a radically different approach here and cast Program~\eqref{eq:group_lasso} as an instance of Program~\eqref{prob:01}. More specifically, let $z^+,z^-\in \RR^p$ be positive and negative parts of $z$, so that $z=z^+-z^-$. Let us introduce the nonnegative $u\in \RR^p$ such that $z^++z^- = u^{\circ 2}$, where we used $\circ$ notation to show entrywise power. We may now write that %\begin{align} %\| z^++z^- \|_{\Omega,1} %& = \| u^{\circ 2} \|_{\Omega,1 } =: \|u\|_{\Omega,2}^2, %\end{align} %Unlike $\|\cdot\|_{\Omega,1}$, note that $\|\cdot\|_{\Omega,2}^2$ is differentiable and, in fact, there exists a positive semidefinite matrix $Q\in \RR^{d\times d}$ such that $\|u\|_{\Omega,2}^2=u^\top Q_\Omega u$. Let us form $x=[(z^+)^\top,(z^-)^\top, u^\top]^\top\in \RR^{3d}$ and set %\begin{align*} %f(x) = \frac{1}{2}\| Bz^+-Bz^- -b\|_1+ \|u\|_{\Omega,2}^2, %\end{align*} %\begin{align} %g(x) = 0, \qquad A(x) = z^++z^--u^{\circ 2}. %\end{align} %We can now apply Algorithm~\ref{Algo:2} to solve Program~\eqref{prob:01} with $f,g,A$ specified above. % % %\paragraph{Convergence rate.} %Clearly, $f,A$ are strongly smooth in the sense that \eqref{eq:smoothness basic} holds with proper $\lambda_f,\lambda_A$. Likewise, both $f$ and the Jacobian $DA$ are continuous and the restricted Lipschitz constants $\lambda'_f,\lambda'_A$ in \eqref{eq:defn_lambda'} are consequently well-defined and finite for a fixed $\rho'>0$. We next verify the key regularity condition in Theorem~\ref{thm:main}, namely, \eqref{eq:regularity}. Note that %\begin{align*} %DA(x) & = %\left[ %\begin{array}{ccc} %I_p & I_p & -2\text{diag}(u) %\end{array} %\right]\in \RR^{d\times 3d}, %\end{align*} %\begin{align*} %-DA(x)^\top A(x) = %\left[ %\begin{array}{c} %-z^+-z^-+u^{\circ 2} \\ %-z^+-z^-+u^{\circ 2}\\ %2\text{diag}(u)( z^++z^--u^{\circ 2}) %\end{array} %\right], %\end{align*} %\begin{align*} %& \text{dist} \left( -DA(x)^\top A(x), \frac{\partial g(x)}{\b} \right) \nonumber\\ %& = \text{dist} \left( -DA(x)^\top A(x), \{0\} \right) \nonumber\\ %& = \left\| -DA(x)^\top A(x) \right\| \nonumber\\ %& \ge \sqrt{2} \| A(x)\|, %\end{align*} %namely, \eqref{eq:regularity} holds with $\nu=1$. \subsection{k-Means Clustering} Given data points $\{y_i\}_{i=1}^n$, Euclidean distance matrix, $D \in \RR^{nxn}$, is defined as $ (D)_{i, j} = \left\| y_i - y_j\right\|^2 $. Then, clustering is to find a co-association matrix $Y$ such that $(Y)_{ij} = 1$ if points i and j are within the same cluster and $(Y)_{ij} = 0$ otherwise. A convex semidefinite programming(SDP) relaxation of k-means clusetering problem proposed in~\cite{Peng2007}. \begin{align} \min_{Y \in \RR^{nxn}} \text{Tr}(DY) ~~ \text{s.t} ~~ &Y\mathbf{1} = \mathbf{1},~\text{Tr}(Y) = k,~\nonumber\\ & X\succeq 0,~X \geq 0, \label{eq:sdp_svx} \end{align} where $k$ is the number of clusters and $Y \succeq 0 $ is set of positive semidefinite matrices. Standard solvers for SDP can not handle large data sets since they often require projection onto the semidefinite cone whose complexity is order $\mathcal{O}(n^3)$. In other words, time required to solve the problem increases exponentially with the dimension. To remedy this problem, we use Burer-Monteiro factorization approach. We sacrifice the convexity to reduce the computational complexity. Hence, the resulting program \begin{align} \label{eq:nc_cluster} \min_{V \in \RR^{nxr}} \text{Tr}(DVV^{\top}) ~~ \text{s.t} ~~ &VV^{\top}\mathbf{1} = \mathbf{1},~\|V\|_F^2 = k,~\nonumber\\ ~&U \geq 0 \end{align} Note that $Y \geq 0$ is replaced by much stronger but easier to enforce $V \geq 0$ constraint. Trace constraint translates to a Frobenius norm constraint in factorized space. Semidefinite constraint is naturally removed due to factorization $Y=VV^{\top}$. See~\citep{kulis2007fast} for the details of the relaxation. Now, we can safely cast program~\eqref{eq:nc_cluster} as an instance of program~\eqref{prob:01}. For every $\{i\}_1^n$, let $x_i \in \RR^r$ denote the $i$th row of $V$. We next form $x \in \RR^d$ with $d = nr$ by expanding the factorized variable $U$, \begin{align*} x = [x_1^{\top}, \cdots, x_n^{\top}]^{\top} \in \RR^d \end{align*} \begin{align*} f(x) =\sum_{i,j} D_{i, j} \left\langle x_i, x_j \right\rangle, \end{align*} \begin{align*} g(x) = 1_C(x), ~~~ A(x) = [x_1^{\top}\sum_j x_j, \cdots, x_n^{\top}\sum_j x_j]^{\top} = b, \end{align*} where $C$ is the intersection of the positive orthant with $\ell_2$-norm ball with radius $\sqrt{k}$ and $b$ is all ones vector, i.e, $b=\mathbf{1} \in \RR^n$. To verify the condition in~\eqref{eq:regularity}, let us write down the Jacobian of the nonlinear operator A, \begin{align} DA(x) = \left[ \begin{array}{cccc} w_{1,1} x_1^\top & \cdots & w_{1,n} x_{1}^\top\\ %x_2^\top p& \cdots & x_{2}^\top\\ \vdots\\ w_{n,1}x_{n}^\top & \cdots & w_{n,n}1 x_{n}^\top \end{array} \right], \label{eq:Jacobian clustering} \end{align} where $w_{i.i}=2$ and $w_{i,j}=1$ for $i\ne j$. Consequently, \begin{align*} -DA(x)^\top (A(x)-b) = - \left[ \begin{array}{cccc} \sum_{j}^n w_{j,1}x_j \odot (A(x) - b)\\ %(2 x_1 + x_2 +& \cdots &+ x_{n})\odot (A(x) - b)\\ %( x_1 + 2x_2 +& \cdots &+ x_{n})\odot (A(x) - b)\\ \vdots\\ \sum_{j}^n w_{j,n}x_j \odot (A(x) - b)\\ \end{array} \right], \end{align*} where $\odot$ denotes the hadamard product. Therefore, \begin{align*} & \text{dist} \left( -DA(x)^\top (A(x)-b), \frac{\partial g(x)}{\b} \right) \nonumber\\ & = \left\| P_{T_C(x)}\left(-DA(x)^\top (A(x)-b)\right) \right\| \nonumber\\ & ??, \end{align*} First equality directly follows from~\eqref{eq:dist_subgrad}. We solve the inner problems in the 2nd step of the algorithm~\ref{Algo:2} with 3 different methods, namely LBFGS(or trust regions) and APGM. LBFGS is a limited memory version of BFGS algorithm in~\cite{fletcher2013practical} to approximate the the second order information. APGM is a method for solving nonconvex problems of the form~\eqref{e:exac} with convergence guarantees to 1st order stationarity as discussed in section \ref{sec:cvg rate}. We compare our approach against 2 convex methods; \begin{itemize} \item HCGM: homotopy based conditional gradient method in\cite{yurtsever2018conditional} which applies to~\eqref{eq:sdp_svx}. \item SDPNAL+: \cite{yang2015sdpnal} \end{itemize} The results are depicted in figure~\ref{fig:clustering}??. {\bf{Todo(mfs)}}:add figure, table and discussion of results here..