Page MenuHomec4science

numerics.tex
No OneTemporary

File Metadata

Created
Sun, Nov 10, 23:12

numerics.tex

\section{Experiments \label{sec:experiments}}
\paragraph{Recall the condition.} In problem~(\ref{prob:01}) with $m\le d$, for $\rho,\rho'>0$ and subspace $S\subset \RR^{d}$, let
\begin{align}
\nu(g,A,S,\rho,\rho') :=
\begin{cases}
\underset{x\in \mathbb{R}^d}{\min} \, \frac{\left\| P_S P_{\cone(\partial g(x))^*} ( \D A(x)^\top \cdot A(x)) \right\|}{\|A(x)\|} \\
\text{subject to } 0< \|A(x)\|\le \rho,
\, \|x\|\le \rho',
\end{cases}
\label{eq:new slater defn}
\end{align}
where $\cone(\partial g(x))$ is the conic hull of the subdifferential $\partial g(x)$ and $P_{\cone(\partial g(x))^*}$ projects onto the polar of this cone, see (\ref{eq:polar},\ref{eq:conic}). Moreover, $\D A(x)$ is the Jacobian of $A$. We say that (\ref{prob:01}) satisfies the geometric regularity if $\nu(g,A,S,\rho,\rho') >0$.
In particular, when $S = \mathbb{R}^{d}$, the Moreau decomposition~\cite{moreau1962decomposition} allows us to simplify \eqref{eq:new slater defn} as
\begin{align}
\nu(g,A,S,\rho,\rho') :=
\begin{cases}
\underset{x}{\min} \, \frac{ \dist\left( \D A(x)^\top \cdot A(x) , \cone(\partial g(x))\right) }{\|A(x)\|} \\
\text{subject to } 0< \|A(x)\|\le \rho, \|x\|\le \rho',
\end{cases}
\label{eq:defn_nu_A}
\end{align}
where $\dist(\cdot,\text{cone}(\partial g(x)))$ returns the Euclidean distance to $\cone(\partial g(x))$.
\section{Maxcut}
Given an undirected graph $G = (V, E)$, where $V$ is a set which consists of vertices, and $E$ is a set consisting two elements indicating the links between the vertices, maximum cut is problem aims to find the set of vertices which maximizes the weight of the edges in the cut. It is an important combinatorial problem in computer science and NP-Hard. Given the symmetric graph Laplacian matrix whose elements include edge weights of the graph. We solve the following SDP relaxation of the problem;
\begin{align}
\min_{X \in \mathbb{R}^{n \times n}} &\quad \frac{1}{4} \langle C, X \rangle \quad
\text{s.t.} \quad \text{diag}(X) = \mathbf{1}, \quad \text{trace}(X) = n, \quad X \succeq 0,
\label{eq:maxcutsdp}
\end{align}
where $\mathbf{1} \in \mathbb{R}^{n}$ is the vector of all ones.
We solve the following BM factorization of the problem after change of variables $X = YY^{\top}$;
\begin{align}
\min_{Y \in \mathbb{R}^{n \times r}} \quad \frac{1}{4} \text{trace}(Y^{\top}CY) \quad
\text{s.t.} \quad \text{diag}(YY^{\top}) = \mathbf{1}, \quad \|Y\|_F^2 \leq n.
\label{eq:maxcutnc}
\end{align}
Note here that we relaxed the equality constraint $\text{trace}(X) = n$ to an inequality constraint $\|Y\|_F^2 < n$ while doing the factorization. Although the constraints are not equivalent, at the feasible point of the problem $\|Y\|_F^2 = n$ is automatically satisfied.
For every $i$, let $x_i\in\mathbb{R}^r$ denote the $i$th row of $Y$. Let us form $x\in\mathbb{R}^d$ with $d=nr$ by vectorizing $Y$, namely,
\begin{equation}
x = [x_1^\top \cdots x_{n}^\top]^\top.
\end{equation}
We can therefore cast the above in the form of the original problem with
\begin{align}
f(x) = \sum_{i,j} C_{i,j} \langle x_i, x_j \rangle,\quad A: x \rightarrow [\|x_1\|^2 - 1, \cdots, \|x_{n}\|^2 - 1]^\top.
\end{align}
\paragraph{Condition verification.}
First, calculate the jacobian of the nonlinear operator;
\begin{align}
DA(x) =
\left[
\begin{array}{ccc}
x_1^\top & \cdots & 0\\
\vdots\\
0 & \cdots & x_{n}^\top
\end{array}
\right]\in \mathbb{R}^{d'\times d}.
\end{align}
In particular, if we take $S=\mathbb{R}^d$ and $\rho <1$, we have $P_{\text{cone}(\partial g(x))^*}$ and thus
\begin{align}
\nu(g,A,S,\rho,\rho') & =
\begin{cases}
\min_u \eta_{\min}\left(
DA(u)
\right)\\
\text{subject to } 0 < \|A(x)\| \le \rho, \|x\|\le \rho'
\end{cases} \nonumber\\
& =
\begin{cases}
\min_x { \min_i \|x_i\| } \\
\text{subject to }\|x\|\le \rho',\quad \sum_i | \|x_i\|^2 - 1 | > 0, \quad | \|x_i\|^2 - 1 | \le \rho
\qquad \forall i.
\end{cases}
\nonumber\\
& \ge \sqrt{1- \rho} >0.
\end{align}
Above, $\eta_{\min}(.)$ returns the smallest singular value of its input. Consequently, condition in \eqref{eq:new slater defn} holds for the max-cut problem.
We solve the maxcut problem on Gset random graphs in \cite{??} and compare our algorithm against manopt first order method. We compile the results in table \ref{table:maxcut}.
\section{Clustering}
Given data points $\{z_i\}_{i=1}^n $, the entries of the corresponding Euclidean distance matrix $D \in \RR^{n\times n}$ are $ D_{i, j} = \left\| z_i - z_j\right\|^2 $.
Clustering is then the problem of finding a co-association matrix $Y\in \RR^{n\times n}$ such that $Y_{ij} = 1$ if points $z_i$ and $z_j$ are within the same cluster and $Y_{ij} = 0$ otherwise. In~\cite{Peng2007}, the authors provide a SDP relaxation of the clustering problem, specified as
\begin{align}
%\begin{cases}
\underset{Y \in \RR^{nxn}}{\min} \text{tr}(DY) \subjto Y\mathbf{1} = \mathbf{1},
~\text{tr}(Y) = s,~
Y\succeq 0,~Y \geq 0,
\label{eq:sdp_svx}
\end{align}
where $s$ is the number of clusters and $Y $ is both positive semidefinite and has nonnegative entries.
Standard SDP solvers do not scale well with the number of data points~$n$, since they often require projection onto the semidefinite cone with the complexity of $\mathcal{O}(n^3)$. We instead use the BM factorization to solve \eqref{eq:sdp_svx}, sacrificing convexity to reduce the computational complexity. More specifically, we solve the program
\begin{align}
\label{eq:nc_cluster}
\underset{V \in \RR^{n\times r}}{\min} \text{tr}(DVV^{\top}) \subjto
VV^{\top}\mathbf{1} = \mathbf{1},~~ \|V\|_F^2 \le s,
~~V \geq 0,
\end{align}
where $\mathbf{1}\in \RR^n$ is the vector of all ones.
Note that $Y \geq 0$ in \eqref{eq:sdp_svx} is replaced above by the much stronger but easier-to-enforce constraint $V \geq 0$ in \eqref{eq:nc_cluster}, see~\cite{kulis2007fast} for the reasoning behind this relaxation.
Now, we can cast~\eqref{eq:nc_cluster} as an instance of~\eqref{prob:01}. Indeed, for every $i\le n$, let $x_i \in \RR^r$ denote the $i$th row of $V$. We next form $x \in \RR^d$ with $d = nr$ by expanding the factorized variable $V$, namely,
$
x := [x_1^{\top}, \cdots, x_n^{\top}]^{\top} \in \RR^d,
$
and then set
\begin{align*}
f(x) =\sum_{i,j=1}^n D_{i, j} \left\langle x_i, x_j \right\rangle,
\qquad g = \delta_C, \qquad
A(x) = [x_1^{\top}\sum_{j=1}^n x_j -1, \cdots, x_n^{\top}\sum_{j=1}^n x_j-1]^{\top},
\end{align*}
where $C$ is the intersection of the positive orthant in $\RR^d$ with the Euclidean ball of radius $\sqrt{s}$. In the sequel, {we verify that the condition is satisfied with $f,g,A$ specified above. }
\paragraph{Condition verification.}{\bf{check again if it applies??, In the previous proof we assumed that $x$ belongs to the boundry of $C$, which is not permitted in the new version of the condition} }
We only verify the condition here.
Note that
\begin{align}
A(x) = VV^\top \mathbf{1}- \mathbf{1},
\end{align}
\begin{align}
DA(x) & =
\left[
\begin{array}{cccc}
w_{1,1} x_1^\top & \cdots & w_{1,n} x_{1}^\top\\
\vdots\\
w_{n,1}x_{n}^\top & \cdots & w_{n,n}1 x_{n}^\top
\end{array}
\right] \nonumber\\
& = \left[
\begin{array}{ccc}
V & \cdots & V
\end{array}
\right]
+
\left[
\begin{array}{ccc}
x_1^\top & \\
& \ddots & \\
& & x_n^\top
\end{array}
\right],
\label{eq:Jacobian clustering}
\end{align}
where $w_{i.i}=2$ and $w_{i,j}=1$ for $i\ne j$. In the last line above, $n$ copies of $V$ appear and the last matrix above is block-diagonal. For $x_k$, define $V_k$ accordingly and let $x_{k,i}$ be the $i$th row of $V_k$.
Consequently,
\begin{align}
DA(x_k)^\top A(x_k) & =
\left[
\begin{array}{c}
(V_k^\top V_k - I_n) V_k^\top \mathbf{1}\\
\vdots\\
(V_k^\top V_k - I_n) V_k^\top \mathbf{1}
\end{array}
\right] \nonumber\\
& \qquad +
\left[
\begin{array}{c}
x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\
\vdots \\
x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n
\end{array}
\right],
\end{align}
where $I_n\in \RR^{n\times n}$ is the identity matrix.
Let us make a number of simplifying assumptions. First, we assume that $\|x_k\|< \sqrt{s}$ (which can be enforced in the iterates by replacing $C$ with $(1-\epsilon)C$ for a small positive $\epsilon$ in the subproblems). Under this assumption, it follows that
\begin{align}
(\partial g(x_k))_i =
\begin{cases}
0 & (x_k)_i > 0\\
\{a: a\le 0\} & (x_k)_i = 0,
\end{cases}
\qquad i\le d.
\label{eq:exp-subgrad-cluster}
\end{align}
Second, we assume that $V_k$ has nearly orthonormal columns, namely, $V_k^\top V_k \approx I_n$. This can also be enforced in each iterate of Algorithm~\ref{Algo:2} and naturally corresponds to well-separated clusters. While a more fine-tuned argument can remove these assumptions, they will help us simplify the presentation here. Under these assumptions,
\begin{align}
& \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right)^2 \nonumber\\
& = \left\| \left( -DA(x_k)^\top A(x_k) \right)_+\right\|^2
\qquad (a_+ = \max(a,0))
\nonumber\\
& =
\left\|
\left[
\begin{array}{c}
x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\
\vdots \\
x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n
\end{array}
\right]
\right\|^2
\qquad (x_k\in C \Rightarrow x_k\ge 0)
\nonumber\\
& = \sum_{i=1}^n \| x_{k,i}\|^2 (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\
& \ge \min_i \| x_{k,i}\|^2
\cdot \sum_{i=1}^n (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\
& = \min_i \| x_{k,i}\|^2
\cdot \| V_kV_k^\top \mathbf{1}-\mathbf{1} \|^2.
\label{eq:final-cnd-cluster}
\end{align}
Therefore, given a prescribed $\nu$, ensuring $\min_i \|x_{k,i}\| \ge \nu$ guarantees \eqref{eq:regularity}. When the algorithm is initialized close enough to the constraint set, there is indeed no need to separately enforce \eqref{eq:final-cnd-cluster}. In practice, often $n$ exceeds the number of true clusters and a more intricate analysis is required to establish \eqref{eq:regularity} by restricting the argument to a particular subspace of $\RR^n$.
\paragraph{Experimental setup.}
In our simulations, we compare our algorithm against iALM in \cite{sahin2019inexact} and two convex solvers; Homotopy-based conditional gradient method(HCGM) in~\cite{yurtsever2018conditional}, SDPNAL+: A second-order augmented Lagrangian method for solving SDP's with nonnegativity constraints~\cite{yang2015sdpnal}.
At each iteration of the iALM, we are required to solve a subproblem up to an accuracy defined by the user. We choose first order apgm solver for this method.
\paragraph{Dataset.} Our experimental setup is similar to that described by~\cite{mixon2016clustering}. We use the publicly-available fashion-MNIST data in \cite{xiao2017/online}, which is released as a possible replacement for the MNIST handwritten digits. Each data point is a $28\times 28$ gray-scale image, associated with a label from ten classes, labeled from $0$ to $9$.
First, we extract the meaningful features from this dataset using a simple two-layer neural network with a sigmoid activation function. Then, we apply this neural network to 1000 test samples from the same dataset, which gives us a vector of length $10$ for each data point, where each entry represents the posterior probability for each class. Then, we form the $\ell_2$ distance matrix ${D}$ from these probability vectors.
The solution rank for the template~\eqref{eq:sdp_svx} is known and it is equal to number of clusters $k$ \cite[Theorem~1]{kulis2007fast}. As discussed in~\cite{tepper2018clustering}, setting rank $r>k$ leads more accurate reconstruction in expense of speed.
Therefore, we set the rank to 20.
The results are depicted in Figure~\ref{fig:clustering}.
We implemented 3 algorithms on MATLAB and used the software package for SDPNAL+ which contains mex files.
It is predictable that the performance of our nonconvex approach would even improve by using mex files.
Note here that, both iALM and HoLAL algorithm performs similar. However, rates provided in \cite{sahin2019inexact} for iALM algorithm is not to the constrained problem but to the augmented lagrangian problem.
\begin{figure}[]
\begin{center}
{\includegraphics[width=.4\columnwidth]{../figures/journal_obj_res.pdf}}
{\includegraphics[width=.4\columnwidth]{../figures/journal_feas.pdf}}
\caption{Clustering running time comparison. }
\label{fig:clustering}
\end{center}
\end{figure}
\section{Basis Pursuit}
Basis Pursuit (BP) finds sparsest solutions of an under-determined system of linear equations by solving
\begin{align}
\min_{z} \|z\|_1 \subjto
Bz = b,
\label{eq:bp_main}
\end{align}
where $B \in \RR^{n \times d}$ and $b \in \RR^{n}$.
Various primal-dual convex optimization algorithms are available in the literature to solve BP, including~\cite{tran2018adaptive,chambolle2011first}.
We compare our algorithm against state-of-the-art primal-dual convex methods for solving \eqref{eq:bp_main}, namely, Chambole-Pock~\cite{chambolle2011first}, ASGARD~\cite{tran2018smooth} and ASGARD-DL~\cite{tran2018adaptive}.
Here, we take a different approach and cast~(\ref{eq:bp_main}) as an instance of~\eqref{prob:01}. Note that any $z \in \RR^d$ can be decomposed as $z = z^+ - z^-$, where $z^+,z^-\in \RR^d$ are the positive and negative parts of $z$, respectively. Then consider the change of variables $z^+ = u_1^{\circ 2}$ and $z^-= u_2^{\circ 2} \in \RR^d$, where $\circ$ denotes element-wise power. Next, we concatenate $u_1$ and $u_2$ as $x := [ u_1^{\top}, u_2^{\top} ]^{\top} \in \RR^{2d}$ and define $\overline{B} := [B, -B] \in \RR^{n \times 2d}$. Then, \eqref{eq:bp_main} is equivalent to \eqref{prob:01} with
\begin{align}
f(x) =& \|x\|^2, \quad g(x) = 0,\subjto
A(x) = \overline{B}x^{\circ 2}- b.
\label{eq:bp-equiv}
\end{align}
We draw the entries of $B$ independently from a zero-mean and unit-variance Gaussian distribution. For a fixed sparsity level $k$, the support of $z_*\in \RR^d$ and its nonzero amplitudes are also drawn from the standard Gaussian distribution.
Then the measurement vector is created as $b = Bz + \epsilon$, where $\epsilon$ is the noise vector with entries drawn independently from the zero-mean Gaussian distribution with variance $\sigma^2=10^{-6}$.
{\bf results:}
\paragraph{Condition verification:}
Note that
\begin{align}
DA(x) = 2 \overline{B} \text{diag}(x),
\label{eq:jacob-bp}
\end{align}
where $\text{diag}(x)\in\RR^{2d\times 2d}$ is the diagonal matrix formed by $x$.
\begin{align}
& \text{dist} \left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{\b_{k-1}} \right) \nonumber\\
& = \text{dist} \left( -DA(x_k)^\top A(x_k) , \{0\} \right) \qquad (g\equiv 0)\nonumber\\
& = \|DA(x_k)^\top A(x_k) \| \nonumber\\
& =2 \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \|.
\qquad \text{(see \eqref{eq:jacob-bp})}
\label{eq:cnd-bp-pre}
\end{align}
To bound the last line above, let $x_*$ be a solution of~\eqref{prob:01} and note that $\overline{B} x_*^{\circ 2} = b $ by definition. Let also $z_k,z_*\in \RR^d$ denote the vectors corresponding to $x_k,x_*$. Corresponding to $x_k$, also define $u_{k,1},u_{k,2}$ naturally and let $|z_k| = u_{k,1}^{\circ 2} + u_{k,2}^{\circ 2} \in \RR^d$ be the vector of amplitudes of $z_k$. To simplify matters, let us assume also that $B$ is full-rank.
We then rewrite the norm in the last line of \eqref{eq:cnd-bp-pre} as
\begin{align}
& \| \text{diag}(x_k) \overline{B}^\top ( \overline{B}x_k^{\circ 2} -b) \|^2 \nonumber\\
& = \| \text{diag}(x_k) \overline{B}^\top \overline{B} (x_k^{\circ 2} -x_*^{\circ 2}) \|^2
\qquad (\overline{B} x_*^{\circ 2} = b)
\nonumber\\
& = \| \text{diag}(x_k)\overline{B}^\top B (x_k - x_*) \|^2 \nonumber\\
& = \| \text{diag}(u_{k,1})B^\top B (z_k - z_*) \|^2 \nonumber\\
& \qquad + \| \text{diag}(u_{k,2})B^\top B (z_k - z_*) \|^2 \nonumber\\
& = \| \text{diag}(u_{k,1}^{\circ 2}+ u_{k,2}^{\circ 2}) B^\top B (z_k - z_*) \|^2 \nonumber\\
& = \| \text{diag}(|z_k|) B^\top B (z_k - z_*) \|^2 \nonumber\\
& \ge
\eta_n ( B \text{diag}(|z_k|) )^2
\| B(z_k - z_*) \|^2 \nonumber\\
& =
\eta_n ( B \text{diag}(|z_k|) )^2
\| B z_k -b \|^2
\qquad ( Bz_* = \ol{B} x^{\circ2}_* = b) \nonumber\\
& \ge \min_{|T|=n}\eta_n(B_T)\cdot |z_{k,(n)}|^2 \|Bz_k - b\|^2,
\end{align}
where $\eta_n(\cdot)$ returns the $n$th largest singular value of its argument. In the last line above, $B_T$ is the restriction of $B$ to the columns indexed by $T$ of size $n$. Moreover, $z_{k,(n)}$ is the $n$th largest entry of $z$ in magnitude.
Given a prescribed $\nu$, \eqref{eq:regularity} therefore holds if
\begin{align}
|z_{k,(n)} | \ge \frac{\nu}{2 \sqrt{\min_{|T|=n} \eta_n(B_T)}} ,
\label{eq:final-bp-cnd}
\end{align}
for every iteration $k$. This implise that algorithm needs to be initialized close enough to the solution $z^*$ and the entries of $z^*$ are sufficiently large in magnitude, there will be no need to directly enforce \eqref{eq:final-bp-cnd}.

Event Timeline