Page MenuHomec4science

appendix.tex
No OneTemporary

File Metadata

Created
Tue, Oct 8, 07:44

appendix.tex

\section{Proof of Lemma \ref{lem:smoothness}\label{sec:proof of smoothness lemma}}
If $A_i$ denotes the $i^{\text{th}}$ component of the map $A:\mathbb{R}^d\rightarrow\mathbb{R}^m$, note that
\begin{align}
\mathcal{L}_{\beta}(x,y) = f(x) + \sum_{i=1}^m y_i A_i (x) + \frac{\b}{2} \sum_{i=1}^m (A_i(x))^2,
\end{align}
which implies that
\begin{align}
\nabla_x \mathcal{L}_\beta(x,y) & = \nabla f(x) + \sum_{i=1}^m y_i \nabla A_i(x) + \frac{\b}{2} \sum_{i=1}^m A_i(x) \nabla A_i(x) \nonumber\\
& = \nabla f(x) + \D A(x)^\top y + \b \D A(x)^\top A(x),
\end{align}
where $DA(x)$ is the Jacobian of $A$ at $x$. By taking another derivative with respect to $x$, we reach
\begin{align}
\nabla^2_x \mathcal{L}_\beta(x,y) & = \nabla^2 f(x) + \sum_{i=1}^m \left( y_i + \b A_i(x) \right) \nabla^2 A_i(x) + \b \sum_{i=1}^m \nabla A_i(x) \nabla A_i(x)^\top.
\end{align}
It follows that
\begin{align}
\|\nabla_x^2 \mathcal{L}_\beta(x,y)\| & \le \| \nabla^2 f(x) \|+ \max_i \| \nabla^2 A_i(x)\| \left (\|y\|_1+\b \|A(x)\|_1 \right) + \beta\sum_{i=1}^m \|\nabla A_i(x)\|^2 \nonumber\\
& \le \lambda_h+ \sqrt{m} \lambda_A \left (\|y\|+\b \|A(x)\| \right) + \b \|\D A(x)\|^2_F.
\end{align}
For every $x$ such that $\|A(x)\|\le \rho$ and $\|x\|\le \rho'$, we conclude that
\begin{align}
\|\nabla_x^2 \mathcal{L}_\beta(x,y)\|
& \le \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho \right) + \b \max_{\|x\|\le \rho'}\|\D A(x)\|_F^2,
\end{align}
which completes the proof of Lemma \ref{lem:smoothness}.
\section{Proof of Lemma \ref{lem:11}\label{sec:proof of descent lemma}}
Throughout, let
\begin{align}
G = G_{\b,\g}(x,y) = \frac{x-x^+}{\g},
\end{align}
for short.
Suppose that $\|A(x)\|\le \rho$, $\|x\|\le \rho$, and similarly $\|A(x^+)\|\le \rho$, $\|x^+\|\le \rho'$. An application of Lemma \ref{lem:smoothness} yields that
\begin{align}
\L_\b(x^+,y)+g(x^+) & \le \L_\b(x,y)+ \langle x^+-x,\nabla_x \L_\b(x,y) \rangle
+ \frac{\lambda_\b}{2} \|x^+ - x\|^2 + g(x^+) \nonumber\\
& = \L_\b(x,y)-\g \langle G ,\nabla_x \L_\b(x,y) \rangle
+ \frac{\g^2 \lambda_\b }{2} \|G\|^2 + g(x^+)
\label{eq:descent pr 1}
\end{align}
Since $x^+ = P_g(x - \g \nabla_x \L_\b(x,y))$, we also have that
\begin{align}
G - \nabla_x \L_\b(x,y) = \xi \in \partial g(x^+).
\label{eq:opt of prox}
\end{align}
By combining (\ref{eq:descent pr 1},\ref{eq:opt of prox}), we find that
\begin{align}
\L_\b(x^+,y)+g(x^+) &
\le \L_\b(x,y) -\g \|G\|^2 + \g \langle G, \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\
& = \L_\b(x,y) -\g \|G\|^2 + \langle x- x^+ , \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\
& \le \L_\b(x,y) + g(x) - \g\left( 1-\frac{\g\lambda_\b}{2}\right) \|G\|^2,
\end{align}
where the last line above uses the convexity of $g$. Recalling that $\g\le 1/\lambda_\b$ completes the proof of Lemma \ref{lem:11}.
\section{Proof of Lemma \ref{lem:eval Lipsc cte}\label{sec:proof of linesearch}}
By optimality of $x^+_{\gamma}$ in \eqref{eq:defn of x+gamma}, we note that
\begin{equation}
x^+_{\gamma} - x +\gamma \nabla_x \mathcal{L}_\beta(x,y) = - \g\xi \in -\g \partial g (x^+_{\gamma}).
\label{eq:optimality of uplus}
\end{equation}
By definition in \eqref{eq:defn of gamma line search}, $\gamma$ also satisfies
\begin{align}
& \mathcal{L}_{\beta}(x^+_{\gamma},y) + g(x_\g^+) \nonumber\\
& \le \mathcal{L}_\beta(x,y) + \left\langle
x^+_{\gamma} - x , \nabla_x \mathcal{L}_\beta (x,y)
\right\rangle + \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2
+ g(x_\g^+)
\nonumber\\
& = \mathcal{L}_\beta(x,y) + \left\langle
x - x^+_{\gamma} ,\xi
\right\rangle
- \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x_\g^+)\nonumber\\
& \le \mathcal{L}_\beta(x,y)
- \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x) - g(x^+_\g)
\qquad (\text{convexity of } g) \nonumber\\
& = \mathcal{L}_\beta(x,y) - \frac{\gamma}{2} \|G_{\beta,\gamma}(x,y)\|^2 +g(x) - g(x^+_\g),
\qquad \text{(see Definition \ref{def:grad map})}
\end{align}
which completes the proof of Lemma \ref{lem:eval Lipsc cte} since \eqref{eq:moreover} follows directly from \eqref{eq:defn of gamma line search}.
\section{Proof of Lemma \ref{lem:bnd bnd Ak} \label{sec:proof of bnd Ak}}
By assumption, we have that
\begin{align}
\max_{k\in K} \|A(x_k)\| \le \rho,
\qquad \max_{k\in K} \| x_k\| \le \rho'.
%\qquad \max_{k\in K} \| x_{k+1}-x_k \| \le \rho''.
\label{eq:bndness in slater proof}
\end{align}
From the primal update in \eqref{eq:update uk recall} and the definition of the proximal operator $P_g$~\cite{parikh2014proximal}, it follows that
\begin{align}
x_{k+1}-x_k + \g_k \nabla f(x_k) + \g_k \D A(x_k)^\top (y_k + \b_k A(x_k) ) \in - \partial g(x_{k+1}),
\end{align}
where $\partial g(x_{k+1})$ is the subdifferential of $g$ at $x_{k+1}$.
After recalling the definition of gradient mapping in \eqref{eq:grad map recall}, the above inclusion can be written as
\begin{align}
-\frac{ G_k}{\b_k} + \frac{ \nabla f(x_k)}{\b_k}
+ \frac{ \D A(x_k)^\top y_k }{\b_k} + \D A(x_k)^\top A(x_k) \in - \frac{\partial g(x_{k+1})}{\b_k \g_k}.
\label{eq:to be projected}
\end{align}
Let $\text{cone}(\partial g(x) )^*$ denote the polar of
\begin{equation}
\cone(\partial g(x))) = \bigcup_{\alpha \ge 0} \alpha \cdot \partial g(x) \subseteq \RR^d.
\label{eq:defn of dual cone}
\end{equation}
By projecting both sides \eqref{eq:to be projected} onto $\cone(\partial g(x_{k+1}))^*$, we find that
\begin{align}
& P_{\cone(\partial g(x_{k+1}))^* } \left( - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{\D A(x_k)^\top y_k}{\b_k} + \D A(x_k)^\top A(x_k) \right) \nonumber\\
& \in P_{\cone(\partial g(x_{k+1}))^* } \left( - \frac{\partial g(x_{k+1}) }{\b_k \g_k} \right) = \{ 0\},
\label{eq:before Sk}
\end{align}
where the equality above follows from the duality of $\cone(\partial g(x_{k+1}))^*$ and $\cone(\partial g(x_{k+1}))$.
Recall also that the subspace $S\subseteq \RR^d$ by assumption satisfies
\begin{align}
S \supseteq \bigcup_{k\in K} P_{ \cone( \partial g( x_{k+1}) )^* }\left( \D A(x_{k+1})^\top A(x_{k+1}) \right),
\label{eq:defn of S}
\end{align}
and project both sides of \eqref{eq:before Sk} onto $S$ to reach
\begin{align}
& P_{S}P_{\cone(\partial g(x_{k+1}))^*}\left( - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{\D A(x_k)^\top y_k}{\b_k} + \D A(x_k)^\top A(x_k) \right) = 0.
\label{eq:pre norm}
\end{align}
By taking the norm and then applying the triangle inequality above, we argue that
\begin{align}
& \left\|
P_{S} P_{\cone(\partial g(x_{k+1}))^*}( \D A(x_k)^\top A(x_k) )
\right\| \nonumber\\
& \le
\left\| P_{S} P_{\cone(\partial g(x_{k+1}))^*} \left( - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{\D A(x_k)^\top y_k}{\b_k}
\right) \right\|
\qquad \text{(see \eqref{eq:pre norm})}.
\label{eq:post norm}
\end{align}
Because proximal map is non-expansive and $P_{S}P_{\cone(\partial g(x_{k+1}))^*}(0) = 0$, we may upper bound the last line above as
\begin{align}
& \left\|
P_{S} P_{\cone(\partial g(x_{k+1}))^*} ( \D A(x_k)^\top A(x_k) )
\right\| \nonumber\\
& \le
\left\| - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{\D A(x_k)^\top y_k}{\b_k}
\right\| \nonumber\\
& \le \frac{1}{\b_k} \left( \|G_k\| + \|\nabla f(x_k) \| + \|\D A(x_k)^\top y_k\| \right).
\qquad \text{(triangle inequality)} \nonumber\\
& \le \frac{1}{\b_k} \left( \|G_k\| +\lambda'_f+ \lambda'_A \|y_k\| \right) ,
\label{eq:slater proof 0}
\end{align}
where
\begin{align}
\lambda'_f := \max_{\|x\| \le \rho'} \| \nabla f(x)\|,
\qquad
\lambda'_A := \max_{\|x\| \le \rho'} \| \D A(x)\|,
\end{align}
are the local Lipschitz constant of $f$ and $A$.
To lower bound the first line of \eqref{eq:slater proof 0}, we invoke the restricted injectivity in Section~\ref{sec:slater}.
%
%
%
%must impose a restricted injectivity as follows.
%We let $S_{K}$ with orthonormal columns denote a basis for the subspace $S_K$ in \eqref{eq:defn of S}, to simplify the notation. We then assume that $ \nu_A>0$, where
%\begin{align}
%\nu_A :=
%\begin{cases}
%\min_{v,x} \, \frac{\left\| S_{K}^\top P_{\cone(x_{k+1})} ( DA(x)^\top v) \right\|}{\|v\|} \\
%\|v\|\le \rho\\
%\|x\|\le \rho'.
%\end{cases}
%\label{eq:new slater proof}
%\end{align}
Indeed, recalling \eqref{eq:new slater defn} and the first bound in \eqref{eq:bndness in slater proof}, for every $k\in K $, we write that
\begin{align}
& \left\|
P_{S} P_{\cone(\partial g(x_{k+1}))^*} ( \D A(x_{k})^\top A(x_k))
\right\| \nonumber\\
& \ge \left\|
P_{S} P_{\cone(\partial g(x_{k+1}))^*} ( \D A(x_{k+1})^\top A(x_{k}))
\right\|
- \left\|
(\D A(x_{k+1}) - \D A(x_{k})) ^\top A(x_{k})
\right\| \nonumber\\
& \ge \nu(g,A,S,\rho,\rho') \|A(x_k)\| - \| \D A(x_{k+1})-\D A(x_k) \| \|A(x_k) \|,
\qquad \text{(see \eqref{eq:new slater defn})}
\label{eq:slater proof 1}
\end{align}
where the second line above again uses the non-expansiveness of $P_{S}$ and $P_{\cone(\partial g(x_{k+1}))^*}$. The remaining term in \eqref{eq:slater proof 1} is bounded as
\begin{align}
\| \D A(x_{k+1})-\D A(x_k) \| & \le \lambda_A \|x_{k+1}-x_k\| = \lambda_A \g_k \|G_k\| .
\qquad \text{(see (\ref{eq:smoothness basic},\ref{eq:bndness in slater proof}))}
\end{align}
Assuming that
\begin{align}
\nu(g,A,S,\rho,\rho') \ge 2\lambda_A \max_{k\in K} \g_k \|G_k\|,
\end{align}
allows us to simplify the last line of \eqref{eq:slater proof 1} as
\begin{align}
\left\|
P_{S} P_{\cone(\partial g(x_{k+1}))^*} ( \D A(x_{k})^\top A(x_k))
\right\| \ge \frac{\nu(g,A,S,\rho,\rho') }{2} \|A(x_k)\|,
\end{align}
which, after substituting in \eqref{eq:slater proof 0}, yields that
\begin{align}
\|A(x_k)\|
& \le \frac{2}{\b_k\nu(g,A,S,\rho,\rho')} \left( \|G_k\| + \lambda'_f + \lambda'_A \|y_k\| \right),
\end{align}
and completes the proof of Lemma \ref{lem:bnd bnd Ak}.
%\section{Clustering: Verifying the Geometric Regularity}
%
%Note that
%\begin{align}
%A(x) = VV^\top \mathbf{1}- \mathbf{1},
%\end{align}
%\begin{align}
%DA(x) & =
%\left[
%\begin{array}{cccc}
%w_{1,1} x_1^\top & \cdots & w_{1,n} x_{1}^\top\\
%%x_2^\top p& \cdots & x_{2}^\top\\
%\vdots\\
%w_{n,1}x_{n}^\top & \cdots & w_{n,n}1 x_{n}^\top
%\end{array}
%\right] \nonumber\\
%& = \left[
%\begin{array}{ccc}
%V & \cdots & V
%\end{array}
%\right]
%+
%\left[
%\begin{array}{ccc}
%x_1^\top & \\
%& \ddots & \\
%& & x_n^\top
%\end{array}
%\right],
%\label{eq:Jacobian clustering}
%\end{align}
%%where $I_n\in\RR^{n\times n}$ is the identity matrix,
%where $w_{i.i}=2$ and $w_{i,j}=1$ for $i\ne j$. In the last line above, $n$ copies of $V$ appear and the last matrix above is block-diagonal. For $x_k$, define $V_k$ accordingly and let $x_{k,i}$ be the $i$th row of $V_k$.
%Consequently,
%\begin{align}
%DA(x_k)^\top A(x_k) & =
%\left[
%\begin{array}{c}
%(V_k^\top V_k - I_n) V_k^\top \mathbf{1}\\
%\vdots\\
%(V_k^\top V_k - I_n) V_k^\top \mathbf{1}
%\end{array}
%\right] \nonumber\\
%& \qquad +
%\left[
%\begin{array}{c}
%x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\
%\vdots \\
%x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n
%\end{array}
%\right],
%\end{align}
%where $I_n\in \RR^{n\times n}$ is the identity matrix.
%Let us make a number of simplifying assumptions. First, we assume that $\|x_k\|< \sqrt{s}$ (which can be enforced in the iterates by replacing $C$ with $(1-\epsilon)C$ for a small positive $\epsilon$ in the subproblems). Under this assumption, it follows that
%\begin{align}
%(\partial g(x_k))_i =
%\begin{cases}
%0 & (x_k)_i > 0\\
%\{a: a\le 0\} & (x_k)_i = 0,
%\end{cases}
%\qquad i\le d.
%\label{eq:exp-subgrad-cluster}
%\end{align}
%Second, we assume that $V_k$ has nearly orthonormal columns, namely, $V_k^\top V_k \approx I_n$. This can also be enforced in each iterate of Algorithm~\ref{Algo:2} and naturally corresponds to well-separated clusters. While a more fine-tuned argument can remove these assumptions, they will help us simplify the presentation here. Under these assumptions, the (squared) right-hand side of \eqref{eq:regularity} becomes
%\begin{align}
%& \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right)^2 \nonumber\\
%& = \left\| \left( -DA(x_k)^\top A(x_k) \right)_+\right\|^2
%\qquad (a_+ = \max(a,0))
% \nonumber\\
%& =
%\left\|
%\left[
%\begin{array}{c}
%x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\
%\vdots \\
%x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n
%\end{array}
%\right]
%\right\|^2
%\qquad (x_k\in C \Rightarrow x_k\ge 0)
%\nonumber\\
%& = \sum_{i=1}^n \| x_{k,i}\|^2 (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\
%& \ge \min_i \| x_{k,i}\|^2
%\cdot \sum_{i=1}^n (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\
%& = \min_i \| x_{k,i}\|^2
%\cdot \| V_kV_k^\top \mathbf{1}-\mathbf{1} \|^2.
%\label{eq:final-cnd-cluster}
%\end{align}
%Therefore, given a prescribed $\nu$, ensuring $\min_i \|x_{k,i}\| \ge \nu$ guarantees \eqref{eq:regularity}. When the algorithm is initialized close enough to the constraint set, there is indeed no need to separately enforce \eqref{eq:final-cnd-cluster}. In practice, often $n$ exceeds the number of true clusters and a more intricate analysis is required to establish \eqref{eq:regularity} by restricting the argument to a particular subspace of $\RR^n$.
\section{Proof of Proposition \ref{prop:convex} \label{sec:proof of prop}}
To prove the first claim of the proposition, suppose that the Slater's condition does not hold, namely, suppose that
\begin{equation}
\Null(A)\cap \relint(C) = \emptyset,
\label{eq:no feas}
\end{equation}
where $\Null(A)$ and $\relint(C)$ denote the null space of the matrix $A$ and the relative interior of $C$, respectively.
We have assumed that \eqref{prob:01} is feasible, namely, there exists $x\in C$ such that $Ax=0$. It follows from \eqref{eq:no feas} that $x\in \text{boundary}(C)$ and that $\Null(A)$ supports $C$ at $x$, namely,
$A x'\ge 0$, for every $x'\in C$. (The inequality $Ax'\ge 0$ applies to each entry of the vector $Ax'$.)
Consequently, $\Null(A) \cap T_C(x) \ne \{0\}$, where $T_C(x)$ is the tangent cone of the set $C$ at $x$.
Equivalently, it holds that
$\row(A)\cap N_C(x) \ne \{0\}$, where $\row(A)$ is the row space of the matrix $A$ and $N_C(x)$ is the normal cone to $C$ at $x$. That is, there exists a unit-norm vector $v$ such that
$P_{T_C(x)}A^\top v=0$
and, consequently, $P_S P_{T_C(x)}A^\top v=0$. Let us take $\rho'=\|x\|$ in~\eqref{eq:nu for cvx}. We then conclude that
$$
\widetilde{\nu}(g,A,S,1,\|x\|)=\widetilde{\nu}(g,A,S,\infty,\|x\|)=0,
$$
namely, the geometric regularity also does not hold for any $\rho\ge 0$ and $\rho'=\|x\|$. The first identity above follows from the homogenity of the right-hand side of \eqref{eq:nu for cvx}.
Because increasing $\rho'$ cannot increase the right-hand side of \eqref{eq:nu for cvx}, we find that $\widetilde{\nu}(g,A,S,\infty,\max_{x\in C} \|x\|)=0$, which proves the first claim in Proposition~\ref{prop:convex}.
For the converse, we can verify that it suffices to take $\text{row}(A) \subseteq S$.
Next, recall \eqref{eq:nu for cvx} and suppose that there exists $x\in \RR^d$ such that
\begin{align}
\eta_{\min} ( P_S P_{T_C(x)} A^\top ) =0.
\label{eq:counter-assump}
\end{align}
Throughout, we assume without loss of generality that $x\in C$. Indeed, otherwise $x$ would not be feasible for problem \eqref{prob:01} with $g=1_C$ and cannot be used to study the Slater's condition in \eqref{prob:01}. Note that, by assumption in Proposition~\ref{prop:convex}, \eqref{eq:counter-assump} can be rewritten as
\begin{align}
\eta_{\min}(P_S P_{T_C(x)} A^\top)
& = \eta_{\min}( P_{T_C(x)} A^\top) =0,
\qquad (S = \text{aff}(C))
\label{eq:counter-assump-rewritten}
\end{align}
where $\text{aff}(C)$ is the affine hull of $C$.
Then, since $\|x\| \le \rho'<\infty$ in \eqref{eq:nu for cvx}, we can assume throughout that $\text{boundary}(C) \cap B_{\rho'}\ne \emptyset$ and, moreover, $x\in \text{boundary}(C)$. (Here, $B_{\rho'}=\{z : \|z\|\le \rho' \} $ is the ball of radius $\rho'$ at the origin.) Indeed, otherwise if $x\in \text{relint}(C)$, we have that $T_C(x)=S$ and thus
\begin{align*}
\eta_{\min}(P_{S}P_{T_C(x)} A^\top) & = \eta_{\min}( P_{T_C(x)} A^\top)
\qquad (S = \text{aff}(C) )
\nonumber\\
& = \eta_{\min}( A^\top)
\qquad ( \text{row}(A) \subseteq S) \nonumber\\
& >0,
\end{align*}
which contradicts \eqref{eq:counter-assump}. The last line above holds because, by assumption in Proposition~\ref{prop:convex}, $A$ is full-rank. To reiterate, we assume that $x\in \text{boundary}(C)$. Therefore, by \eqref{eq:counter-assump-rewritten}, there exists a unit-norm $u\in \row(A)$ such that $u\in N_C(x)$. In turn, this implies that $\text{null}(A) \cap \text{int}(C)=\emptyset$. Indeed, otherwise, any vector $v\in \text{null}(A) \cap \text{int}(C)$ satisfies $ \langle u, v \rangle <0$, which is impossible because $u\in \text{row}(A)$ and $v\in \text{null}(A)$ are orthogonal vectors. That is, the Slater's condition does not hold, which proves the second claim in Proposition~\ref{prop:convex}.

Event Timeline