diff --git a/Journal/sections/ADMM.tex b/Journal/sections/ADMM.tex index 2999519..83f1866 100644 --- a/Journal/sections/ADMM.tex +++ b/Journal/sections/ADMM.tex @@ -1,240 +1,240 @@ \section{Linearized Alternating Direction Method of Multipliers} In convex optimization, whenever applicable, Alternating Direction Method of Multipliers (ADMM) \cite{Glowinski1975,Gabay1976,Boyd2011} often outperforms the augmented Lagrangian method. In addition, ADMM often more efficiently handles non-smooth terms. In light of the success of ADMM in convex optimization, in this section we develop and study a (linearized) ADMM variant of Algorithm~\ref{Algo:2}. More specifically, consider the program \begin{align} \begin{cases} \underset{x,z}{\min} \,\, f(x)+g(x)+h(z)+l(z)\\ A(x)+B(z) = 0, \end{cases} \label{eq:admm 2} \end{align} where $f,h:\RR^d\rightarrow\RR$ and $A,B:\RR^d\rightarrow\RR^d$ are smooth in the sense described later in this section. %\begin{align*} %\| \nabla f(x) - \nabla f(x') \| \le \lambda_f \|x-x'\|, %\,\, %\| \D A(x) - \D A(x') \| \le \lambda_A \|x-x'\|, %\end{align*} %\begin{align} %\| \nabla h(z) - \nabla h(z') \| \le \lambda_h \|z-z'\|, %\,\, %\| \D B(z) - \D B(z') \| \le \lambda_B \|z-z'\|, %\label{eq:smoothness basics admm} %\end{align} %for every $x,x',z,z'\in \RR^d$. Above, $g,l:\RR^d\rightarrow\RR$ are proximal-friendly convex functions which might not be differentiable. %For example, when $C,D\subset \RR^d$ and $g=1_C, l=1_D$ are the indicator functions on $C,D$, respectively, Program~\eqref{eq:admm 2} becomes %\begin{align} %\begin{cases} %\underset{x,z}{\min} \,\, f(x)+h(z)\\ %A(x)+B(z) = 0\\ %x\in C,\qquad %z\in D. %\end{cases} %\label{eq:admm 2} %\end{align} For penalty weight $\b\ge 0$, the augmented Lagrangian corresponding to problem~\eqref{eq:admm 2} is \begin{align} \L_{\b}(x,z,y) & = f(x)+h(z) + \langle A(x)+B(z),y \rangle + \frac{\b}{2}\| A(x)+B(z) \|^2, \label{eq:lagrangian admm} \end{align} and problem \eqref{eq:admm 2} is therefore equivalent to the minimax program \begin{align} \underset{x,z}{\min} \underset{y}{\max}\,\, \L_\b(x,z,y). \label{eq:admm main} \end{align} To solve the equivalent formulation in \eqref{eq:admm main}, we propose the linearized ADMM in Algorithm~\ref{Algo:3}. Most remarks about Algorithm~\ref{Algo:2} apply to Algorithm~\ref{Algo:3} as well and, in particular, note that Algorithm~\ref{Algo:3} performs two consecutive primal updates, one on $x$ and then one on $z$. To parse the details of Algorithm~\ref{Algo:3}, we need to slightly change the gradient map in Definition~\ref{def:grad map} and the line search procedure in Lemma~\ref{lem:eval Lipsc cte} to match the new augmented Lagrangian $\L_\b$ in \eqref{eq:lagrangian admm}. More specifically, the corresponding gradient maps are defined as \begin{equation} G_{\b,\gamma}(x,z,y) = \frac{x-x^+}{\gamma}, \qquad H_{\b,\iota}(x,z,y) = \frac{z-z^+}{\iota}, \label{eq:grad map admm} \end{equation} where \begin{align} x^+=P_{g}(x-\gamma \nabla_x \mathcal{L}_ {\beta}(x,z,y)), \qquad z^+=P_{l}(z-\iota \nabla_z \mathcal{L}_ {\beta}(x,z,y)), \end{align} and $\g,\iota>0$ are the primal step sizes. Above, $P_g$ and $P_l$ are the corresponding proximal operators. The line search procedure too is similar to Lemma~\ref{lem:eval Lipsc cte} and we set \begin{align*} x^+_{\gamma'} = P_g(x - \gamma' \nabla_x \mathcal{L}_\beta(x,z,y)), \end{align*} \begin{align} z^+_{\iota'} = P_l(z - \iota' \nabla_z \mathcal{L}_\beta(x,z,y)), \end{align} \begin{align} \gamma & := \max \Big\{ \gamma' ={\gamma}_0 \theta^i : \mathcal{L}_\beta (x^+_{\gamma'},z,y) \nonumber\\ & \qquad \qquad \le \mathcal{L}_\beta (x,z,y) + \left\langle x^+_{\gamma'} - x , \nabla_x \mathcal{L}_{\beta}(x,z,y) \right\rangle + \frac{1}{2\gamma'} \| x^+_{\gamma'} - x\|^2 \Big\}, \label{eq:line search x admm} \end{align} \begin{align} \iota & := \max \Big\{ \iota' ={\iota}_0 \theta^i : \mathcal{L}_\beta (x,z^+_{\iota'},y) \nonumber\\ & \qquad \qquad \le \mathcal{L}_\beta (x,z,y) + \left\langle z^+_{\iota'} - z , \nabla_z \mathcal{L}_{\beta}(x,z,y) \right\rangle + \frac{1}{2\iota'} \| z^+_{\iota'} - z\|^2 \Big\}. \label{eq:defn of gamma line search admm} \end{align} The analysis of Algorithm~\ref{Algo:3} is similar to that of Algorithm~\ref{Algo:2}, involving also a similar version of Lemma~\ref{lem:bnd bnd Ak}. The convergence rate of Algorithm~\ref{Algo:3} is detailed below and proved in Appendix~\ref{sec:ADMM theory}. To present the result, let us introduce some additional notation. We let \begin{align} u=\left[ \begin{array}{cc} x^\top z^\top \end{array} \right]^\top, \qquad p(u) = f(x)+h(z), \nonumber\\ q(u) = g(x)+l(z), \qquad D(u) = A(x)+B(z), \label{eq:new-notation-admm} \end{align} for short and assume that $p,D$ are smooth in the sense that \begin{align} \| \nabla p(u) - \nabla p(u') \| \le \lambda_p \|u-u'\|, \nonumber\\ \| \D D(u) - \D D(u') \| \le \lambda_D \|u-u'\|, \label{eq:smoothness basics admm} \end{align} for every $u,u'\in \RR^{2d}$. \begin{algorithm} \label{Algo:3} Input: Parameters $\beta_1,\s_1,\rho,\tau> 0$, primal initialization $x_{1},z_1\in \RR^d$ with $\|A(x_1)+B(z_1)\|\le \rho$, dual initialization $y_1\in \RR^m$. \\ For $k=1,2,\ldots$, execute\\ \begin{enumerate} \item \textbf{(Update penalty weight)} $ \beta_k \leftarrow \beta_{1} \sqrt{k} \log(k+1)/\log 2. $ \item \textbf{(Line search in $x$)} Use the line search procedure in \eqref{eq:line search x admm} by replacing $x=x_k,z=z_k,y=y_k,\b=\b_k$ and let $\g_k \leftarrow \g$ be the output.\\ \item \textbf{(Descent step in $x$)} $ x_{k+1} \leftarrow P_{g }(x_{k} - \gamma_{k} \nabla_x \mathcal{L}_{\beta_{k}}(x_{k},z_k,y_{k}))$, where $\L_{\b_k}$ is the augmented Lagrangian and $P_g$ denotes the proximal operator, defined in (\ref{eq:Lagrangian},\ref{eq:defn of prox}), respectively.\\ \item \textbf{(Line search in $z$)} Use the line search procedure in \eqref{eq:defn of gamma line search} by replacing $x=x_{k+1},z=z_k,y=y_k,\b=\b_k$ and let $\iota_k \leftarrow \iota$ be the output.\\ \item \textbf{(Descent step in $z$)} $ z_{k+1} \leftarrow P_{l }(z_{k} - \iota_{k} \nabla_z \mathcal{L}_{\beta_{k}}(x_{k+1},z_k,y_{k}))$, where $P_l$ denotes the proximal operator for $l$.\\ % %\item \textbf{(Control)} Properly modify $x_{k+1},z_{k+1}$ to ensure that $\|x_{k+1}\|,\|z_{k+1}\|\le \rho'$ or $\|x_{k+1}-x_k\|\le \rho''$.\\ \item \textbf{(Stopping criterion)} If $\g_k \|G_{\b_k,\g_k}(x_k,z_k,y_k)\|^2+ \iota_k \|G_{\b_k,\iota_k}(x_{k+1},z_k,y_k)\|^2 + \s_k \|A(x_k)+B(z_k)\|^2 \le \tau$, then quit and return $x_{k+1},z_{k+1}$. % as approximate minimizers of Program \eqref{prob:01}. See \eqref{eq:grad map admm} for the definition of the gradient mapping. \\ \item \textbf{(Update dual step size)} $\s_{k+1} \leftarrow \s_{1} \min\left( {\frac{1}{\sqrt{k+1}}}, \frac{\|A(x_1)+B(z_1)\|}{|A(x_{k+1})+B(z_{k+1})\|}\cdot \frac{ \log^2 2 }{(k+1)\log^2(k+2)} \right). $\\ \item \textbf{(Dual ascent step)} $y_{k+1} \leftarrow y_{k} + \sigma_{k+1}(A(x_{k+1})+B(z_{k+1}))$. \end{enumerate} \caption{Linearized ADMM for solving problem \eqref{eq:admm main}} \end{algorithm} %\begin{theorem}[Linearized ADMM] %\label{thm:main admm} %For a sufficiently large $k_0$, consider an integer interval $K=[k_0:k_1]$, and suppose that %\begin{align} % \inf_{k\ge k_0} f(x_k) + g(x_k)+ h(z_k) + l(z_k) + \langle A(x_k)+B(z_k) ,y_{k_0}\rangle %>- \infty, %\end{align} %and that $\rho$ is large enough such that (\ref{eq:suff cnd on rho final}) holds . For $\rho'>0$, in addition to the strong smoothness of $f$ and $A$ quantified in (\ref{eq:smoothness basic}), let us define %\begin{align} %\lambda'_f = \max_{\|x\|\le \rho'} \|\nabla f(x)\|, %\qquad %\lambda'_A = \max_{\|x\| \le \rho'} \|DA(x)\|, %\qquad %\lambda'_B = \max_{\|x\|\le \rho'} \|DB(x)\|, %\end{align} %to be the (restricted) Lipschitz constants of $f,A,B$, respectively. %Moreover, suppose that the nonconvex Slater's condition, namely, Lemma \ref{lem:bnd bnd Ak}, is in force, and %\begin{align} %\nu_A \ge 2\lambda_A \rho''. %\end{align} %Then the output of linearized ADMM in Algorithm~\ref{Algo:2} satisfies \textbf{we need a better expression below to better show the dependence on various parameters.} %\begin{align} %& \min_{k\in K}\,\, \min(\overline{\g},\overline{\iota})( \|G_k\|^2 + \|H_k\|^2) \sqrt{\frac{k_0 \log^2(k_0+1)}{k_1\log^2(k_1+1)}} \nonumber\\ %& \qquad + \|A(x_k)\|^2 %\lesssim \frac{1}{k_1-k_0}, %\end{align} %where $\overline{\gamma},\overline{\iota}$ are independent of $k_1$ and $\lesssim$ suppresses the dependence on all parameters except $k_1$. The exact expressions in terms of $\lambda_f,\lambda'_f,\lambda_A,\lambda'_A, \b_{k_0},\s_{k_0}, \nu_A,y_0$ are found in (\ref{eq:raw bnd on sum admm},\ref{eq:lower_band_iotas_admm}). % %\end{theorem} \begin{theorem}[Convergence rate of linearized ADMM] \label{thm:main admm} For sufficiently large integers $k_0< k_1$, consider the interval $K=[k_0:k_1]$,\footnote{The requirement for $k_0$ to be sufficiently large is merely to simplify the analysis. If desired, one can set $k_1=\infty$ in Theorem~\ref{thm:main admm}.} and consider the output sequence $\{x_k,z_k,y_k\}_{k\in K}$ of Algorithm~\ref{Algo:2} with $\kappa_k:= \min(\g_k,\iota_k)$ for short. After recalling (\ref{eq:new-notation-admm}), suppose that \begin{align*} \mu := - \min(0, \inf_{k} p(u_k) + q(u_k) + \langle D(u_k) ,y_{k_0}\rangle ) < \infty. \end{align*} For $\rho'>0$, in addition to the strong smoothness of $p$ and $D$ quantified in (\ref{eq:smoothness basics admm}), let us define \begin{align} \lambda'_p = \max_{\|u\|\le \rho'} \|\nabla p(x)\|, \qquad \lambda'_D = \max_{\|u\| \le \rho'} \|\D D(x)\|, \end{align} to be the (restricted) Lipschitz constants of $p$ and $D$, respectively. Suppose also that problem~(\ref{eq:admm 2}) satisfies the geometric regularity in Definition~\ref{defn:nonconvex slater} with \begin{align} \nu(q,D,S,\rho,\rho') \gtrsim \max\left(\lambda_D \max_{k\in K} \sqrt{\kappa_k\mu} , \frac{{\lambda'_p+\lambda'_D}}{\sqrt{\mu}} \right), \label{eq:low-bnd-on-regularity-admm} \end{align} where \begin{itemize} \item $\rho \gtrsim \sqrt{\mu}$, %\item $\rho \gtrsim -\sqrt{\mu}$, \item $\rho'\ge \max_{k\in K} \|u_k\|$, \item $S \supseteq \bigcup_{k\in K} P_{\cone(\partial q (u_{k+1}) )^*}\left( \D D(u_{k+1})^\top D(u_{k+1}) \right).$ \end{itemize} Then the output sequence of Algorithm~\ref{Algo:2} satisfies \begin{align} \label{eq:rates-thm-admm} & \min_{k\in K} \frac{1}{\lambda_p+ \sqrt{m} \lambda_D \rho + d \lambda'^2_D} \cdot \frac{\|G_{\b_K,\g_k}(x_k,y_k)\|^2 + \|H_{\b_K,\iota_k}(z_k,y_k)\|^2 }{\sqrt{k_1}\log(k_1+1)} \nonumber\\ & + \|A(x_k)\|^2+\|B(z_k)\|^2 \lesssim \frac{1}{k_1-k_0}\left( \frac{\lambda_p'^2+\lambda_D'^2}{\nu(q,D,S,\rho,\rho')^2} + \mu\right), \end{align} where $\lesssim,\gtrsim$ above suppress the dependence on constants and less important parameters, for the sake of clarity. %The exact expressions %%in terms of $\lambda_f,\lambda'_f,\lambda_A,\lambda'_A, \b_{k_0},\s_{k_0},\rho, \rho', \nu(g,A,S,\rho,\rho'),y_0$ %are found in (\ref{eq:exact dependences},\ref{eq:suff cnd on nu-1},\ref{eq:cnd-on-nu-proof}). \end{theorem} \noindent Most of the remarks after Theorem \ref{thm:main} apply to Theorem~\ref{thm:main admm} too. \section*{Acknowledgments} -AE is extremely grateful to Nadav Hallak for his thoughtful and constructive comments, which improved this manuscript. \ No newline at end of file +AE is extremely grateful to Nadav Hallak and Yura Malitsky for their thoughtful and constructive comments, which improved this manuscript. \ No newline at end of file diff --git a/Journal/sections/ADMM_theory.tex b/Journal/sections/ADMM_theory.tex index 762c6ed..77623f2 100644 --- a/Journal/sections/ADMM_theory.tex +++ b/Journal/sections/ADMM_theory.tex @@ -1,244 +1,244 @@ \section{Proof of Theorem \ref{thm:main admm} \label{sec:ADMM theory}} For completeness, let us repeat the technical lemmas and definitions of Section \ref{sec:preliminaries}, slightly adjusted here for the augmented Lagrangian of problem~\eqref{eq:admm main}, defined in \eqref{eq:lagrangian admm}. These standard results are stated below without proof. \begin{lemma}[Smoothness]\label{lem:smoothness admm} Given $\rho,\rho'\ge 0$, it holds that \begin{align*} \| \nabla_x \mathcal{L}_{\beta}(x, z,y)- \nabla_x \mathcal{L}_{\beta}(x', z, y) \| \le \lambda_{\b,z} \|x-x'\|, \end{align*} \begin{align} \| \nabla_z \L_\b(x,z,y) - \nabla_z \L_\b(x,z',y) \| \le \lambda_{\b,x} \|z-z'\|, \end{align} for every $(x,z),(x',z),(x,z')\in \X_{\rho,\rho'}$ and $y\in\RR^m$, where \begin{align} \X_{\rho,\rho'}:= \{ (x'',z''): \|A(x'')+B(z'')\| \le \rho, \|x''\|\le \rho', \|z''\| \le \rho'\}, \end{align} \begin{align*} \lambda_{\beta,x} & \le \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho \right) + \b d \lambda'^2_A, \end{align*} \begin{align} \lambda_{\beta,z} & \le \lambda_h + \sqrt{m}\lambda_B \left(\|y\| + \b\rho \right) + \b d \lambda'^2_B, \label{eq:smoothness of Lagrangian admm} \end{align} \begin{align} \lambda'_A := \max_{\|x\|\le \rho'}\|\D A(x)\|, \qquad \lambda'_B := \max_{\|z\|\le \rho'}\|\D B(z)\|, \label{eq:local-lipschitz-admm} \end{align} and $\lambda_f,\lambda_A,\lambda_h,\lambda_B$ were defined in (\ref{eq:smoothness basics admm}). \end{lemma} \begin{definition}\label{def:grad map admm}\textbf{{(Gradient Mapping)}} Given $x,z\in \RR^d$ and $\gamma,\iota >0$, the gradient mappings $G_{\beta,\gamma}(\cdot,z, y), G_{\b,\iota}(x,\cdot,y): \RR^d\rightarrow\RR^d$ take, respectively, $x,z\in \RR^d$ to \begin{equation} G_{\b,\gamma}(x,z,y) = \frac{x-x^+}{\gamma}, \qquad H_{\b,\iota}(x,z,y) = \frac{z-z^+}{\iota}, \label{eq:grad map admm} \end{equation} where $x^+=P_{g}(x-\gamma \nabla_x \mathcal{L}_ {\beta}(x,z,y))$ and $z^+=P_{l}(z-\iota \nabla_z \mathcal{L}_ {\beta}(x,z,y))$. \end{definition} \begin{lemma}[Descent lemma]\label{lem:11 admm} For $x,z\in \RR^d$ and $y\in \RR^m$, let $x^+,z^+$ be as in Definition \ref{def:grad map admm} with $\g<1/\lambda_{\b,x}$ and $\iota<1/\lambda_{\b,z}$. For $\rho,\rho'\ge 0$, suppose that \begin{align} (x,z),(x^+,z),(x,z^+) \in \X_{\rho,\rho'}. %= \{ (x',z'): \|A(x')+B(z')\| \le \rho, \|x'\| \le \rho', \|z'\| \le \rho' \}. \end{align} Then it holds that \begin{equation*} - \label{e:deslem} +% \label{e:deslem} \| G_{\beta,\gamma}(x,z,y)\|^{2}\leq \frac{2}{\gamma} (\mathcal{L}_\beta(x,z,y) + g (x)- \mathcal{L}_\beta(x^+,z,y)- g(x^+) ), \end{equation*} \begin{align} \| H_{\beta,\iota}(x,z,y)\|^{2}\leq \frac{2}{\iota} (\mathcal{L}_\beta(x,z,y) + l (x)- \mathcal{L}_\beta(x,z^+,y)- l(x^+) ). \label{eq:deslem admm} \end{align} \end{lemma} \begin{lemma}[Line search] \label{lem:eval Lipsc cte admm} Fix $\theta \in (0,1)$ and ${\gamma}_0,\iota_0>0$. For $\gamma',\iota'>0$, let \begin{align} x^+_{\gamma'} = P_g(x - \gamma' \nabla_x \mathcal{L}_\beta(x,z,y)), \qquad z^+_{\iota'} = P_l(z - \iota' \nabla_z \mathcal{L}_\beta(x,z,y)), \end{align} and define \begin{align*} \gamma & := \max \Big\{ \gamma' ={\gamma}_0 \theta^i : \mathcal{L}_\beta (x^+_{\gamma'},z,y) \nonumber\\ & \qquad \qquad \quad \le \mathcal{L}_\beta (x,z,y) + \left\langle x^+_{\gamma'} - x , \nabla_x \mathcal{L}_{\beta}(x,z,y) \right\rangle + \frac{1}{2\gamma'} \| x^+_{\gamma'} - x\|^2 \Big\}, \end{align*} \begin{align} \iota & := \max \Big\{ \iota' ={\iota}_0 \theta^i : \mathcal{L}_\beta (x,z^+_{\iota'},y) \nonumber\\ & \qquad \qquad \quad \le \mathcal{L}_\beta (x,z,y) + \left\langle z^+_{\iota'} - z , \nabla_z \mathcal{L}_{\beta}(x,z,y) \right\rangle + \frac{1}{2\iota'} \| z^+_{\iota'} - z\|^2 \Big\}. \label{eq:defn of gamma line search admm} \end{align} Then, (\ref{eq:deslem admm}) holds and, moreover, we have that \begin{align} \gamma \ge \frac{\theta}{\lambda_{\beta,x}}, \qquad \iota \ge \frac{\theta}{\lambda_{\beta,z}}. \label{eq:moreover admm} \end{align} \end{lemma} For the reader's convenience, let us also recall the updates of Algorithm~\ref{Algo:3} in iteration $k$ as \begin{align} x_{k+1}& =P_g(x_k -\g_k \nabla_x \L_{\b_k} (x_k,z_k,y_k)),\nonumber\\ z_{k+1}& =P_l(z_k - \i_k \nabla_z \L_{\b_k} (x_{k+1},z_k,y_k)),\nonumber\\ y_{k+1}& = y_k+\s_{k+1} (A(x_{k+1})+B(z_{k+1})). \label{eq:ADMM updates recall} \end{align} For every $k\in K=[k_0:k_1]$, recall that the primal step sizes $\g_k,\i_k$ are determined by line search in Lemma \ref{lem:eval Lipsc cte admm}. Moreover, the penalty weights and dual step sizes are set as \begin{align} \b_k & = \b_{k_0} \sqrt{\frac{k\log^2(k+1)}{k_0 \log^2(k_0+1)}}, \nonumber\\ \s_k & = \s_{k_0}\min\left( \sqrt{\frac{k_0}{k}} , \frac{\|A(x_{k_0})+B(z_{k_0})\|}{\|A(x_k)+B(z_k)\|} \cdot \frac{k_0 \log^2 (k_0+1)}{k \log^2 (k+1)} \right). \end{align} For every $k\in K$, let us set \begin{align*} G_k = G_{\b_k,\g_k}(x_k,z_k,y_k) = \frac{x_k-x_{k+1}}{\g_k}, \end{align*} \begin{align} H_k = H_{\b_k,\iota_k}(x_{k+1},z_k,y_k)= \frac{z_k-z_{k+1}}{\i_k}, \end{align} for short. The convergence analysis of Algorithm~\ref{Algo:3} only slightly differs from the one in the proof of Theorem~\ref{thm:main} and we therefore only present the proof sketch, somewhat informally. Similar to the proof of Theorem~\ref{thm:main}, two applications of Lemma~\ref{lem:11 admm} yields that \begin{align} \frac{\g_k \|G_k\|^2}{2} \le \L_{\b_k}(x_k,z_k,y_k) + g(x_k) - \L_{\b_k}(x_{k+1},z_k,y_k) - g(x_{k+1}), \nonumber\\ \frac{\i_k \|H_k\|^2}{2} \le \L_{\b_k}(x_{k+1},z_k,y_k) + l(z_k) - \L_{\b_k}(x_{k+1},z_{k+1},y_k) - l(z_{k+1}), \label{eq:decrease twice} \end{align} for every $k$. By setting \begin{align*} u_k = [ \begin{array}{cc} x_k^\top & z_k^\top \end{array} ]^\top \in \RR^{2d}, \qquad Q_k = [ \begin{array}{cc} G_k^\top & H_k^\top \end{array} ]^\top \in \RR^{2d}, \qquad q(u) = f(x)+h(z), \end{align*} \begin{align} q(u) = g(x) + l(z), \qquad D(u) = A(x)+B(z), \qquad \kappa_k = \min(\g_k,\iota_k), \label{eq:defn of w} \end{align} for every $k\in K$ and after summing up the two inequalities in \eqref{eq:decrease twice}, we reach \begin{align} \frac{\kappa_k \|Q_k\|^2}{2} & \le \L_{\b_k}(u_k,y_k) + q(u_k) - \L_{\b_k}(u_{k+1},y_k) - q(u_{k+1}), \qquad \forall k\in K. \end{align} By following the same steps as in the proof of Theorem \ref{thm:main}, we find that \begin{align} \sum_{k=k_0}^{k_1} \frac{\kappa_k \|Q_k\|^2}{2} & \le \mu+ 2\sum_{k=k_0}^{k_1} \| D(u_k) \|^2 , \end{align} where \begin{align} \mu & := \max\Bigg( \sup_{k} \Big( q(u_{k_0})+q'(u_{k_0}) - q(u_{k})-q'(u_k) \nonumber\\ & + \langle D(u_{k_0}) -D(u_k) ,y_{k_0}\rangle + \frac{\beta_{k_0}}{2} \|A(x_{k_0})+B(z_{k_0})\|^2 \Big) , 0 \Bigg) < \infty. \end{align} On the other hand, the $x$ and $z$ updates in \eqref{eq:ADMM updates recall} imply that \begin{align} & G_k - \nabla f(x_k) - \D A(x_k)^\top y_k \nonumber\\ & \qquad - \b_k \D A(x_k)^\top (A(x_k) + B(z_k) ) \in \frac{\partial g(x_{k+1})}{\gamma_k} \subseteq \frac{\partial g(x_{k+1})}{\kappa_k} , \end{align} \begin{align} & H_k - \nabla h(z_k) - \D B(z_k)^\top y_k \nonumber\\ & \qquad - \b_k \D B(z_k)^\top (A(x_{k+1})+B(z_k)) \in \frac{\partial l(z_{k+1})}{\iota_k} \subseteq \frac{\partial l(x_{k+1})}{\kappa_k} , \end{align} which can be more compactly written as \begin{align} & Q_k - \nabla q(u_k) - \D D(u_k)^\top y_k - \b_k \D D(u_k)^\top D(u_k) \nonumber\\ & \qquad + \b_k \left[ \begin{array}{c} 0\\ \D B(z_k)^\top (A(x_k) - A(x_{k+1})) \end{array} \right] \in \frac{\partial q(u_{k+1})}{\kappa_k}. \label{eq:to-be-projected-admm} \end{align} Note that \eqref{eq:to-be-projected-admm} is similar to \eqref{eq:to be projected}, except for its last term, which satisfies \begin{align} & \b_k \left\| \left[ \begin{array}{c} 0\\ \D B(z_k)^\top (A(x_k) - A(x_{k+1})) \end{array} \right] \right\| \nonumber\\ & = \b_k \| \D B(z_k)^\top (A(x_k) - A(x_{k+1}) \| \nonumber\\ & \le \b_k \lambda'_A \lambda'_B \|x_{k}-x_{k+1}\| \qquad \text{(see \eqref{eq:local-lipschitz-admm})} \nonumber\\ & = \b_k \g_k \lambda'_A \lambda'_B \| G_k\| \qquad \text{(see \eqref{eq:grad map admm})} \nonumber\\ & \le \b_k \g_k \lambda'_A \lambda'_B \| Q_k\|. \qquad \text{(see \eqref{eq:defn of w})} \end{align} From this point, the rest of the proof steps of Lemma~\ref{lem:bnd bnd Ak} and Theorem~\ref{thm:main} can be applied directly to complete the proof of Theorem~\ref{thm:main admm}. %For $\rho,\rho',\rho''>0$, suppose that %\begin{align} %\max_{k\in K} \|A(x_k)+B(z_k) \| \le \rho, %\quad %\max_{k\in K} \|x_k\| \le \rho', %\quad %\max_{k\in K} \|x_{k+1}-x_k\| \le \rho'', %\label{eq:feas to be checked later} %\end{align} %\begin{align} %\nu_A \ge 2\lambda_A \rho'', %\end{align} %where $\nu_A$ was defined in \eqref{eq:new slater defn}. %Then, following the same line of argument, a similar version of Lemma~\ref{lem:bnd bnd Ak} holds and %\begin{align} %\|A(x_k)+B(z_k) \| \le \frac{2}{\nu_A\b_k} (\|G_k\|+ \lambda'_h+ \lambda'_A\|y_{k-1}\|). %\end{align} %If we assume that (\ref{eq:to be used later on},\ref{eq:suff cnd on rho}) hold, then the first bound in \eqref{eq:feas to be checked later} is in force and %\begin{align} %& \sum_{k=k_0}^{k_1} \kappa_k \|Q_k\|^2 + \|A(x_k)+B(z_k)\|^2 \nonumber\\ %& \le \frac{24c \lambda'^2_f k_0 \log^2(k_0+1)}{\nu_A^2 \b^2_{k_0}} + \frac{24c\lambda'^2_A y_{\max}^2 k_0 \log^2(k_0+1)}{\nu_A^2 \b_{k_0}^2} + 5\mu. %\label{eq:raw bnd on sum admm} %\end{align} %To better interpret the above bound, we next estimate the primal step sizes $\{\g_k,\i_k\}$, which are determined by line search. Let $\lambda_{\b,u}$ denote the Lipschitz constant of $\L_{\b_k}(u,v)$ with respect to $u$. Similar to the proof of Theorem \ref{thm:main}, we find that %\begin{align*} %\gamma_k %& \ge \frac{\theta }{ 2 \b_{k_0} \left( \sqrt{m}\lambda_A \rho + d \lambda'^2_A \right)}\sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}} \nonumber\\ %& =: \overline{\g} \sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}}, %\end{align*} %\begin{align} %\iota_k & \ge \frac{\theta }{ 2 \b_{k_0} \left( \sqrt{m}\lambda_B \rho + d \lambda'^2_B \right)}\sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}} \nonumber\\ %& =: \overline{\iota} \sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}}, %\label{eq:lower_band_iotas_admm} %\end{align} %for sufficiently large $k_0$. Substituting the above bounds in \eqref{eq:raw bnd on sum admm} and following the steps in the proof of Theorem \ref{thm:main} completes the proof of Theorem \ref{thm:main admm}. diff --git a/Journal/sections/LinAL.tex b/Journal/sections/LinAL.tex index 558d987..50288b4 100644 --- a/Journal/sections/LinAL.tex +++ b/Journal/sections/LinAL.tex @@ -1,321 +1,323 @@ \section{Homotopy Linearized AL Algorithm \label{sec:holal}} To solve the equivalent formulation of problem \eqref{prob:01} presented in \eqref{eq:minmax}, we propose a Homotopy Linearized Augmented Lagrangian algorithm (HoLAL), summarized in Algorithm~\ref{Algo:2}. At every iteration, Algorithm~\ref{Algo:2} takes a primal descent step followed by a dual ascent step. The increasing sequence of penalty weights $\{\b_k\}_k$ in Step 1, and the dual updates in Steps 6 and 7 are responsible for continuously enforcing the constraints in~\eqref{prob:01}. As we will see in the convergence analysis, the particular choice of $\b_k$ in Algorithm~\ref{Algo:2} strikes a balance between reducing the objective of \eqref{prob:01} and enforcing its constraints. Moreover, the choice of dual step size $\s_k$ in Algorithm~\ref{Algo:2} ensures that the dual variable $y_k$ remains bounded; see~\cite{bertsekas1976penalty} for a precedent in the literature of augmented Lagrangian method with a similar choice for the dual step size. %\notea{this next paragraph is not good.} %Step 4 of Algorithm~\ref{Algo:2} removes pathological cases with divergent iterates. The first function of Step 4 is to clip the iterates of Algorithm~\ref{Algo:2} to a ball of radius $\rho'$. %As an example, suppose that $g=1_C$ in \eqref{prob:01} is the indicator function for a bounded convex set $C\subset \RR^d$ and take $\rho' > \max_{x\in C} \|x\|$. Then, for sufficiently large $k$, it is not difficult to verify, by Theorem~\ref{thm:main} below, all the iterates of Algorithm~\ref{Algo:2} automatically satisfy $\|x_k\|\le \rho'$. The second function of Step 4 is to control the primal step sizes. \notea{that's bad.} %\notea{we should at least say something about the adaptive version, even if we don't include the proof. maybe we should include the algorithm, claim we have proved its convergence (which we did) and then use it in simulations alongside the nonadaptive version. } \begin{algorithm} \label{Algo:2} Input: Parameters $\beta_1,\s_1,\rho,\tau> 0$, primal initialization $x_{1}\in \RR^d$ with $\|A(x_1)\|\le \rho$, dual initialization $y_1\in \RR^m$. \\ For $k=1,2,\ldots$, execute\\ \begin{enumerate} \item \textbf{(Update penalty weight)} $ \beta_k \leftarrow \beta_{1}\sqrt{k}\log(k+1)/\log 2 . $\\ \item \textbf{(Line search)} Use the line search in \eqref{eq:defn of gamma line search} with $x=x_k,y=y_k,\b=\b_k$ and let $\g_k \leftarrow \g$.\\ \item \textbf{(Primal descent step)} $ x_{k+1} \leftarrow P_{g }(x_{k} - \gamma_{k} \nabla \mathcal{L}_{\beta_{k}}(x_{k},y_{k}))$, where $\L_{\b_k}$ is the augmented Lagrangian and $P_g$ denotes the proximal operator, see (\ref{prob:01},\ref{eq:Lagrangian},\ref{eq:defn of prox}).\\ %\item \textbf{(Control)} If necessary, modify $x_{k+1}$ to ensure that $\|x_{k+1}\|\le \rho'$ and $\|x_{k+1}-x_k\|\le \rho''$. \notea{will this cause any problem?}\\ \item \textbf{(Stopping criterion)} If $\g_k \|G_{\b_k,\g_k}(x_k,y_k)\|^2 + \s_k \|A(x_k)\|^2 \le \tau$, then quit and return $x_{k+1}$. See \eqref{eq:grad map} for the definition of gradient mapping $G_{\b_k,\g_k}$. \\ \item \textbf{(Update dual step size)} $ \s_{k+1} \leftarrow \s_{1} \min\left( {\frac{1}{\sqrt{k+1}}},\frac{\|A(x_1)\|}{\|A(x_{k+1})\|}\cdot \frac{ \log^2 2 }{(k+1)\log^2(k+2)} \right). $\\ \item \textbf{(Dual ascent step)} $y_{k+1} \leftarrow y_{k} + \sigma_{k+1}A(x_{k+1})$. \end{enumerate} \caption{HoLAL algorithm for solving problem \eqref{prob:01}} \end{algorithm} %Slater's condition plays a key role in convex optimization and we require a similar condition for the success of our algorithm. %\begin{definition}\label{defn:nonconvex slater} \textbf{(Nonconvex Slater's condition)} %Consider an interval $K=[k_0:k_1]$ and a sequence $\{x_k\}_{k=k_0}^{k_1}\subset \RR^d$. Let us set %\begin{align} %\cone(x_{k+1}) := \text{cone}(\partial g(x_{k+1}) )^* \subset \RR^d, %\qquad \forall k\in K, %\end{align} %for short, and let $S_K\subset \RR^d$ be a subspace such that %\begin{align} %S _{K} \supseteq \bigcup_{k\in K} P_{\cone(x_{k+1})}\left( DA(x_{k+1})^\top A(x_{k+1}) \right), %\label{eq:defn of S defn} %\end{align} %and, to simplify the notation, let $S_{K}$ also denote an orthonormal basis for this subspace. %For $\rho,\rho'>0$, we set %\begin{align} %\nu_A := %\begin{cases} %\underset{v,x}{\min} \, \frac{\left\| S_{K}^\top P_{\cone(x)} ( DA(x)^\top v) \right\|}{\|v\|} \\ %\|v\|\le \rho\\ %\|x\|\le \rho', %\end{cases} %%\label{eq:new slater defn} %\end{align} %and say that Program (\ref{prob:01}) satisfies the nonconvex Slater's condition if $\nu_A >0$. %\end{definition} %To gain some insight about Definition \ref{defn:nonconvex slater}, suppose that $f:\RR^d\rightarrow\RR$ is convex. Consider a nonempty and bounded convex set $C\subseteq\RR^d$ and let us restrict ourselves to the choice of $g = 1_C:\RR^d\rightarrow\RR$, which takes $x\in \RR^d$ to %\begin{align} %g(x) = 1_C(x) = %\begin{cases} %0 & x \in C\\ %\infty & x\notin C. %\end{cases} %\label{eq:indicator} %\end{align} %We also let $T_C(x)$ denote the tangent cone to $C$ at $x$, and $P_{T_C(x)}:\RR^d\rightarrow\RR^d$ denotes the orthogonal projection onto this tangent cone. %We also set $S_K = \RR^d$ to simplify the discussion. Then, it is not difficult to verify that \eqref{eq:new slater defn} is equivalent to %\begin{align} %\nu_A := %\begin{cases} %\underset{v,x}{\min} \, \, \frac{\left\| P_{T_C(x)} A^\top v \right\|}{\|v\|} \\ %\|v\|\le \rho\\ %\|x\|\le \rho' %\end{cases} %= \begin{cases} %\underset{x}{\min} \,\, \eta_{m}\left( P_{T_C(x)} A^\top \right) \\ %\|x\|\le \rho', %\end{cases} %%\label{eq:nu for cvx} %\end{align} %where $\eta_{m}$ returns the $m$th largest singular value of its input matrix. The nonconvex Slater condition here, namely, $\nu_A>0$, is closely related to the standard Slater's condition in convex optimization. %\begin{proposition}%\label{prop:convex} %In problem (\ref{prob:01}), suppose that $f$ is a convex function, $g$ is as in (\ref{eq:indicator}), and $A:\RR^d\rightarrow\RR^m$ a linear operator, represented with the matrix $A\in \RR^{m\times d}$. Assume also that Program (\ref{prob:01}) is feasible, namely, there exists $x\in C$ such that $Ax=0$. In (\ref{eq:nu for cvx}), let us take $S_K =\RR^d$. Then the standard Slater's condition holds if the nonconvex Slater's condition ($\nu_A>0$) holds. %\end{proposition} %\begin{proof} %Suppose that the standard Slater's condition does not hold, namely, that %\begin{equation} %\relint(\Null(A) \cap C) = \Null(A)\cap \relint(C) = \emptyset, %\label{eq:no feas} %\end{equation} %where $\Null(A)$ and $\relint(C)$ denote the null space of the matrix $A$ and the relative interior of $C$, respectively. %By assumption, there exists $x_0\in C$ such that $Ax_0=0$. It follows from \eqref{eq:no feas} that $x_0\in \text{boundary}(C)$ and that $\Null(A)$ supports $C$ at $x_0$, namely, %$A x_0\ge 0$, for every $x_0\in C$. %Consequently, $\Null(A) \cap T_C(x) \ne \{0\}$, where $T_C(x_0)$ is the tangent cone of the set $C$ at $x_0$. %Equivalently, it holds that %$\row(A)\cap N_C(u) \ne \{0\}$, where $\row(A)$ is the row space of the matrix $A$. That is, there exists a unit-norm vector $v$ such that %$P_{T_C(x_0)}A^\top v=0$, %and, consequently, %$ %\eta_{m}(P_{T_C(x_0)}A^\top) = 0 %$. %Let us take $\rho'=\|x_0\|$ in \eqref{eq:nu for cvx}. We then conclude that $\nu_A=0$, namely, the nonconvex Slater's condition also does not hold, thereby completing the proof of Proposition \ref{prop:convex}.\hfill \qed %\end{proof} % %The nonconvex Slater's condition is necessary for the success of Algorithm~\ref{Algo:2}. % % %The following result, proved in Appendix \ref{sec:proof of bnd Ak}, uses the nonconvex Slater's condition to find a converse, thus bounding the feasibility gap with the gradient mapping. %\begin{lemma}\label{lem:bnd bnd Ak} \textbf{\emph{(Nonconvex Slater's condition)}} %For an integer interval $K=[k_0:k_1]$, let us set %\begin{align} %\cone(x_{k+1}) := \text{cone}(\partial g(x_{k+1}) )^* \subset \RR^d, %\qquad \forall k\in K, %\end{align} %for short, and %consider a subspace $S_K\subseteq \mathbb{R}^d$ such that %\begin{align} %S _{K} \supseteq \bigcup_{k\in K} P_{\cone(x_{k+1})}\left( DA(x_{k+1})^\top A(x_{k+1}) \right), %\label{eq:defn of S lemma} %\end{align} %and, to simplify the notation, let $S_{K}$ also denote an orthonormal basis for this subspace. %For $\rho,\rho',\rho''>0$, assume that %\begin{align} %\max_{k\in K}\|A(x_k)\| \le \rho, %\qquad %\max_{k\in K}\|x_k\| \le \rho', %\qquad %\max_{k\in K} \|x_{k+1}-x_k\|\le \rho''. %\label{eq:good neighb} %\end{align} %Let us also set %\begin{align} %\nu_A := %\begin{cases} %\min_{v,x} \, \frac{\left\| S_{K}^\top P_{\cone(x)} ( DA(x)^\top v) \right\|}{\|v\|} \\ %\|v\|\le \rho\\ %\|x\|\le \rho'. %\end{cases} %\label{eq:new slater lemma} %\end{align} %and assume that $\nu_A\ge 2\lambda_A\rho''$. %Then it holds that %\begin{align} %\|A(x_k)\| %& \le \frac{2}{\nu_A\b_k} \left( \|G_k\| + \lambda'_f + \lambda'_A \| y_k\| \right), %\qquad \forall k\in K, %\label{eq:bnd on Ak final} %\end{align} %where %\begin{align} %\lambda'_f := \max_{\|x\| \le \rho'} \|\nabla f(x)\|, %\qquad %\lambda'_A := \max_{\|x\|\le \rho'} \| DA(x)\|. %\end{align} %\end{lemma} \noindent In the $k^{\text{th}}$ iteration, if the primal step size $\g_k$ is sufficiently small, it is natural to expect Step 3 of Algorithm~\ref{Algo:2} to reduce the objective of \eqref{eq:minmax}. Perhaps less obviously, the geometric regularity in Definition~\ref{defn:nonconvex slater} ensures that this primal update also reduces the feasibility gap of \eqref{prob:01}. This intuition is the key to the analysis of Algorithm~\ref{Algo:2}, as formalized below and proved in Appendix \ref{sec:proof of bnd Ak}. +\notea{Yura made a comment below about the boundedness of the sequence here. It is true that we don't show boundedness in this lemma and so I removed the footnote that one can take $k_1=\infty$. Later in the theorem, $k_1$ can be indeed infinity, since we control the boundness of the sequence explicitly. } \begin{lemma}\label{lem:bnd bnd Ak} %\textbf{\emph{(Nonconvex Slater's condition)}} -For integers $k_0< k_1$, consider the integer interval $K=[k_0:k_1]$.\footnote{If desired, one can set $k_1=\infty$ in Lemma \ref{lem:bnd bnd Ak}.} +For integers $k_0< k_1$, consider the integer interval $K=[k_0:k_1]$. +%\footnote{If desired, one can set $k_1=\infty$ in Lemma \ref{lem:bnd bnd Ak}.} % and, for $\rho'>0$, suppose that %\begin{align} %\rho' \ge \max_{k\in K} \|x_{k}\|. %\label{eq:good neighb} %\end{align} Suppose that problem~(\ref{prob:01}) satisfies the geometric regularity in Definition~\ref{defn:nonconvex slater} with \begin{align} \nu(g,A,S,\rho,\rho')\ge 2\lambda_A \max_{k\in K} \g_k \|G_{\b_k,\g_k}(x_k,y_k)\|, \label{eq:low-bnd-on-nu-lemma} \end{align} where $\lambda_A$ was defined in (\ref{eq:smoothness basic}) and \begin{itemize} \item $\rho \ge \max_{k\in K}\|A(x_k)\| $, \item $\rho' \ge \max_{k\in K}\|x_k\| $, \item $S \supseteq \bigcup_{k\in K} P_{\cone(\partial g (x_{k+1}) )^*}\left( \D A(x_{k+1})^\top A(x_{k+1}) \right).$ \end{itemize} % let us set %\begin{align} %\cone(x_{k+1}) := \text{cone}(\partial g(x_{k+1}) )^* \subset \RR^d, %\qquad \forall k\in K, %\end{align} %for short, and %consider a subspace $S_K\subseteq \mathbb{R}^d$ such that %\begin{align} %S _{K} \supseteq \bigcup_{k\in K} P_{\cone(x_{k+1})}\left( DA(x_{k+1})^\top A(x_{k+1}) \right), %\label{eq:defn of S lemma} %\end{align} %and, to simplify the notation, let $S_{K}$ also denote an orthonormal basis for this subspace. %For $\rho,\rho',\rho''>0$, assume that %\begin{align} %\max_{k\in K}\|A(x_k)\| \le \rho, %\qquad %\max_{k\in K}\|x_k\| \le \rho', %\qquad %\max_{k\in K} \|x_{k+1}-x_k\|\le \rho''. %\label{eq:good neighb} %\end{align} %Let us also set %\begin{align} %\nu_A := %\begin{cases} %\min_{v,x} \, \frac{\left\| S_{K}^\top P_{\cone(x)} ( DA(x)^\top v) \right\|}{\|v\|} \\ %\|v\|\le \rho\\ %\|x\|\le \rho'. %\end{cases} %\label{eq:new slater lemma} %\end{align} %and assume that $\nu_A\ge 2\lambda_A\rho''$. Then, for every $k\in K$, it holds that \begin{align} \|A(x_k)\| & \le \frac{2}{\nu(g,A,S,\rho,\rho')\b_k} \left( \|G_{\b_k,\g_k}(x_k,y_k)\| + \lambda'_f + \lambda'_A \| y_k\| \right), \label{eq:bnd on Ak final} \end{align} where \begin{align} \lambda'_f := \max_{\|x\| \le \rho'} \|\nabla f(x)\|, \qquad \lambda'_A := \max_{\|x\|\le \rho'} \| \D A(x)\|. \end{align} \end{lemma} \noindent In words, Lemma~\ref{lem:bnd bnd Ak} relates the feasibility gap to the gradient mapping in problem~\eqref{prob:01}. Not surprisingly, as the penalty weight $\b_k$ increases in Algorithm~\ref{Algo:2}, the feasibility gap in~\eqref{prob:01} reduces, as indicated in~\eqref{eq:bnd on Ak final}. Note also that the larger $\nu$, the more regular problem~\eqref{prob:01} is, and the smaller feasibility gap becomes, as again corroborated by \eqref{eq:bnd on Ak final}. With the aid of Lemma~\ref{lem:bnd bnd Ak}, we can derive the convergence rate of Algorithm~\ref{Algo:2} to first-order stationarity, with the proof deferred to Appendix~\ref{sec:proof-of-main}. For the convergence metric, we will use a linear combination of the gradient mapping and the feasibility gap of problem~(\ref{prob:01}), as motivated earlier by the discussion after Definition~\ref{def:grad map}. \begin{theorem}[Convergence rate of HoLAL] \label{thm:main} For sufficiently large integers $k_0< k_1$, consider the interval $K=[k_0:k_1]$,\footnote{The requirement for $k_0$ to be sufficiently large is merely to simplify the analysis. If desired, one can set $k_1=\infty$ in Theorem~\ref{thm:main}.} and consider the output sequence $\{x_k,y_k\}_{k\in K}$ of Algorithm~\ref{Algo:2}. Suppose that \begin{align*} \mu := - \min(0, \inf_{k} f(x_k) + g(x_k) + \langle A(x_k) ,y_{k_0}\rangle ) < \infty. \end{align*} For $\rho'>0$, in addition to the strong smoothness of $f$ and $A$ quantified in (\ref{eq:smoothness basic}), let us define \begin{align} \lambda'_f = \max_{\|x\|\le \rho'} \|\nabla f(x)\|, \qquad \lambda'_A = \max_{\|x\| \le \rho'} \|\D A(x)\|, \label{eq:local-lips} \end{align} to be the (restricted) Lipschitz constants of $f$ and $A$, respectively.\footnote{The restricted Lipschitz continuity assumption for $f,A$ in \eqref{eq:local-lips} is mild. Indeed, we have already assumed in \eqref{eq:smoothness basic} that $f,A$ are both continuously differentiable and, by compactness of the domain in \eqref{eq:local-lips}, $\lambda'_f,\lambda'_A <\infty$. } Suppose also that problem~(\ref{prob:01}) satisfies the geometric regularity in Definition~\ref{defn:nonconvex slater} with \begin{align} \nu(g,A,S,\rho,\rho') \gtrsim \max\left(\lambda_A \max_{k\in K} \sqrt{\g_k\mu} , \frac{{\lambda'_f+\lambda'_A}}{\sqrt{\mu}} \right), \label{eq:low-bnd-on-regularity} \end{align} where \begin{itemize} \item $\rho \gtrsim \sqrt{\mu}$, %\item $\rho \gtrsim -\sqrt{\mu}$, \item $\rho'\ge \max_{k\in K} \|x_k\|$, \item $S \supseteq \bigcup_{k\in K} P_{\cone(\partial g (x_{k+1}) )^*}\left( \D A(x_{k+1})^\top A(x_{k+1}) \right).$ \end{itemize} Then the output sequence of Algorithm~\ref{Algo:2} satisfies \begin{align} \label{eq:rates-thm} & \min_{k\in K} \frac{1}{\lambda_f+ \sqrt{m} \lambda_A \rho + d \lambda'^2_A} \cdot \frac{\|G_{\b_K,\g_k}(x_k,y_k)\|^2}{\sqrt{k_1}\log(k_1+1)} + \|A(x_k)\|^2 \nonumber\\ & \lesssim \frac{1}{k_1-k_0}\left( \frac{\lambda_f'^2+\lambda_A'^2}{\nu(g,A,S,\rho,\rho')^2} + \mu\right), \end{align} where $\lesssim,\gtrsim$ above suppress the dependence on constants and less important parameters, for the sake of clarity. The exact expressions %in terms of $\lambda_f,\lambda'_f,\lambda_A,\lambda'_A, \b_{k_0},\s_{k_0},\rho, \rho', \nu(g,A,S,\rho,\rho'),y_0$ are found in (\ref{eq:exact dependences},\ref{eq:suff cnd on nu-1},\ref{eq:cnd-on-nu-proof}). \end{theorem} \noindent A few remarks about Theorem~\ref{thm:main} are in order. \paragraph{\textbf{\emph{Convergence rates.}}} Loosely speaking, Theorem~\ref{thm:main} states that Algorithm~\ref{Algo:2} achieves first-order stationarity for \eqref{prob:01} by reducing the gradient map and the feasibility gap at the rates \begin{align} \|G_{\b_k,\g_k}(x_k,y_k) \|^2 = \frac{1}{\widetilde{O}(\sqrt{k})}, \qquad \|A(x_k) \| = \frac{1}{\widetilde{O}(\sqrt{k})}, \label{eq:rates-summary} \end{align} where $\widetilde{O}$ suppresses logarithmic terms above. -For comparison, if $f$ was convex and $A$ was affine in \eqref{prob:01}, both rates in \eqref{eq:rates-summary} would have improved to $1/k$, see \cite[Theorem 2.5]{xu2017accelerated} for the details. +3For comparison, if $f$ was convex and $A$ was affine in \eqref{prob:01}, both rates in \eqref{eq:rates-summary} would have improved to $1/k$, see \cite[Theorem 2.5]{xu2017accelerated} for the details. In contrast, for the convenience of the reader, let us emphasize again that the only linearized primal-dual nonconvex convergence rate we are aware of is the result in \cite[Theorem 19]{bot2018proximal}, which comes to a standstill for the correct choice of $\theta=1/2$, even though their problem~(1) therein is less general than our problem~\eqref{prob:01}. Moreover, compared with the first-order inexact methods that solve \eqref{e:exac} to approximate stationarity (instead of linearizing it like us), Theorem~\ref{thm:main} improves over the state-of-the-art total convergence rate of $1/k^{\frac{1}{3}}$, reported in~\cite{sahin2019inexact,cartis2018optimality}. \paragraph{\emph{\textbf{Geometric regularity.}}} Geometric regularity in Definition~\ref{defn:nonconvex slater} plays a key role in Theorem~\ref{thm:main} by, broadly speaking, ensuring that the primal updates of Algorithm~\ref{Algo:2} reduce the feasibility gap as the penalty weight $\b_k$ grows. We will verify this condition for several examples in Section~\ref{sec:experiments}. As confirmed by \eqref{eq:rates-thm}, the larger $\nu(g,A,S,\rho,\rho')$, the more regular \eqref{prob:01}, and the faster Algorithm~\ref{Algo:2} would converge to stationarity. In fact, for Algorithm~\ref{Algo:2} to succeed, Theorem~\ref{thm:main} requires $\nu$ to be sufficiently large, see \eqref{eq:low-bnd-on-regularity}. We do not know if \eqref{eq:low-bnd-on-regularity} is necessary or rather an artifact of the proof technique, but it is naturally expected for the convergence rate to at least slow down when $\nu$ decreases, as corroborated by \eqref{eq:rates-thm}. The right-hand side of \eqref{eq:low-bnd-on-regularity} also depends on the largest primal step size $\max_{k\in K}\g_k$. Since $\g_k$ is found by line search in Algorithm~\ref{Algo:2}, we are unable to upper bound this quantity unless we make further assumptions on problem~(\ref{prob:01}), or slightly modify the algorithm to cap primal step sizes. However, recall that the augmented Lagrangian $\L_{\b_k}(\cdot,y_k)$ is $\lambda_{\b_k}$ Lipschitz gradient in its first argument and thus typically $\g_k \propto 1/\lambda_{\b_k}$, namely, $\g_k \propto 1/\sqrt{k}$ by (\ref{eq:smoothness at k},\ref{eq:low bnd on gammas}). \paragraph{\emph{\textbf{Smoothness.}}} The smoother $f,A$ are in the sense of (\ref{eq:smoothness basic},\ref{eq:local-lips}), the faster convergence would be, see~(\ref{eq:low-bnd-on-regularity},\ref{eq:rates-thm}). Indeed, as $f,A$ becomes smoother, problem~(\ref{prob:01}) more and more resembles a convex program, at least locally. Note, however, that it is often not straightforward to compute the local Lipschitz constants $\lambda'_f,\lambda'_A$ of \eqref{eq:local-lips} in practice, but it is in general possible to loosely upper bound them. For us, it is necessary to work with $\lambda'_f,\lambda'_A$ to translate any descent in the augmented Lagrangian (see Lemma~\ref{lem:11}) to reducing the feasibility gap (see Lemma~\ref{lem:bnd bnd Ak}). \paragraph{\emph{\textbf{Subspace $S$.}}} The freedom over the choice of subspace $S$ in Theorem~\ref{thm:main} is meant to further strengthen the result in the spirit of Proposition~\ref{prop:convex}. One might simply set $S=\mathbb{R}^d$ in the first reading. \paragraph{\textbf{\emph{Faster rates.}}} Assuming restricted strong convexity and smoothness for $f$ in~\eqref{prob:01} and near-isometry for $A$, (approximate) linear convergence of Algorithm~\ref{Algo:2} to a global minimizer of problem~\eqref{prob:01} can be established~\cite{gomez2019fast}. diff --git a/Journal/sections/appendix.tex b/Journal/sections/appendix.tex index e4ac7fc..51df08a 100644 --- a/Journal/sections/appendix.tex +++ b/Journal/sections/appendix.tex @@ -1,355 +1,360 @@ \section{Proof of Lemma \ref{lem:smoothness}\label{sec:proof of smoothness lemma}} If $A_i$ denotes the $i^{\text{th}}$ component of the map $A:\mathbb{R}^d\rightarrow\mathbb{R}^m$, note that \begin{align} \mathcal{L}_{\beta}(x,y) = f(x) + \sum_{i=1}^m y_i A_i (x) + \frac{\b}{2} \sum_{i=1}^m (A_i(x))^2, \end{align} which implies that \begin{align} \nabla_x \mathcal{L}_\beta(x,y) & = \nabla f(x) + \sum_{i=1}^m y_i \nabla A_i(x) + \frac{\b}{2} \sum_{i=1}^m A_i(x) \nabla A_i(x) \nonumber\\ & = \nabla f(x) + \D A(x)^\top y + \b \D A(x)^\top A(x), \end{align} where $DA(x)$ is the Jacobian of $A$ at $x$. By taking another derivative with respect to $x$, we reach \begin{align} \nabla^2_x \mathcal{L}_\beta(x,y) & = \nabla^2 f(x) + \sum_{i=1}^m \left( y_i + \b A_i(x) \right) \nabla^2 A_i(x) + \b \sum_{i=1}^m \nabla A_i(x) \nabla A_i(x)^\top. \end{align} It follows that \begin{align} \|\nabla_x^2 \mathcal{L}_\beta(x,y)\| & \le \| \nabla^2 f(x) \|+ \max_i \| \nabla^2 A_i(x)\| \left (\|y\|_1+\b \|A(x)\|_1 \right) + \beta\sum_{i=1}^m \|\nabla A_i(x)\|^2 \nonumber\\ & \le \lambda_h+ \sqrt{m} \lambda_A \left (\|y\|+\b \|A(x)\| \right) + \b \|\D A(x)\|^2_F. \end{align} For every $x$ such that $\|A(x)\|\le \rho$ and $\|x\|\le \rho'$, we conclude that \begin{align} \|\nabla_x^2 \mathcal{L}_\beta(x,y)\| & \le \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho \right) + \b \max_{\|x\|\le \rho'}\|\D A(x)\|_F^2, \end{align} which completes the proof of Lemma \ref{lem:smoothness}. \section{Proof of Lemma \ref{lem:11}\label{sec:proof of descent lemma}} Throughout, let \begin{align} G = G_{\b,\g}(x,y) = \frac{x-x^+}{\g}, \end{align} for short. -Suppose that $\|A(x)\|\le \rho$, $\|x\|\le \rho$, and similarly $\|A(x^+)\|\le \rho$, $\|x^+\|\le \rho'$. An application of Lemma \ref{lem:smoothness} yields that +Suppose that $\|A(x)\|\le \rho$, $\|x\|\le \rho'$, and similarly $\|A(x^+)\|\le \rho$, $\|x^+\|\le \rho'$. An application of Lemma \ref{lem:smoothness} yields that \begin{align} \L_\b(x^+,y)+g(x^+) & \le \L_\b(x,y)+ \langle x^+-x,\nabla_x \L_\b(x,y) \rangle + \frac{\lambda_\b}{2} \|x^+ - x\|^2 + g(x^+) \nonumber\\ & = \L_\b(x,y)-\g \langle G ,\nabla_x \L_\b(x,y) \rangle + \frac{\g^2 \lambda_\b }{2} \|G\|^2 + g(x^+) \label{eq:descent pr 1} \end{align} Since $x^+ = P_g(x - \g \nabla_x \L_\b(x,y))$, we also have that \begin{align} G - \nabla_x \L_\b(x,y) = \xi \in \partial g(x^+). \label{eq:opt of prox} \end{align} By combining (\ref{eq:descent pr 1},\ref{eq:opt of prox}), we find that \begin{align} \L_\b(x^+,y)+g(x^+) & \le \L_\b(x,y) -\g \|G\|^2 + \g \langle G, \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\ & = \L_\b(x,y) -\g \|G\|^2 + \langle x- x^+ , \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\ & \le \L_\b(x,y) + g(x) - \g\left( 1-\frac{\g\lambda_\b}{2}\right) \|G\|^2, \end{align} where the last line above uses the convexity of $g$. Recalling that $\g\le 1/\lambda_\b$ completes the proof of Lemma \ref{lem:11}. \section{Proof of Lemma \ref{lem:eval Lipsc cte}\label{sec:proof of linesearch}} By optimality of $x^+_{\gamma}$ in \eqref{eq:defn of x+gamma}, we note that \begin{equation} x^+_{\gamma} - x +\gamma \nabla_x \mathcal{L}_\beta(x,y) = - \g\xi \in -\g \partial g (x^+_{\gamma}). \label{eq:optimality of uplus} \end{equation} By definition in \eqref{eq:defn of gamma line search}, $\gamma$ also satisfies \begin{align} & \mathcal{L}_{\beta}(x^+_{\gamma},y) + g(x_\g^+) \nonumber\\ & \le \mathcal{L}_\beta(x,y) + \left\langle x^+_{\gamma} - x , \nabla_x \mathcal{L}_\beta (x,y) \right\rangle + \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x_\g^+) \nonumber\\ & = \mathcal{L}_\beta(x,y) + \left\langle x - x^+_{\gamma} ,\xi \right\rangle - \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x_\g^+)\nonumber\\ & \le \mathcal{L}_\beta(x,y) -- \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x) - g(x^+_\g) +- \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x) \qquad (\text{convexity of } g) \nonumber\\ -& = \mathcal{L}_\beta(x,y) - \frac{\gamma}{2} \|G_{\beta,\gamma}(x,y)\|^2 +g(x) - g(x^+_\g), +& = \mathcal{L}_\beta(x,y) - \frac{\gamma}{2} \|G_{\beta,\gamma}(x,y)\|^2 +g(x), \qquad \text{(see Definition \ref{def:grad map})} \end{align} -which completes the proof of Lemma \ref{lem:eval Lipsc cte} since \eqref{eq:moreover} follows directly from \eqref{eq:defn of gamma line search}. +which completes the proof of Lemma \ref{lem:eval Lipsc cte} since \eqref{eq:moreover} follows directly from \eqref{eq:defn of gamma line search}. \notea{corrected the typo Yura pointed out. } \section{Proof of Lemma \ref{lem:bnd bnd Ak} \label{sec:proof of bnd Ak}} By assumption, we have that \begin{align} \max_{k\in K} \|A(x_k)\| \le \rho, \qquad \max_{k\in K} \| x_k\| \le \rho'. %\qquad \max_{k\in K} \| x_{k+1}-x_k \| \le \rho''. \label{eq:bndness in slater proof} \end{align} From the primal update in \eqref{eq:update uk recall} and the definition of the proximal operator $P_g$~\cite{parikh2014proximal}, it follows that \begin{align} x_{k+1}-x_k + \g_k \nabla f(x_k) + \g_k \D A(x_k)^\top (y_k + \b_k A(x_k) ) \in - \partial g(x_{k+1}), \end{align} where $\partial g(x_{k+1})$ is the subdifferential of $g$ at $x_{k+1}$. After recalling the definition of gradient mapping in \eqref{eq:grad map recall}, the above inclusion can be written as \begin{align} -\frac{ G_k}{\b_k} + \frac{ \nabla f(x_k)}{\b_k} + \frac{ \D A(x_k)^\top y_k }{\b_k} + \D A(x_k)^\top A(x_k) \in - \frac{\partial g(x_{k+1})}{\b_k \g_k}. \label{eq:to be projected} \end{align} Let $\text{cone}(\partial g(x) )^*$ denote the polar of \begin{equation} \cone(\partial g(x))) = \bigcup_{\alpha \ge 0} \alpha \cdot \partial g(x) \subseteq \RR^d. \label{eq:defn of dual cone} \end{equation} By projecting both sides \eqref{eq:to be projected} onto $\cone(\partial g(x_{k+1}))^*$, we find that \begin{align} & P_{\cone(\partial g(x_{k+1}))^* } \left( - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{\D A(x_k)^\top y_k}{\b_k} + \D A(x_k)^\top A(x_k) \right) \nonumber\\ & \in P_{\cone(\partial g(x_{k+1}))^* } \left( - \frac{\partial g(x_{k+1}) }{\b_k \g_k} \right) = \{ 0\}, \label{eq:before Sk} \end{align} where the equality above follows from the duality of $\cone(\partial g(x_{k+1}))^*$ and $\cone(\partial g(x_{k+1}))$. Recall also that the subspace $S\subseteq \RR^d$ by assumption satisfies \begin{align} S \supseteq \bigcup_{k\in K} P_{ \cone( \partial g( x_{k+1}) )^* }\left( \D A(x_{k+1})^\top A(x_{k+1}) \right), \label{eq:defn of S} \end{align} and project both sides of \eqref{eq:before Sk} onto $S$ to reach \begin{align} & P_{S}P_{\cone(\partial g(x_{k+1}))^*}\left( - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{\D A(x_k)^\top y_k}{\b_k} + \D A(x_k)^\top A(x_k) \right) = 0. \label{eq:pre norm} \end{align} -By taking the norm and then applying the triangle inequality above, we argue that +Recall the property that +\begin{align} +P_K(a+b) = P_K(a)+P_K(b), +\end{align} +for a cone $K$ and vectors $a,b$. \notea{could someone double check this property above?} +Using this property, then by taking the norm and finally applying the triangle inequality above, we argue that \begin{align} & \left\| P_{S} P_{\cone(\partial g(x_{k+1}))^*}( \D A(x_k)^\top A(x_k) ) \right\| \nonumber\\ & \le \left\| P_{S} P_{\cone(\partial g(x_{k+1}))^*} \left( - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{\D A(x_k)^\top y_k}{\b_k} \right) \right\| \qquad \text{(see \eqref{eq:pre norm})}. \label{eq:post norm} \end{align} Because proximal map is non-expansive and $P_{S}P_{\cone(\partial g(x_{k+1}))^*}(0) = 0$, we may upper bound the last line above as \begin{align} & \left\| P_{S} P_{\cone(\partial g(x_{k+1}))^*} ( \D A(x_k)^\top A(x_k) ) \right\| \nonumber\\ & \le \left\| - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{\D A(x_k)^\top y_k}{\b_k} \right\| \nonumber\\ & \le \frac{1}{\b_k} \left( \|G_k\| + \|\nabla f(x_k) \| + \|\D A(x_k)^\top y_k\| \right). \qquad \text{(triangle inequality)} \nonumber\\ & \le \frac{1}{\b_k} \left( \|G_k\| +\lambda'_f+ \lambda'_A \|y_k\| \right) , \label{eq:slater proof 0} \end{align} where \begin{align} \lambda'_f := \max_{\|x\| \le \rho'} \| \nabla f(x)\|, \qquad \lambda'_A := \max_{\|x\| \le \rho'} \| \D A(x)\|, \end{align} are the local Lipschitz constant of $f$ and $A$. To lower bound the first line of \eqref{eq:slater proof 0}, we invoke the restricted injectivity in Section~\ref{sec:slater}. % % % %must impose a restricted injectivity as follows. %We let $S_{K}$ with orthonormal columns denote a basis for the subspace $S_K$ in \eqref{eq:defn of S}, to simplify the notation. We then assume that $ \nu_A>0$, where %\begin{align} %\nu_A := %\begin{cases} %\min_{v,x} \, \frac{\left\| S_{K}^\top P_{\cone(x_{k+1})} ( DA(x)^\top v) \right\|}{\|v\|} \\ %\|v\|\le \rho\\ %\|x\|\le \rho'. %\end{cases} %\label{eq:new slater proof} %\end{align} Indeed, recalling \eqref{eq:new slater defn} and the first bound in \eqref{eq:bndness in slater proof}, for every $k\in K $, we write that \begin{align} & \left\| P_{S} P_{\cone(\partial g(x_{k+1}))^*} ( \D A(x_{k})^\top A(x_k)) \right\| \nonumber\\ & \ge \left\| P_{S} P_{\cone(\partial g(x_{k+1}))^*} ( \D A(x_{k+1})^\top A(x_{k})) \right\| - \left\| (\D A(x_{k+1}) - \D A(x_{k})) ^\top A(x_{k}) \right\| \nonumber\\ & \ge \nu(g,A,S,\rho,\rho') \|A(x_k)\| - \| \D A(x_{k+1})-\D A(x_k) \| \|A(x_k) \|, \qquad \text{(see \eqref{eq:new slater defn})} \label{eq:slater proof 1} \end{align} where the second line above again uses the non-expansiveness of $P_{S}$ and $P_{\cone(\partial g(x_{k+1}))^*}$. The remaining term in \eqref{eq:slater proof 1} is bounded as \begin{align} \| \D A(x_{k+1})-\D A(x_k) \| & \le \lambda_A \|x_{k+1}-x_k\| = \lambda_A \g_k \|G_k\| . \qquad \text{(see (\ref{eq:smoothness basic},\ref{eq:bndness in slater proof}))} \end{align} Assuming that \begin{align} \nu(g,A,S,\rho,\rho') \ge 2\lambda_A \max_{k\in K} \g_k \|G_k\|, \end{align} allows us to simplify the last line of \eqref{eq:slater proof 1} as \begin{align} \left\| P_{S} P_{\cone(\partial g(x_{k+1}))^*} ( \D A(x_{k})^\top A(x_k)) \right\| \ge \frac{\nu(g,A,S,\rho,\rho') }{2} \|A(x_k)\|, \end{align} which, after substituting in \eqref{eq:slater proof 0}, yields that \begin{align} \|A(x_k)\| & \le \frac{2}{\b_k\nu(g,A,S,\rho,\rho')} \left( \|G_k\| + \lambda'_f + \lambda'_A \|y_k\| \right), \end{align} and completes the proof of Lemma \ref{lem:bnd bnd Ak}. %\section{Clustering: Verifying the Geometric Regularity} % %Note that %\begin{align} %A(x) = VV^\top \mathbf{1}- \mathbf{1}, %\end{align} %\begin{align} %DA(x) & = %\left[ %\begin{array}{cccc} %w_{1,1} x_1^\top & \cdots & w_{1,n} x_{1}^\top\\ %%x_2^\top p& \cdots & x_{2}^\top\\ %\vdots\\ %w_{n,1}x_{n}^\top & \cdots & w_{n,n}1 x_{n}^\top %\end{array} %\right] \nonumber\\ %& = \left[ %\begin{array}{ccc} %V & \cdots & V %\end{array} %\right] %+ %\left[ %\begin{array}{ccc} %x_1^\top & \\ %& \ddots & \\ %& & x_n^\top %\end{array} %\right], %\label{eq:Jacobian clustering} %\end{align} %%where $I_n\in\RR^{n\times n}$ is the identity matrix, %where $w_{i.i}=2$ and $w_{i,j}=1$ for $i\ne j$. In the last line above, $n$ copies of $V$ appear and the last matrix above is block-diagonal. For $x_k$, define $V_k$ accordingly and let $x_{k,i}$ be the $i$th row of $V_k$. %Consequently, %\begin{align} %DA(x_k)^\top A(x_k) & = %\left[ %\begin{array}{c} %(V_k^\top V_k - I_n) V_k^\top \mathbf{1}\\ %\vdots\\ %(V_k^\top V_k - I_n) V_k^\top \mathbf{1} %\end{array} %\right] \nonumber\\ %& \qquad + %\left[ %\begin{array}{c} %x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\ %\vdots \\ %x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n %\end{array} %\right], %\end{align} %where $I_n\in \RR^{n\times n}$ is the identity matrix. %Let us make a number of simplifying assumptions. First, we assume that $\|x_k\|< \sqrt{s}$ (which can be enforced in the iterates by replacing $C$ with $(1-\epsilon)C$ for a small positive $\epsilon$ in the subproblems). Under this assumption, it follows that %\begin{align} %(\partial g(x_k))_i = %\begin{cases} %0 & (x_k)_i > 0\\ %\{a: a\le 0\} & (x_k)_i = 0, %\end{cases} %\qquad i\le d. %\label{eq:exp-subgrad-cluster} %\end{align} %Second, we assume that $V_k$ has nearly orthonormal columns, namely, $V_k^\top V_k \approx I_n$. This can also be enforced in each iterate of Algorithm~\ref{Algo:2} and naturally corresponds to well-separated clusters. While a more fine-tuned argument can remove these assumptions, they will help us simplify the presentation here. Under these assumptions, the (squared) right-hand side of \eqref{eq:regularity} becomes %\begin{align} %& \dist\left( -DA(x_k)^\top A(x_k) , \frac{\partial g(x_k)}{ \b_{k-1}} \right)^2 \nonumber\\ %& = \left\| \left( -DA(x_k)^\top A(x_k) \right)_+\right\|^2 %\qquad (a_+ = \max(a,0)) % \nonumber\\ %& = %\left\| %\left[ %\begin{array}{c} %x_{k,1} (V_k V_k^\top \mathbf{1}- \mathbf{1})_1 \\ %\vdots \\ %x_{k,n} (V_k V_k^\top \mathbf{1}- \mathbf{1})_n %\end{array} %\right] %\right\|^2 %\qquad (x_k\in C \Rightarrow x_k\ge 0) %\nonumber\\ %& = \sum_{i=1}^n \| x_{k,i}\|^2 (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\ %& \ge \min_i \| x_{k,i}\|^2 %\cdot \sum_{i=1}^n (V_kV_k^\top \mathbf{1}-\mathbf{1})_i^2 \nonumber\\ %& = \min_i \| x_{k,i}\|^2 %\cdot \| V_kV_k^\top \mathbf{1}-\mathbf{1} \|^2. %\label{eq:final-cnd-cluster} %\end{align} %Therefore, given a prescribed $\nu$, ensuring $\min_i \|x_{k,i}\| \ge \nu$ guarantees \eqref{eq:regularity}. When the algorithm is initialized close enough to the constraint set, there is indeed no need to separately enforce \eqref{eq:final-cnd-cluster}. In practice, often $n$ exceeds the number of true clusters and a more intricate analysis is required to establish \eqref{eq:regularity} by restricting the argument to a particular subspace of $\RR^n$. \section{Proof of Proposition \ref{prop:convex} \label{sec:proof of prop}} To prove the first claim of the proposition, suppose that the Slater's condition does not hold, namely, suppose that \begin{equation} \Null(A)\cap \relint(C) = \emptyset, \label{eq:no feas} \end{equation} where $\Null(A)$ and $\relint(C)$ denote the null space of the matrix $A$ and the relative interior of $C$, respectively. We have assumed that \eqref{prob:01} is feasible, namely, there exists $x\in C$ such that $Ax=0$. It follows from \eqref{eq:no feas} that $x\in \text{boundary}(C)$ and that $\Null(A)$ supports $C$ at $x$, namely, $A x'\ge 0$, for every $x'\in C$. (The inequality $Ax'\ge 0$ applies to each entry of the vector $Ax'$.) Consequently, $\Null(A) \cap T_C(x) \ne \{0\}$, where $T_C(x)$ is the tangent cone of the set $C$ at $x$. Equivalently, it holds that $\row(A)\cap N_C(x) \ne \{0\}$, where $\row(A)$ is the row space of the matrix $A$ and $N_C(x)$ is the normal cone to $C$ at $x$. That is, there exists a unit-norm vector $v$ such that $P_{T_C(x)}A^\top v=0$ and, consequently, $P_S P_{T_C(x)}A^\top v=0$. Let us take $\rho'=\|x\|$ in~\eqref{eq:nu for cvx}. We then conclude that $$ \widetilde{\nu}(g,A,S,1,\|x\|)=\widetilde{\nu}(g,A,S,\infty,\|x\|)=0, $$ namely, the geometric regularity also does not hold for any $\rho\ge 0$ and $\rho'=\|x\|$. The first identity above follows from the homogenity of the right-hand side of \eqref{eq:nu for cvx}. Because increasing $\rho'$ cannot increase the right-hand side of \eqref{eq:nu for cvx}, we find that $\widetilde{\nu}(g,A,S,\infty,\max_{x\in C} \|x\|)=0$, which proves the first claim in Proposition~\ref{prop:convex}. For the converse, we can verify that it suffices to take $\text{row}(A) \subseteq S$. Next, recall \eqref{eq:nu for cvx} and suppose that there exists $x\in \RR^d$ such that \begin{align} \eta_{\min} ( P_S P_{T_C(x)} A^\top ) =0. \label{eq:counter-assump} \end{align} Throughout, we assume without loss of generality that $x\in C$. Indeed, otherwise $x$ would not be feasible for problem \eqref{prob:01} with $g=1_C$ and cannot be used to study the Slater's condition in \eqref{prob:01}. Note that, by assumption in Proposition~\ref{prop:convex}, \eqref{eq:counter-assump} can be rewritten as \begin{align} \eta_{\min}(P_S P_{T_C(x)} A^\top) & = \eta_{\min}( P_{T_C(x)} A^\top) =0, \qquad (S = \text{aff}(C)) \label{eq:counter-assump-rewritten} \end{align} where $\text{aff}(C)$ is the affine hull of $C$. Then, since $\|x\| \le \rho'<\infty$ in \eqref{eq:nu for cvx}, we can assume throughout that $\text{boundary}(C) \cap B_{\rho'}\ne \emptyset$ and, moreover, $x\in \text{boundary}(C)$. (Here, $B_{\rho'}=\{z : \|z\|\le \rho' \} $ is the ball of radius $\rho'$ at the origin.) Indeed, otherwise if $x\in \text{relint}(C)$, we have that $T_C(x)=S$ and thus \begin{align*} \eta_{\min}(P_{S}P_{T_C(x)} A^\top) & = \eta_{\min}( P_{T_C(x)} A^\top) \qquad (S = \text{aff}(C) ) \nonumber\\ & = \eta_{\min}( A^\top) \qquad ( \text{row}(A) \subseteq S) \nonumber\\ & >0, \end{align*} which contradicts \eqref{eq:counter-assump}. The last line above holds because, by assumption in Proposition~\ref{prop:convex}, $A$ is full-rank. To reiterate, we assume that $x\in \text{boundary}(C)$. Therefore, by \eqref{eq:counter-assump-rewritten}, there exists a unit-norm $u\in \row(A)$ such that $u\in N_C(x)$. In turn, this implies that $\text{null}(A) \cap \text{int}(C)=\emptyset$. Indeed, otherwise, any vector $v\in \text{null}(A) \cap \text{int}(C)$ satisfies $ \langle u, v \rangle <0$, which is impossible because $u\in \text{row}(A)$ and $v\in \text{null}(A)$ are orthogonal vectors. That is, the Slater's condition does not hold, which proves the second claim in Proposition~\ref{prop:convex}. diff --git a/Journal/sections/slater.tex b/Journal/sections/slater.tex index 7e9b77a..fd60d38 100644 --- a/Journal/sections/slater.tex +++ b/Journal/sections/slater.tex @@ -1,121 +1,121 @@ \section{Geometric Regularity \label{sec:slater}} The Slater's Condition plays a key role in convex optimization as a sufficient condition for strong duality~\cite{boyd2004convex}. The Slater's condition is vital to the success of a variety of primal-dual algorithms for constrained convex programming. As a visual example, when $f\equiv 0$, $g=1_C$ is the indicator function of a bounded convex set $C\subset\RR^d$, and $A$ is a linear operator, then \eqref{prob:01} reduces to the feasibility problem of finding a point in $C\cap \text{null}(A)$, where the subspace $\text{null}(A)$ is the null-space of $A$. In this case, the Slater's condition removes pathological cases, such as the one in Figure~\ref{fig:convex_slater}, by ensuring that $\text{null}(A)$ is not tangent to $C$. \begin{figure}[H] \begin{center} \includegraphics[scale=.5]{figures/Slater.pdf} -\caption{Solving (1) can be difficult even when it is a convex program. As an example, this figure shows a pathological case described in Section~\ref{sec:slater}, where the Slater’s condition does is not met. Here, for example, the alternating projection algorithm to solve \eqref{prob:01} converges at the rate of $O(1/\sqrt{k})$, in contrast to the (much faster) eventual linear convergence if the Slater's condition was met, namely, if $\text{null}(A)$ intersected the interior of $C$.} +\caption{Solving (1) can be difficult even when it is a convex program. As an example, this figure shows a pathological case described in Section~\ref{sec:slater}, where the Slater’s condition is not met. Here, for example, the alternating projection algorithm to solve \eqref{prob:01} converges at the rate of $O(1/\sqrt{k})$, in contrast to the (much faster) eventual linear convergence if the Slater's condition was met, namely, if $\text{null}(A)$ intersected the interior of $C$.} \label{fig:convex_slater} \end{center} \end{figure} \noindent Likewise, to successfully solve problem~\eqref{prob:01} in the presence of nonlinear constraints, we require the following condition which, loosely speaking, extends the Slater's condition to the nonconvex setting. %, as clarified shortly afterwards. This geometric condition is also closely related to those in the existing literature of nonconvex programming, as we discuss later in this section.\footnote{An algorithmic-dependent variant of Definition~\ref{defn:nonconvex slater} was studied with a limited scope in our earlier work~\cite{sahin2019inexact} in the context of inexact primal-dual algorithms. \notea{should we say more?} } %, in a sense, also extends the uniform regularity, introduced in %\cite[Definition 2.3]{bolte2018nonconvex} %, \begin{definition}\label{defn:nonconvex slater} \textbf{{(Geometric regularity)}} In problem~(\ref{prob:01}) with $m\le d$, for $\rho,\rho'>0$ and subspace $S\subset \RR^{d}$, let \begin{align} \nu(g,A,S,\rho,\rho') := \begin{cases} \underset{x\in \mathbb{R}^d}{\min} \, \frac{\left\| P_S P_{\cone(\partial g(x))^*} ( \D A(x)^\top \cdot A(x)) \right\|}{\|A(x)\|} \\ \text{subject to } 0< \|A(x)\|\le \rho, \, \|x\|\le \rho', \end{cases} \label{eq:new slater defn} \end{align} where $\cone(\partial g(x))$ is the conic hull of the subdifferential $\partial g(x)$ and $P_{\cone(\partial g(x))^*}$ projects onto the polar of this cone, see (\ref{eq:polar},\ref{eq:conic}). Moreover, $\D A(x)$ is the Jacobian of $A$. We say that (\ref{prob:01}) satisfies the geometric regularity if $\nu(g,A,S,\rho,\rho') >0$. \end{definition} Throughout, we will occasionally suppress the dependence of $\nu$ on some of its parameters to lighten the notation. A few remarks about geometric regularity are in order. \paragraph{\textbf{\emph{Jacobian $\D A$.}}} Let $\D A(x)^\top \overset{\operatorname{QR}}{=} Q(x) R(x)$ be the QR decomposition of $\D A(x)^\top$. As we will see shortly, $\D A(x)^\top $ in \eqref{eq:new slater defn} may be replaced with its orthonormal basis $Q(x)$ to broaden the applicability of the geometric regularity to the cases where $\D A(x)$ might be rank-deficient. Definition~\ref{defn:nonconvex slater} avoids this additional layer of complexity and instead, whenever needed, we will assume that $\D A(x)$ is nonsingular for every $x$. Alternatively, a simple QR decomposition can also remove any redundancies from $A(x)=0$ in the constraints of~\eqref{prob:01}. -\paragraph{\textbf{\emph{Subspace $S$.}}} Role of the subspace $S$ in \eqref{eq:new slater defn} is also to broadens the applicability of the geometric regularity. In particular, when $S = \RR^{d}$, the Moreau decomposition~\cite{moreau1962decomposition} allows us to simplify \eqref{eq:new slater defn} as +\paragraph{\textbf{\emph{Subspace $S$.}}} Role of the subspace $S$ in \eqref{eq:new slater defn} is also to broaden the applicability of the geometric regularity. In particular, when $S = \RR^{d}$, the Moreau decomposition~\cite{moreau1962decomposition} allows us to simplify \eqref{eq:new slater defn} as \begin{align} \nu(g,A,S,\rho,\rho') := \begin{cases} \underset{x}{\min} \, \frac{ \dist\left( \D A(x)^\top \cdot A(x) , \cone(\partial g(x))\right) }{\|A(x)\|} \\ \text{subject to } 0< \|A(x)\|\le \rho, \|x\|\le \rho', \end{cases} \label{eq:defn_nu_A} \end{align} where $\dist(\cdot,\cone(\partial g(x)))$ returns the Euclidean distance to $\cone(\partial g(x))$. \paragraph{\emph{\textbf{Convex case.}}} To better parse Definition \ref{defn:nonconvex slater}, let us consider the specific example where $f:\RR^d\rightarrow\RR$ is convex, $g = 1_C$ is the indicator function for a bounded convex set $C\subset \RR^d$, and $A$ is a nonsingular linear operator represented with the full-rank matrix $A\in \RR^{m\times d}$, allowing some abuse of notation. We also let $T_C(x)$ denote the tangent cone to $C$ at $x$ \cite{rockafellar2015convex}, and reserve $P_{T_C(x)}:\RR^d\rightarrow\RR^d$ for the orthogonal projection onto this cone. %Assume also that $S=\RR^d$ for clarity. %We also set $S_K = \RR^d$ to simplify the discussion. We can now study the interpretation of geometric regularity in this setting. Using the Moreau decomposition, it is not difficult verify that \begin{align} \nu(1_C,A,S,\rho,\rho') & \ge \begin{cases} \underset{v,x}{\min} \, \, \frac{\left\| P_S P_{T_C(x)} A^\top v \right\|}{\|v\|} \\ \text{subject to } 0< \|v\|\le \rho, \|x\|\le \rho' \end{cases} \nonumber\\ & = \begin{cases} \underset{x}{\min} \,\, \eta_{\min}\left( P_S P_{T_C(x)} A^\top \right) \\ \text{subject to } \|x\|\le \rho' \end{cases} \nonumber\\ & =: \widetilde{\nu}(1_C,A,S,\rho,\rho'), \label{eq:nu for cvx} \end{align} where $\eta_{\min}(\cdot)$ returns the smallest singular value of its input matrix. Then, loosely speaking, geometric regularity ensures that the row span of $A$ is not tangent to $C$, similar to the Slater's condition, see Figure~\ref{fig:convex_slater}. This close relationship between geometric regularity and the Slater's condition is formalized next and proved in Appendix~\ref{sec:proof of prop}. \notea{double check the proof...} \begin{proposition}\label{prop:convex} In (\ref{prob:01}), suppose that \begin{itemize} \item $f:\RR^d\rightarrow\RR$ is convex, \item $g=1_C$ is the indicator on a convex set $C\subset\RR^d$, \item $A:\RR^d\rightarrow\RR^m$ is a nonsingular linear operator, represented with the full-rank matrix $A\in \RR^{m\times d}$,\footnote{As mentioned earlier, it is easy to remove the full-rank assumption by replacing $\D A(x)$ in \eqref{eq:new slater defn} with its orthonormal basis. We assume $A$ to be full-rank for the sake of clarity at the cost of a simple QR decomposition to remove any ``redundant measurements'' from problem~\eqref{prob:01}.} \item and the problem is feasible, namely, there exists $x\in C$ such that $Ax=0$. \end{itemize} Then, \begin{itemize} \item problem (\ref{prob:01}) satisfies the Slater's condition if there exists a subspace $S\subseteq \RR^d$ such that $\widetilde{\nu}(1_C,A,S,\infty,\max_{x\in C}\|x\|)>0$, see also (\ref{eq:nu for cvx}). \item Moreover, suppose that $S$ is the affine hull of $C$. Then, (\ref{prob:01}) satisfies the Slater's condition if and only if $\widetilde{\nu}(1_C,A,S,\infty,\max_{x\in C}\|x\|)>0$. \end{itemize} \end{proposition} \paragraph{\emph{\textbf{Beyond the Slater's condition.}}} One might also expect $\nu$ in Definition~\ref{defn:nonconvex slater} to hold some information about the convergence speed of an algorithm. For example, suppose in problem \eqref{prob:01} that $f\equiv 0$, $g=1_C$ with convex set $C$ specified below, and $A$ is a linear operator. We also take $m=1$, so that $A$ can be represented with a $1\times d$ row-vector. For a small perturbation vector $\varepsilon\in \RR^{1\times d}$, let $$C=\{x\in \RR^d: (A+\varepsilon) x\ge 0\}$$ be a half-space. Then the Slater's condition holds regardless of the perturbation level $\|\varepsilon\|$. However, even though positive, $\nu(1_C,A,\RR^d,\infty,\rho')$ can be made arbitrarily small by making $\|\varepsilon\|$ very small, which in turn holds a clue to the arbitrarily slow convergence of the alternating projection algorithm to solve~\eqref{prob:01}. %Put differently, unlike the Slater's condition, $\nu(g,A,\RR^d)$ contains information about the convergence rate, as we will see shortly. -\paragraph{\emph{\textbf{Relation to prior work.}}} +\paragraph{\emph{\textbf{Relation to prior work.}}} \notea{this part has to be updated somewhat.} Geomtric regularity is closely related to those in the existing literature. In the special case where $g=0$ in~\eqref{prob:01}, it is easy to verify that geometric regularity in Definition~\ref{defn:nonconvex slater} with $S=\mathbb{R}^d$ and $\rho'=\infty$ implies the Polyak-Lojasiewicz (PL) condition~\cite{karimi2016linear} for minimizing $\|A(x)\|^2$. PL condition itself is a special case of Kurdyka-Lojasiewicz, see \cite[Definition 1.1]{xu2017globally}. When $g=0$, it is also easy to see that the geometric regularity is weaker than the Mangasarian-Fromovitz (MF) condition in nonlinear optimization, see \cite[Assumption A]{bolte2018nonconvex} and~\cite{bertsekas2014constrained}. {Moreover, {when $g$ is the indicator on a convex set,} the geometric regularity is a consequence of the \textit{basic constraint qualification} in \cite{rockafellar1993lagrange}. %, which itself generalizes the MF condition to the case when $g$ is an indicator function of a convex set. } %\notea{I'm not sure if the claim about basic constraint qualification is true because our condition should locally hold rather globally. Could you add the exact equation number in [55] and double check this? Also consider double checking other claims in this paragraph.} We may think of the geometric regularity in Definition~\ref{defn:nonconvex slater} as a local condition, which might hold within a neighborhood of the constraint set $\{x:A(x)=0\}$ rather than everywhere in $\mathbb{R}^d$. Figure~\ref{fig:counter} gives an example of this local nature of the geometric regularity. There is a constant complexity algorithm in \cite{bolte2018nonconvex} to reach this so-called ``information zone'', which supplements Theorem~\ref{thm:main}. %Lastly, in contrast to most conditions in the nonconvex optimization literature, such as~\cite{flores2012complete}, the condition in~\eqref{eq:regularity} appears to be easier to verify, as we see in the sequel. To close, let us point out that, unlike the bulk of the nonconvex programming literature, we can verify the geometric regularity in Definition~\ref{defn:nonconvex slater} for a number of important examples. %Moreover, this condition is closely related to the existing ones in the literature of nonconvex programming, as detailed in Section~\ref{sec:holal}. In this work, we will focus on instances of problem \eqref{prob:01} that satisfy the geometric regularity. To solve such problems, the next section introduces and studies the HoLAL algorithm. \begin{figure} %\includegraphics[scale=1]{local} \caption{This figure shows that geometric regularity in Definition~\ref{defn:nonconvex slater} is often a local property. More specifically, in problem~\eqref{prob:01}, suppose that $f\equiv 0$, $g=1_C$ is the indicator function on the disk $C$, and $A$ is a linear operator with the null space depicted in the figure. Then the objective in~\eqref{eq:new slater defn}, evaluated at the red point, is zero. To ensure that the geometric regularity holds ($nu(1_C,A,\mathbb{R}^d,\rho,\rho')>0$), we must choose a small enough $\rho$. That is, the geometric regularity holds if the feasibility gap is sufficiently small. \notea{add the figure!}\label{fig:counter}} \end{figure}