diff --git a/tex/Paper/sections/ADMM.tex b/tex/Paper/sections/ADMM.tex index f33be09..ac55ec3 100644 --- a/tex/Paper/sections/ADMM.tex +++ b/tex/Paper/sections/ADMM.tex @@ -1,161 +1,161 @@ \section{Linearized ADMM} Consider the program \begin{align} \begin{cases} \underset{x,z}{\min} \,\, f(x)+g(x)+h(z)+l(z)\\ A(x)+B(z) = 0, \end{cases} \label{eq:admm 2} \end{align} where $f,h:\RR^d\rightarrow\RR$ and $A,B:\RR^d\rightarrow\RR^d$ are smooth in the sense that \begin{align*} \| \nabla f(x) - \nabla f(x') \| \le \lambda_f \|x-x'\|, \,\, \| DA(x) - DA(x') \| \le \lambda_A \|x-x'\|, \end{align*} \begin{align} \| \nabla h(z) - \nabla h(z') \| \le \lambda_h \|z-z'\|, \,\, \| DB(z) - DB(z') \| \le \lambda_B \|z-z'\|, \label{eq:smoothness basics admm} \end{align} for every $x,x',z,z'\in \RR^d$. Above, $g,l:\RR^d\rightarrow\RR$ are convex functions. For example, when $C,D\subset \RR^d$ and $g=1_C, l=1_D$ are the indicator functions on $C,D$, respectively, Program~\eqref{eq:admm 2} becomes \begin{align} \begin{cases} \underset{x,z}{\min} \,\, f(x)+h(z)\\ A(x)+B(z) = 0\\ x\in C\\ z\in D. \end{cases} \label{eq:admm 2} \end{align} The augmented Lagrangian corresponding to Program~\eqref{eq:admm 2} is \begin{align} \L_{\b}(x,z,y) & = f(x)+g(x)+h(z)+l(z) \nonumber\\ & \qquad + \langle A(x)+B(z),y \rangle + \frac{\b}{2}\| A(x)+B(z) \|^2, \label{eq:lagrangian admm} \end{align} and Program \eqref{eq:admm 2} is therefore equivalent to the minimax program \begin{align} \underset{x,z}{\min} \underset{y}{\max}\,\, \L_\b(x,z,y). \label{eq:admm main} \end{align} To solve the equivalent formulation in Program \eqref{eq:admm main}, we propose the linearized ADMM, detailed in Algorithm~\ref{Algo:3}. To parse this algorithm, we need to slightly change the gradient map in Definition~\ref{def:grad map} and the line search procedure in Lemma~\ref{lem:eval Lipsc cte} to match $\L_\b$ in \eqref{eq:lagrangian admm}. More specifically, the corresponding gradient maps are defined as \begin{equation} G_{\b,\gamma}(x,z,y) = \frac{x-x^+}{\gamma}, \qquad H_{\b,\iota}(x,z,y) = \frac{z-z^+}{\iota}, \label{eq:grad map admm} \end{equation} where \begin{align} x^+=P_{g}(x-\gamma \nabla_x \mathcal{L}_ {\beta}(x,z,y)), \qquad z^+=P_{l}(z-\iota \nabla_z \mathcal{L}_ {\beta}(x,z,y)), \end{align} and $\g,\iota$ are the primal step sizes. The line search procedure is similar to Lemma~\ref{lem:eval Lipsc cte} and we set \begin{align*} x^+_{\gamma'} = P_g(x - \gamma' \nabla_x \mathcal{L}_\beta(x,z,y)), \end{align*} \begin{align} z^+_{\iota'} = P_l(z - \iota' \nabla_z \mathcal{L}_\beta(x,z,y)), \end{align} \begin{align} \gamma & := \max \Big\{ \gamma' ={\gamma}_0 \theta^i : \mathcal{L}_\beta (x^+_{\gamma'},z,y) \nonumber\\ & \qquad \qquad \le \mathcal{L}_\beta (x,z,y) + -\g' \left\langle x^+_{\gamma'} - x , \nabla_x \mathcal{L}_{\beta}(x,z,y) \right\rangle +\left\langle x^+_{\gamma'} - x , \nabla_x \mathcal{L}_{\beta}(x,z,y) \right\rangle + \frac{1}{2\gamma'} \| x^+_{\gamma'} - x\|^2 \Big\}, \label{eq:line search x admm} \end{align} \begin{align} \iota & := \max \Big\{ \iota' ={\iota}_0 \theta^i : \mathcal{L}_\beta (x,z^+_{\iota'},y) \nonumber\\ & \qquad \qquad \le \mathcal{L}_\beta (x,z,y) + -\iota' \left\langle z^+_{\iota'} - z , \nabla_z \mathcal{L}_{\beta}(x,z,y) \right\rangle +\left\langle z^+_{\iota'} - z , \nabla_z \mathcal{L}_{\beta}(x,z,y) \right\rangle + \frac{1}{2\iota'} \| z^+_{\iota'} - z\|^2 \Big\}. \label{eq:defn of gamma line search} \end{align} \begin{algorithm} \label{Algo:3} Input: Parameters $\beta_1,\s_1,\rho,\rho',\rho'',\tau> 0$, initialization $x_{1},z_1\in \RR^d$ with $\|A(x_1)+B(z_1)\|\le \rho$ and $\|x_1\|,\|z_1\|\le \rho'$, dual initialization $y_0\in \RR^m$. \\ For $k=1,2,\ldots$, execute\\ \begin{enumerate} \item \textbf{(Update penalty weight)} Set \begin{align} \beta_k = \frac{\beta_{1}}{\log 2}\sqrt{k \log^2(k+1)} . \end{align} \item \textbf{(Line search in $x$)} Use the line search procedure in \eqref{eq:line search x admm} by replacing $x=x_k,z=z_k,y=y_k,\b=\b_k$ and let $\g_k = \g$ be the output.\\ \item \textbf{(Descent step in $x$)} Set $ x_{k+1} = P_{g }(x_{k} - \gamma_{k} \nabla_x \mathcal{L}_{\beta_{k}}(x_{k},z_k,y_{k}))$, where $\L_{\b_k}$ is the augmented Lagrangian and $P_g$ denotes the proximal operator, defined in (\ref{eq:Lagrangian},\ref{eq:defn of prox}), respectively.\\ \item \textbf{(Line search in $z$)} Use the line search procedure in \eqref{eq:defn of gamma line search} by replacing $x=x_{k+1},z=z_k,y=y_k,\b=\b_k$ and let $\iota_k = \iota$ be the output.\\ \item \textbf{(Descent step in $z$)} Set $ z_{k+1} = P_{l }(z_{k} - \iota_{k} \nabla_z \mathcal{L}_{\beta_{k}}(x_{k+1},z_k,y_{k}))$, where $P_l$ denotes the proximal operator for $l$.\\ \item \textbf{(Control)} Properly modify $x_{k+1},z_{k+1}$ to ensure that $\|x_{k+1}\|,\|z_{k+1}\|\le \rho'$ or $\|x_{k+1}-x_k\|\le \rho''$.\\ \item \textbf{(Stopping criterion)} If $\g_k \|G_{\b_k,\g_k}(x_k,z_k,y_k)\|^2+ \iota_k \|G_{\b_k,\iota_k}(x_{k+1},z_k,y_k)\|^2 + \s_k \|A(x_k)+B(z_k)\|^2 \le \tau$, then quit and return $x_{k+1},z_{k+1}$ as approximate minimizers of Program \eqref{prob:01}. See \eqref{eq:grad map} for the definition of the gradient mapping. \\ \item \textbf{(Update dual step size)} Set \begin{align} \s_{k+1} = \s_{1} \min\left( {\frac{1}{\sqrt{k+1}}}, \frac{\|A(x_1)+B(z_1)\|}{|A(x_{k+1})+B(z_{k+1})\|}\cdot \frac{ \log^2 2 }{(k+1)\log^2(k+2)} \right). \end{align}\\ \item \textbf{(Dual ascent step)} Set $y_{k+1} = y_{k} + \sigma_{k+1}(A(x_{k+1})+B(z_{k+1}))$. \end{enumerate} \caption{ADMM for solving Program \eqref{eq:admm main}} \end{algorithm} \begin{theorem}[Linearized ADMM] \label{thm:main admm} For a sufficiently large $k_0$, consider an integer interval $K=[k_0:k_1]$, and suppose that \begin{align} \inf_{k\ge k_0} f(x_k) + g(x_k)+ h(z_k) + l(z_k) + \langle A(x_k)+B(z_k) ,y_{k_0}\rangle >- \infty, \end{align} and that $\rho$ is large enough such that (\ref{eq:suff cnd on rho final}) holds . For $\rho'>0$, in addition to the strong smoothness of $f$ and $A$ quantified in (\ref{eq:smoothness basic}), let us define \begin{align} \lambda'_f = \max_{\|x\|\le \rho'} \|\nabla f(x)\|, \qquad \lambda'_A = \max_{\|x\| \le \rho'} \|DA(x)\|, \qquad \lambda'_B = \max_{\|x\|\le \rho'} \|DB(x)\|, \end{align} to be the (restricted) Lipschitz constants of $f,A,B$, respectively. Moreover, suppose that the nonconvex Slater's condition, namely, Lemma \ref{lem:bnd bnd Ak}, is in force, and \begin{align} \nu_A \ge 2\lambda_A \rho''. \end{align} Then the output of linearized ADMM in Algorithm~\ref{Algo:2} satisfies \textbf{we need a better expression below to better show the dependence on various parameters.} \begin{align} & \min_{k\in K}\,\, \min(\overline{\g},\overline{\iota})( \|G_k\|^2 + \|H_k\|^2) \sqrt{\frac{k_0 \log^2(k_0+1)}{k_1\log^2(k_1+1)}} \nonumber\\ & \qquad + \|A(x_k)\|^2 \lesssim \frac{1}{k_1-k_0}, \end{align} where $\overline{\gamma},\overline{\iota}$ are independent of $k_1$ and $\lesssim$ suppresses the dependence on all parameters except $k_1$. The exact expressions in terms of $\lambda_f,\lambda'_f,\lambda_A,\lambda'_A, \b_{k_0},\s_{k_0}, \nu_A,y_0$ are found in (\ref{eq:raw bnd on sum admm},\ref{eq:lower_band_iotas_admm}). \end{theorem} diff --git a/tex/Paper/sections/ADMM_theory.tex b/tex/Paper/sections/ADMM_theory.tex index c93e0e4..5203e5f 100644 --- a/tex/Paper/sections/ADMM_theory.tex +++ b/tex/Paper/sections/ADMM_theory.tex @@ -1,195 +1,195 @@ \section{Proof of Theorem \ref{thm:main admm} \label{sec:ADMM theory}} Let us repeat the technical lemmas and definitions in Section \ref{sec:preliminaries}, slightly adjusted for the augmented Lagrangian of Program~\eqref{eq:admm main}, see \eqref{eq:lagrangian admm}. These standard results are stated without proof. \begin{lemma}[Smoothness]\label{lem:smoothness admm} For $\rho,\rho'\ge 0$, it holds that \begin{align*} \| \nabla_x \mathcal{L}_{\beta}(x, z,y)- \nabla_x \mathcal{L}_{\beta}(x', z, y) \| \le \lambda_{\b,z} \|x-x'\|, \end{align*} \begin{align} \| \nabla_z \L_\b(x,z,y) - \nabla_z \L_\b(x,z',y) \| \le \lambda_{\b,x} \|z-z'\|, \end{align} for every $x,x',z,z' \in \{ x'': \|A(x'')+B(z)\| \le \rho, \|x''\|\le \rho'\}$ and $y\in\RR^m$, where \begin{align*} \lambda_{\beta,z} & \le \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho \right) + \b d \lambda'^2_A. \end{align*} \begin{align} \lambda_{\beta,x} & \le \lambda_h + \sqrt{m}\lambda_B \left(\|y\| + \b\rho \right) + \b d \lambda'^2_B. \label{eq:smoothness of Lagrangian admm} \end{align} Above, $\lambda_f,\lambda_A,\lambda_h,\lambda_B$ were defined in (\ref{eq:smoothness basics admm}) and \begin{align} \lambda'_A := \max_{\|x\|\le \rho'}\|DA(x)\|, \qquad \lambda'_B := \max_{\|z\|\le \rho'}\|DB(z)\|. \end{align} \end{lemma} \begin{definition}\label{def:grad map admm}\textbf{{(Gradient Mapping)}} Given $x\in \RR^d$ and $\gamma >0$, the gradient mappings $G_{\beta,\gamma}(\cdot,z, y), G_{\b,\iota}(x,\cdot,y): \RR^d\rightarrow\RR^d$ takes, respectively, $x,z\in \RR^d$ to \begin{equation} G_{\b,\gamma}(x,z,y) = \frac{x-x^+}{\gamma}, \qquad H_{\b,\iota}(x,z,y) = \frac{z-z^+}{\iota}, \label{eq:grad map admm} \end{equation} where $x^+=P_{g}(x-\gamma \nabla_x \mathcal{L}_ {\beta}(x,z,y))$ and $z^+=P_{l}(z-\iota \nabla_z \mathcal{L}_ {\beta}(x,z,y))$. \end{definition} \begin{lemma}[Descent lemma]\label{lem:11 admm} For $x,z\in \RR^d$ and $y\in \RR^m$, let $x^+,z^+$ be as in Definition \ref{def:grad map admm} with $\g<1/\lambda_{\b,z}$ and $\iota<1/\lambda_{\b,x}$. For $\rho,\rho'\ge 0$, suppose that \begin{align} x,x^+,z,z^+ \in \{ x': \|A(x')\| \le \rho, \|x'\| \le \rho' \}. \end{align} Then it holds that \begin{equation*} \label{e:deslem} \| G_{\beta,\gamma}(x,z,y)\|^{2}\leq \frac{2}{\gamma} (\mathcal{L}_\beta(x,z,y) + g (x)- \mathcal{L}_\beta(x^+,z,y)- g(x^+) ), \end{equation*} \begin{align} \| H_{\beta,\iota}(x,z,y)\|^{2}\leq \frac{2}{\iota} (\mathcal{L}_\beta(x,z,y) + l (x)- \mathcal{L}_\beta(x,z^+,y)- l(x^+) ). \label{eq:deslem admm} \end{align} \end{lemma} \begin{lemma}[Line search] \label{lem:eval Lipsc cte admm} Fix $\theta \in (0,1)$ and ${\gamma}_0,\iota_0>0$. For $\gamma',\iota'>0$, let \begin{align} x^+_{\gamma'} = P_g(x - \gamma' \nabla_x \mathcal{L}_\beta(x,z,y)), \qquad z^+_{\iota'} = P_l(z - \iota' \nabla_z \mathcal{L}_\beta(x,z,y)), \end{align} and define \begin{align*} \gamma & := \max \Big\{ \gamma' ={\gamma}_0 \theta^i : \mathcal{L}_\beta (x^+_{\gamma'},z,y) \nonumber\\ & \qquad \qquad \quad \le \mathcal{L}_\beta (x,z,y) + -\g' \left\langle x^+_{\gamma'} - x , \nabla_x \mathcal{L}_{\beta}(x,z,y) \right\rangle +\left\langle x^+_{\gamma'} - x , \nabla_x \mathcal{L}_{\beta}(x,z,y) \right\rangle + \frac{1}{2\gamma'} \| x^+_{\gamma'} - x\|^2 \Big\}, \end{align*} \begin{align} \iota & := \max \Big\{ \iota' ={\iota}_0 \theta^i : \mathcal{L}_\beta (x,z^+_{\iota'},y) \nonumber\\ & \qquad \qquad \quad \le \mathcal{L}_\beta (x,z,y) + -\iota' \left\langle z^+_{\iota'} - z , \nabla_z \mathcal{L}_{\beta}(x,z,y) \right\rangle +\left\langle z^+_{\iota'} - z , \nabla_z \mathcal{L}_{\beta}(x,z,y) \right\rangle + \frac{1}{2\iota'} \| z^+_{\iota'} - z\|^2 \Big\}. \label{eq:defn of gamma line search} \end{align} Then, (\ref{eq:deslem admm}) holds and, moreover, we have that \begin{align} \gamma \ge \frac{\theta}{\lambda_{\beta,x}}, \qquad \iota \ge \frac{\theta}{\lambda_{\beta,z}}. \label{eq:moreover} \end{align} \end{lemma} For the reader's convenience, let us also recall the updates of the nonconvex ADMM algorithm in iteration $k$ as \begin{align} x_{k+1}& =P_g(x_k -\g_k \nabla_x \L_{\b_k} (x_k,z_k,y_k)),\nonumber\\ z_{k+1}& =P_l(z_k - \i_k \nabla_z \L_{\b_k} (x_{k+1},z_k,y_k)),\nonumber\\ y_{k+1}& = y_k+\s_{k+1} (A(x_{k+1})+B(z_{k+1})). \label{eq:ADMM updates recall} \end{align} For every $k\in K$, recall that the primal step sizes $\g_k,\i_k$ are determined by line search in Lemma \ref{lem:eval Lipsc cte admm} and the penalty weights and dual step sizes are set as \begin{align} \b_k & = \b_{k_0} \sqrt{\frac{k\log^2(k+1)}{k_0 \log^2(k_0+1)}}, \nonumber\\ \s_k & = \s_{k_0}\min\left( \sqrt{\frac{k_0}{k}} , \frac{\|A(u_{k_0})+B(v_{k_0})\|}{\|A(u_k)+B(v_k)\|} \cdot \frac{k_0 \log^2 (k_0+1)}{k \log^2 (k+1)} \right). \end{align} For every $k\in K$, let us set \begin{align*} G_k = G_{\b_k,\g_k}(x_k,z_k,y_k) = \frac{x_k-x_{k+1}}{\g_k}, \end{align*} \begin{align} H_k = H_{\b_k,\iota_k}(x_{k+1},z_k,y_k)= \frac{z_k-z_{k+1}}{\i_k}, \end{align} for short. The convergence analysis only slightly differs from the proof of Theorem \ref{thm:main} and we therefore focus only on the differences. Similar to the proof of Theorem~\ref{thm:main}, two applications of Lemma~\ref{lem:11 admm} yields that \begin{align} \frac{\g_k \|G_k\|^2}{2} \le \L_{\b_k}(x_k,z_k,y_k)- \L_{\b_k}(x_{k+1},z_k,y_k), \nonumber\\ \frac{\i_k \|H_k\|^2}{2} \le \L_{\b_k}(x_{k+1},z_k,y_k)- \L_{\b_k}(x_{k+1},z_{k+1},y_k), \label{eq:decrease twice} \end{align} for every $k$. By setting \begin{align*} u_k = [ \begin{array}{cc} x_k^\top & z_k^\top \end{array} ]^\top \in \RR^{2d}, \qquad Q_k = [ \begin{array}{cc} G_k^\top & H_k^\top \end{array} ]^\top \in \RR^{2d}, \end{align*} \begin{align} q(u) = f(x)+g(x)+h(z)+l(z), \qquad \kappa_k = \min(\g_k,\iota_k), \label{eq:defn of w} \end{align} for every $k\in K$ and after summing up the two inequalities in \eqref{eq:decrease twice}, we reach \begin{align} \frac{\kappa_k \|Q_k\|^2}{2} & \le \L_{\b_k}(x_k,z_k,y_k) - \L_{\b_k}(x_{k+1},z_{k+1},y_k), \qquad \forall k\in K. \end{align} By following the same steps as in the proof of Theorem \ref{thm:main}, we find that \begin{align} \sum_{k=k_0}^{k_1} \frac{\kappa_k \|Q_k\|^2}{2} & \le \mu+ 2\sum_{k=k_0}^{k_1} \| A(x_k) + B(z_k) \|^2 , \end{align} where \begin{align} \mu & := \sup_{k\ge k_0} \Big( q(u_{k_0}) - q(u_{k})+ \langle A(x_{k_0})+B(z_{k_0})-A(x_k)-B(z_k) ,y_{k_0}\rangle \nonumber\\ & \qquad \qquad + \frac{\beta_{k_0}}{2} \|A(x_{k_0})+B(z_{k_0})\|^2 \Big) < \infty. \end{align} On the other hand, the $x$ update in \eqref{eq:ADMM updates recall} implies that \begin{align} & G_k - \nabla f(x_k) - DA(x_k)^\top y_k \nonumber\\ & \qquad - \b_k DA(x_k)^\top (A(x_k) + B(z_k) ) \in \partial g(x_{k+1}). \end{align} For $\rho,\rho',\rho''>0$, suppose that \begin{align} \max_{k\in K} \|A(x_k)+B(z_k) \| \le \rho, \quad \max_{k\in K} \|x_k\| \le \rho', \quad \max_{k\in K} \|x_{k+1}-x_k\| \le \rho'', \label{eq:feas to be checked later} \end{align} \begin{align} \nu_A \ge 2\lambda_A \rho'', \end{align} where $\nu_A$ was defined in \eqref{eq:new slater defn}. Then, following the same line of argument, a similar version of Lemma~\ref{lem:bnd bnd Ak} holds and \begin{align} \|A(x_k)+B(z_k) \| \le \frac{2}{\nu_A\b_k} (\|G_k\|+ \lambda'_h+ \lambda'_A\|y_{k-1}\|). \end{align} If we assume that (\ref{eq:to be used later on},\ref{eq:suff cnd on rho}) hold, then the first bound in \eqref{eq:feas to be checked later} is in force and \begin{align} & \sum_{k=k_0}^{k_1} \kappa_k \|Q_k\|^2 + \|A(x_k)+B(z_k)\|^2 \nonumber\\ & \le \frac{24c \lambda'^2_f k_0 \log^2(k_0+1)}{\nu_A^2 \b^2_{k_0}} + \frac{24c\lambda'^2_A y_{\max}^2 k_0 \log^2(k_0+1)}{\nu_A^2 \b_{k_0}^2} + 5\mu. \label{eq:raw bnd on sum admm} \end{align} To better interpret the above bound, we next estimate the primal step sizes $\{\g_k,\i_k\}$, which are determined by line search. Let $\lambda_{\b,u}$ denote the Lipschitz constant of $\L_{\b_k}(u,v)$ with respect to $u$. Similar to the proof of Theorem \ref{thm:main}, we find that \begin{align*} \gamma_k & \ge \frac{\theta }{ 2 \b_{k_0} \left( \sqrt{m}\lambda_A \rho + d \lambda'^2_A \right)}\sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}} \nonumber\\ & =: \overline{\g} \sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}}, \end{align*} \begin{align} \iota_k & \ge \frac{\theta }{ 2 \b_{k_0} \left( \sqrt{m}\lambda_B \rho + d \lambda'^2_B \right)}\sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}} \nonumber\\ & =: \overline{\iota} \sqrt{\frac{k_0 \log^2(k_0+1)}{k \log^2(k+1)}}, \label{eq:lower_band_iotas_admm} \end{align} for sufficiently large $k_0$. Substituting the above bounds in \eqref{eq:raw bnd on sum admm} and following the steps in the proof of Theorem \ref{thm:main} completes the proof of Theorem \ref{thm:main admm}. diff --git a/tex/Paper/sections/appendix.tex b/tex/Paper/sections/appendix.tex index bca5034..23f87e4 100644 --- a/tex/Paper/sections/appendix.tex +++ b/tex/Paper/sections/appendix.tex @@ -1,217 +1,217 @@ \section{Proof of Lemma \ref{lem:smoothness}\label{sec:proof of smoothness lemma}} \textbf{We assume Hessian exists. We shouldn't assume that for a strictly correct proof!} Note that \begin{align} \mathcal{L}_{\beta}(x,y) = f(x) + \sum_{i=1}^m y_i A_i (x) + \frac{\b}{2} \sum_{i=1}^m (A_i(x))^2, \end{align} which implies that \begin{align} \nabla_x \mathcal{L}_\beta(x,y) & = \nabla f(x) + \sum_{i=1}^m y_i \nabla A_i(x) + \frac{\b}{2} \sum_{i=1}^m A_i(x) \nabla A_i(x) \nonumber\\ & = \nabla f(x) + DA(x)^\top y + \b DA(x)^\top A(x), \end{align} where $DA(x)$ is the Jacobian of $A$ at $x$. By taking another derivative with respect to $x$, we reach \begin{align} \nabla^2_x \mathcal{L}_\beta(x,y) & = \nabla^2 f(x) + \sum_{i=1}^m \left( y_i + \b A_i(x) \right) \nabla^2 A_i(x) + \b \sum_{i=1}^m \nabla A_i(x) \nabla A_i(x)^\top. \end{align} It follows that \begin{align} \|\nabla_x^2 \mathcal{L}_\beta(x,y)\| & \le \| \nabla^2 f(x) \|+ \max_i \| \nabla^2 A_i(x)\| \left (\|y\|_1+\b \|A(x)\|_1 \right) + \beta\sum_{i=1}^m \|\nabla A_i(x)\|^2 \nonumber\\ & \le \lambda_h+ \sqrt{m} \lambda_A \left (\|y\|+\b \|A(x)\| \right) + \b \|DA(x)\|^2_F. \end{align} For every $x$ such that $\|A(x)\|\le \rho$ and $\|x\|\le \rho'$, we conclude that \begin{align} \|\nabla_x^2 \mathcal{L}_\beta(x,y)\| & \le \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho \right) + \b \max_{\|x\|\le \rho'}\|DA(x)\|_F^2, \end{align} which completes the proof of Lemma \ref{lem:smoothness}. \section{Proof of Lemma \ref{lem:11}\label{sec:proof of descent lemma}} Throughout, let \begin{align} G = G_{\b,\g}(x,y) = \frac{x-x^+}{\g}, \end{align} for short. Suppose that $\|A(x)\|\le \rho$, $\|x\|\le \rho$, and similarly $\|A(x^+)\|\le \rho$, $\|x^+\|\le \rho'$. An application of Lemma \ref{lem:smoothness} yields that \begin{align} \L_\b(x^+,y)+g(x^+) & \le \L_\b(x,y)+ \langle x^+-x,\nabla_x \L_\b(x,y) \rangle + \frac{\lambda_\b}{2} \|x^+ - x\|^2 + g(x^+) \nonumber\\ & = \L_\b(x,y)-\g \langle G ,\nabla_x \L_\b(x,y) \rangle + \frac{\g^2 \lambda_\b }{2} \|G\|^2 + g(x^+) \label{eq:descent pr 1} \end{align} Since $x^+ = P_g(x - \g \nabla_x \L_\b(x,y))$, we also have that \begin{align} -\g (G - \nabla_x \L_\b(x,y)) = \xi \in \partial g(x^+). +G - \nabla_x \L_\b(x,y) = \xi \in \partial g(x^+). \label{eq:opt of prox} \end{align} By combining (\ref{eq:descent pr 1},\ref{eq:opt of prox}), we find that \begin{align} \L_\b(x^+,y)+g(x^+) & \le \L_\b(x,y) -\g \|G\|^2 + \g \langle G, \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\ & = \L_\b(x,y) -\g \|G\|^2 + \langle x- x^+ , \xi \rangle + \frac{\g^2 \lambda_\b}{2}\|G\|^2 + g(x^+) \nonumber\\ & \le \L_\b(x,y) + g(x) - \g\left( 1-\frac{\g\lambda_\b}{2}\right) \|G\|^2, \end{align} where the last line above uses the convexity of $g$. Recalling that $\g\le 1/\lambda_\b$ completes the proof of Lemma \ref{lem:11}. \section{Proof of Lemma \ref{lem:eval Lipsc cte}\label{sec:proof of linesearch}} Recalling $x^+_{\gamma}$ in \eqref{eq:defn of gamma line search}, we note that \begin{equation} -x^+_{\gamma} - x +\gamma \nabla_x \mathcal{L}_\beta(x,y) = -\xi \in -\partial g (x^+_{\gamma}). +x^+_{\gamma} - x +\gamma \nabla_x \mathcal{L}_\beta(x,y) = - \g\xi \in -\partial g (x^+_{\gamma}). \label{eq:optimality of uplus} \end{equation} Lastly, $\gamma$ by definition in \eqref{eq:defn of gamma line search} satisfies \begin{align} & \mathcal{L}_{\beta}(x^+_{\gamma},y) + g(x_\g^+) \nonumber\\ - & \le \mathcal{L}_\beta(x,y) + \g \left\langle + & \le \mathcal{L}_\beta(x,y) + \left\langle x^+_{\gamma} - x , \nabla_x \mathcal{L}_\beta (x,y) \right\rangle + \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x_\g^+) \nonumber\\ & = \mathcal{L}_\beta(x,y) + \left\langle x - x^+_{\gamma} ,\xi \right\rangle - \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x_\g^+)\nonumber\\ & \le \mathcal{L}_\beta(x,y) - \frac{1}{2\gamma}\|x^+_{\gamma} - x\|^2 + g(x) - g(x^+_\g) \qquad (\text{convexity of } g) \nonumber\\ & = \mathcal{L}_\beta(x,y) - \frac{\gamma}{2} \|G_{\beta,\gamma}(x,y)\|^2 +g(x) - g(x^+_\g), \qquad \text{(see Definition \ref{def:grad map})} \end{align} which completes the proof of Lemma \ref{lem:eval Lipsc cte}. \section{Proof of Lemma \ref{lem:bnd bnd Ak} \label{sec:proof of bnd Ak}} Throughout, we assume that \begin{align} \max_{k\in K} \|A(x_k)\| \le \rho, \qquad \max_{k\in K} \| x_k\| \le \rho', \qquad \max_{k\in K} \| x_{k+1}-x_k \| \le \rho''. \label{eq:bndness in slater proof} \end{align} From the $x$ update in \eqref{eq:update uk recall}, it follows that \begin{align} x_{k+1}-x_k + \g_k \nabla f(x_k) + \g_k DA(x_k)^\top (y_k + \b_k A(x_k) ) =- \xi \in - \partial g(x_{k+1}), \end{align} which, after recalling \eqref{eq:grad map recall}, can be written as \begin{align} -\frac{ G_k}{\b_k} + \frac{ \nabla f(x_k)}{\b_k} + \frac{ DA(x_k)^\top y_k }{\b_k} + DA(x_k)^\top A(x_k) \in - \frac{\partial g(x_{k+1})}{\b_k \g_k}. \label{eq:to be projected} \end{align} Let \begin{align} \cone(x_{k+1}) = \text{cone}(\partial g(x_{k+1}) )^*, \end{align} be the shorthand for the dual cone associated with \begin{equation} \cone(\partial g(x_{k+1})) = \bigcup_{\alpha \ge 0} \alpha \cdot \partial g(x_{k+1}) \subseteq \RR^d. \label{eq:defn of dual cone} \end{equation} By projecting both sides \eqref{eq:to be projected} onto $\cone_{k+1}$, we find that \begin{align} & P_{\cone(x_{k+1})}\left( - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{DA(x_k)^\top y_k}{\b_k} + DA(x_k)^\top A(x_k) \right) \nonumber\\ & \in P_{\cone(x_{k+1})}\left( - \frac{\partial g(x_{k+1}) }{\b_k \g_k} \right) = \{ 0\}, \label{eq:before Sk} \end{align} where the equality follows from the duality of $\cone(x_{k+1})$ and $\cone(\partial g(x_{k+1}))$, see \eqref{eq:defn of dual cone}. Consider a subspace $S_K\subseteq \RR^d$ such that \begin{align} S _{K} \supseteq \bigcup_{k\in K} P_{\cone(x_{k+1})}\left( DA(x_{k+1})^\top A(x_{k+1}) \right), \label{eq:defn of S} \end{align} and project both sides of \eqref{eq:before Sk} onto $S_K$ to reach \begin{align} & P_{S_K}P_{\cone(x_{k+1})}\left( - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{DA(x_k)^\top y_k}{\b_k} + DA(x_k)^\top A(x_k) \right) = 0. \label{eq:pre norm} \end{align} By taking the norm and then applying the triangle inequality, we argue that \begin{align} & \left\| P_{S_K} P_{\cone(x_{k+1})} ( DA(x_k)^\top A(x_k) ) \right\| \nonumber\\ & \le \left\| P_{S_K} P_{\cone(x_{k+1})} \left( - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{DA(x_k)^\top y_k}{\b_k} \right) \right\| \qquad \text{(see \eqref{eq:pre norm})}. \label{eq:post norm} \end{align} Because proximal map is non-expansive and $P_{S_K}P_{\cone_{k+1}}(0) = 0$, we may upper bound the last line above as \begin{align} & \left\| P_{S_K} P_{\cone(x_{k+1})} ( DA(x_k)^\top A(x_k) ) \right\| \nonumber\\ & \le \left\| - \frac{G_k}{\b_k} + \frac{\nabla f(x_k)}{\b_k} + \frac{DA(x_k)^\top y_k}{\b_k} \right\| \nonumber\\ & \le \frac{1}{\b_k} \left( \|G_k\| + \|\nabla f(x_k) \| + \|DA(x_k)^\top y_k\| \right). \qquad \text{(triangle inequality)} \nonumber\\ & \le \frac{1}{\b_k} \left( \|G_k\| +\lambda'_f+ \lambda'_A \|y_k\| \right) , \label{eq:slater proof 0} \end{align} where \begin{align} \lambda'_f := \max_{\|x\| \le \rho'} \| \nabla f(x)\|, \qquad \lambda'_A := \max_{\|x\| \le \rho'} \| DA(x)\|. \end{align} To lower bound the first line of \eqref{eq:slater proof 0}, we must impose a restricted injectivity as follows. We let $S_{K}$ with orthonormal columns denote a basis for the subspace $S_K$ in \eqref{eq:defn of S}, to simplify the notation. We then assume that $ \nu_A>0$, where \begin{align} \nu_A := \begin{cases} \min_{v,x} \, \frac{\left\| S_{K}^\top P_{\cone(x_{k+1})} ( DA(x)^\top v) \right\|}{\|v\|} \\ \|v\|\le \rho\\ \|x\|\le \rho'. \end{cases} \label{eq:new slater proof} \end{align} Recalling the first bound in \eqref{eq:bndness in slater proof}, for every $k\in K $, we may therefore write that \begin{align} & \left\| P_{S_K} P_{\cone(x_{k+1})} ( DA(x_{k})^\top A(x_k)) \right\| \nonumber\\ & \ge \left\| P_{S_K} P_{\cone(x_{k+1})} ( DA(x_{k+k})^\top A(x_{k})) \right\| - \left\| (DA(x_{k+1}) - DA(x_{k})) ^\top A(x_{k}) \right\| \nonumber\\ & \ge \nu_A \|A(x_k)\| - \| DA(x_{k+1})-DA(x_k) \| \|A(x_k) \|, \qquad \text{(see \eqref{eq:new slater proof})} \label{eq:slater proof 1} \end{align} where the second line above again uses the non-expansiveness of $P_{S_K}$ and $P_{\cone_{k+1}}$. The remaining term in \eqref{eq:slater proof 1} is bounded as \begin{align} \| DA(x_{k+1})-DA(x_k) \| & \le \lambda_A \|x_{k+1}-x_k\| \le \lambda_A \rho''. \qquad \text{(see (\ref{eq:smoothness basic},\ref{eq:bndness in slater proof}))} \end{align} Assuming that \begin{align} \lambda_A \rho'' \le \nu_A/2, \end{align} allows us to simplify the last line of \eqref{eq:slater proof 1} as \begin{align} \left\| P_{S_K} P_{\cone(x_{k+1})} ( DA(x_{k})^\top A(x_k)) \right\| \ge \frac{\nu_A }{2} \|A(x_k)\|, \end{align} which, after substituting in \eqref{eq:slater proof 0}, yields that \begin{align} \|A(x_k)\| & \le \frac{2}{\b_k\nu_A} \left( \|G_k\| + \lambda'_f + \lambda'_A \|y_k\| \right), \end{align} and completes the proof of Lemma \ref{lem:bnd bnd Ak}. diff --git a/tex/Paper/sections/preliminaries.tex b/tex/Paper/sections/preliminaries.tex index 7faa17c..983063d 100644 --- a/tex/Paper/sections/preliminaries.tex +++ b/tex/Paper/sections/preliminaries.tex @@ -1,101 +1,101 @@ \section{Preliminaries \label{sec:preliminaries}} \paragraph{\textbf{\emph{Notation.}}} We use the notations $\langle\cdot ,\cdot \rangle $ and $\|\cdot\|$ for the {standard inner} product and norm on $\RR^d${, respectively}. For a convex function $g:\RR^d\rightarrow\RR$, the subdifferential at $x\in \RR^d$ is denoted by $\partial g(x)$ and the proximal operator $P_g:\RR^d\rightarrow\RR^d$ takes $x$ to \begin{align} P_g(x) = \underset{y}{\argmin} \, g(y) + \frac{1}{2}\|x-y\|^2. \label{eq:defn of prox} \end{align} In addition, if $g=1_C$ is the indicator function of a convex set or cone, we use the simpler notation $P_C$ instead of $P_{1_C}$ to denote the orthogonal projection onto $C$. Throughout, $g^*$ and $\partial g^*$ will denote the Fenchel conjugate of $g$ and its subdifferential, respectively. For a cone $C$, we let $C^*$ denote its polar, namely, \begin{align} C^* = \{x: \langle x,x'\rangle \le 0,\,\, \forall x' \}. \end{align} An integer interval is denoted by $[k_0:k_1]=\{k_0,\cdots,k_1\}$ for integers $k_0 \le k_1$. For matrices, $\|\cdot\|$ and $\|\cdot\|_F$ denote the spectral and Frobenius norms, respectively. \paragraph{\textbf{\emph{{Necessary optimality conditions.}}} \label{sec:opt cnds}} {First order necessary optimality conditions} for {Program} \eqref{prob:01} are well studied in the literature \cite{rockafellar2009variational}. {Indeed, $u$ is a (first-order) stationary point of Program \eqref{prob:01} if there exists $y\in \RR^m$ for which \begin{align} \begin{cases} -\nabla h(x) - DA(x)^\top y \in \partial g(x)\\ A(x) = 0, \end{cases} \label{e:inclu1} \end{align} where $DA(x)$ is the Jacobian of $A$ at $x$. Recalling \eqref{eq:Lagrangian}, we observe that \eqref{e:inclu1} is equivalent to \begin{align} \begin{cases} -\nabla_x \mathcal{L}_\beta(x,y) \in \partial g(x)\\ A(x) = 0, \end{cases} \label{e:inclu2} \end{align} which is in turn the necessary optimality condition for Program \eqref{eq:minmax}. } \paragraph{\emph{\textbf{Technical lemmas.}}} The proof of the following result is trivial but included in Appendix for completeness. \begin{lemma}[Smoothness]\label{lem:smoothness} For fixed $y\in \RR^m$ and $\rho,\rho'\ge 0$, it holds that \begin{align} \| \nabla_x \mathcal{L}_{\beta}(x, y)- \nabla_x \mathcal{L}_{\beta}(x', y) \| \le \lambda_\b \|x-x'\|, \end{align} for every $x,x' \in \{ x'': \|A(x'')\|\le \rho, \|x''\|\le \rho'\}$, where \begin{align} \lambda_\beta & \le \lambda_f + \sqrt{m}\lambda_A \left(\|y\| + \b\rho \right) + \b d \lambda'^2_A. \label{eq:smoothness of Lagrangian} \end{align} Above, $\lambda_f,\lambda_A$ were defined in (\ref{eq:smoothness basic}) and \begin{align} \lambda'_A := \max_{\|x\|\le \rho'}\|DA(x)\|. \end{align} \end{lemma} Gradient mapping, defined below, plays an important role in our convergence analysis. \begin{definition}\label{def:grad map}\textbf{{(Gradient Mapping)}} Given $x\in \RR^d$ and $\gamma >0$, the gradient mapping $G_{\beta,\gamma}(\cdot; y): \RR^d\rightarrow\RR^d$ takes $x\in \RR^d$ to \begin{equation} G_{\b,\gamma}(x,y) = \frac{x-x^+}{\gamma}, \label{eq:grad map} \end{equation} where $x^+=P_{g}(x-\gamma \nabla \mathcal{L}_ {\beta}(x,y))$. \end{definition} In particular, if we remove $g$ from Program \eqref{prob:01}, the gradient mapping reduces to $G_{\beta,\gamma}(x,y)=\nabla h(x) $. For a sufficiently small step size $\g$, the gradient mapping determines the corresponding descent in the objective function of Program \eqref{eq:minmax}. The following result is standard \cite[Lemma 3.2, Remark 3.2(i)]{bolte2014proximal}, but the proof is given in Appendix \ref{sec:proof of descent lemma} for completeness. \begin{lemma}[Descent lemma]\label{lem:11} For $x\in \RR^d$ and $y\in \RR^m$, let $x^+ = P_g(x - \g \nabla_x \L_\b(x,y) )$, where $\g< 1/\lambda_\b$. For $\rho,\rho'\ge 0$, suppose that \begin{align} x,x^+ \in \{ x': \|A(x')\| \le \rho, \|x'\| \le \rho' \}. \end{align} Then it holds that \begin{equation} \label{e:deslem} \| G_{\beta,\gamma}(x,y)\|^{2}\leq \frac{2}{\gamma} (\mathcal{L}_\beta(x,y) + g (x)- \mathcal{L}_\beta(x^+,y)- g(x^+) ). \end{equation} \end{lemma} In {practice}, determining the step size $\gamma$ by computing the right-hand side of \eqref{eq:smoothness of Lagrangian} might be intractable. Instead, we often resort to the classic line search technique, reviewed below and proved in Appendix \ref{sec:proof of linesearch} for completeness. \begin{lemma}[Line search] \label{lem:eval Lipsc cte} Fix $\theta \in (0,1)$ and ${\gamma}_0>0$. For $\gamma'>0$, let \begin{align} x^+_{\gamma'} = P_g(x - \gamma' \nabla_x \mathcal{L}_\beta(x,y)), \label{eq:defn of x+gamma} \end{align} and define \begin{align} \gamma & := \max \Big\{ \gamma' ={\gamma}_0 \theta^i : \mathcal{L}_\beta (x^+_{\gamma'},y) \nonumber\\ & \qquad \qquad \quad \le \mathcal{L}_\beta (x,y) + -\g' \left\langle x^+_{\gamma'} - x , \nabla_x \mathcal{L}_{\beta}(x,y) \right\rangle +\left\langle x^+_{\gamma'} - x , \nabla_x \mathcal{L}_{\beta}(x,y) \right\rangle + \frac{1}{2\gamma'} \| x^+_{\gamma'} - x\|^2 \Big\}. \label{eq:defn of gamma line search} \end{align} Then, (\ref{e:deslem}) holds and, moreover, we have that \begin{align} \gamma \ge \frac{\theta}{\lambda_\beta}. \label{eq:moreover} \end{align} \end{lemma}