diff --git a/Escaping saddle points.tex b/Escaping saddle points.tex index f2168c4..211bbcf 100644 --- a/Escaping saddle points.tex +++ b/Escaping saddle points.tex @@ -1,291 +1,285 @@ \documentclass[english]{article} \usepackage[utf8]{inputenc} \usepackage{amsthm} \usepackage{amsmath} \usepackage{amssymb} \usepackage{hyperref} \hypersetup{ colorlinks=true,% citecolor=black,% filecolor=black,% linkcolor=black,% urlcolor=blue } \makeatletter %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Textclass uspecific LaTeX commands. \theoremstyle{plain} \newtheorem{thm}{\protect\theoremname} \theoremstyle{plain} \newtheorem{lem}[thm]{\protect\lemmaname} \theoremstyle{plain} \newtheorem{corr}[thm]{\protect\corrolaryname} %\theoremstyle{definition} \theoremstyle{plain} \newtheorem{defn}[thm]{\protect\definitionname} \theoremstyle{plain} \newtheorem{prop}[thm]{\protect\propositionname} \theoremstyle{definition} \newtheorem{example}[thm]{\protect\examplename} \newtheorem*{remark}{Remark} \theoremstyle{plain} \newtheorem{conditions}[thm]{\protect\conditionsname} \makeatother \usepackage{babel} \providecommand{\definitionname}{Definition} \providecommand{\examplename}{Example} \providecommand{\lemmaname}{Lemma} \providecommand{\corrolaryname}{Corollary} \providecommand{\propositionname}{Proposition} \providecommand{\conditionsname}{Conditions} \providecommand{\theoremname}{Theorem} \usepackage{geometry} \geometry{ a4paper, left=20mm, %lmargin = 0mm, %inner = 0mm, right = 20mm, %rmargin = 0mm, %outer = 0mm, top=15mm, %tmargin = 0mm, bottom = 15mm, %bmargin = 0mm, } \def \l{\left} \def \r{\right} \def \R{\mathbb{R}} \def \E{\mathbb{E}} \def \wt{\widetilde} \def \D{\mathcal{D}} \def \wh{\widehat} \def \ol{\overline} \def \m{(m)} \def \t {\theta} \def \b {\beta} \def \a {\alpha} \def \g{\gamma} \def \L{\mathcal{L}} \def \d{\delta} \def \S{\mathcal{S}} \title{Escaping From Saddle Points} \begin{document} \maketitle Consider the optimization program \begin{align} \begin{cases} \min h(u)\\ L(u) = 0\\ u\in C, \end{cases} \label{eq:main} \end{align} where $h$ is a non-convex but differentiable function, $L$ is a differentiable map, and $C$ is a convex set. Program \eqref{eq:main} is equivalent to \begin{align} & \min_{u\in C} \max_y h(u)+\langle L(u),y \rangle+\frac{\|L(u)\|_2^2}{2\b^*} \nonumber\\ & =: \min_{u\in C} \max_y \L(u,y,\b^*), \end{align} where $1_C$ is the convex indicator function on $C$ and $\b^*>0$. Above, $\L$ is the augmented Lagrangian of Program \eqref{eq:main}. Therefore, $(u^*,y^*)$ is a stationary point of Program \eqref{eq:main} if \begin{align} \begin{cases} 0\in \nabla_u \L(u^*,y^*,\b^*) + N_C(u^*) \\ L(u^*) = 0, \end{cases} +\label{eq:opt cnd} \end{align} where $DL(u^*)$ is the Jacobian of $L$ and $N_C(u^*)$ is the normal cone of $C$, both at $u^*$. Note that $u^*$ is a local minimum if \begin{equation} \delta_u^T \nabla_{uu} \L(u^*,y^*,\b^*) \d_u \ge 0, \qquad \forall \delta_u \in T_C(u^*), \end{equation} -where $T_C(u^*)$ is the tangent cone of $C$ at $u^*$. Alternatively, if the above second form takes both positive and negative on $T_C(u^*)$, then $u^*$ is a strict saddle point. +where $T_C(u^*)$ is the tangent cone of $C$ at $u^*$. Alternatively, if the above second form takes both positive and negative values on $T_C(u^*)$, then $u^*$ is a strict saddle point. %Note that $u$ is a stationary point of Program \eqref{eq:main} if %\begin{align} %\begin{cases} %\delta\in T_C(u) \\ %\nabla h(u)^T \delta = 0\\ %L(u)=0 %\end{cases} %\Longrightarrow %DL(u)\delta = 0, %\end{align} %where $T_C(u)$ is the tangent cone to $C$ at $u$. Equivalently, $u$ is a stationary point of Program \eqref{eq:main} if To solve this program, Bang et al. suggested the first-order algorithm \begin{align*} u_{k+1} = P_C(u_k - \g_k \nabla_u \L(u_k,y_k,\b_k) ), \end{align*} \begin{align*} y_{k+1} = y_k + \frac{L(u_{k+1})}{2\b_k}, \end{align*} \begin{equation} \beta_{k+1} = b(u_k,y_k,\b_k,\g_k), \qquad \g_{k+1} = c(u_k,y_k,\b_k,\g_k), \label{eq:main alg} \end{equation} for positive-valued $b,c$ specified in their write-up. {\color{blue} Is the last claim correct?} -A limit point $z^* = (u^*,y^*,\b^*,\g^*)$ of Algorithm \eqref{eq:main alg} satisfies -\begin{align} -\begin{cases} -0\in \nabla \L(u^*,y^*,\b^*) + N_C(u^*),\\ -L(u^*) = 0. -\end{cases} -\label{eq:opt cnd} -\end{align} +It is easy to verify that a limit point $z^* = (u^*,y^*,\b^*,\g^*)$ of Algorithm \eqref{eq:main alg} satisfies \eqref{eq:opt cnd}. Consequently, any limit point of Algorithm \eqref{eq:main alg} is a stationary point of Program \eqref{eq:main} and vice verse. However, without limiting the choice of $h$ (to convex for example), we cannot ensure that Algorithm \eqref{eq:main alg} finds a local minimum of Program \eqref{eq:main}. Our goal is to show that, almost surely, Algorithm \eqref{eq:main alg} avoids (strict) saddle points of Program \eqref{eq:main}. For example, suppose that Program \eqref{eq:main} does not have any non-strict saddle points. Under this assumption, if Algorithm \eqref{eq:main alg} converges, then the limit point is a local minimum, almost surely. %Then we may write the above algorithm as a dynamical system that satisfies %\begin{align} %z_{k+1}:= %\l[ %\begin{array}{c} %u_{k+1}\\ %y_{k+1}\\ %\b_{k+1}\\ %\g_{k+1} %\end{array} %\r] %& = %\l[ %\begin{array}{c} %P_C (u_k- \g_k\nabla_u \L_k(u_k,y_k)\\ %y_k+ \frac{L(u_{k+1})}{2\b_k}\\ %b(u_k,y_k,\b_k,\g_k)\\ %c(u_k,y_k,\b_k,\g_k) %\end{array} %\r]. %\end{align} \section{Analysis} Let us use the shorthand $z=(u,y,\b,\g)$ and consider the dynamical system specified by \begin{align} z_{k+1} & := \l[ \begin{array}{c} P_C (u_k- \g_k\nabla_u \L(z_k))\\ y_k+ \frac{L(u_{k})}{2\b_k}\\ b(z_k)\\ c(z_k) \end{array} \r] =: O \l(z_k \r) . \label{eq:sys} \end{align} It is easy to verify that any fixed point of the System \eqref{eq:sys} is a limit point of Algorithm \eqref{eq:main alg}. Moreover, when close enough to a stationary point (equivalently, fixed point) $z^*$, the future iterates of Algorithm \ref{eq:main alg} are attracted (dispelled) from $z^*$ if and only if the future iterates of System \eqref{eq:sys} are attracted (dispelled) from $z^*$. {\color{blue} Haven't verified the last claim.} %with the same starting point, the sequence produced by System \eqref{eq:sys} converges to a fixed point $z^*$ if and only if the sequence produced by Algorithm \eqref{eq:main alg} converges to $z^*$. It therefore suffices to study System \eqref{eq:sys} in what follows. \begin{lem} $O$ is an isomorphism in a neighborhood of $z^*$. That is, $O$ is continuous and has a continuous inverse. {\color{blue} I don't know if this claim is correct... For it to be correct, it might require $C$ to have bounded \emph{reach}. } \end{lem} Consider a fixed point $z^*$ of System \eqref{eq:sys} and a point $z=z^*+\d_z$ in its vicinity, where $\d_z=(\d_u,\d_y,\d_\b,\d_\g)$ and $\d_u\in T_C(u^*)$. We may then write that \begin{align} O(z^*+\d_z) -O( z^*) & = O(z+\d_z) - z^* \nonumber\\ & = \l[ \begin{array}{c} P_C\l(u^*+\d_u- (\g^*+\d_\g) \nabla_u \L(z^*+\d_z ) \r)- u^*\\ \frac{D L(u^*) \d_u }{2\b^*} + \d_y - \frac{L(u^*) \d_\b}{2(\b^*)^2 } \\ \langle \nabla b(z^*), \d_z \rangle\\ \langle \nabla c(z^*) ,\d_z \rangle \end{array} \r]. \label{eq:what happens} \end{align} To simplify the first entry of the vector above, let $P_{T_C(u^*)}$ be the orthogonal projection onto the tangent cone of $C$ at $u^*$, and note that {\color{blue} Is the first identity below correct?} \begin{align} & P_C(u^*+\d_u- (\g^*+\d_\g) \nabla_u \L(z^*+\d_z )- u^*\nonumber\\ & = P_{T_C(u^*)} (\d_u- (\g^*+\d_\g) \nabla_u \L(z^*+\d_z) ) \nonumber\\ & = \delta_u - (\g^*+\d_\g) P_{T_C(u^*)} ( \nabla_u \L(z^*+\d_z) ) \qquad \l( \delta_u \in T_C(u^*) \r) \nonumber\\ & = \delta_u - \g^* P_{T_C(u^*)} \nabla_{u,u} \L(z^*) \d_u - \g^* P_{T_C(u^*)} \nabla_{u,u} \L(z^*) \d_u - \g^* P_{T_C(u^*)} \nabla_{u,y} \L(z^*) \d_y - \g^* P_{T_C(u^*)} \nabla_{u,\b} \L(z^*) \d_\b \nonumber\\ & \qquad - \delta_\g P_{T_C(u^*)} \nabla_u \L(z^*) \nonumber\\ & = \delta_u - \g^* P_{T_C(u^*)} \nabla_{u,u} \L(z^*) \d_u - \g^* P_{T_C(u^*)} \nabla_{u,u} \L(z^*) \d_u - \g^* P_{T_C(u^*)} \nabla_{u,y} \L(z^*) \d_y - \g^* P_{T_C(u^*)} \nabla_{u,\b} \L(z^*) \d_\b \qquad \text{(see \eqref{eq:opt cnd})} \nonumber\\ & = \l[ \begin{array}{cccc} I_u - \g^* P_{T_C(u^*)} \nabla_{u,u} \L(z^*) & - \g^* P_{T_C(u^*)} \nabla_{u,y} \L(z^*) & - \g^* P_{T_C(u^*)} \nabla_{u,\b} \L(z^*) \end{array} \r] \cdot \l[ \begin{array}{c} \d_u \\ \d_y \\ \d_\b\\ \end{array} \r]. \end{align} Substituting the above identity back into \eqref{eq:what happens}, we find that \begin{align} O(z^*+\d_z) - O(z^*) & = \l[ \begin{array}{cccc} I_u - \g^* P_{T_C(u^*)} \nabla_{u,u} \L(z^*) & - \g^* P_{T_C(u^*)} \nabla_{u,y} \L(z^*) & - \g^* P_{T_C(u^*)} \nabla_{u,\b} \L(z^*) & 0 \\ \frac{DL(u^*)}{2\b^*} & I_y& -\frac{L(u^*)}{2(\b^*)^2} & 0\\ \nabla_u b(z^*)^T & \nabla_y b(z^*)^T & \nabla_\b b(z^*)^T & \nabla_\g b(z^*)^T\\ \nabla_u c(z^*)^T & \nabla_y c(z^*)^T & \nabla_\b c(z^*)^T & \nabla_\g c(z^*)^T\\ \end{array} \r] \cdot \d_z \nonumber\\ & = DO(z^*) \cdot \d_z. \end{align} If $u^*$ is a strict saddle point of Program \eqref{eq:main}, then there exists $\delta_u,\delta_u'\in T_C(u^*)$ such that \begin{equation} \delta_u^T \nabla_{uu} \L(z^*) \delta_u >0, \qquad \delta'^T_u\nabla_{uu} \L(z^*) \delta_u' <0. \end{equation} Therefore, $DO(z^*)$ has eigenvalues outside of the unit circle as long as \begin{equation} \g^* \le \frac{1}{\|P_{T_C(u^*)} \nabla_{uu}\L(z^*) P_{T_C(u^*)}\|}. \end{equation} Let us now recall the stable manifold theorem, e.g., Theorem 3 in ``Behavior of accelerated gradient methods near critical points of non-convex problems''. \begin{thm} \textbf{\emph{(Stable manifold theorem)}} Suppose that $z^*$ is a fixed point of $O$. Suppose also that $O$ is itself a diffeomorphism in a neighborhood of $z^*$. Suppose lastly that $DO(z^*)$ has an eigenvalue outside of the unit circle. Then the local stable manifold $\S_{z^*} \ni z^*$ has measure zero. In particular, there exists a neighborhood $B\ni z^*$ such that, if $O^k(z)\in B$ for all $k$, then $z\in \S_{z^*}$. {\color{blue} I think we need a version of this theorem that holds for an isomorphism, namely, just continuous with a continuous inverse. Can we find one in the book ``Global stability of dynamical systems''? I have the book, if needed. } \end{thm} Consider the sequence $\{z_k\}$ generated by System \eqref{eq:sys}. Suppose that $\{z_k\}$ converges to $z^*$. Then, for sufficiently large $k$, $z_k$ belongs to the stable manifold at $z^*$. That is, $z_0\in O^{-k}(\S_{z^*})$. Since $\S_{z^*}$ has measure zero, so does $O^{-k}(\S_{z_k})$. That is, almost surely, System \eqref{eq:sys} (or equivalently Algorithm \eqref{eq:main alg}) does not converge to the strict saddle point $z^*$. \end{document} \ No newline at end of file diff --git a/bibliography.bib b/bibliography.bib index 2763c71..9eab014 100644 --- a/bibliography.bib +++ b/bibliography.bib @@ -1,491 +1,508 @@ %% This BibTeX bibliography file was created using BibDesk. %% http://bibdesk.sourceforge.net/ %% Created for Alp Yurtsever at 2018-02-08 22:09:35 +0100 %% Saved with string encoding Unicode (UTF-8) @article{Jester1996, Author = {Hamilton-Jester, C. L. and Li, C.-K.}, Date-Added = {2018-02-08 21:07:43 +0000}, Date-Modified = {2018-02-08 21:09:35 +0000}, Journal = {Rocky Mountain J. Math.}, Number = {4}, Pages = {1371--1383}, Title = {Extreme vectors of doubly nonnegative matrices}, Volume = {26}, Year = {1996}} @article{Liu2013, Author = {Liu, J. and Musialski, P. and Wonka, P. and Ye, J.}, Date-Added = {2018-02-08 18:27:25 +0000}, Date-Modified = {2018-02-08 18:28:52 +0000}, Journal = {{IEEE} Trans. Pattern Anal. Mach. Intell}, Number = {1}, Pages = {208--230}, Title = {Tensor completion for estimating missing values in visual data}, Volume = {35}, Year = {2013}} @article{Zeng2018, Author = {Zeng, W.-J. and So, H. C.}, Date-Added = {2018-02-08 18:24:12 +0000}, Date-Modified = {2018-02-08 18:26:12 +0000}, Journal = {{IEEE} Trans. on Sig. Process}, Number = {5}, Pages = {1125--1140}, Title = {Outlier--Robust Matrix Completion via $\ell_p$-Minimization}, Volume = {66}, Year = {2018}} @unpublished{Xu2017, Author = {Xu, H.-K.}, Date-Added = {2018-02-07 09:45:36 +0000}, Date-Modified = {2018-02-07 09:46:20 +0000}, Note = {arXiv:1710.07367v1}, Title = {Convergence Analysis of the {F}rank--{W}olfe Algorithm and Its Generalization in {B}anach Spaces}, Year = {2017}} @conference{Dunner2016, Author = {D\"unner, C. and Forte, S. and Tak\'ac, M. and Jaggi, M.}, Booktitle = {Proc. $33$rd Int. Conf. Machine Learning}, Date-Added = {2018-02-07 01:03:32 +0000}, Date-Modified = {2018-02-07 01:04:42 +0000}, Title = {Primal--Dual Rates and Certificates}, Year = {2016}} @unpublished{Lan2014, Author = {G. Lan}, Date-Added = {2018-02-06 17:11:13 +0000}, Date-Modified = {2018-02-06 17:11:13 +0000}, Note = {arXiv:1309.5550v2}, Title = {The Complexity of Large--Scale Convex Programming under a Linear Optimization Oracle}, Year = {2014}} @article{Yang2015, Author = {Yang, L. and Sun, D. and Toh, K-.C.}, Date-Added = {2018-01-16 21:00:53 +0000}, Date-Modified = {2018-01-16 21:04:16 +0000}, Journal = {Mathematical Programming Computation}, Number = {3}, Pages = {331-366}, Title = {{SDPNAL}+: A majorized semismooth {N}ewton--{CG} augmented {L}agrangian method for semidefinite programming with nonnegative constraints}, Volume = {7}, Year = {2015}} @webpage{MNIST, Author = {LeCun, Y. and Cortes, C.}, Date-Added = {2018-01-16 20:36:46 +0000}, Date-Modified = {2018-01-16 20:44:36 +0000}, Month = {Accessed: Jan. 2016}, Title = {{MNIST} handwritten digit database}, Url = {http://yann.lecun.com/exdb/mnist/}, Bdsk-Url-1 = {http://yann.lecun.com/exdb/mnist/}} @article{Mixon2017, Author = {Mixon, D. G. and Villar, S. and Ward, R.}, Date-Added = {2018-01-16 20:34:35 +0000}, Date-Modified = {2018-01-16 20:35:59 +0000}, Journal = {Information and Inference: A Journal of the IMA}, Number = {4}, Pages = {389--415}, Title = {Clustering subgaussian mixtures by semidefinite programming}, Volume = {6}, Year = {2017}} @book{VonNeumann1944, Author = {Von Neumann, J. and Morgenstern, O.}, Date-Added = {2018-01-13 18:19:54 +0000}, Date-Modified = {2018-01-13 18:21:16 +0000}, Publisher = {Princeton press}, Title = {Theory of games and economic behavior}, Year = {1944}} @article{Taskar2006, Author = {Taskar, B. and Lacoste-Julien, S. and Jordan, M.I.}, Date-Added = {2018-01-13 18:17:27 +0000}, Date-Modified = {2018-01-13 18:18:54 +0000}, Journal = {Journal of Machine Learning Research}, Title = {Structured prediction, dual extragradient and {B}regman projections}, Year = {2006}} @conference{richard2012estimation, Author = {Richard, E. and Savalle, P.-A. and Vayatis, N.}, Booktitle = {Proceedings of the $29$th International Conference on Machine Learning}, Date-Modified = {2018-01-13 08:25:26 +0000}, Title = {Estimation of simultaneously sparse and low rank matrices}, Year = {2012}} @conference{yen2016convex, Author = {Yen, I. E.-H. and Lin, X. and Zhang, J. and Ravikumar, P. and Dhillon, I. S.}, Booktitle = {Proceedings of the $33$rd International Conference on Machine Learning}, Date-Modified = {2018-01-16 20:33:23 +0000}, Title = {A convex atomic--norm approach to multiple sequence alignment and motif discovery}, Year = {2016}} @conference{JulienLacoste2015, Author = {Lacoste-Julien, S. and Jaggi, M.}, Booktitle = {Advances in Neural Information Processing Systems 28}, Date-Added = {2018-01-01 05:03:16 +0000}, Date-Modified = {2018-01-01 05:13:35 +0000}, Title = {On the Global Linear Convergence of {F}rank-{W}olfe Optimization Variants}, Year = {2015}} @article{Guelat1986, Author = {Gu\'elat, J. and Marcotte, P.}, Date-Added = {2018-01-01 05:00:46 +0000}, Date-Modified = {2018-01-01 05:01:56 +0000}, Journal = {Math. Program.}, Number = {1}, Pages = {110--119}, Title = {Some Comments on {W}olfe's `Away Step'}, Volume = {35}, Year = {1986}} @article{Beck2004, Author = {Beck, A.}, Date-Added = {2018-01-01 04:58:40 +0000}, Date-Modified = {2018-01-01 04:59:53 +0000}, Journal = {Mathematical Methods of Operations Research}, Number = {2}, Pages = {235--247}, Title = {A Conditional Gradient Method with Linear Rate of Convergence for Solving Convex Linear Systems}, Volume = {59}, Year = {2004}} @conference{Garber2015, Author = {Garber, D. and Hazan, E.}, Booktitle = {Proceedings of the $32$nd International Conference on Machine Learning}, Date-Added = {2018-01-01 04:56:22 +0000}, Date-Modified = {2018-01-01 04:57:29 +0000}, Title = {Faster Rates for the {F}rank-{W}olfe Method over Strongly-Convex Sets}, Year = {2015}} @conference{Alacaoglu2017, Author = {Alacaoglu, A. and Tran--Dinh, Q. and Fercoq, O. and Cevher, V.}, Booktitle = {Advances in Neural Information Processing Systems 30}, Date-Added = {2017-12-30 16:03:02 +0000}, Date-Modified = {2017-12-30 16:04:19 +0000}, Title = {Smooth Primal-Dual Coordinate Descent Algorithms for Nonsmooth Convex Optimization}, Year = {2017}} @article{Nesterov2005, Author = {Nesterov, Y.}, Date-Added = {2017-12-30 12:02:55 +0000}, Date-Modified = {2017-12-30 12:04:01 +0000}, Journal = {Math. Program.}, Pages = {127--152}, Title = {Smooth Minimization of Non-smooth Functions}, Volume = {103}, Year = {2005}} @conference{Odor2016, Author = {Odor, G. and Li, Y.-H. and Yurtsever, A. and Hsieh, Y.-P. and Tran--Dinh, Q. and El~Halabi, M. and Cevher, V.}, Booktitle = {$41$st IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, Date-Added = {2017-12-30 02:13:12 +0000}, Date-Modified = {2017-12-30 02:16:37 +0000}, Title = {{F}rank-{W}olfe Works for Non-Lipschitz Continuous Gradient Objectives: Scalable Poisson Phase Retrieval}, Year = {2016}} @article{Nesterov2017, Author = {Nesterov, Y.}, Date-Added = {2017-12-30 02:09:57 +0000}, Date-Modified = {2017-12-30 02:11:15 +0000}, Journal = {Math. Program.}, Title = {Complexity Bounds for Primal-Dual Methods Minimizing the Model of Objective Function}, Year = {2017}} @conference{Gidel2017, Author = {Gidel, G. and Jebara, T. and Lacoste-Julien, S.}, Booktitle = {Proceedings of the $20$th International Conference on Artificial Intelligence and Statistics (AISTATS)}, Date-Added = {2017-12-30 02:00:44 +0000}, Date-Modified = {2017-12-30 02:02:14 +0000}, Title = {{F}rank-{Wolfe} Algorithms for Saddle Point Problems}, Year = {2017}} @conference{Hazan2008, Author = {Hazan, E.}, Booktitle = {LATIN'08 Proceedings of the 8th Latin American conference on Theoretical informatics}, Date-Added = {2017-12-30 00:44:16 +0000}, Date-Modified = {2017-12-30 00:45:27 +0000}, Pages = {306--316}, Title = {Sparse Approximate Solutions to Semidefinite Programs}, Year = {2008}} @article{Clarkson2010, Author = {Clarkson, K. L.}, Date-Added = {2017-12-30 00:41:25 +0000}, Date-Modified = {2017-12-30 00:42:16 +0000}, Journal = {ACM Transactions on Algorithms (TALG)}, Number = {4}, Title = {Coresets, Sparse Greedy Approximation, and the {F}rank-{W}olfe Algorithm}, Volume = {6}, Year = {2010}} @article{Nesterov1983, Author = {Nesterov, Y.}, Date-Added = {2017-12-30 00:34:10 +0000}, Date-Modified = {2017-12-30 00:35:19 +0000}, Journal = {Soviet Mathematics Doklady}, Number = {2}, Pages = {372--376}, Title = {A Method of Solving a Convex Programming Problem with Convergence Rate {$O( 1 / k^2 )$}}, Volume = {27}, Year = {1987}} @conference{Hazan2016, Author = {Hazan, E. and Luo, H.}, Booktitle = {Proceedings of the $33$rd International Conference on Machine Learning}, Date-Added = {2017-12-30 00:28:22 +0000}, Date-Modified = {2017-12-30 00:29:50 +0000}, Title = {Variance-Reduced and Projection-Free Stochastic Optimization}, Year = {2016}} @article{Dunn1980, Author = {Dunn, J.}, Date-Added = {2017-12-29 15:08:44 +0000}, Date-Modified = {2017-12-29 15:09:31 +0000}, Journal = {SIAM J. Control Optim.}, Number = {5}, Pages = {473--487}, Title = {Convergence Rates for Conditional Gradient Sequences Generated by Implicit Step Length Rules}, Volume = {18}, Year = {1980}} @article{Dunn1979, Author = {Dunn, J.}, Date-Added = {2017-12-29 15:06:12 +0000}, Date-Modified = {2017-12-29 15:07:48 +0000}, Journal = {SIAM J. Control Optim.}, Number = {2}, Pages = {187--211}, Title = {Rates of Convergence for Conditional Gradient Algorithms Near Singular and Nonsinglular Extremals}, Volume = {17}, Year = {1979}} @article{Dunn1978, Author = {Dunn, J. and Harshbarger, S.}, Date-Added = {2017-12-29 15:03:16 +0000}, Date-Modified = {2017-12-29 15:06:28 +0000}, Journal = {Journal of Mathematical Analysis and Applications}, Number = {2}, Pages = {432--444}, Title = {Conditional Gradient Algorithms with Open Loop Step Size Rules}, Volume = {62}, Year = {1978}} @article{Levitin1966, Author = {Levitin, E. and Polyak, B.}, Date-Added = {2017-12-29 15:01:05 +0000}, Date-Modified = {2017-12-29 15:02:00 +0000}, Journal = {USSR Computational Mathematics and Mathematical Physics}, Number = {5}, Pages = {1--50}, Title = {Constrained Minimization Methods}, Volume = {6}, Year = {1966}} @article{FrankWolfe1956, Author = {Frank, M. and Wolfe, P.}, Date-Added = {2017-12-29 14:31:07 +0000}, Date-Modified = {2017-12-29 14:33:45 +0000}, Journal = {Naval Research Logistics Quarterly}, Pages = {95--110}, Title = {An Algorithm for Quadratic Programming}, Volume = {3}, Year = {1956}} @article{Zhao1998qap, Author = {Qing Zhao and Stefan E. Karisch and Franz Rendl and Henry Wolkowicz}, Date-Added = {2017-11-03 22:18:29 +0000}, Date-Modified = {2017-11-03 22:18:29 +0000}, Journal = {Journal of Combinatorial Optimization}, Number = {1}, Pages = {71--109}, Title = {Semidefinite programming relaxations for the quadratic assignment problem}, Volume = {2}, Year = {1998}} @article{Loiola2007, Author = {Loiola, E.M. and de Abreu, N.M.M. and Boaventura-Netto, P.O. and Hahn, P. and Querido, T.}, Date-Added = {2017-11-03 22:18:07 +0000}, Date-Modified = {2017-11-03 22:24:01 +0000}, Journal = {European Journal of Operational Research}, Number = {2}, Pages = {657--690}, Title = {A survey for the quadratic assignment problem}, Volume = {176}, Year = {2007}} @article{Garber2016, Author = {Garber, D. and Hazan, E.}, Date-Added = {2017-11-03 21:46:13 +0000}, Date-Modified = {2017-11-03 21:47:48 +0000}, Journal = {Mathematical Programming, Ser. A}, Number = {158}, Pages = {329--361}, Title = {Sublinear time algorithms for approximate semidefinite programming}, Year = {2016}} @conference{Yoshise2010, Address = {Yokohama}, Author = {Yoshise, A. and Matsukawa, Y.}, Booktitle = {{IEEE} {I}nternational Symposium on Computer-Aided Control System Design}, Date-Added = {2017-11-03 15:38:24 +0000}, Date-Modified = {2017-11-03 15:39:47 +0000}, Pages = {13--18}, Title = {On optimization over the doubly nonnegative cone}, Year = {2010}} @inproceedings{Yurtsever2016, Address = {Fort Lauderdale}, Author = {Yurtsever, A. and Udell, M. and Tropp, J.A. and Cevher, V.}, Booktitle = {Proc. 20th Int. Conf. Artificial Intelligence and Statistics (AISTATS)}, Date-Added = {2017-11-03 15:24:37 +0000}, Date-Modified = {2017-11-03 15:27:09 +0000}, Month = {May}, Title = {Sketchy Decisions: Convex Low-Rank Matrix Optimization with Optimal Storage}, Year = {2017}} @article{Lavaei2012, Author = {Lavaei, J. and Low, H.L.}, Date-Added = {2017-10-31 14:40:12 +0000}, Date-Modified = {2017-10-31 14:43:52 +0000}, Journal = {{IEEE} Trans. on Power Syst.}, Month = {February}, Number = {1}, Pages = {92--107}, Title = {Zero Duality Gap in Optimal Power Flow Problem}, Volume = {27}, Year = {2012}} @article{Bandeira2016, Author = {Bandeira, A.S. and Boumal, N. and Voroninski, V.}, Date-Added = {2017-10-26 15:53:31 +0000}, Date-Modified = {2017-10-26 15:54:35 +0000}, Journal = {{JMLR}: Workshop and Conference Proceedings}, Pages = {1--22}, Title = {On the low-rank approach for semidefinite programs arising in synchronization and community detection}, Volume = {49}, Year = {2016}} @article{jaggi2011convex, Author = {Jaggi, M.}, Date-Modified = {2017-10-31 15:01:25 +0000}, Journal = {arXiv preprint arXiv:1108.1170}, Title = {Convex optimization without projection steps}, Year = {2011}} @inproceedings{lacoste2012block, Address = {Atlanta, Georgia, USA}, Author = {Lacoste-Julien, S. and Jaggi, M. and Schmidt, M. and Pletscher, P.}, Booktitle = {Proc. $30$th Int. Conf. Machine Learning}, Date-Modified = {2017-11-06 23:36:35 +0000}, Journal = {arXiv preprint arXiv:1207.4747}, Title = {Block-coordinate Frank-Wolfe optimization for structural SVMs}, Year = {2013}} @article{Ahmed2014, Author = {Ahmed, A. and Recht, B. and Romberg, J.}, Date-Added = {2017-10-26 15:45:03 +0000}, Date-Modified = {2017-10-26 15:49:20 +0000}, Journal = {{IEEE} Trans. on Inf. Theory}, Number = {3}, Pages = {1711--1732}, Title = {Blind Deconvolution Using Convex Programming}, Volume = {60}, Year = {2014}} @article{Lanckriet2004, Author = {Lanckriet, G.R.G. and Cristianini, N. and Ghaoui, L.E. and Bartlett, P. and Jordan, M.I.}, Date-Added = {2017-10-26 15:38:58 +0000}, Date-Modified = {2017-10-26 15:41:29 +0000}, Journal = {Journal of Machine Learning Research}, Pages = {27--72}, Title = {Learning the kernel matrix with semidefinite programming}, Volume = {5}, Year = {2004}} @article{dAspremont2004, Author = {d'Aspremont, A. and Ghaoui, L.E. and Jordan, M.I. and Lanckriet, G.R.G.}, Date-Added = {2017-10-26 15:34:19 +0000}, Date-Modified = {2017-10-26 15:39:57 +0000}, Journal = {SIAM Review}, Number = {3}, Pages = {434--448}, Title = {A Direct Formulation for Sparse {PCA} Using Semidefinite Programming}, Volume = {49}, Year = {2007}} @article{Peng2007, Author = {Peng, J. and Wei, Y.}, Date-Added = {2017-10-26 15:22:37 +0000}, Date-Modified = {2017-10-26 15:30:53 +0000}, Journal = {SIAM J. Optim.}, Number = {1}, Pages = {186--205}, Title = {Approximating {K}--means--type clustering via semidefinite programming}, Volume = {18}, Year = {2007}} @inproceedings{Hazan2012, Author = {Hazan, E. and Kale, S.}, Booktitle = {Proceedings of the $29$th International Conference on Machine Learning}, Date-Added = {2017-10-26 14:40:57 +0000}, Date-Modified = {2017-10-26 14:42:00 +0000}, Title = {Projection--free Online Learning}, Year = {2012}} @unpublished{TranDinh2017, Author = {Tran--Dinh, Q. and Fercoq, O. and Cevher, V.}, Date-Added = {2017-10-26 14:38:40 +0000}, Date-Modified = {2017-12-30 16:06:44 +0000}, Note = {arXiv:1507.06243}, Title = {A Smooth Primal-Dual Optimization Framework for Nonsmooth Composite Convex Minimization}, Year = {2017}} @inproceedings{Yurtsever2015, Author = {Yurtsever, A. and Tran--Dinh, Q. and Cevher, V.}, Booktitle = {Advances in Neural Information Processing Systems 28}, Date-Added = {2017-10-26 14:30:02 +0000}, Date-Modified = {2017-10-26 14:42:25 +0000}, Title = {A Universal Primal-Dual Convex Optimization Framework}, Year = {2015}} @conference{locatello2017greedy, Author = {Locatello, F. and Tschannen, M. and R{\"a}tsch, G. and Jaggi, M.}, Booktitle = {Advances in Neural Information Processing Systems 30}, Date-Modified = {2018-01-08 08:55:53 +0000}, Title = {Greedy Algorithms for Cone Constrained Optimization with Convergence Guarantees}, Year = {2017}} @inproceedings{locatello2017unified, Author = {Locatello, F. and Khanna, R. and Tschannen, M. and Jaggi, M.}, Booktitle = {Proc. 20th Int. Conf. Artificial Intelligence and Statistics (AISTATS)}, Date-Modified = {2017-11-21 12:42:59 +0000}, Title = {A Unified Optimization View on Generalized Matching Pursuit and Frank-Wolfe}, Year = {2017}} @conference{Jaggi2013, Author = {Jaggi, M.}, Booktitle = {Proceedings of the $30$th International Conference on Machine Learning}, Date-Added = {2017-07-04 22:01:30 +0000}, Date-Modified = {2017-10-31 15:02:13 +0000}, Title = {Revisiting {F}rank--{W}olfe: {P}rojection--free sparse convex optimization}, Year = {2013}} @article{Freund2016, Author = {Freund, R. M. and Grigas, P.}, Date-Added = {2017-07-04 21:53:41 +0000}, Date-Modified = {2018-02-06 22:18:55 +0000}, Journal = {Mathematical Programming}, Month = {Jan}, Number = {1}, Pages = {199--230}, Title = {New analysis and results for the {F}rank--{W}olfe method}, Volume = {155}, Year = {2016}} + + +%%% AE added the following references in Sept 2018 + +@article{park2016provable, + title={Provable Burer-Monteiro factorization for a class of norm-constrained matrix problems}, + author={Park, Dohyung and Kyrillidis, Anastasios and Bhojanapalli, Srinadh and Caramanis, Constantine and Sanghavi, Sujay}, + journal={arXiv preprint arXiv:1606.01316}, + year={2016} +} + +@article{mixon2016clustering, + title={Clustering subgaussian mixtures by semidefinite programming}, + author={Mixon, Dustin G and Villar, Soledad and Ward, Rachel}, + journal={arXiv preprint arXiv:1602.06612}, + year={2016} +} \ No newline at end of file diff --git a/nonlinear.tex b/nonlinear.tex index 2b34507..732c988 100755 --- a/nonlinear.tex +++ b/nonlinear.tex @@ -1,782 +1,904 @@ %%%%%%%%%%%%%%%%%%%%%%% file template.tex %%%%%%%%%%%%%%%%%%%%%%%%% % % This is a general template file for the LaTeX package SVJour3 % for Springer journals. Springer Heidelberg 2010/09/16 % % Copy it to a new file with a new name and use it as the basis % for your article. Delete % signs as needed. % % This template includes a few options for different layouts and % content for various journals. Please consult a previous issue of % your journal as needed. % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % % First comes an example EPS file -- just ignore it and % proceed on the \documentclass line % your LaTeX will extract the file if required \begin{filecontents*}{example.eps} %!PS-Adobe-3.0 EPSF-3.0 %%BoundingBox: 19 19 221 221 %%CreationDate: Mon Sep 29 1997 %%Creator: programmed by hand (JK) %%EndComments gsave newpath 20 20 moveto 20 220 lineto 220 220 lineto 220 20 lineto closepath 2 setlinewidth gsave .4 setgray fill grestore stroke grestore \end{filecontents*} % \RequirePackage{fix-cm} % %\documentclass{svjour3} % onecolumn (standard format) %\documentclass[smallcondensed]{svjour3} % onecolumn (ditto) \documentclass[smallextended]{svjour3} % onecolumn (second format) %\documentclass[twocolumn]{svjour3} % twocolumn % \smartqed % flush right qed marks, e.g. at end of proof % \usepackage{amsmath} \usepackage{amssymb} \usepackage{graphicx} \usepackage{algorithm} \usepackage{color} - +\usepackage{hyperref} + +\hypersetup{ + colorlinks=true,% + citecolor=black,% + filecolor=black,% + linkcolor=black,% + urlcolor=blue +} % % \usepackage{mathptmx} % use Times fonts if available on your TeX system % insert here the call for the packages your document requires %\usepackage{latexsym} % etc. \PassOptionsToPackage{normalem}{ulem} \newcommand{\minimize}[2]{\ensuremath{\underset{\substack{{#1}}}% {\mathrm{minimize}}\;\;#2 }} \newcommand{\Argmind}[2]{\ensuremath{\underset{\substack{{#1}}}% {\mathrm{Argmin}}\;\;#2 }} \newcommand{\Frac}[2]{\displaystyle{\frac{#1}{#2}}} \newcommand{\menge}[2]{\big\{{#1} \;|\; {#2}\big\}} \newcommand{\Pair}[2]{{\big\langle{{#1},{#2}}\big\rangle}} \newcommand{\pair}[2]{{\langle{{#1},{#2}}\rangle}} \newcommand{\Scal}[2]{{\bigg\langle{{#1}\:\bigg |~{#2}}\bigg\rangle}} \newcommand{\Menge}[2]{\bigg\{{#1}~\bigg|~{#2}\bigg\}} \newcommand{\lev}[1]{\ensuremath{\mathrm{lev}_{\leq #1}\:}} \newcommand{\enve}[1]{\ensuremath{\operatorname{env}#1}} \newcommand{\emp}{\ensuremath{{\varnothing}}} \newcommand{\bdry}{\ensuremath{\operatorname{bdry}}} \newcommand{\yosi}[2]{\ensuremath{\sideset{^{#2}}{}{\operatorname{}}\!\!#1}} \newcommand{\infconv}{\ensuremath{\mbox{\small$\,\square\,$}}} \newcommand{\scal}[2]{\left\langle{#1}\mid {#2} \right\rangle} \newcommand{\pscal}[2]{\langle\langle{#1}\mid{#2}\rangle\rangle} \newcommand{\norm}[1]{\|#1\|} \newcommand{\vuo}{\ensuremath{\mbox{\footnotesize$\square$}}} \newcommand{\exi}{\ensuremath{\exists\,}} \newcommand{\zeroun}{\ensuremath{\left]0,1\right[}} \newcommand{\HH}{\ensuremath{\mathcal H}} \newcommand{\GG}{\ensuremath{\mathcal G}} \newcommand{\YY}{\ensuremath{\mathcal Y}} \newcommand{\XX}{\ensuremath{\mathcal X}} \newcommand{\bary}{\ensuremath{\widetilde{\boldsymbol{y}}}} \newcommand{\barp}{\ensuremath{\widetilde{\boldsymbol{p}}}} \newcommand{\barq}{\ensuremath{\widetilde{\boldsymbol{q}}}} \newcommand{\barx}{\ensuremath{\widetilde{\boldsymbol{x}}}} \newcommand{\BL}{\ensuremath{\EuScript B}\,} \newcommand{\BP}{\ensuremath{\EuScript P}} \newcommand{\HHH}{\ensuremath{\boldsymbol{\mathcal H}}} \newcommand{\WW}{\ensuremath{\boldsymbol{\mathcal W}}} \newcommand{\KKK}{\ensuremath{\boldsymbol{\mathcal K}}} \newcommand{\GGG}{\ensuremath{\boldsymbol{\mathcal G}}} \newcommand{\VV}{\ensuremath{\boldsymbol{V}}} \newcommand{\CCC}{\ensuremath{\boldsymbol{C}}} \newcommand{\MM}{\ensuremath{\boldsymbol{M}}} \newcommand{\SG}{\ensuremath{\boldsymbol{S}}} \newcommand{\EE}{\ensuremath{\boldsymbol{E}}} \newcommand{\XP}{\ensuremath{\mathcal X}^*} \newcommand{\sri}{\ensuremath{\operatorname{sri}}} \newcommand{\ri}{\ensuremath{\operatorname{ri}\,}} \newcommand{\RR}{\ensuremath{\mathbb R}} \newcommand{\KK}{\ensuremath{\mathcal K}} \newcommand{\RP}{\ensuremath{\left[0,+\infty\right[}} \newcommand{\RPP}{\ensuremath{\,\left]0,+\infty\right[}} \newcommand{\RX}{\ensuremath{\,\left]-\infty,+\infty\right]}} \newcommand{\NN}{\ensuremath{\mathbb N}} \newcommand{\SL}{\ensuremath{\EuScript S}\,} \newcommand{\dom}{\ensuremath{\operatorname{dom}}} \newcommand{\cont}{\ensuremath{\operatorname{cont}}} %\newcommand{\gr}{\ensuremath{\operatorname{gra}}} \newcommand{\prox}{\ensuremath{\operatorname{prox}}} \newcommand{\Prox}{\ensuremath{\operatorname{Prox}}} \newcommand{\intdom}{\ensuremath{\operatorname{int}\operatorname{dom}}\,} \newcommand{\inte}{\ensuremath{\operatorname{int}}} \newcommand{\cart}{\ensuremath{\mbox{\huge{$\times$}}}} \newcommand{\scart}{\ensuremath{\mbox{\LARGE{$\times$}}}} \newcommand{\WC}{\ensuremath{{\mathfrak W}}} \newcommand{\TT}{\ensuremath{{\mathbf T}}} \newcommand{\SC}{\ensuremath{{\mathfrak S}}} \newcommand{\rh}{\ensuremath{{\mathrm a}}} \newcommand{\og}{\ensuremath{{\mathrm b}}} \newcommand{\rk}{\ensuremath{{\mathsf {\mathbf a}}}} \newcommand{\ck}{\ensuremath{{\mathsf {\mathbf u}}}} \newcommand{\xk}{\ensuremath{{\mathsf{\mathbf x}}}} \newcommand{\yk}{\ensuremath{{\mathsf{\mathbf y}}}} \newcommand{\RPX}{\ensuremath{{[0,\pinf]}}} \newcommand{\card}{\ensuremath{\operatorname{card}}} \newcommand{\bd}{\ensuremath{\operatorname{bdry}}} \newcommand{\Argmin}{\ensuremath{\operatorname{Argmin}}} \newcommand{\argmin}{\ensuremath{\operatorname{argmin}}} \newcommand{\ran}{\ensuremath{\operatorname{ran}}} \newcommand{\zer}{\ensuremath{\operatorname{zer}}} \newcommand{\gra}{\ensuremath{\operatorname{gra}}} \newcommand{\conv}{\ensuremath{\operatorname{conv}}} \newcommand{\vv}{\ensuremath{\boldsymbol{v}}} \newcommand{\sss}{\ensuremath{\boldsymbol{s}}} \newcommand{\xx}{\ensuremath{\boldsymbol{x}}} \newcommand{\xs}{\ensuremath{\textsf{x}}} \newcommand{\xo}{\ensuremath{\overline{\boldsymbol{x}}}} \newcommand{\pp}{\ensuremath{\boldsymbol{p}}} \newcommand{\qq}{\ensuremath{\boldsymbol{q}}} \newcommand{\yy}{\ensuremath{\boldsymbol{y}}} \newcommand{\ff}{\ensuremath{\boldsymbol{f}}} \newcommand{\hh}{\ensuremath{\boldsymbol{h}}} \newcommand{\ttt}{\ensuremath{\boldsymbol{t}}} \newcommand{\ee}{\ensuremath{\boldsymbol{e}}} \newcommand{\rr}{\ensuremath{\boldsymbol{r}}} \newcommand{\gggg}{\ensuremath{\boldsymbol{g}}} \newcommand{\zz}{\ensuremath{\boldsymbol{z}}} \newcommand{\bb}{\ensuremath{\boldsymbol{b}}} \newcommand{\uu}{\ensuremath{\boldsymbol{u}}} \newcommand{\cc}{\ensuremath{\boldsymbol{c}}} \newcommand{\dd}{\ensuremath{\boldsymbol{d}}} \newcommand{\aaa}{\ensuremath{\boldsymbol{a}}} \newcommand{\ww}{\ensuremath{\boldsymbol{w}}} \newcommand{\BB}{\ensuremath{\boldsymbol{B}}} \newcommand{\LL}{\ensuremath{\boldsymbol{L}}} \newcommand{\PPP}{\ensuremath{\boldsymbol{\mathsf{P}}}} \newcommand{\UU}{\ensuremath{\boldsymbol{U}}} \newcommand{\EEE}{\ensuremath{\boldsymbol{E}}} \newcommand{\E}{\ensuremath{\mathbf{E}}} \newcommand{\D}{\ensuremath{\mathbf{D}}} \newcommand{\ep}{\ensuremath{\boldsymbol{\varepsilon}}} \newcommand{\RRR}{\ensuremath{\boldsymbol{R}}} %\newcommand{\RRR}{\ensuremath{\boldsymbol{R}}} \newcommand{\AAA}{\ensuremath{\boldsymbol{A}}} \newcommand{\BBB}{\ensuremath{\boldsymbol{B}}} \newcommand{\QQ}{\ensuremath{\boldsymbol{Q}}} \newcommand{\SSS}{\ensuremath{\boldsymbol{S}}} \newcommand{\DD}{\ensuremath{\boldsymbol{D}}} \newcommand{\PP}{\ensuremath{\boldsymbol{P}}} \newcommand{\FF}{\ensuremath{\boldsymbol{\mathcal{F}}}} \newcommand{\cone}{\ensuremath{\operatorname{cone}}} \newcommand{\Fix}{\ensuremath{\operatorname{Fix}}} \newcommand{\Id}{\ensuremath{\operatorname{Id}}} \newcommand{\diam}{\ensuremath{\operatorname{diam}}} \newcommand{\IId}{\ensuremath{\boldsymbol{\operatorname{Id}}}} \newcommand{\weakly}{\ensuremath{\rightharpoonup}} \newcommand{\minf}{\ensuremath{-\infty}} \newcommand{\pinf}{\ensuremath{+\infty}} \newcommand{\LLambda}{\ensuremath{\boldsymbol{\Lambda}}} \newcommand{\vva}{\ensuremath{\boldsymbol{\epsilon}}} \newcommand{\trace}{\operatorname{tr}} % AE's commands \newcommand{\edita}[1]{{\color{blue} #1}} -\newcommand{\notea}[1]{{\color{blue} \textbf{Note: #1}}} +\newcommand{\notea}[1]{{\color{magenta} \textbf{Note: #1}}} +\newcommand{\ol}{\overline} + % please place your own definitions here and don't use \def but % \newcommand{}{} % % Insert the name of "your journal" with % \journalname{myjournal} % \begin{document} \title{A relaxation of the augmented Lagrange method } %\subtitle{Do you have a subtitle?\\ If so, write it here} %\titlerunning{Short form of title} % if too long for running head \author{Bang C. Vu\and Alcaoglu Ahmet\and Sahin M. Fatih\and Alp Yurtsever\and Volkan Cevher \\[5mm] } %\authorrunning{Short form of author list} % if too long for running head \institute{Laboratory for Information and Inference Systems (LIONS), EPFL, Switzerland\\ \email{bang.vu@epfl.ch\and ahmet.alacaoglu@epfl.ch; mehmet.sahin@epfl.ch;alp.yurtsever@epfl.ch; mehmet.sahin@epfl.ch;volkan.cehver@epfl.ch} } \date{Received: date / Accepted: date} % The correct dates will be entered by the editor \maketitle \begin{abstract} We propose a splitting method for solving .... \keywords{Non-linear constraint \and Non-convex \and Smoothing\and Primal-dual} % \PACS{PACS code1 \and PACS code2 \and more} \subclass{47H05\and 49M29\and 49M27\and 90C25} \end{abstract} -\section{Introduction} +\section{Introduction \label{intro}} -\edita{Various problems in engineering and computational sciences can be cast as non-linear optimization programs, and the design of efficient numerical algorithms to provably solve such problems is therefore of fundamental importance. cite?} +\edita{Various problems in engineering and computational sciences can be cast as non-linear optimization programs, and the design of efficient numerical algorithms to provably solve such problems is therefore of fundamental importance. cite? %Non-linear programming is a broad discipline in applied mathematics. -In this paper, we \edita{are particularly interested in solving the optimization program -\label{intro} -\begin{equation}\label{prob:01} +In this paper, we are particularly interested in solving the optimization program +\begin{equation} +\label{prob:01} \begin{cases} \min_{u} h(u),\\ -L(u) = b,\\ +A(u) = b,\\ u\in C, \end{cases} -\end{equation}}where -$C\subset \RR^d$ is a non-empty closed convex set, -$L\colon \RR^d\to\RR^m$ is a non-linear but continuously differentiable, -$h\colon\RR^d\to \RR$ is a differentiable function with \edita{$\mathbb{L}_h$-Lipschitz continuous gradient.} \notea{Maybe we should change the operator $L$ to something else, specially since it's nonlinear and also because it gets confused with Lipschitz constant.} +\end{equation}where $h\in \mathbb{L}_{d}(\lambda_h)$ is a continuously-differentiable function from $\mathbb{R}^d$ to $\RR$ with $\lambda_h$ Lipschitz-continuous gradient, $A: \mathbb{R}^d\rightarrow\mathbb{R}^m$ is a non-linear function, with each component $A_i\in \mathbb{L}_d(\lambda_A) $ for $i\in \{1,\cdots,m\}$. Moreover, +$C\subset \RR^d$ is non-empty, closed, and convex. %\edita{Program \eqref{prob:01} is typically} non-convex and \edita{can be considered as a standard non-linear program, where the set $C$ %is modeled by inequalities constraints, namely, $C = \menge{u}{g(u) \leq 0}$ for some $g\colon\RR^d\to \RR^s$.} -\edita{Variants of Program \eqref{prob:01} naturally arise in ?? For the sake of brevity, we showcase only one instance of Program $\eqref{prob:01}$.} \notea{Please add some representative applications above alongside some references. } + + +Variants of Program~\eqref{prob:01} naturally arise in a broad range of applications in ?? \notea{Please add some representative applications above alongside some references. } For the sake of brevity, we showcase here one instance of Program $\eqref{prob:01}$.} \begin{example}\edita{{{\textbf{(Burer-Monteiro factorization)}}} -Let $\mathcal{X} = \mathbb{S}^{d\times d}$ be the space of $d\times d$ symmetric matrices, equipped with the standard inner product $\langle x|y\rangle = \text{tr}(x^*y)$. In particular, when $x\in \mathcal{X}$ is positive semi-definite, we write that $x\succeq 0$.} +Let $\mathbb{S}^{d'\times d'}$ be the space of $d'\times d'$ symmetric matrices, equipped with the standard inner product $\langle x|y\rangle = \text{tr}(x^*y)$. In particular, when $x\in \mathbb{S}^{d'\times d'}$ is positive semi-definite, we write that $x\succeq 0$.} % Consider $\mathcal{C}'\subseteq \mathcal{X}$, and let $h_0\colon\mathcal{X}\to \RR$ be a differentiable %convex function, with $\mathbb{L}_{0}$ Lipschitz-continuous gradient.} % \edita{Consider the program \begin{equation} \label{e:fac} \begin{cases} -\min_x h_0(x) \\ -M(x) = b\\ -x \succeq 0 \\ -x\in C', +\min_x h'(x) \\ +A'(x) = b'\\ +x\in C'\\ +x \succeq 0 , \end{cases} \end{equation} -where $C' \subseteq \mathcal{X}$ is a non-empty, closed, and convex subset of $\mathcal{X}$, and $h_0: \mathcal{X}\to \RR$ is a differentiable convex function, with $\mathbb{L}_0$ Lipschitz-continuous gradient. Also, $M\colon\mathcal{X}\to\RR^m$ and $b\in\RR^m$.} - -The problem \eqref{e:fac} includes many variety problems in semidefinite programming \cite{Burer2005,Burer2003,tu2014practical}. -This problem is a convex optimal problem. However, convex optimal methods are not good choice when $d$ is very large. To -overcome this issue, factorized technique \cite{Burer2005,Burer2003} $x = uu^\top$, that transmit the hight dimensional $x$ into low dimensional $u$, will be useful. -Moreover, the constraint $x\succeq 0$ will be disappeared in $u$. Suppose that one can find $C \subset \RR^{n\times r}$ such that -$x\in C'$ if $u\in C$. By defining $L\colon u\mapsto Muu^\top$ and $h\colon u\mapsto h_0(uu^\top)$, we can -write \eqref{e:fac} as a non-convex optimal problem: -\begin{equation} - \label{e:fac1} -\underset{u\in C, Lu=b} {\text{min}} \; h(u). -\end{equation} - Then it is not hard to check that \eqref{e:fac1} fits to our framework. \notea{Example hasn't been verified.} +where $h': \mathbb{S}^{d'\times d'} \to \RR$, $A'\colon\mathbb{S}^{d'\times d'}\to\RR^m$, $b\in\RR^m$, and $C' \subseteq \mathbb{R}^{d'\times d'}$. + +Variants of Program \eqref{e:fac} are popular in matrix completion and sensing \cite{park2016provable}, with a broad range of applications to problems in collaborative filtering, geophysics, and imaging, among others~\cite{Burer2005,Burer2003,tu2014practical}. Two common choices for $C'$ in Program \eqref{e:fac} are +$C' =\{x: x \ge 0\}$ and $C' = \{x: \text{tr}(x) \le 1\}$ \cite{mixon2016clustering}.} + +\edita{Solving Program \eqref{e:fac} with semi-definite programming is not scalable, becoming increasingly cumbersome as the dimension $d'$ grows. +To overcome this {computational bottleneck}, the factorized technique sets $x = uu^\top$ for $u\in \mathbb{R}^{d'\times r}$ and a sufficiently large $r$. The resulting non-convex program is then solved with respect to the much lower-dimensional variable $u$. If we also replace the constraint $uu^\top \in C'$ with $u\in C$ for a properly chosen convex set, the new problem in $u$ matches Program \eqref{prob:01} with $h(u) = h'(uu^\top)$ and $A(u) = A'(uu^\top)$. For our examples of $C'$ above, we might choose $C=\{u:u\ge 0\}$ and $C=\{\|u\|_F^2 \le 1\}$, respectively. Here, $\|\cdot\|_F$ stands for the Frobenius norm. +} \end{example} \edita{The \emph{augmented Lagrangian method} \cite{luenberger1984linear} is a powerful approach to solve Program \eqref{prob:01}, see Section \ref{sec:related work} for a review of the related literature as well as other approaches to solve Program \eqref{prob:01}}. \edita{ Indeed, for positive $\beta$, it is easy to verify that Program \eqref{prob:01} is equivalent to \begin{align} \min_{u\in C} \max_y \, \mathcal{L}_\beta(u,y), \label{eq:minmax} \end{align} where \begin{align} -\mathcal{L}_\beta(u,y) := h(u) + \langle L(u)-b, y \rangle + \frac{1}{2\beta}\|L(u)-b\|_2^2, +\label{eq:Lagrangian} +\mathcal{L}_\beta(u,y) := h(u) + \langle A(u)-b, y \rangle + \frac{1}{2\beta}\|A(u)-b\|_2^2, \end{align} is the augmented Lagrangian corresponding to Program \eqref{prob:01}. The equivalent formulation in Program \eqref{eq:minmax} naturally suggests the following algorithm to solve Program \eqref{prob:01}:} \begin{equation}\label{e:exac} -u_{k+1} \in \argmin_{u} \mathcal{L}_{\beta}(u,y_k), +u_{k+1} \in \underset{u\in C}{\argmin} \, \mathcal{L}_{\beta}(u,y_k), \end{equation} \begin{equation} -y_{k+1} = y_k+\frac{1}{\beta}(Lu_{k+1} -b). +y_{k+1} = y_k+\frac{1}{\beta}(A(u_{k+1}) -b). \end{equation} In fact, when the penalty parameter $\beta$ is sufficiently small, the augmented Lagrangian has a local minimum point near the true optimal point. However, we do not know exactly how small $\beta$ is. -Hence, the choice of $\beta$ plays a centreral role in practices. Moreover, - $\mathcal{L}_{\beta}(u,y)$ is non-convex in $u$, finding -$u_{k+1}$ is a problematic. +Hence, the choice of $\beta$ plays a centreral role in practices. \notea{Is the last claim really true? Programs \eqref{prob:01} and \eqref{eq:minmax} seem to be equivalent. } +\edita{In our nonlinear framework, updating $u$ in the augmented Lagrangian method requires solving the non-convex Program \eqref{e:exac} to global optimality, which is often intractable. } \notea{We should discuss fixes to this issue, if any, and explain why they are not satisfactory.} \edita{The key contribution of this paper is to provably and efficiently address this challenge.} -This paper we overcome these changlenges. We propose a algorithmic framework where instead of -solving \eqref{e:exac}, we do only one projected gradient step. Then, update $\beta$ adatively. -%%%%%%%%%%%%%%%%%%%% -\section{ Preliminaries} -\subsection{Notations} -We use the notations $\scal{\cdot}{\cdot}$ and $\|\cdot\|$ for the scalar product and associated norm on $\RR^d$. -The conjugate of the linear operator $A$ is denoted by $A^*$. -Let $C$ be a nonempty closed subset of $\mathcal{X}$. The indicator function of $\mathcal{C}$ is denoted by $\iota_{\mathcal{C}}$. -The distance function is $d_{\mathcal{C}}\colon u\mapsto \inf_{a\in\mathcal{C}}\|u-a\|$. The projection of $x$ onto $C$ is denoted by $P_Cx$. -Let $\overline{u}\in C$, the tangent cone to $C$ at $\overline{u}$ is +\edita{ +\paragraph{\emph{\textbf{Contributions.}} } + +In order to solve Program \eqref{prob:01}, this paper proposes to replace the (intractable) Program \eqref{e:exac} with the update \begin{equation} -T_{C}(\overline{u}) = \menge{u\in \RR^d}{\exists t_k\downarrow 0, u_k \to u \;\text{with}\; (\forall k\in\NN)\; \overline{u} + t_ku_k\in C}. +u_{k+1} = P_C (u_k - \gamma_k \nabla \mathcal{L}_{\beta_k} (u_k,y_k)), +\label{eq:new update} \end{equation} -The regular normal cone $\hat{N}_C(\overline{u})$ is defined as the dual to the tangent cone, $\hat{N}_C(\overline{u}) = T_{C}(\overline{u})^*$. -The (Mordukhovich) limiting normal cone to $C$ at $\overline{u}$ is defined by +for carefully selected sequences $\{\beta_k,\gamma_k\}_k$. Here, $P_C$ is the orthogonal projection onto the convex set $C$ which is often easy to compute in various applications and consequently the update in \eqref{eq:new update} is inexpensive and fast. + +Put differently, instead of fully solving Program \eqref{e:exac}, this paper proposes to apply one iteration of the projected gradient algorithm for every update. We provide the convergence guarantees for this fast and scalable new algorithm. +}\notea{We should summarize the guarantees.} + +%%%%%%%%%%%%%%%%%%%%% +\section{ Preliminaries} +\notea{I think the whole of this section should move down. The actual results are hidden deep in the paper!} + +\paragraph{\textbf{\emph{Notation.}}} +We use the notations $\scal{\cdot}{\cdot}$ and $\|\cdot\|$ for the \edita{standard inner} product and \edita{the} associated norm on $\RR^d$\edita{, respectively}. +The conjugate of \edita{a} linear operator $A$ is denoted by $A^\top$. +Let $C\subset \mathbb{R}^d$ be nonempty, closed, \edita{and convex}. The indicator function of $C$ is denoted by $\iota_{\mathcal{C}}$, and the projection onto $C$ is denoted by $P_C$. +%The distance function is $d_{\mathcal{C}}\colon u\mapsto \inf_{a\in\mathcal{C}}\|u-a\|$. The projection of $x$ onto $C$ is denoted by $P_Cx$. +\edita{For $u\in C$, the tangent cone to $C$ at $u$ is \begin{equation} -N_C(\overline{u}) = \menge{v\in \RR^d}{\exists u_k\to \overline{u}, v_k\to v \;\text{with}\; (\forall k\in\NN)\; v_k \in \hat{N}_{C}(u_k)}. +T_{C}(u) = \left\{v\in \RR^d : \exists t > 0 \text{ such that } u+t v \in C\right\}. \end{equation} -Let $f\colon \RR^d\to \left]-\infty, +\infty\right]$ be a proper, lsc, convex function. The sub-differential of $f$ at $p$ is +The corresponding normal cone $N_C(u)$ at $u$ is the polar of the tangent cone, namely, +\begin{align} +N_C(u) = \left\{ v': \langle v, v' \rangle \le 0,\, \forall v\in T_C(u) \right\}. +\end{align}} +%The regular normal cone $\hat{N}_C(\overline{u})$ is defined as the dual to the tangent cone, $\hat{N}_C(\overline{u}) = T_{C}(\overline{u})^*$. +%The (Mordukhovich) limiting normal cone to $C$ at $\overline{u}$ is defined by +%\begin{equation} +%N_C(\overline{u}) = \menge{v\in \RR^d}{\exists u_k\to \overline{u}, v_k\to v \;\text{with}\; (\forall k\in\NN)\; v_k \in \hat{N}_{C}(u_k)}. +%\end{equation} + \edita{ +The sub-differential of a convex function $f$ at $u$ is defined as +% +%Let $f\colon \RR^d\to (-\infty, +\infty$ be a proper, lower semi-continuous, convex function. The sub-differential of $f$ at $p$ is \begin{equation} -\partial f(p)=\menge{u\in\mathcal{X}}{(\forall y \in\mathcal{X})\; f(y) -f(p) \geq \scal{u}{y-p} }. -\end{equation} -If $f$ is differentiable, $\partial f(p)$ is a singleton and denoted by $\nabla f$. +\partial f(u)= +\left\{ g : +f(u') - f(u) \ge \langle g, u'-u\rangle, +\,\, \forall u' + \right\}. +\end{equation} } +\edita{In particular,} if $f$ is differentiable at $u$, $\partial f(u)$ is a singleton and denoted by $\nabla f(u)$. \label{s:nota} -\subsection{Fist order optimality conditions} -Fist order optimality conditions for the problem \eqref{prob:01} are well studied in the literature \cite[Corollary 6.15]{rockafellar2009variational}. -Let $\overline{u}$ be a locally optimal and suppose that there no vector $y\not=0$ such that -\begin{equation} --\nabla L(\overline{u})^* y \in N_C(\overline{u}). -\end{equation} -Then, the first order optimality condition for $\overline{u}$ is -\begin{equation}\label{e:inclu1} -(\exists y \in\RR^m)\; - \nabla L(\overline{u})^*y - \nabla h(\overline{u}) \in N_{C}(\overline{u}). -\end{equation} -Since $\partial \iota_{\{0\}} = \RR^m$, the condition \eqref{e:inclu1} is equivalent to -\begin{equation} -0 \in \partial \iota_C(\overline{u}) +\nabla h(\overline{u}) + \nabla L(\overline{u})^* \partial \iota_{\{0\}}(L\overline{u}-b). -\end{equation} -Observe that the condition \eqref{e:inclu1} is also equivalent to the following condition -\begin{equation}\label{e:inclu2} -(\exists y \in\RR^m)\; 0 \in \nabla \mathcal{L}(\overline{u},y), -\end{equation} -where $\mathcal{L}(u,y)$ is the Lagrangian function associated to the non-linear constraint $Lu=b$, -\begin{equation} - \mathcal{L}_{\beta}(u,y) = (h +\iota_C)(u) + \scal{Lu-b}{y}. -\end{equation} -The corresponding augmented Lagrangian function associated to the non-linear constraint $Lu=b$ is defined by -\begin{equation} -(\forall \beta \in \left]0,+\infty\right[)\quad \mathcal{L}_{\beta}(u,y) = (h+\iota_C)(u) + \scal{Lu-b}{y} +\frac{1}{2\beta}\|Lu-b\|^2. -\end{equation} -For convenience, we define -\begin{equation} -(\forall \beta \in \left]0,+\infty\right[)\quad g_{\beta}(u,y) = \scal{Lu-b}{y} +\frac{1}{2\beta}\|Lu-b\|^2. -\end{equation} -and -\begin{equation} -(\forall \beta \in \left]0,+\infty\right[)\quad F_{\beta}(u,y) = h(u)+ g_{\beta}(u,y). -\end{equation} +\paragraph{\textbf{\emph{\edita{Necessary Optimality Conditions.}}} \label{sec:opt cnds}} +\edita{Necessary optimality conditions} for \edita{Program} \eqref{prob:01} are well studied in the literature \cite[Corollary 6.15]{rockafellar2009variational}. \edita{Indeed, $u$ is a (first-order) stationary point of Program \eqref{prob:01} if there exists $y$ for which +\begin{align} +\begin{cases} +-\nabla h(u) - DA(u)^\top y \in N_C(u)\\ +A(u) = b. +\end{cases} +\label{e:inclu1} +\end{align} +Here, $DA(u)$ is the Jacobian of $A$ at $u$. Recalling \eqref{eq:Lagrangian}, we observe that \eqref{e:inclu1} is equivalent to +\begin{align} +\begin{cases} +-\nabla_u \mathcal{L}_\beta(u,y) \in N_C(u)\\ +A(u) = b, +\end{cases} +\label{e:inclu1} +\end{align} +which is in turn the necessary optimality condition for Program \eqref{eq:minmax}. +} +% +%Let $\overline{u}$ be a locally optimal and suppose that there no vector $y\not=0$ such that +%\begin{equation} +%-\nabla L(\overline{u})^* y \in N_C(\overline{u}). +%\end{equation} +%Then, the first order optimality condition for $\overline{u}$ is +%\begin{equation}\label{e:inclu1} +%(\exists y \in\RR^m)\; - \nabla L(\overline{u})^*y - \nabla h(\overline{u}) \in N_{C}(\overline{u}). +%\end{equation} +%Since $\partial \iota_{\{0\}} = \RR^m$, the condition \eqref{e:inclu1} is equivalent to +%\begin{equation} +%0 \in \partial \iota_C(\overline{u}) +\nabla h(\overline{u}) + \nabla L(\overline{u})^* \partial \iota_{\{0\}}(L\overline{u}-b). +%\end{equation} +%Observe that the condition \eqref{e:inclu1} is also equivalent to the following condition +%\begin{equation}\label{e:inclu2} +%(\exists y \in\RR^m)\; 0 \in \nabla \mathcal{L}(\overline{u},y), +%\end{equation} +%where $\mathcal{L}(u,y)$ is the Lagrangian function associated to the non-linear constraint $Lu=b$, +%\begin{equation} +% \mathcal{L}_{\beta}(u,y) = (h +\iota_C)(u) + \scal{Lu-b}{y}. +%\end{equation} +%The corresponding augmented Lagrangian function associated to the non-linear constraint $Lu=b$ is defined by +%\begin{equation} +%(\forall \beta \in \left]0,+\infty\right[)\quad \mathcal{L}_{\beta}(u,y) = (h+\iota_C)(u) + \scal{Lu-b}{y} +\frac{1}{2\beta}\|Lu-b\|^2. +%\end{equation} +%For convenience, we define +%\begin{equation} +%(\forall \beta \in \left]0,+\infty\right[)\quad g_{\beta}(u,y) = \scal{Lu-b}{y} +\frac{1}{2\beta}\|Lu-b\|^2. +%\end{equation} +%and +%\begin{equation} +%(\forall \beta \in \left]0,+\infty\right[)\quad F_{\beta}(u,y) = h(u)+ g_{\beta}(u,y). +%\end{equation} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Gradient mapping} -In nonconvex optimization, the relations between the gradient mapping and station point are well-understood \cite{Lan16,Hare2009,bolte2014proximal}. -We recall the relation between the gradient mapping and function value $F_{\beta}$. -\begin{definition} Given $u$ and $\gamma >0$, define - gradient mapping $G_{\beta,\gamma}(\cdot; y)\colon u\mapsto \gamma^{-1}(u-u^+)$, where $u^+=P_{C}(u-\gamma \nabla F_{\beta}(u;y))$. +\paragraph{\textbf{\emph{Gradient Mapping.}}} +In nonconvex optimization, the relation between the gradient mapping and stationarity is well-understood \cite{Lan16,Hare2009,bolte2014proximal}, \edita{which we review here for completeness.} +\begin{definition} \label{def:grad map} Given $u$ and $\gamma >0$, define + the gradient mapping $G_{\beta,\gamma}(\cdot; y)\colon u\rightarrow \gamma^{-1}(u-u^+)$, where $u^+=P_{C}(u-\gamma \nabla \mathcal{L}_ {\beta}(u,y))$. \end{definition} %\begin{definition} Given $u$ and $\gamma >0$, let $r_{\beta}(\xi,y)$ be a stochastic estimimate of $\nabla F_{\beta}(u,y)$, define %stochastic gradient mapping $SG_{\beta,\gamma}(\cdot; y)\colon u\mapsto \gamma^{-1}(u-u_+)$, where $u_+=\prox_{\gamma f}(u-\gamma r_{\beta}(\xi;y))$. %\end{definition} - -The following lemma is an application of \cite[Lemma 3.2, Remark 3.2(i)]{bolte2014proximal}. +\edita{In particular, if we remove the constraints of Program \eqref{prob:01}, the gradient mapping reduces to $G_{\beta,\gamma}(u,y)=\nabla h(u) $. The gradient mapping is closely related to $\mathcal{L}_\beta$}. The following result follows immediately from an application of \cite[Lemma 3.2, Remark 3.2(i)]{bolte2014proximal}. \begin{lemma}\label{lem:11} - Suppose that $\nabla g_{\beta}(\cdot, y)$ is $\mathbb{L}_{\beta}$-Lipschitz continuous. Let $u\in C$. Then, - for any $\gamma \in ]0, 1/(\mathbb{L}_h+ \mathbb{L}_{\beta})[$, + \edita{For fixed $y\in \RR^m$, suppose that $\mathcal{L}_{\beta}(\cdot, y) \in \mathbb{L}_{d}(\lambda_\beta)$. For $u\in C$ and $\gamma \in (0, 1/\lambda_\beta)$, it holds that } \begin{equation} \label{e:deslem} - \| G_{\beta,\gamma}(u;y)\|^{2}\leq \frac{2}{\gamma} (F_{\beta}(u;y) - F_{\beta}(u^+;y)). + \| G_{\beta,\gamma}(u;y)\|^{2}\leq \frac{2}{\gamma} (\mathcal{L}_\beta(u;y) - \mathcal{L}_\beta(u^+;y)). \end{equation} -where $u^+ =u^+(\mu,u) = P_{C}(u-\gamma \nabla F_{\beta}(u;y))$. +%where $u^+ =u^+(\mu,u) = P_{C}(u-\gamma \nabla F_{\beta}(u;y))$. %\end{enumerate} \end{lemma} -\begin{proof} For a fixed $y$, the function $u\mapsto h(u)+F_{\beta}(u;y)$ is $(\mathbb{L}_h+ \mathbb{L}_{\beta})$-Lipschitz. Hence, the results follows from -\cite[Lemma 3.2, Remark 3.2(i)]{bolte2014proximal}. -\end{proof} - -In application, the Lipschitz constant $L_{\beta}$ is somtimes hard to evaluate exactly. In this case we can exploit the classic line search technique. -Let $\delta $ and $\theta$ be in $\left]0,1\right[$ and $\overline{\gamma} > 0$. Define -\begin{alignat}{2}\label{e:non-lips} -&\gamma = \max\{\mu > 0| (\exists i \in\NN)(\mu= \overline{\gamma}\theta^i)\notag\\ -&F_{\beta}(u^+(\mu,u),y) \leq F_{\beta}(u,y) + \scal{u^+(\mu,u)-u}{\nabla F_{\beta}(u,y)} +\frac{\delta}{\mu}\|u-u^+(\mu,u)\|^2\}. -\end{alignat} -Now, let $\gamma$ be define as in \eqref{e:non-lips}. Then, since $u^+ =u^+(\mu,u) = P_{C}(u-\gamma \nabla F_{\beta}(u;y))$, -\begin{equation} -G_{\beta,\gamma}(u) - \nabla F_{\beta}(u;y)) \in N_{C}(u^+). -\end{equation} -Hence, $\scal{G_{\beta,\gamma}(u) - \nabla F_{\beta}(u;y)) }{u-u^+} \leq 0$. Using \eqref{e:non-lips}, we have -\begin{alignat}{2} -F_{\beta}(u^+,y) &\leq F_{\beta}(u,y) + \scal{u-u^+}{ -\nabla F_{\beta}(u,y)} +\frac{\delta}{\gamma}\|u-u^+\|^2\notag\\ -&= F_{\beta}(u,y) + \scal{u-u^+}{ G_{\beta,\gamma}(u) -\nabla F_{\beta}(u,y)} +\frac{\delta-1}{\gamma}\|u-u^+\|^2\notag\\ -&\leq F_{\beta}(u,y) - \frac{\delta-1}{\gamma}\|u-u^+\|^2, -\end{alignat} -which implies that -\begin{equation} -(1-\delta) \| G_{\beta,\gamma}(u;y)\|^{2}\leq \frac{1}{\gamma} (F_{\beta}(u;y) - F_{\beta}(u^+;y)). -\end{equation} -In particular, by taking $\delta =1/2$, we obtain \eqref{e:deslem}. -\begin{lemma} -\label{l:solv} - Suppose that $\nabla g_{\beta}(\cdot, y)$ is $\mathbb{L}_{\beta}$-Lipschitz continuous. Let $u\in C$ and -$\gamma \in ]0, 1/(\mathbb{L}_h+ \mathbb{L}_{\beta})[$. Suppose that $Lu^+=b$ and $\| G_{\beta,\gamma}(u;y)\|^{2} =0$. -Then $u^+$ is a stationary point, i.e., the first order optimality condition \eqref{e:inclu1}, for $ \overline{u} =u^+$, is satisfied. +%\begin{proof} For a fixed $y$, the function $u\mapsto h(u)+F_{\beta}(u;y)$ is $(\mathbb{L}_h+ \mathbb{L}_{\beta})$-Lipschitz. Hence, the results follows from +%\cite[Lemma 3.2, Remark 3.2(i)]{bolte2014proximal}. +%\end{proof} + +In \edita{practice}, the Lipschitz constant $\lambda_{\beta}$ is often hard to evaluate exactly \edita{and we might resort to the classic line search technique, reviewed below and proved in Appendix \ref{sec:proof of eval Lipsc cte}.} +\edita{ +\begin{lemma} \label{lem:eval Lipsc cte} Fix $\rho \in (0,1)$ and ${\gamma}_0$. For $\ol{\gamma}>0$, let $u^+_{\gamma'} = P_C(u - \gamma' \nabla \mathcal{L}_\beta(u,y))$ and define +\begin{equation*} +\gamma := +\max \left\{ +\gamma' ={\gamma}_0 \rho^i : +\mathcal{L}_\beta (u^+_{\gamma'},y) \le \mathcal{L}_\beta (u,y) + +\left\langle u^+_{\gamma'} - u | \nabla \mathcal{L}_{\beta}(u,y) \right\rangle ++ \frac{1}{2\gamma'} \| u^+_{\gamma'} - u\|^2 +\right\}. +\end{equation*} +Then, (\ref{e:deslem}) holds. \end{lemma} -\begin{proof} -Since $u^+ = P_{C}(u-\gamma \nabla F_{\beta}(u;y))$. Then - it follows that -\begin{equation} -\label{e:ver1} -G_{\gamma,\beta}(u;y) - \nabla F_{\beta}(u;y) \in N_C(u^+). -\end{equation} -Adding $\nabla F_{\beta}(u^+;y)$ to both sides of \eqref{e:ver1}, we obtain -\begin{equation} -\label{e:ver2} -G_{\gamma,\beta}(u;y) - \nabla F_{\beta}(u;y) +\nabla F_{\beta}(u^+;y) \in\partial f(u^+)+ \nabla F_{\beta}(u^+;y). -\end{equation} -Using the Lipschitzian gradient of $F_{\beta}(\cdot, y)$ and $\gamma (\mathbb{L}_\beta +\mathbb{L}_{h}) \leq 1$, we see that -\begin{alignat}{2} -&\|G_{\gamma,\beta}(u;y) - \nabla F_{\beta}(u;y) - \nabla F_{\beta}(u^+;y) \| \notag\\ -&\quad\leq \|G_{\gamma,\beta}(u;y)\|+\|\nabla F_{\beta}(u;y) - \nabla F_{\beta}(u^+;y) \|\notag\\ -&\quad\leq \|G_{\gamma,\beta}(u;y)\|+( \mathbb{L}_\beta +\mathbb{L}_{h})\|u-u^+\|\notag\\ -&\quad =0, -\end{alignat} -which means that $G_{\gamma,\beta}(u;y) - \nabla F_{\beta}(u;y) - \nabla F_{\beta}(u^+;y)=0$. Hence, we derive from -\eqref{e:ver2} that -\begin{equation} -\label{conkhi} -0 \in\partial f(u^+)+ \nabla F_{\beta}(u^+;y). -\end{equation} -By definition of $F_{\beta}(u^+;y)$ and $Lu^+=b$, we have -\begin{alignat}{2} -\nabla F_{\beta}(u^+;y) &= \nabla h(u^+) + \nabla L(u^+)^*y + \frac{1}{\beta} \nabla L(u^+)^*(Lu^+-b) \notag\\ -&= \nabla h(u^+) + \nabla L(u^+)^*y, -\end{alignat} -which together with \eqref{conkhi}, shows that \eqref{e:inclu1} is satisfied. -\end{proof} -\subsection{Sufficient conditions} -Sufficient condition for a local minima is well-studied in the litterature \cite{luenberger1984linear,rockafellar2009variational,mordukhovich2006variational,gfrerer2015complete}. -For simple, let us assume that $C = \menge{u}{g(u)\leq c}$ for some $\mathcal{C}^2$-function $g\colon\RR^d\to\RR$. In this section we assume that $h$ and $L$ are too $\mathcal{C}^2$-functions. -Let $\overline{u}$ be a point such that $g(\overline{u})=c$ and $0\not\in\nabla g(\overline{u})$. Then, \cite[Proposition 10.3]{rockafellar2009variational} implies that -\begin{equation} -N_{C}(\overline{u}) =\menge{\mu \nabla g(\overline{u})}{\mu \geq 0}. -\end{equation} -Now suppose that the first oder optimality condition \eqref{e:inclu1} is satisfied for $\overline{u}$. Then there exists $\mu \geq 0$ and $y \in\RR^m$ such that -\begin{equation}\label{e:inclu1} - \nabla L(\overline{u})^*y + \nabla h(\overline{u}) +\mu \nabla g(\overline{u})=0. - \end{equation} - Therefore, inview of \cite[Chapter 11]{luenberger1984linear}, $\overline{u}$ is a local minimum provided that - the Hessian matrix - \begin{equation} - H(\overline{u}) = \nabla^2h(\overline{u}) + y^T\nabla^2 L(\overline{u}) + \mu \nabla^2 g(\overline{u}) - \end{equation} - is positive definite on the space -\begin{equation} -E= \menge{y}{\nabla L(\overline{u})y =0, \nabla g(\overline{u})y=0}. -\end{equation} -In our examples, this condition is checkable where $g(u) = \frac{1}{2}\|u\|^2$, $h$ is quadratic and $Lu = Muu^\top$. +} + + +%Let $\delta $ and $\theta$ be in $\left]0,1\right[$ and $\overline{\gamma} > 0$. Define +%\begin{alignat}{2}\label{e:non-lips} +%&\gamma = \max\{\mu > 0| (\exists i \in\NN)(\mu= \overline{\gamma}\theta^i)\notag\\ +%&F_{\beta}(u^+(\mu,u),y) \leq F_{\beta}(u,y) + \scal{u^+(\mu,u)-u}{\nabla F_{\beta}(u,y)} +\frac{\delta}{\mu}\|u-u^+(\mu,u)\|^2\}. +%\end{alignat} +%Now, let $\gamma$ be define as in \eqref{e:non-lips}. Then, since $u^+ =u^+(\mu,u) = P_{C}(u-\gamma \nabla F_{\beta}(u;y))$, +%\begin{equation} +%G_{\beta,\gamma}(u) - \nabla F_{\beta}(u;y)) \in N_{C}(u^+). +%\end{equation} +%Hence, $\scal{G_{\beta,\gamma}(u) - \nabla F_{\beta}(u;y)) }{u-u^+} \leq 0$. Using \eqref{e:non-lips}, we have +%\begin{alignat}{2} +%F_{\beta}(u^+,y) &\leq F_{\beta}(u,y) + \scal{u-u^+}{ -\nabla F_{\beta}(u,y)} +\frac{\delta}{\gamma}\|u-u^+\|^2\notag\\ +%&= F_{\beta}(u,y) + \scal{u-u^+}{ G_{\beta,\gamma}(u) -\nabla F_{\beta}(u,y)} +\frac{\delta-1}{\gamma}\|u-u^+\|^2\notag\\ +%&\leq F_{\beta}(u,y) - \frac{\delta-1}{\gamma}\|u-u^+\|^2, +%\end{alignat} +%which implies that +%\begin{equation} +%(1-\delta) \| G_{\beta,\gamma}(u;y)\|^{2}\leq \frac{1}{\gamma} (F_{\beta}(u;y) - F_{\beta}(u^+;y)). +%\end{equation} +%In particular, by taking $\delta =1/2$, we obtain \eqref{e:deslem}. +\edita{Optimality conditions in Section \ref{sec:opt cnds} can also be expressed in terms of the gradient mapping. Indeed, it is straightforward to verify that $u^+$ is a first-order stationary point of Program \eqref{prob:01} if +\begin{align} +\begin{cases} +G_{\beta,\gamma}(u,y) = 0\\ +A(u^+) = b. +\end{cases} +\label{eq:opt grad map} +\end{align} +} +% +% +% +%\begin{lemma} +%\label{l:solv} +% Suppose that $\mathcal{L}_{\beta,\gamma}\in \mathbb{L}_d(\lambda_\beta)$. For $\gamma \in (0, 1/\lambda_\beta)$, $u^+=P_C(u-\gamma \nabla \mathcal{L}_\beta(u,y))$ is a first-order stationary point of Program (\ref{prob:01}) if +% +% $\nabla g_{\beta}(\cdot, y)$ is $\mathbb{L}_{\beta}$-Lipschitz continuous. Let $u\in C$ and +%$\gamma \in ]0, 1/(\mathbb{L}_h+ \mathbb{L}_{\beta})[$. Suppose that $Lu^+=b$ and $\| G_{\beta,\gamma}(u;y)\|^{2} =0$. +%Then $u^+$ is a stationary point, i.e., the first order optimality condition \eqref{e:inclu1}, for $ \overline{u} =u^+$, is satisfied. +%\end{lemma} +%\begin{proof} +%Since $u^+ = P_{C}(u-\gamma \nabla F_{\beta}(u;y))$. Then +% it follows that +%\begin{equation} +%\label{e:ver1} +%G_{\gamma,\beta}(u;y) - \nabla F_{\beta}(u;y) \in N_C(u^+). +%\end{equation} +%Adding $\nabla F_{\beta}(u^+;y)$ to both sides of \eqref{e:ver1}, we obtain +%\begin{equation} +%\label{e:ver2} +%G_{\gamma,\beta}(u;y) - \nabla F_{\beta}(u;y) +\nabla F_{\beta}(u^+;y) \in\partial f(u^+)+ \nabla F_{\beta}(u^+;y). +%\end{equation} +%Using the Lipschitzian gradient of $F_{\beta}(\cdot, y)$ and $\gamma (\mathbb{L}_\beta +\mathbb{L}_{h}) \leq 1$, we see that +%\begin{alignat}{2} +%&\|G_{\gamma,\beta}(u;y) - \nabla F_{\beta}(u;y) - \nabla F_{\beta}(u^+;y) \| \notag\\ +%&\quad\leq \|G_{\gamma,\beta}(u;y)\|+\|\nabla F_{\beta}(u;y) - \nabla F_{\beta}(u^+;y) \|\notag\\ +%&\quad\leq \|G_{\gamma,\beta}(u;y)\|+( \mathbb{L}_\beta +\mathbb{L}_{h})\|u-u^+\|\notag\\ +%&\quad =0, +%\end{alignat} +%which means that $G_{\gamma,\beta}(u;y) - \nabla F_{\beta}(u;y) - \nabla F_{\beta}(u^+;y)=0$. Hence, we derive from +%\eqref{e:ver2} that +%\begin{equation} +%\label{conkhi} +%0 \in\partial f(u^+)+ \nabla F_{\beta}(u^+;y). +%\end{equation} +%By definition of $F_{\beta}(u^+;y)$ and $Lu^+=b$, we have +%\begin{alignat}{2} +%\nabla F_{\beta}(u^+;y) &= \nabla h(u^+) + \nabla L(u^+)^*y + \frac{1}{\beta} \nabla L(u^+)^*(Lu^+-b) \notag\\ +%&= \nabla h(u^+) + \nabla L(u^+)^*y, +%\end{alignat} +%which together with \eqref{conkhi}, shows that \eqref{e:inclu1} is satisfied. +%\end{proof} +\paragraph{\textbf{\emph{\edita{Sufficient Optimality Conditions.}}}} +Sufficient optimality conditions for Program \eqref{prob:01} are also well understood in the litterature \cite{luenberger1984linear,rockafellar2009variational,mordukhovich2006variational,gfrerer2015complete}. +Indeed, $u$ is a local minimizer of Program \eqref{prob:01} if there exists $y$ for which +\begin{align} +\begin{cases} +v^\top \left( +\nabla_{uu} h(u) + \sum_{i=1}^m \nabla_{uu} A_i(u) +\right) v \ge 0, +\qquad +\forall v \in T_C(u),\\ +A(u) = b. +\end{cases} +\label{eq:suff cnd} +\end{align} +\notea{Why does above look different from sufficient cnds for Lagrangian? Suppose to be equivalent problems.} + +%For simple, let us assume that $C = \menge{u}{g(u)\leq c}$ for some $\mathcal{C}^2$-function $g\colon\RR^d\to\RR$. In this section we assume that $h$ and $L$ are too $\mathcal{C}^2$-functions. +%Let $\overline{u}$ be a point such that $g(\overline{u})=c$ and $0\not\in\nabla g(\overline{u})$. Then, \cite[Proposition 10.3]{rockafellar2009variational} implies that +%\begin{equation} +%N_{C}(\overline{u}) =\menge{\mu \nabla g(\overline{u})}{\mu \geq 0}. +%\end{equation} +%Now suppose that the first oder optimality condition \eqref{e:inclu1} is satisfied for $\overline{u}$. Then there exists $\mu \geq 0$ and $y \in\RR^m$ such that +%\begin{equation}\label{e:inclu1} +% \nabla L(\overline{u})^*y + \nabla h(\overline{u}) +\mu \nabla g(\overline{u})=0. +% \end{equation} +% Therefore, inview of \cite[Chapter 11]{luenberger1984linear}, $\overline{u}$ is a local minimum provided that +% the Hessian matrix +% \begin{equation} +% H(\overline{u}) = \nabla^2h(\overline{u}) + y^T\nabla^2 L(\overline{u}) + \mu \nabla^2 g(\overline{u}) +% \end{equation} +% is positive definite on the space +%\begin{equation} +%E= \menge{y}{\nabla L(\overline{u})y =0, \nabla g(\overline{u})y=0}. +%\end{equation} +%In our examples, this condition is checkable where $g(u) = \frac{1}{2}\|u\|^2$, $h$ is quadratic and $Lu = Muu^\top$. \section{Algorithm $\&$ Convergence} \subsection{Algorithm} We propose the following method for solving the problem \eqref{prob:01} where, the main idea is that we do a projected gradient descent step on $u$ to obtain $u^+$ and update the penalty parameter $\beta^+$ in such a way that the feasiblity $\frac{1}{2\beta^+\gamma}\|Lu^+-b\|^2$ reduce faster than the gradient mapping up to some noise level $\omega$: \begin{equation} \frac{1}{2\beta^+\gamma}\|Lu^+-b\|^2\leq \frac{1}{8} \|G_{\beta,\gamma}(u,y) \|^2 + \frac{\omega}{\gamma} \end{equation} Then update the corresponding the multiplier $y$ as in the classical ADMM: \begin{equation} y^+ = y+\frac{1}{\sigma}(LU^+-b). \end{equation} The formal algorithm is presented as follows. \begin{algorithm} \label{Algo:2} Input: $\beta_0> 0$, $c > 0$, $\alpha \in \left[0,1\right[$, $u_{0}\in \mathcal{C}$, $y_0 =0$, $\epsilon_1\in \left]0,1\right[$. Given $\beta_k$, choose $\gamma_{k} \leq \frac{1-\epsilon_1}{\mathbb{L}_{h}+\mathbb{L}_{\beta_{k}}}.$ Iterate \\ For k=0,1,\ldots\\ 1. Projected gradient: $ u_{k+1} = P_{\mathcal{C} }(u_{k} - \gamma_{k} \nabla F_{\beta_{k}}(u_{k},y_{k})).$\\ 2. Line search step\\ \quad $s=0, d_{k,0}=2$, $\beta_{k+1,0}= \frac{1}{2} \|Lu_{k+1}-b \|^2 \bigg( \frac{\gamma_{k}}{8} \|G_{\beta_{k},\gamma_{k}}(u_k,y_{k}) \|^2 + \frac{d_{k,s}}{{(1+k)}^{1+\epsilon_1}}\bigg)^{-1}.$\\ While $\beta_{k+1,s} \geq c/(k+1)^{\alpha}$ do \begin{alignat}{2}\label{e:mc} % \beta_{k+1}\geq \frac{1}{2} \|Lu_{k+1}-b \|^2 \bigg( \frac{\gamma_{k}}{8} \|G_{\beta_{k},\gamma_{k}}(u_k,y_{k}) \|^2 + \frac{d_k}{(k+1)^\alpha}\bigg)^{-1}. d_{k,s+1} &= 2*d_{k,s} \\ \beta_{k+1,s+1}&= \frac{1}{2} \|Lu_{k+1}-b \|^2 \bigg( \frac{\gamma_{k}}{8} \|G_{\beta_{k},\gamma_{k}}(u_k,y_{k}) \|^2 + \frac{d_{k,s+1}}{{(1+k)}^{1+\epsilon_1}}\bigg)^{-1}\\ s&\leftarrow s+1. \end{alignat} Endwhile\\ 3. Update $\beta_{k+1} = \beta_{k+1,s}$.\\ 4. Chose $\sigma_{k+1} \geq 2\beta_{k}$ and update $y_{k+1} = y_{k} + \frac{1}{\sigma_{k+1}} (Lu_{k+1}-b)$.\\ \end{algorithm} \begin{remark} The updating rule of $(\beta_k)_{k\in\NN}$ in \eqref{e:mc} plays a role in our analysis. Intuitively, if $u_{k+1}$ is solution then $Lu_{k+1}=b$ and \eqref{e:mc} is trivially satisfied for any $\beta_{k+1}\geq 0$. Hence $\beta_{k+1}$ enforces $u_{k+1}$ close to $\menge{u}{Lu=b}$ \end{remark} \begin{remark}When $\sigma_k\equiv \infty$, we get $y_k\equiv 0$ and hence the step 2 disappears. If we chose $\sigma_k = c(k+1)^{\alpha_1}\|Lu_k-b\|$ where $c,\alpha_1$ is chosen such that $\sigma_{k} > 2\beta_{k-1}$, then \begin{equation} \|y_{k+1}\| \leq \|y_k\| + \frac{\|Lu_{k+1}-b\|}{\sigma_{k+1}} = \|y_k\| + \frac{1}{c(k+2)^\alpha}. \end{equation} Since $\sum_{k\in\NN} \frac{1}{c(k+2)^\alpha} <+\infty$, $(\|y_k\|)_{k\in\NN}$ converges and hence bounded. Therefore, \begin{equation} b_0 = \inf_{k\in\NN} \mathcal{L}_{\beta_{k}}(u_{k+1},y_{k}) \geq \inf_{k} h(u_{k}), \end{equation} which implies that $b_0>-\infty$ whenever $(u_k)_{k\in\NN}$ or $\dom(f)$ is bounded. \end{remark} \subsection{Convergence} In view of Lemma \ref{l:solv}, we need to estimate gradient mapping $\|G_{\beta_{k},\gamma_{k}}(u_k,y_{k})\|$ as well as feasibility $\|Lu_{n+1}-b\|^2$. \begin{theorem} \label{t:1} Suppose that $b_0 = \inf_{k\in\NN} \mathcal{L}_{\beta_{k}}(u_{k+1},y_{k}) > -\infty$ and that $z_0=\sum_{k=1}^\infty \frac{d_{k,s_k}}{(1+k)^{1+\epsilon_1}} <+\infty$, where $s_k$ be the smallest index such that $\beta_{k,s_k} < c/(k+1)^{\alpha}$. Then the following hold. \begin{equation}\label{e:mapp1} \sum_{k=1}^\infty \gamma_{k} \|G_{\beta_{k},\gamma_{k}}(u_k,y_{k}) \|^2 \leq 4(\mathcal{L}_{\beta_0}(u_{1},y_0) + z_0-b_0+\frac{\gamma_0}{8}\|G_{\beta_0,\gamma_0}(u_0)\|^2), \end{equation} and \begin{equation}\label{e:feas1} \sum_{k=1}^\infty \frac{1}{\beta_{k+1}} \|Lu_{k+1}-b \|^2 \leq (\mathcal{L}_{\beta_0}(u_{1},y_0) + 3z_0-b_0+\frac{\gamma_0}{8}\|G_{\beta_0,\gamma_0}(u_0)\|^2). \end{equation} \end{theorem} \begin{proof} Set $e_{k+1}= \frac{d_{k,s_k}}{(k+1)^\alpha}$. Then $z_0= \sum_{k\in\NN}e_k <+\infty$. It follows from Lemma \ref{lem:11} that \begin{alignat}{2} G_k= \frac{\gamma_{k}}{2}\|G_{\beta_k,\gamma_k}(u_k,y_k) \|^2 &\leq \mathcal{L}_{\beta_k}(u_k,y_k) - \mathcal{L}_{\beta_{k}}(u_{k+1},y_k)\notag\\ &= h(u_k) - h(u_{k+1}) + g_{\beta_k}(u_k,y_k) -g_{\beta_{k}}(u_{k+1},y_k)\notag\\ &=h(u_k) - h(u_{k+1}) + g_{\beta_{k-1}}(u_k,y_{k-1}) -g_{\beta_{k}}(u_{k+1},y_k)\notag\\ &\quad + g_{\beta_k}(u_k,y_k)- g_{\beta_{k-1}}(u_k,y_{k-1}), \label{e:sa1} \end{alignat} where we set $g_{\beta}(u,y) = \scal{Lu-b}{y} +\frac{1}{2\beta}\|Lu-b\|^2$. Let us estimate the last term in \eqref{e:sa1}. We have \begin{alignat}{2} \omega_{1,k}= g_{\beta_k}(u_k,y_k)- g_{\beta_{k-1}}(u_k,y_{k-1}) = \big(\frac{1}{2\beta_k} -\frac{1}{2\beta_{k-1}}\big)\|Lu_k-b\|^2+\scal{Lu_k-b}{y_k-y_{k-1}}. \end{alignat} Since $y_{k} = y_{k-1} + \frac{1}{\sigma_{k}} (Lu_{k}-b)$ and use \eqref{e:mc}, we get \begin{alignat}{2} &\quad \omega_{1,k} = \big(\frac{1}{2\beta_k} -\frac{1}{2\beta_{k-1}}\big)\|Lu_k-b\|^2+ \frac{1}{\sigma_k} \|Lu_k-b\|^2. %&\leq \big(\frac{1}{2\beta_k} -\frac{1}{2\beta_{k-1}}\big)\|Lu_k-b\|^2+ \frac{\gamma_{k-1}}{8} \|G_{\beta_{k-1},\gamma_{k-1}}(u_{k-1},y_{k-1}) \|^2 + \frac{d_{k-1}}{k^\alpha}\notag\\ %&\leq \big(\frac{1}{2\beta_k} -\frac{1}{2\beta_{k-1}}\big)\|Lu_k-b\|^2+\frac{1}{4} G_k+ \frac{1}{4} G_{k-1} + \frac{d_{k-1}}{k^\alpha}. \end{alignat} Let us estimate the first term in \eqref{e:sa1}. Set $T_k= h(u_k) +\scal{Lu_k-b}{y_{k-1}} $. Then \begin{alignat}{2} \omega_{2,k}&= h(u_k) - h(u_{k+1}) + g_{\beta_{k-1}}(u_k,y_{k-1}) -g_{\beta_{k}}(u_{k+1},y_k)\notag\\ & = T_k -T_{k+1} + \frac{1}{2\beta_{k-1}} \|Lu_k-b\|^2 - \frac{1}{2\beta_k}\|Lu_{k+1}-b\|^2. \end{alignat} Therefore, we derive from \eqref{e:sa1} that \begin{alignat}{2} G_k &\leq \omega_{1,k} +\omega_{2,k}\notag\\ &= T_k -T_{k+1} + \frac{1}{2\beta_k} \big(\|Lu_k-b\|^2 -\|Lu_{k+1}-b\|^2\big)+\frac{1}{\sigma_k} \|Lu_k-b\|^2.\label{e:sa2} \end{alignat} Now using the condition \eqref{e:mc}, we obtain \begin{equation} \frac{1}{2\beta_k} \|Lu_k-b\|^2 \leq \frac{1}{4} G_{k-1} + e_k\leq \frac{1}{4}G_k+ \frac{1}{4} G_{k-1} + e_k, \end{equation} Therefore, it follows from \eqref{e:sa2} that \begin{alignat}{2} \frac12 G_k &\leq (T_k+\frac{1}{4} G_{k-1}) -(T_{k+1} +\frac{1}{4} G_k )+ \frac{1}{\sigma_k} \|Lu_k-b\|^2 - \frac{1}{2\beta_k}\|Lu_{k+1}-b\|^2 + e_k\notag\\ &\leq (T_k+\frac{1}{4} G_{k-1}) -(T_{k+1} +\frac{1}{4} G_k )+ \frac{1}{2\beta_{k-1}} \|Lu_k-b\|^2 - \frac{1}{2\beta_k}\|Lu_{k+1}-b\|^2 + e_k. \label{e:sa3} \end{alignat} For every $N\in\NN$, $N\geq 1$, summing \eqref{e:sa3} from $k=1$ to $k=N$, we obtain, \begin{alignat}{2} \sum_{k=1}^{N}\frac12 G_k \leq T_1 +\frac{1}{4} G_0 + \frac{1}{\beta_{0}} \|Lu_1-b\|^2 - T_{N+1} - \frac{1}{4}G_{N} - \frac{1}{2\beta_N} \|Lu_{N+1}-b\|^2 + z_0. \end{alignat} Note that, by the definiton of $T_{N+1}$, we have \begin{alignat}{2} -T_{N+1} - \frac{1}{2\beta_N} \|Lu_{N+1}-b\|^2& = - \mathcal{L}_{\beta_N}(u_{N+1},y_N) \leq -b_0. \end{alignat} Hence, \begin{equation} \sum_{k=1}^{N}\frac12 G_k \leq \mathcal{L}_{\beta_0}(u_{1},y_0) + z_0-b_0 +\frac{1}{4} G_0 , \end{equation} which proves \eqref{e:mapp1}. Moreover, \eqref{e:feas1} follows directly from \eqref{e:mc}. \end{proof} \begin{corollary} Under the same condition as in Theorem \ref{t:1}. Suppose that $\gamma_k = \mathcal{O}(\beta_k)$. Then \begin{equation}\label{e:mapp2} \min_{1\leq k\leq N} \|G_{\beta_{k},\gamma_{k}}(u_k,y_{k}) \|^2 = \mathcal{O}(1/N^{1-\alpha}) \to 0, \end{equation} and \begin{equation}\label{e:feas2} \min_{1\leq k\leq N} \|Lu_{k+1}-b \|^2 = \mathcal{O}(1/N) \to 0. \end{equation} \end{corollary} \begin{proof} We see that \eqref{e:mapp2} and \eqref{e:feas2} follow directly from \eqref{e:mapp1} and \eqref{e:feas1}, respectively. \end{proof} \begin{corollary} \label{c:2} Under the same condition as in Theorem \ref{t:1}. The sequence $(F_{\beta_k}(u_k,y_{k}))_{k\in\NN}$ converges to a $F^\star$. Moreover, if $(\|y_{k-1}\| \sqrt{\beta_k})_{k\in\NN}$ and $(\beta_{k+1}/\beta_{k})_{k\in\NN}$ are bounded by $M$, then $(h(u_k))_{k\in\NN}$ converges to $F^\star$. \end{corollary} \begin{proof} Note that $(F_{\beta_k}(u_{k+1},y_k))_{n\in\NN}$ is bounded below. Moreover, the proof of Theorem \ref{t:1} show that \begin{equation} F_{\beta_k}(u_{k+1},y_k) + \frac{\gamma_k}{8}\|\mathcal{G}_{\beta_k,\gamma_k}(u_k)\|^2 \leq F_{\beta_{k-1}}(u_{k},y_{k-1}) + \frac{\gamma_{k-1}}{8}\|\mathcal{G}_{\beta_{k-1},\gamma_{k-1}}(u_{k-1})\|^2 +e_k. \end{equation} Hence $(F_{\beta_k}(u_{k+1},y_k) + \frac{\gamma_k}{8}\|\mathcal{G}_{\beta_k,\gamma_k}(u_k)\|^2)_{n\in\NN}$ converges to a finite value $F^\star$. Since $\frac{\gamma_k}{8}\|\mathcal{G}_{\beta_k,\gamma_k}(u_k)\|^2\to 0$, we get $F_{\beta_k}(u_{k+1},y_k)\to F^\star$. Since $\frac{1}{2\beta_k}\|Lu_{k}-b\|^2\to 0$ and $(\beta_{k+1}/\beta_{k})_{k\in\NN}$ is bounded by $M$, we obtain \begin{equation} \frac{1}{2\beta_k}\|Lu_{k+1}-b\|^2 =\frac{\beta_{k+1}}{2 \beta_{k+1}\beta_{k}}\|Lu_{k+1}-b\|^2 \leq \frac{M}{2\beta_{k+1}} \|L u_{k+1} -b\|^2 \to 0. \end{equation} Moreover, since $(\|y_{k-1}\| \sqrt{\beta_k})_{k\in\NN}$ is bounded by $M$, we also have \begin{equation} |\scal{Lu_k-b}{y_{k-1}}| = |\frac{1}{\sqrt{\beta_k}}\scal{Lu_k-b}{\sqrt{\beta_k}y_{k-1}}| \leq \frac{M}{\sqrt{\beta_k}}\|Lu_{k}-b\| \to 0. \end{equation} Therefore, \begin{alignat}{2} |h(u_n) - F^\star| &\leq |F_{\beta_k}(u_k,y_{k-1})-F^\star| + |\scal{Lu_k-b}{y_{k-1}}| + \frac{1}{2\beta_k}\|Lu_k-b\|^2\notag\\ &\to 0, \end{alignat} which proves the desired result. \end{proof} \subsection{Local convergence} Let $\overline{u}$ and $\overline{y}$ satisfy the first order optimality condition \begin{equation} \label{e:fr1} -\nabla L(\overline{u})^*\overline{y} - \nabla h(\overline{u}) \in N_{C}(\overline{u}). \end{equation} For simple, let us recall $\nabla F_{\beta}\colon (u,y) \mapsto \nabla h(u) + \nabla L(u)^*y + \frac{1}{\beta}\nabla L(u)^*(Lu-b)$. \begin{theorem} Under the conditions of Theorem \ref{t:1} and $\gamma_k \geq \underline{\gamma} > 0$ . Suppose that each $\overline{u}$ in \eqref{e:fr1} is a local minima of $h$, and $C$ and $(y_k)_{k\in\NN}$ are bounded. Then $(h(u_k))_{k\in\NN}$ converges to local optimum $h(\overline{u})$. \end{theorem} \begin{proof} Since $C$ is bounded, $(u_k)_{k\in\NN}$ is bounded. Therefore, there exists a subsequence $(n_k)_{k\in\NN}$ of $\NN$ such that $u_{n_k}\to u^*$ and $y_{n_k}\to y^*$. It follows from Theorem \ref{t:1} that $\gamma_k\|(u_{k+1}-u_k)/\gamma_k\|^2 \to 0$. Since $(\gamma_k)_{k\in\NN}$ is bounded below by $\underline{\gamma}$, we obtain $G_{\beta_k,\gamma_k}(u_k)\to 0$. Hence, $u_{n_k+1}\to u^*$. Now, using the updating of $u_{k}$, we have $G_{\beta_k,\gamma_k}(u_k) -\nabla F_{\beta_k}(u_k,y_k) \in N_{C}(u_{k+1})$. Hence, \begin{equation}\label{e:aa2s} (\forall u\in C)(\forall k\in\NN)\; \scal{-G_{\beta_k,\gamma_k}(u_k)+\nabla F_{\beta_{k}}(u_k,y_k) }{u-u_{k+1}}\geq 0. \end{equation} Since $C$ is bounded and $\nabla L$ is continuous, we obtain $\sup_{u\in C}\|\nabla L(u)\| <+\infty$, and hence \begin{equation} \|\frac{1}{\beta_{n_k}}\nabla L(u_{n_k})^*(Lu_{n_k}-b)\| \leq \frac{1}{c}(\sup_{u\in C}\|\nabla L(u)\| )\|Lu_{n_k}-b\|\to 0. \end{equation} We also have \begin{equation} \nabla L(u_{n_k}) \to \nabla L(u^*) \; \text{and}\; \nabla h(u_{n_k}) \to \nabla h(u^*). \end{equation} Since $y_{n_k}\to y^*$, we get \begin{alignat}{2} & \|\nabla L(u_{n_k})^*y_{n_k}- \nabla L(u^*)y^* \| \quad\\ &\leq \|\nabla L(u_{n_k})^*y_{n_k}- \nabla L(u_{n_k})y^* \| +\|\nabla L(u_{n_k})^*y^*- \nabla L(u^*)y^* \|\notag\\ &\leq \|\nabla L(u_{n_k})\|\|y_{n_k}- y^* \|+\|y^*\| \|\nabla L(u_{n_k})- \nabla L(u^*) \|\notag\\ &\to 0. \end{alignat} Consequently, $\nabla F_{\beta_{n_k}}(u_{n_k}, y_{n_k}) \to \nabla h(u^*) +\nabla L(u^*)y^*$. Note that $$G_{\beta_{n_k},\gamma_{n_k}}(u_{n_k})\to 0.$$ Now, passing through subsequence in \eqref{e:aa2s}, we obtain \begin{equation}\label{e:aa2s} (\forall u\in C)\scal{\nabla h(u^*) +\nabla L(u^*)y^* }{u-u^*}\geq 0, \end{equation} which is \eqref{e:fr1} for $\overline{u} = u^*$ and $\overline{y} = y^*$. By assumption, $u^*$ is local minimum and $h(u_{n_k}) \to h(u^*)$. Therefore, $F^\star = h(u^*)$. Using Corollary \ref{c:2}, we get $h(u_k) \to h(\overline{u})$. \end{proof} \section{Related Work \label{sec:related work}} To the best of our knowledge, the proposed method is new and different from existing methods in the literature. As mentioned in Introduction, the connection to augmented Lagrange method is already mentioned. Our method is significantly different from the augmented Lagrange method, we perform only step of the projected gradient step on primal variable $u$ instead of minimizing the augmented Lagrange fucntion. Furthermore, we update the penalty parameter $\beta$ adaptively to make sure that the feasibility reduces faster than the gradient mapping. In the case when $h=0$, a modification of Chambolle-Pock's method is investigated in \cite{Valkonen14} and preconditioned ADMM \cite{Matin17} where the convergence of iterate is proved under strong assumptions not full-filling in our setting here. %\noindent{\bf Connection to Linearized Alternating Direction Method}.\\ ADMM is the classic method proposed for solving the problem \ref{prob:01} for the case where $L$ is a linear operator and $h$ is zero \cite{gabay1976dual}. This method is an application of the Douglas-Rachford method to the dual problem \cite{Gabay83}. One of the main drawback of the ADMM is the appearance of the term $Lu$ in the update rule of $u_{k+1}$. To overcome this issue, some strategies were suggested. The first strategies is proposed in \cite{shefi2014rate}, refined in \cite{banert2016fixing}, known as alternating direction proximal method of multipliers. The second strategies is to use linearized technique \cite{lin2011linearized}. We show here that our proposed method is closed related to updating rule as the linearized alternating direction method \cite{lin2011linearized}. Assume that $h\equiv 0$ and $L$ is a linear operator. Then the proposed method can be rewritten as \begin{equation} \begin{cases} u_{k+1}= \arg\min_{u\in C} \frac{1}{2\gamma_k} \| u-u_k + \gamma_kL^*\bigg( \lambda_k + \frac{1}{\beta_k}\big(Lu_k-b\big)\bigg)\|^2\notag\\ \beta_{k+1}= \frac{1}{2} \|Lu_{k+1}-b \|^2 \bigg( \frac{\gamma_{k}}{8} \|G_{\beta_{k},\gamma_{k}}(u_k,y_{k}) \|^2 + \frac{d_k}{(k+1)^\alpha}\bigg)^{-1}\\ \text{Chose $\sigma_{k+1}\geq 2\beta_k$ and}\; \lambda_{k+1} = \lambda_k +\frac{1}{\sigma_{k+1}}(Lu_{k+1}-b), \end{cases} \end{equation} which is a variant version of Linearized ADMM \cite{lin2011linearized}. %\noindent %{\bf Connection to ALBUM3 in \cite{bolte2018nonconvex}}\\ Very recently, \cite{bolte2018nonconvex} proposes a framework with for solving the problem \ref{prob:01} with $C=\RR^d$. In particular, a special case AlBUM3 (Proximal Linearized Alternating Minimization) in this work is closely related to us where their conditions are checkable only when $L$ is linear. Moreover, our updating of $\beta_{k}$ in \cite{bolte2018nonconvex} depending on the smallest eigenvalue $L^*L$. For nonlinear $L$, the application of their method remains a challenge. %\noindent{\bf Connection to the deflected subgradient method}\\ The deflected subgradient method is investigated in \cite{burachik2010deflected} can be use to solve a special case of the Problem \ref{prob:01} for some a compact subset $\mathcal{C}$ in $\mathcal{X}$. The basis step of the deflected subgradient method to solve: given $\beta, v$, \begin{equation} u^*\in \arg\min_{u\in C} h(u) + \beta \boldsymbol{\sigma}(Lu-b) - \scal{Kv}{Lu-b} \end{equation} where $\boldsymbol{\sigma}$ is a continuous penalty function such as $\|\cdot\|$, and $K$ is bounded linear operator. In general, there is no closed -form expression for $u^*$ since it does not split $f$, $h$, $L$ invididually. Hence, it is hard to implement deflected subgradient method. This is also a common drawback of the classic penalty method and its related works \cite{gasimov2002augmented,burachik2010primal}. \section{Numerical experiments} \subsection{Hanging chain} %\begin{thebibliography}{} % % and use \bibitem to create references. Consult the Instructions % for authors for reference list style. \bibliographystyle{abbrv} \bibliography{references_alp.bib,bang.bib,ctuy16-small-bib.bib,JS_References.bib,lions.bib,references_optimal_sampling,references_yp,tythc16-small-bib,yhli.bib,bibliograpply,ctuy16-small-bib,bang1.bib,bang.bib} + +\appendix + +\section{Proof of Lemma \ref{lem:eval Lipsc cte} \label{sec:proof of eval Lipsc cte}} + +By definition of $u^+_{\gamma}$, we have that +\begin{equation} +u^+_{\gamma} - u +\gamma \nabla \mathcal{L}_\beta(u,y) \in -N_C(u^+_{\gamma}). +\label{eq:optimality of uplus} +\end{equation} +On the other hand, $\gamma$ by definition satisfies +\begin{align} +& \mathcal{L}_{\beta}(u^+_{\gamma},y) \nonumber\\ + & \le \mathcal{L}_\beta(u,y) + \left\langle +u^+_{\gamma'} - u | \nabla \mathcal{L}_\beta (u,y) +\right\rangle + \frac{1}{2\gamma'}\|u^+_{\gamma'} - u\|^2 \nonumber\\ +& = \mathcal{L}_\beta(u,y) + \frac{1}{\gamma} \left\langle +u^+_{\gamma'} - u |u^+_\gamma - u+ \gamma \nabla \mathcal{L}_\beta (u,y) +\right\rangle +- \frac{1}{2\gamma}\|u^+_{\gamma} - u\|^2 \nonumber\\ +& \le \mathcal{L}_\beta(u,y) +- \frac{1}{2\gamma}\|u^+_{\gamma} - u\|^2 +\qquad \text{(see \eqref{eq:optimality of uplus})} \nonumber\\ +& = \mathcal{L}_\beta(u,y) - \frac{\gamma}{2} \|G_{\beta,\gamma}(u,y)\|^2, +\qquad \text{(see Definition \ref{def:grad map})} +\end{align} +which completes the proof of Lemma \ref{lem:eval Lipsc cte}. + + + + + \end{document} % end of file template.tex Set $X = \menge{u}{Lu=b} \cap C$. Suppose that there exist a neighborhood $B(\overline{u};\epsilon)$ of $\overline{u}$ and a neighborhood $B(\overline{y};\epsilon)$ of $\overline{y}$, and positive number $\alpha_1,\alpha_2$ and non-negative $\rho_1,\rho_2$ such that $ (\forall u \in X \cap B(\overline{u};\epsilon))(\forall y \in B(\overline{y};\epsilon))$, \begin{equation}\label{e:maic} \scal{\nabla F_{\beta}(u,y)- \nabla F_{\beta}(\overline{u},\overline{y}) }{u-\overline{u}} \geq \rho_1\|u-\overline{u}\|^{\alpha_1} +\rho_2\| y- \overline{y}\|^{\alpha_2}. \end{equation} \begin{theorem} Suppose that $\gamma_k \geq \underline{\gamma} > 0$ and all the conditions in Theorem \ref{t:1} is satisfied. Furthermore, Suppose that for some $k_0$, $(u_{k})_{k\geq k_0} \subset B(\overline{u};\epsilon)$ and $(y_{k})_{k\geq k_0} \subset B(\overline{y};\epsilon)$, and \eqref{e:maic} is satisfied for every $\beta\in (\beta_k)_{k\geq k_0}$. If $\rho_1 > 0$ or $\rho_2 > 0$, then $u_{k}\to \overline{u}$ or $y_k \to \overline{y}$. \end{theorem} \begin{proof} Without lost of generality, $k_0=0$. It follows from Theorem \ref{t:1} that $\gamma_k\|(u_{k+1}-u_k)/\gamma_k\|^2 \to 0$. Since $(\gamma_k)_{k\in\NN}$ is bounded below by $\underline{\gamma}$, we obtain $G_{\beta_k,\gamma_k}(u_k)\to 0$. Since $\beta_{k+1}^{-1}\|Lu_{k+1}-b\|^2 \to 0$ and $(\beta_{k})_{k\in\NN}$ is bounded above by $\overline{\beta}=c$, we get $\|Lu_{k+1}-b\|^2\to 0$. Since $L\overline{u} =b$, we can rewrite \eqref{e:fr1} as $-\nabla F_{\beta}(\overline{u},\overline{y}) \in N_{C}(\overline{u})$. Hence \begin{equation}\label{e:aa1s} (\forall u\in C)(\forall k\in\NN)\; \scal{\nabla F_{\beta_{k}}(\overline{u},\overline{y}) }{u-\overline{u}}\geq 0. \end{equation} Now, using the updating of $u_{k}$, we have $G_{\beta_k,\gamma_k}(u_k) -\nabla F_{\beta_k}(u_k,y_k) \in N_{C}(u_{k+1})$. Hence, \begin{equation}\label{e:aa2s} (\forall u\in C)(\forall k\in\NN)\; \scal{-G_{\beta_k,\gamma_k}(u_k)+\nabla F_{\beta_{k}}(u_k,y_k) }{u-u_{k+1}}\geq 0. \end{equation} Now, we imply from \eqref{e:aa1s} and \eqref{e:aa2s} that \begin{alignat}{2} \label{e:las1} \scal{-G_{\beta_k,\gamma_k}(u_k)}{\overline{u}-u_{k+1}} + \scal{\nabla F_{\beta_{k}}(u_k,y_k) -\nabla F_{\beta_{k}}(\overline{u},\overline{y}}{\overline{u}-u_{k+1}}\geq 0. \end{alignat} Using the condition \eqref{e:maic}, we derive from \eqref{e:las1} that \begin{alignat}{2} \rho_1\|u_{n+1}-\overline{u}\|^{\alpha_1} &+\rho_2 \| y_n- \overline{y}\|^{\alpha_2}) \leq \scal{-G_{\beta_k,\gamma_k}(u_k)}{\overline{u}-u_{k+1}}\notag\\ &\quad +\scal{\nabla F_{\beta_{k}}(u_k,y_k) -\nabla F_{\beta_{k}}(\overline{u},\overline{y})}{u_{k}-u_{k+1}} \notag\\ & \leq (\epsilon+\gamma_k \|\nabla F_{\beta_{k}}(u_k,y_k) -\nabla F_{\beta_{k}}(\overline{u},\overline{y})\|) \| G_{\beta_k,\gamma_k}(u_k)\|\notag\\ &\leq (2\epsilon + \|\nabla F_{\beta_{k}}(\overline{u},y_k)-\nabla F_{\beta_{k}}(\overline{u},\overline{y}) \|)\| G_{\beta_k,\gamma_k}(u_k)\|\notag\\ &\leq (2\epsilon + \|\nabla L(\overline{u})\|\epsilon \| G_{\beta_k,\gamma_k}(u_k)\|\notag\\ & \to 0. \end{alignat} Therefore, $u_{n}\to \overline{u}$ or $y_n\to \overline{y}$ provided that $\rho_1> 0$ or $\rho_2 >0$. \end{proof} + +