diff --git a/nonlinear.tex b/nonlinear.tex index 76d43a8..a3ce2b6 100755 --- a/nonlinear.tex +++ b/nonlinear.tex @@ -1,622 +1,630 @@ %%%%%%%%%%%%%%%%%%%%%%% file template.tex %%%%%%%%%%%%%%%%%%%%%%%%% % % This is a general template file for the LaTeX package SVJour3 % for Springer journals. Springer Heidelberg 2010/09/16 % % Copy it to a new file with a new name and use it as the basis % for your article. Delete % signs as needed. % % This template includes a few options for different layouts and % content for various journals. Please consult a previous issue of % your journal as needed. % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % % First comes an example EPS file -- just ignore it and % proceed on the \documentclass line % your LaTeX will extract the file if required \begin{filecontents*}{example.eps} %!PS-Adobe-3.0 EPSF-3.0 %%BoundingBox: 19 19 221 221 %%CreationDate: Mon Sep 29 1997 %%Creator: programmed by hand (JK) %%EndComments gsave newpath 20 20 moveto 20 220 lineto 220 220 lineto 220 20 lineto closepath 2 setlinewidth gsave .4 setgray fill grestore stroke grestore \end{filecontents*} % \RequirePackage{fix-cm} % %\documentclass{svjour3} % onecolumn (standard format) %\documentclass[smallcondensed]{svjour3} % onecolumn (ditto) \documentclass[smallextended]{svjour3} % onecolumn (second format) %\documentclass[twocolumn]{svjour3} % twocolumn % \smartqed % flush right qed marks, e.g. at end of proof % \usepackage{amsmath} \usepackage{amssymb} \usepackage{graphicx} \usepackage{algorithm} % % \usepackage{mathptmx} % use Times fonts if available on your TeX system % insert here the call for the packages your document requires %\usepackage{latexsym} % etc. \PassOptionsToPackage{normalem}{ulem} \newcommand{\minimize}[2]{\ensuremath{\underset{\substack{{#1}}}% {\mathrm{minimize}}\;\;#2 }} \newcommand{\Argmind}[2]{\ensuremath{\underset{\substack{{#1}}}% {\mathrm{Argmin}}\;\;#2 }} \newcommand{\Frac}[2]{\displaystyle{\frac{#1}{#2}}} \newcommand{\menge}[2]{\big\{{#1} \;|\; {#2}\big\}} \newcommand{\Pair}[2]{{\big\langle{{#1},{#2}}\big\rangle}} \newcommand{\pair}[2]{{\langle{{#1},{#2}}\rangle}} \newcommand{\Scal}[2]{{\bigg\langle{{#1}\:\bigg |~{#2}}\bigg\rangle}} \newcommand{\Menge}[2]{\bigg\{{#1}~\bigg|~{#2}\bigg\}} \newcommand{\lev}[1]{\ensuremath{\mathrm{lev}_{\leq #1}\:}} \newcommand{\enve}[1]{\ensuremath{\operatorname{env}#1}} \newcommand{\emp}{\ensuremath{{\varnothing}}} \newcommand{\bdry}{\ensuremath{\operatorname{bdry}}} \newcommand{\yosi}[2]{\ensuremath{\sideset{^{#2}}{}{\operatorname{}}\!\!#1}} \newcommand{\infconv}{\ensuremath{\mbox{\small$\,\square\,$}}} \newcommand{\scal}[2]{\left\langle{#1}\mid {#2} \right\rangle} \newcommand{\pscal}[2]{\langle\langle{#1}\mid{#2}\rangle\rangle} \newcommand{\norm}[1]{\|#1\|} \newcommand{\vuo}{\ensuremath{\mbox{\footnotesize$\square$}}} \newcommand{\exi}{\ensuremath{\exists\,}} \newcommand{\zeroun}{\ensuremath{\left]0,1\right[}} \newcommand{\HH}{\ensuremath{\mathcal H}} \newcommand{\GG}{\ensuremath{\mathcal G}} \newcommand{\YY}{\ensuremath{\mathcal Y}} \newcommand{\XX}{\ensuremath{\mathcal X}} \newcommand{\bary}{\ensuremath{\widetilde{\boldsymbol{y}}}} \newcommand{\barp}{\ensuremath{\widetilde{\boldsymbol{p}}}} \newcommand{\barq}{\ensuremath{\widetilde{\boldsymbol{q}}}} \newcommand{\barx}{\ensuremath{\widetilde{\boldsymbol{x}}}} \newcommand{\BL}{\ensuremath{\EuScript B}\,} \newcommand{\BP}{\ensuremath{\EuScript P}} \newcommand{\HHH}{\ensuremath{\boldsymbol{\mathcal H}}} \newcommand{\WW}{\ensuremath{\boldsymbol{\mathcal W}}} \newcommand{\KKK}{\ensuremath{\boldsymbol{\mathcal K}}} \newcommand{\GGG}{\ensuremath{\boldsymbol{\mathcal G}}} \newcommand{\VV}{\ensuremath{\boldsymbol{V}}} \newcommand{\CCC}{\ensuremath{\boldsymbol{C}}} \newcommand{\MM}{\ensuremath{\boldsymbol{M}}} \newcommand{\SG}{\ensuremath{\boldsymbol{S}}} \newcommand{\EE}{\ensuremath{\boldsymbol{E}}} \newcommand{\XP}{\ensuremath{\mathcal X}^*} \newcommand{\sri}{\ensuremath{\operatorname{sri}}} \newcommand{\ri}{\ensuremath{\operatorname{ri}\,}} \newcommand{\RR}{\ensuremath{\mathbb R}} \newcommand{\KK}{\ensuremath{\mathcal K}} \newcommand{\RP}{\ensuremath{\left[0,+\infty\right[}} \newcommand{\RPP}{\ensuremath{\,\left]0,+\infty\right[}} \newcommand{\RX}{\ensuremath{\,\left]-\infty,+\infty\right]}} \newcommand{\NN}{\ensuremath{\mathbb N}} \newcommand{\SL}{\ensuremath{\EuScript S}\,} \newcommand{\dom}{\ensuremath{\operatorname{dom}}} \newcommand{\cont}{\ensuremath{\operatorname{cont}}} %\newcommand{\gr}{\ensuremath{\operatorname{gra}}} \newcommand{\prox}{\ensuremath{\operatorname{prox}}} \newcommand{\Prox}{\ensuremath{\operatorname{Prox}}} \newcommand{\intdom}{\ensuremath{\operatorname{int}\operatorname{dom}}\,} \newcommand{\inte}{\ensuremath{\operatorname{int}}} \newcommand{\cart}{\ensuremath{\mbox{\huge{$\times$}}}} \newcommand{\scart}{\ensuremath{\mbox{\LARGE{$\times$}}}} \newcommand{\WC}{\ensuremath{{\mathfrak W}}} \newcommand{\TT}{\ensuremath{{\mathbf T}}} \newcommand{\SC}{\ensuremath{{\mathfrak S}}} \newcommand{\rh}{\ensuremath{{\mathrm a}}} \newcommand{\og}{\ensuremath{{\mathrm b}}} \newcommand{\rk}{\ensuremath{{\mathsf {\mathbf a}}}} \newcommand{\ck}{\ensuremath{{\mathsf {\mathbf u}}}} \newcommand{\xk}{\ensuremath{{\mathsf{\mathbf x}}}} \newcommand{\yk}{\ensuremath{{\mathsf{\mathbf y}}}} \newcommand{\RPX}{\ensuremath{{[0,\pinf]}}} \newcommand{\card}{\ensuremath{\operatorname{card}}} \newcommand{\bd}{\ensuremath{\operatorname{bdry}}} \newcommand{\Argmin}{\ensuremath{\operatorname{Argmin}}} \newcommand{\argmin}{\ensuremath{\operatorname{argmin}}} \newcommand{\ran}{\ensuremath{\operatorname{ran}}} \newcommand{\zer}{\ensuremath{\operatorname{zer}}} \newcommand{\gra}{\ensuremath{\operatorname{gra}}} \newcommand{\conv}{\ensuremath{\operatorname{conv}}} \newcommand{\vv}{\ensuremath{\boldsymbol{v}}} \newcommand{\sss}{\ensuremath{\boldsymbol{s}}} \newcommand{\xx}{\ensuremath{\boldsymbol{x}}} \newcommand{\xs}{\ensuremath{\textsf{x}}} \newcommand{\xo}{\ensuremath{\overline{\boldsymbol{x}}}} \newcommand{\pp}{\ensuremath{\boldsymbol{p}}} \newcommand{\qq}{\ensuremath{\boldsymbol{q}}} \newcommand{\yy}{\ensuremath{\boldsymbol{y}}} \newcommand{\ff}{\ensuremath{\boldsymbol{f}}} \newcommand{\hh}{\ensuremath{\boldsymbol{h}}} \newcommand{\ttt}{\ensuremath{\boldsymbol{t}}} \newcommand{\ee}{\ensuremath{\boldsymbol{e}}} \newcommand{\rr}{\ensuremath{\boldsymbol{r}}} \newcommand{\gggg}{\ensuremath{\boldsymbol{g}}} \newcommand{\zz}{\ensuremath{\boldsymbol{z}}} \newcommand{\bb}{\ensuremath{\boldsymbol{b}}} \newcommand{\uu}{\ensuremath{\boldsymbol{u}}} \newcommand{\cc}{\ensuremath{\boldsymbol{c}}} \newcommand{\dd}{\ensuremath{\boldsymbol{d}}} \newcommand{\aaa}{\ensuremath{\boldsymbol{a}}} \newcommand{\ww}{\ensuremath{\boldsymbol{w}}} \newcommand{\BB}{\ensuremath{\boldsymbol{B}}} \newcommand{\LL}{\ensuremath{\boldsymbol{L}}} \newcommand{\PPP}{\ensuremath{\boldsymbol{\mathsf{P}}}} \newcommand{\UU}{\ensuremath{\boldsymbol{U}}} \newcommand{\EEE}{\ensuremath{\boldsymbol{E}}} \newcommand{\E}{\ensuremath{\mathbf{E}}} \newcommand{\D}{\ensuremath{\mathbf{D}}} \newcommand{\ep}{\ensuremath{\boldsymbol{\varepsilon}}} \newcommand{\RRR}{\ensuremath{\boldsymbol{R}}} %\newcommand{\RRR}{\ensuremath{\boldsymbol{R}}} \newcommand{\AAA}{\ensuremath{\boldsymbol{A}}} \newcommand{\BBB}{\ensuremath{\boldsymbol{B}}} \newcommand{\QQ}{\ensuremath{\boldsymbol{Q}}} \newcommand{\SSS}{\ensuremath{\boldsymbol{S}}} \newcommand{\DD}{\ensuremath{\boldsymbol{D}}} \newcommand{\PP}{\ensuremath{\boldsymbol{P}}} \newcommand{\FF}{\ensuremath{\boldsymbol{\mathcal{F}}}} \newcommand{\cone}{\ensuremath{\operatorname{cone}}} \newcommand{\Fix}{\ensuremath{\operatorname{Fix}}} \newcommand{\Id}{\ensuremath{\operatorname{Id}}} \newcommand{\diam}{\ensuremath{\operatorname{diam}}} \newcommand{\IId}{\ensuremath{\boldsymbol{\operatorname{Id}}}} \newcommand{\weakly}{\ensuremath{\rightharpoonup}} \newcommand{\minf}{\ensuremath{-\infty}} \newcommand{\pinf}{\ensuremath{+\infty}} \newcommand{\LLambda}{\ensuremath{\boldsymbol{\Lambda}}} \newcommand{\vva}{\ensuremath{\boldsymbol{\epsilon}}} \newcommand{\trace}{\operatorname{tr}} % please place your own definitions here and don't use \def but % \newcommand{}{} % % Insert the name of "your journal" with % \journalname{myjournal} % \begin{document} \title{A relaxation of the augmented Lagrange method } %\subtitle{Do you have a subtitle?\\ If so, write it here} %\titlerunning{Short form of title} % if too long for running head \author{Bang C. Vu\and Alcaoglu Ahmet\and Sahin M. Fatih\and Alp Yurtsever\and Volkan Cevher \\[5mm] } %\authorrunning{Short form of author list} % if too long for running head \institute{Laboratory for Information and Inference Systems (LIONS), EPFL, Switzerland\\ \email{bang.vu@epfl.ch\and ahmet.alacaoglu@epfl.ch; mehmet.sahin@epfl.ch;alp.yurtsever@epfl.ch; mehmet.sahin@epfl.ch;volkan.cehver@epfl.ch} } \date{Received: date / Accepted: date} % The correct dates will be entered by the editor \maketitle \begin{abstract} We propose a splitting method for solving .... \keywords{Non-linear constraint \and Non-convex \and Smoothing\and Primal-dual} % \PACS{PACS code1 \and PACS code2 \and more} \subclass{47H05\and 49M29\and 49M27\and 90C25} \end{abstract} \section{Introduction} Non-linear programming is a broad discipline in applied mathematics. In this paper, we focus on numerical method for solving the following problem in $\RR^d$ \label{intro} \begin{equation}\label{prob:01} \underset{u\in C, Lu =b}{\text{ minimize }} h(u), \end{equation} where $C\subset \RR^d$ is a non-empty closed convex set, $L\colon \RR^d\to\RR^m$ is a non-linear, continuous differentiable, $h\colon\RR^d\to \RR$ is a differentiable function with $\mathbb{L}_h$-Lipschitzian gradient. The problem is generally non-convex and serves as one of the standard non-linear programming problem where the set $C$ is modeled by inequalities constraints of the form $C = \menge{u}{g(u) \leq 0}$ for some $g\colon\RR^d\to \RR^s$. There are many examples of nonlinear programming that come from industrial operations and business decision making where the non-linearities arises naturally. Let us stress an importance general example as special instance of $\eqref{prob:01}$. \begin{example}{(Factorized technique)} Let us consider the space $\mathcal{X} = \mathbb{S}^{d\times d}$ where each $x\in\mathcal{X}$ is a $d\times d$-symmetric matric. When $x$ is postive semidefinite, we write $x\succeq 0$. The inner product in $\mathbb{S}^{d\times d}$ is defined by $\scal{x}{y} = \trace(x^\top y)$. Let $\mathcal{C}$ be a closed, non-empty, convex set of $\mathbb{S}^{d\times d}$. Let $h_0\colon\mathcal{X}\to \RR$ be a differentiable convex function, with $\mathbb{L}_{0}$ Lipschitz continuous gradient. Let $M\colon\mathbb{R}^{d\times d}\to\RR^m$ be a non-zero linear mapping, let $b\in\RR^m$. The problem is to \begin{equation} \label{e:fac} \underset{x\in C', x\succeq 0, Mx=b} {\text{min}} \;h_0(x) \end{equation} The problem \eqref{e:fac} includes many variety problems in semidefinite programming \cite{Burer2005,Burer2003,tu2014practical}. This problem is a convex optimal problem. However, convex optimal methods are not good choice when $d$ is very large. To overcome this issue, factorized technique \cite{Burer2005,Burer2003} $x = uu^\top$, that transmit the hight dimensional $x$ into low dimensional $u$, will be useful. Moreover, the constraint $x\succeq 0$ will be disappeared in $u$. Suppose that one can find $C \subset \RR^{n\times r}$ such that $x\in C'$ if $u\in C$. By defining $L\colon u\mapsto Muu^\top$ and $h\colon u\mapsto h_0(uu^\top)$, we can write \eqref{e:fac} as a non-convex optimal problem: \begin{equation} \label{e:fac1} \underset{u\in C, Lu=b} {\text{min}} \; h(u). \end{equation} Then it is not hard to check that \eqref{e:fac1} fits to our framework. \end{example} Various standard numerical methods were proposed in the literature for the problem \eqref{prob:01} such as penalty method, augmented Lagrangian method, barrier method, Newton methods, modified Newton methods. Let us recall briefly the basic idea of the augmented Lagrangian method \cite{luenberger1984linear}. For simple, taking $C=\RR^d$. Define \begin{equation} \mathcal{L}_{\beta}(u,\dot{y}) = h(u) + \scal{Lu-b}{\dot{y}} +\frac{1}{2\beta}\|Lu-b\|^2. \end{equation} Given $\dot{y}_k$, find \begin{equation}\label{e:exac} u_{k+1} \in \argmin_{u} \mathcal{L}_{\beta}(u,\dot{y}_k), \end{equation} and update \begin{equation} \dot{y}_{k+1} = \dot{y}_k+\frac{1}{\beta}(Lu_{k+1} -b). \end{equation} {\bf Changlenges:} It is shown that the penalty parameter $\beta$ is sufficiently small, the augmented Lagrangian has a local minimum point near the true optimal point. However, we do not know exactly how small $\beta$ is. Hence, the choice of $\beta$ plays a centreral role in practices. Moreover, $\mathcal{L}_{\beta}(u,\dot{y})$ is non-convex in $u$, finding $u_{k+1}$ is a problematic. This paper we overcome these changlenges. We propose a algorithmic framework where instead of solving \eqref{e:exac}, we do only one projected gradient step. Then, update $\beta$ adatively. %%%%%%%%%%%%%%%%%%%% \section{ Preliminaries} \subsection{Notations} We use the notations $\scal{\cdot}{\cdot}$ and $\|\cdot\|$ for the scalar product and associated norm on $\RR^d$. The conjugate of the linear operator $A$ is denoted by $A^*$. Let $C$ be a nonempty closed subset of $\mathcal{X}$. The indicator function of $\mathcal{C}$ is denoted by $\iota_{\mathcal{C}}$. The distance function is $d_{\mathcal{C}}\colon u\mapsto \inf_{a\in\mathcal{C}}\|u-a\|$. The projection of $x$ onto $C$ is denoted by $P_Cx$. Let $\overline{u}\in C$, the tangent cone to $C$ at $\overline{u}$ is \begin{equation} T_{C}(\overline{u}) = \menge{u\in \RR^d}{\exists t_k\downarrow 0, u_k \to u \;\text{with}\; (\forall k\in\NN)\; \overline{u} + t_ku_k\in C}. \end{equation} The regular normal cone $\hat{N}_C(\overline{u})$ is defined as the dual to the tangent cone, $\hat{N}_C(\overline{u}) = T_{C}(\overline{u})^*$. The (Mordukhovich) limiting normal cone to $C$ at $\overline{u}$ is defined by \begin{equation} N_C(\overline{u}) = \menge{v\in \RR^d}{\exists u_k\to \overline{u}, v_k\to v \;\text{with}\; (\forall k\in\NN)\; v_k \in \hat{N}_{C}(u_k)}. \end{equation} Let $f\colon \RR^d\to \left]-\infty, +\infty\right]$ be a proper, lsc, convex function. The sub-differential of $f$ at $p$ is \begin{equation} \partial f(p)=\menge{u\in\mathcal{X}}{(\forall y \in\mathcal{X})\; f(y) -f(p) \geq \scal{u}{y-p} }. \end{equation} If $f$ is differentiable, $\partial f(p)$ is a singleton and denoted by $\nabla f$. \label{s:nota} \subsection{Fist order optimality conditions} Fist order optimality conditions for the problem \eqref{prob:01} are well studied in the literature \cite[Corollary 6.15]{rockafellar2009variational}. Let $\overline{u}$ be a locally optimal and suppose that there no vector $y\not=0$ such that \begin{equation} -\nabla L(\overline{u})^* y \in N_C(\overline{u}). \end{equation} Then, the first order optimality condition for $\overline{u}$ is \begin{equation}\label{e:inclu1} (\exists \dot{y} \in\RR^m)\; - \nabla L(\overline{u})^*\dot{y} - \nabla h(\overline{u}) \in N_{C}(\overline{u}). \end{equation} Since $\partial \iota_{\{0\}} = \RR^m$, the condition \eqref{e:inclu1} is equivalent to \begin{equation} 0 \in \partial \iota_C(\overline{u}) +\nabla h(\overline{u}) + \nabla L(\overline{u})^* \partial \iota_{\{0\}}(L\overline{u}-b). \end{equation} Observe that the condition \eqref{e:inclu1} is also equivalent to the following condition \begin{equation}\label{e:inclu2} (\exists \dot{y} \in\RR^m)\; 0 \in \nabla \mathcal{L}(\overline{u},\dot{y}), \end{equation} where $\mathcal{L}(u,\dot{y})$ is the Lagrangian function associated to the non-linear constraint $Lu=b$, \begin{equation} \mathcal{L}_{\beta}(u,\dot{y}) = (h +\iota_C)(u) + \scal{Lu-b}{\dot{y}}. \end{equation} The corresponding augmented Lagrangian function associated to the non-linear constraint $Lu=b$ is defined by \begin{equation} (\forall \beta \in \left]0,+\infty\right[)\quad \mathcal{L}_{\beta}(u,\dot{y}) = (h+\iota_C)(u) + \scal{Lu-b}{\dot{y}} +\frac{1}{2\beta}\|Lu-b\|^2. \end{equation} For convenience, we define \begin{equation} (\forall \beta \in \left]0,+\infty\right[)\quad g_{\beta}(u,\dot{y}) = \scal{Lu-b}{\dot{y}} +\frac{1}{2\beta}\|Lu-b\|^2. \end{equation} and \begin{equation} (\forall \beta \in \left]0,+\infty\right[)\quad F_{\beta}(u,\dot{y}) = h(u)+ g_{\beta}(u,\dot{y}). \end{equation} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Gradient mapping} In nonconvex optimization, the relations between the gradient mapping and station point are well-understood \cite{Lan16,Hare2009,bolte2014proximal}. We recall the relation between the gradient mapping and function value $F_{\beta}$. \begin{definition} Given $u$ and $\gamma >0$, define gradient mapping $G_{\beta,\gamma}(\cdot; \dot{y})\colon u\mapsto \gamma^{-1}(u-u^+)$, where $u^+=P_{C}(u-\gamma \nabla F_{\beta}(u;\dot{y}))$. \end{definition} %\begin{definition} Given $u$ and $\gamma >0$, let $r_{\beta}(\xi,\dot{y})$ be a stochastic estimimate of $\nabla F_{\beta}(u,\dot{y})$, define %stochastic gradient mapping $SG_{\beta,\gamma}(\cdot; \dot{y})\colon u\mapsto \gamma^{-1}(u-u_+)$, where $u_+=\prox_{\gamma f}(u-\gamma r_{\beta}(\xi;\dot{y}))$. %\end{definition} The following lemma is an application of \cite[Lemma 3.2, Remark 3.2(i)]{bolte2014proximal}. \begin{lemma}\label{lem:11} Suppose that $\nabla g_{\beta}(\cdot, \dot{y})$ is $\mathbb{L}_{\beta}$-Lipschitz continuous. Let $u\in C$. Then, for any $\gamma \in ]0, 1/(\mathbb{L}_h+ \mathbb{L}_{\beta})[$, \begin{equation} \| G_{\beta,\gamma}(u;\dot{y})\|^{2}\leq \frac{2}{\gamma} (F_{\beta}(u;\dot{y}) - F_{\beta}(u^+;\dot{y})). \end{equation} -where $u^+ = P_{C}(u-\gamma \nabla F_{\beta}(u;\dot{y}))$. +where $u^+ =u^+(\mu,u) = P_{C}(u-\gamma \nabla F_{\beta}(u;\dot{y}))$. %\end{enumerate} \end{lemma} \begin{proof} For a fixed $\dot{y}$, the function $u\mapsto h(u)+F_{\beta}(u;\dot{y})$ is $(\mathbb{L}_h+ \mathbb{L}_{\beta})$-Lipschitz. Hence, the results follows from \cite[Lemma 3.2, Remark 3.2(i)]{bolte2014proximal}. \end{proof} + +In application, the Lipschitz constant $L_{\beta}$ is somtimes hard to evaluate exactly. In this case we can exploit the classic line search technique. +Let $\delta $ and $\theta$ be in $\left]0,1\right[$ and $\overline{\gamma} > 0$. Define +\begin{alignat}{2} +&\gamma = \max\{\mu > 0| (\exists i \in\NN)(\mu= \overline{\gamma}\theta^i)\notag\\ +&F_{\beta}(u^+(\mu,u),\dot{y}) \leq F_{\beta}(u,\dot{y}) + \scal{u^+(\mu,u)-u}{\nabla F_{\beta}(u,\dot{y})} +\frac{\delta}{\mu}\|u-u^+(\mu,u)\|^2\}. +\end{alignat} \begin{lemma} \label{l:solv} Suppose that $\nabla g_{\beta}(\cdot, \dot{y})$ is $\mathbb{L}_{\beta}$-Lipschitz continuous. Let $u\in C$ and $\gamma \in ]0, 1/(\mathbb{L}_h+ \mathbb{L}_{\beta})[$. Suppose that $Lu^+=b$ and $\| G_{\beta,\gamma}(u;\dot{y})\|^{2} =0$. Then $u^+$ is a stationary point, i.e., the first order optimality condition \eqref{e:inclu1}, for $ \overline{u} =u^+$, is satisfied. \end{lemma} \begin{proof} Since $u^+ = P_{C}(u-\gamma \nabla F_{\beta}(u;\dot{y}))$. Then it follows that \begin{equation} \label{e:ver1} G_{\gamma,\beta}(u;\dot{y}) - \nabla F_{\beta}(u;\dot{y}) \in N_C(u^+). \end{equation} Adding $\nabla F_{\beta}(u^+;\dot{y})$ to both sides of \eqref{e:ver1}, we obtain \begin{equation} \label{e:ver2} G_{\gamma,\beta}(u;\dot{y}) - \nabla F_{\beta}(u;\dot{y}) +\nabla F_{\beta}(u^+;\dot{y}) \in\partial f(u^+)+ \nabla F_{\beta}(u^+;\dot{y}). \end{equation} Using the Lipschitzian gradient of $F_{\beta}(\cdot, \dot{y})$ and $\gamma (\mathbb{L}_\beta +\mathbb{L}_{h}) \leq 1$, we see that \begin{alignat}{2} &\|G_{\gamma,\beta}(u;\dot{y}) - \nabla F_{\beta}(u;\dot{y}) - \nabla F_{\beta}(u^+;\dot{y}) \| \notag\\ &\quad\leq \|G_{\gamma,\beta}(u;\dot{y})\|+\|\nabla F_{\beta}(u;\dot{y}) - \nabla F_{\beta}(u^+;\dot{y}) \|\notag\\ &\quad\leq \|G_{\gamma,\beta}(u;\dot{y})\|+( \mathbb{L}_\beta +\mathbb{L}_{h})\|u-u^+\|\notag\\ &\quad =0, \end{alignat} which means that $G_{\gamma,\beta}(u;\dot{y}) - \nabla F_{\beta}(u;\dot{y}) - \nabla F_{\beta}(u^+;\dot{y})=0$. Hence, we derive from \eqref{e:ver2} that \begin{equation} \label{conkhi} 0 \in\partial f(u^+)+ \nabla F_{\beta}(u^+;\dot{y}). \end{equation} By definition of $F_{\beta}(u^+;\dot{y})$ and $Lu^+=b$, we have \begin{alignat}{2} \nabla F_{\beta}(u^+;\dot{y}) &= \nabla h(u^+) + \nabla L(u^+)^*\dot{y} + \frac{1}{\beta} \nabla L(u^+)^*(Lu^+-b) \notag\\ &= \nabla h(u^+) + \nabla L(u^+)^*\dot{y}, \end{alignat} which together with \eqref{conkhi}, shows that \eqref{e:inclu1} is satisfied. \end{proof} \subsection{Sufficient conditions} Sufficient condition for a local minima is well-studied in the litterature \cite{luenberger1984linear,rockafellar2009variational,mordukhovich2006variational,gfrerer2015complete}. -For simple, let us assume that $C = \menge{u}{Mu\leq c}$ for some $\mathcal{C}^2$-function $g\colon\RR^d\to\RR$. In this section we assume that $h$ and $L$ are too $\mathcal{C}^2$-functions. -Let $\overline{u}$ be a point such that $g\overline{u}=c$ and $0\not\in\nabla g(\overline{u})$. Then, \cite[Proposition 10.3]{rockafellar2009variational} implies that +For simple, let us assume that $C = \menge{u}{g(u)\leq c}$ for some $\mathcal{C}^2$-function $g\colon\RR^d\to\RR$. In this section we assume that $h$ and $L$ are too $\mathcal{C}^2$-functions. +Let $\overline{u}$ be a point such that $g(\overline{u})=c$ and $0\not\in\nabla g(\overline{u})$. Then, \cite[Proposition 10.3]{rockafellar2009variational} implies that \begin{equation} -N_{C}\overline{u} =\menge{\mu \nabla g(\overline{u})}{\mu \geq 0}. +N_{C}(\overline{u}) =\menge{\mu \nabla g(\overline{u})}{\mu \geq 0}. \end{equation} Now suppose that the first oder optimality condition \eqref{e:inclu1} is satisfied for $\overline{u}$. Then there exists $\mu \geq 0$ and $\dot{y} \in\RR^m$ such that \begin{equation}\label{e:inclu1} \nabla L(\overline{u})^*\dot{y} + \nabla h(\overline{u}) +\mu \nabla g(\overline{u})=0. \end{equation} Therefore, inview of \cite[Chapter 11]{luenberger1984linear}, $\overline{u}$ is a local minimum provided that the Hessian matrix \begin{equation} H(\overline{u}) = \nabla^2h(\overline{u}) + \dot{y}^T\nabla^2 L(\overline{u}) + \mu \nabla^2 g(\overline{u}) \end{equation} is positive definite on the space \begin{equation} E= \menge{y}{\nabla L(\overline{u})y =0, \nabla g(\overline{u})y=0}. \end{equation} In our examples, this condition is checkable where $g(u) = \frac{1}{2}\|u\|^2$, $h$ is quadratic and $Lu = Muu^\top$. \section{Algorithm $\&$ Convergence} \subsection{Algorithm} We propose the following method for solving the problem \eqref{prob:01} where, the main idea is that we do a projected gradient descent step on $u$ to obtain $u^+$ and update the penalty parameter $\beta^+$ in such a way that the feasiblity $\frac{1}{2\beta^+\gamma}\|Lu^+-b\|^2$ reduce faster than the gradient mapping up to some noise level $\omega$: \begin{equation} \frac{1}{2\beta^+\gamma}\|Lu^+-b\|^2\leq \frac{1}{8} \|G_{\beta,\gamma}(u,\dot{y}) \|^2 + \frac{\omega}{\gamma} \end{equation} Then update the corresponding the multiplier $\dot{y}$ as in the classical ADMM: \begin{equation} y^+ = y+\frac{1}{\sigma}(LU^+-b). \end{equation} The formal algorithm is presented as follows. \begin{algorithm} \label{Algo:2} -Input: $\beta_0> 0$, $\alpha \in \left]0,1\right[$, $u_{0}\in \mathcal{C}$, $\dot{y}_0 =0$, $\epsilon_1\in \left]0,1\right[$. +Input: $\beta_0> 0$, $c > 0$, $\alpha \in \left]0,1\right[$, $u_{0}\in \mathcal{C}$, $\dot{y}_0 =0$, $\epsilon_1\in \left]0,1\right[$. Given $\beta_k$, choose $\gamma_{k} \leq \frac{1-\epsilon_1}{\mathbb{L}_{h}+\mathbb{L}_{\beta_{k}}}.$ Iterate \\ For k=0,1,\ldots\\ 1. Projected gradient: $ u_{k+1} = P_{\mathcal{C} }(u_{k} - \gamma_{k} \nabla F_{\beta_{k}}(u_{k},\dot{y}_{k})).$\\ 2. Line search step\\ - \quad $d_{k,0}=2$, $\beta_{k+1,0}= \frac{1}{2} \|Lu_{k+1}-b \|^2 \bigg( \frac{\gamma_{k}}{8} \|G_{\beta_{k},\gamma_{k}}(u_k,\dot{y}_{k}) \|^2 + \frac{d_{k,s}}{{(1+k)}^{1+\epsilon_1}}\bigg)^{-1}.$\\ - while $\beta_{k+1,s} \geq c/(k+1)^{\alpha}$ do + \quad $s=0, d_{k,0}=2$, $\beta_{k+1,0}= \frac{1}{2} \|Lu_{k+1}-b \|^2 \bigg( \frac{\gamma_{k}}{8} \|G_{\beta_{k},\gamma_{k}}(u_k,\dot{y}_{k}) \|^2 + \frac{d_{k,s}}{{(1+k)}^{1+\epsilon_1}}\bigg)^{-1}.$\\ + While $\beta_{k+1,s} \geq c/(k+1)^{\alpha}$ do \begin{alignat}{2}\label{e:mc} % \beta_{k+1}\geq \frac{1}{2} \|Lu_{k+1}-b \|^2 \bigg( \frac{\gamma_{k}}{8} \|G_{\beta_{k},\gamma_{k}}(u_k,\dot{y}_{k}) \|^2 + \frac{d_k}{(k+1)^\alpha}\bigg)^{-1}. -d_{k,s+1} &= 2*d_{k,s}. \\ -\beta_{k+1,s+1}&= \frac{1}{2} \|Lu_{k+1}-b \|^2 \bigg( \frac{\gamma_{k}}{8} \|G_{\beta_{k},\gamma_{k}}(u_k,\dot{y}_{k}) \|^2 + \frac{d_{k,s+1}}{{(1+k)}^{1+\epsilon_1}}\bigg)^{-1}. +d_{k,s+1} &= 2*d_{k,s} \\ +\beta_{k+1,s+1}&= \frac{1}{2} \|Lu_{k+1}-b \|^2 \bigg( \frac{\gamma_{k}}{8} \|G_{\beta_{k},\gamma_{k}}(u_k,\dot{y}_{k}) \|^2 + \frac{d_{k,s+1}}{{(1+k)}^{1+\epsilon_1}}\bigg)^{-1}\\ +s&\leftarrow s+1. \end{alignat} - endwhile\\ - 3. $\beta_{k+1} = \beta_{k+1,s}$\\ + Endwhile\\ + 3. Update $\beta_{k+1} = \beta_{k+1,s}$.\\ 4. Chose $\sigma_{k+1} \geq 2\beta_{k}$ and update $\dot{y}_{k+1} = \dot{y}_{k} + \frac{1}{\sigma_{k+1}} (Lu_{k+1}-b)$.\\ \end{algorithm} \begin{remark} The updating rule of $(\beta_k)_{k\in\NN}$ in \eqref{e:mc} plays a role in our analysis. Intuitively, if $u_{k+1}$ is solution then $Lu_{k+1}=b$ and \eqref{e:mc} is trivially satisfied for any $\beta_{k+1}\geq 0$. Hence $\beta_{k+1}$ enforces $u_{k+1}$ close to $\menge{u}{Lu=b}$ \end{remark} \begin{remark}When $\sigma_k\equiv \infty$, we get $\dot{y}_k\equiv 0$ and hence the step 2 disappears. If we chose $\sigma_k = c(k+1)^{\alpha_1}\|Lu_k-b\|$ where $c,\alpha_1$ is chosen such that $\sigma_{k} > 2\beta_{k-1}$, then \begin{equation} \|\dot{y}_{k+1}\| \leq \|\dot{y}_k\| + \frac{\|Lu_{k+1}-b\|}{\sigma_{k+1}} = \|\dot{y}_k\| + \frac{1}{c(k+2)^\alpha}. \end{equation} Since $\sum_{k\in\NN} \frac{1}{c(k+2)^\alpha} <+\infty$, $(\|\dot{y}_k\|)_{k\in\NN}$ converges and hence bounded. Therefore, \begin{equation} b_0 = \inf_{k\in\NN} \mathcal{L}_{\beta_{k}}(u_{k+1},\dot{y}_{k}) \geq \inf_{k} h(u_{k}), \end{equation} which implies that $b_0>-\infty$ whenever $(u_k)_{k\in\NN}$ or $\dom(f)$ is bounded. \end{remark} \subsection{Convergence} In view of Lemma \ref{l:solv}, we need to estimate gradient mapping $\|G_{\beta_{k},\gamma_{k}}(u_k,\dot{y}_{k})\|$ as well as feasibility $\|Lu_{n+1}-b\|^2$. \begin{theorem} \label{t:1} Suppose that $b_0 = \inf_{k\in\NN} \mathcal{L}_{\beta_{k}}(u_{k+1},\dot{y}_{k}) > -\infty$ and that $z_0=\sum_{k=1}^\infty \frac{d_{k,s_k}}{(1+k)^\alpha} <+\infty$, where $s_k$ be the smallest index such that $\beta_{k,s_k} < c/(k+1)^{\alpha}$. Then the following hold. \begin{equation}\label{e:mapp1} \sum_{k=1}^\infty \gamma_{k} \|G_{\beta_{k},\gamma_{k}}(u_k,\dot{y}_{k}) \|^2 \leq 4(\mathcal{L}_{\beta_0}(u_{1},\dot{y}_0) + z_0-b_0+\frac{\gamma_0}{8}\|G_{\beta_0,\gamma_0}(u_0)\|^2), \end{equation} and \begin{equation}\label{e:feas1} \sum_{k=1}^\infty \frac{1}{\beta_{k+1}} \|Lu_{k+1}-b \|^2 \leq (\mathcal{L}_{\beta_0}(u_{1},\dot{y}_0) + 3z_0-b_0+\frac{\gamma_0}{8}\|G_{\beta_0,\gamma_0}(u_0)\|^2). \end{equation} \end{theorem} \begin{proof} Set $e_{k+1}= \frac{d_k}{(k+1)^\alpha}$. Then $z_0= \sum_{k\in\NN}e_k <+\infty$. It follows from Lemma \ref{lem:11} that \begin{alignat}{2} G_k= \frac{\gamma_{k}}{2}\|G_{\beta_k,\gamma_k}(u_k,\dot{y}_k) \|^2 &\leq \mathcal{L}_{\beta_k}(u_k,\dot{y}_k) - \mathcal{L}_{\beta_{k}}(u_{k+1},\dot{y}_k)\notag\\ &= h(u_k) - h(u_{k+1}) + g_{\beta_k}(u_k,\dot{y}_k) -g_{\beta_{k}}(u_{k+1},\dot{y}_k)\notag\\ &=h(u_k) - h(u_{k+1}) + g_{\beta_{k-1}}(u_k,\dot{y}_{k-1}) -g_{\beta_{k}}(u_{k+1},\dot{y}_k)\notag\\ &\quad + g_{\beta_k}(u_k,\dot{y}_k)- g_{\beta_{k-1}}(u_k,\dot{y}_{k-1}), \label{e:sa1} \end{alignat} where we set $g_{\beta}(u,\dot{y}) = \scal{Lu-b}{\dot{y}} +\frac{1}{2\beta}\|Lu-b\|^2$. Let us estimate the last term in \eqref{e:sa1}. We have \begin{alignat}{2} \omega_{1,k}= g_{\beta_k}(u_k,\dot{y}_k)- g_{\beta_{k-1}}(u_k,\dot{y}_{k-1}) = \big(\frac{1}{2\beta_k} -\frac{1}{2\beta_{k-1}}\big)\|Lu_k-b\|^2+\scal{Lu_k-b}{\dot{y}_k-\dot{y}_{k-1}}. \end{alignat} Since $\dot{y}_{k} = \dot{y}_{k-1} + \frac{1}{\sigma_{k}} (Lu_{k}-b)$ and use \eqref{e:mc}, we get \begin{alignat}{2} &\quad \omega_{1,k} = \big(\frac{1}{2\beta_k} -\frac{1}{2\beta_{k-1}}\big)\|Lu_k-b\|^2+ \frac{1}{\sigma_k} \|Lu_k-b\|^2. %&\leq \big(\frac{1}{2\beta_k} -\frac{1}{2\beta_{k-1}}\big)\|Lu_k-b\|^2+ \frac{\gamma_{k-1}}{8} \|G_{\beta_{k-1},\gamma_{k-1}}(u_{k-1},\dot{y}_{k-1}) \|^2 + \frac{d_{k-1}}{k^\alpha}\notag\\ %&\leq \big(\frac{1}{2\beta_k} -\frac{1}{2\beta_{k-1}}\big)\|Lu_k-b\|^2+\frac{1}{4} G_k+ \frac{1}{4} G_{k-1} + \frac{d_{k-1}}{k^\alpha}. \end{alignat} Let us estimate the first term in \eqref{e:sa1}. Set $T_k= h(u_k) +\scal{Lu_k-b}{\dot{y}_{k-1}} $. Then \begin{alignat}{2} \omega_{2,k}&= h(u_k) - h(u_{k+1}) + g_{\beta_{k-1}}(u_k,\dot{y}_{k-1}) -g_{\beta_{k}}(u_{k+1},\dot{y}_k)\notag\\ & = T_k -T_{k+1} + \frac{1}{2\beta_{k-1}} \|Lu_k-b\|^2 - \frac{1}{2\beta_k}\|Lu_{k+1}-b\|^2. \end{alignat} Therefore, we derive from \eqref{e:sa1} that \begin{alignat}{2} G_k &\leq \omega_{1,k} +\omega_{2,k}\notag\\ &= T_k -T_{k+1} + \frac{1}{2\beta_k} \big(\|Lu_k-b\|^2 -\|Lu_{k+1}-b\|^2\big)+\frac{1}{\sigma_k} \|Lu_k-b\|^2.\label{e:sa2} \end{alignat} Now using the condition \eqref{e:mc}, we obtain \begin{equation} \frac{1}{2\beta_k} \|Lu_k-b\|^2 \leq \frac{1}{4} G_{k-1} + e_k\leq \frac{1}{4}G_k+ \frac{1}{4} G_{k-1} + e_k, \end{equation} Therefore, it follows from \eqref{e:sa2} that \begin{alignat}{2} \frac12 G_k &\leq (T_k+\frac{1}{4} G_{k-1}) -(T_{k+1} +\frac{1}{4} G_k )+ \frac{1}{\sigma_k} \|Lu_k-b\|^2 - \frac{1}{2\beta_k}\|Lu_{k+1}-b\|^2 + e_k\notag\\ &\leq (T_k+\frac{1}{4} G_{k-1}) -(T_{k+1} +\frac{1}{4} G_k )+ \frac{1}{2\beta_{k-1}} \|Lu_k-b\|^2 - \frac{1}{2\beta_k}\|Lu_{k+1}-b\|^2 + e_k. \label{e:sa3} \end{alignat} For every $N\in\NN$, $N\geq 1$, summing \eqref{e:sa3} from $k=1$ to $k=N$, we obtain, \begin{alignat}{2} \sum_{k=1}^{N}\frac12 G_k \leq T_1 +\frac{1}{4} G_0 + \frac{1}{\beta_{0}} \|Lu_1-b\|^2 - T_{N+1} - \frac{1}{4}G_{N} - \frac{1}{2\beta_N} \|Lu_{N+1}-b\|^2 + z_0. \end{alignat} Note that, by the definiton of $T_{N+1}$, we have \begin{alignat}{2} -T_{N+1} - \frac{1}{2\beta_N} \|Lu_{N+1}-b\|^2& = - \mathcal{L}_{\beta_N}(u_{N+1},\dot{y}_N) \leq -b_0. \end{alignat} Hence, \begin{equation} \sum_{k=1}^{N}\frac12 G_k \leq \mathcal{L}_{\beta_0}(u_{1},\dot{y}_0) + z_0-b_0 +\frac{1}{4} G_0 , \end{equation} which proves \eqref{e:mapp1}. Moreover, \eqref{e:feas1} follows directly from \eqref{e:mc}. \end{proof} \begin{corollary} Under the same condition as in Theorem \ref{t:1}. Suppose that $\gamma_k = \mathcal{O}(\beta_k)$. Then \begin{equation}\label{e:mapp2} \min_{1\leq k\leq N} \|G_{\beta_{k},\gamma_{k}}(u_k,\dot{y}_{k}) \|^2 = \mathcal{O}(1/N^{1-\alpha}) \to 0, \end{equation} and \begin{equation}\label{e:feas2} \min_{1\leq k\leq N} \|Lu_{k+1}-b \|^2 = \mathcal{O}(1/N) \to 0. \end{equation} \end{corollary} \begin{proof} We see that \eqref{e:mapp2} and \eqref{e:feas2} follow directly from \eqref{e:mapp1} and \eqref{e:feas1}, respectively. \end{proof} \subsection{Related works} To the best of our knowledge, the proposed method is new and different from existing methods in the literature. As mentioned in Introduction, the connection to augmented Lagrange method is already mentioned. Our method is significantly different from the augmented Lagrange method, we perform only step of the projected gradient step on primal variable $u$ instead of minimizing the augmented Lagrange fucntion. Furthermore, we update the penalty parameter $\beta$ adaptively to make sure that the feasibility reduces faster than the gradient mapping. In the case when $h=0$, a modification of Chambolle-Pock's method is investigated in \cite{Valkonen14} and preconditioned ADMM \cite{Matin17} where the convergence of iterate is proved under strong assumptions not full-filling in our setting here. %\noindent{\bf Connection to Linearized Alternating Direction Method}.\\ ADMM is the classic method proposed for solving the problem \ref{prob:01} for the case where $L$ is a linear operator and $h$ is zero \cite{gabay1976dual}. This method is an application of the Douglas-Rachford method to the dual problem \cite{Gabay83}. One of the main drawback of the ADMM is the appearance of the term $Lu$ in the update rule of $u_{k+1}$. To overcome this issue, some strategies were suggested. The first strategies is proposed in \cite{shefi2014rate}, refined in \cite{banert2016fixing}, known as alternating direction proximal method of multipliers. The second strategies is to use linearized technique \cite{lin2011linearized}. We show here that our proposed method is closed related to updating rule as the linearized alternating direction method \cite{lin2011linearized}. Assume that $h\equiv 0$ and $L$ is a linear operator. Then the proposed method can be rewritten as \begin{equation} \begin{cases} u_{k+1}= \arg\min_{u\in C} \frac{1}{2\gamma_k} \| u-u_k + \gamma_kL^*\bigg( \lambda_k + \frac{1}{\beta_k}\big(Lu_k-b\big)\bigg)\|^2\notag\\ \beta_{k+1}= \frac{1}{2} \|Lu_{k+1}-b \|^2 \bigg( \frac{\gamma_{k}}{8} \|G_{\beta_{k},\gamma_{k}}(u_k,\dot{y}_{k}) \|^2 + \frac{d_k}{(k+1)^\alpha}\bigg)^{-1}\\ \text{Chose $\sigma_{k+1}\geq 2\beta_k$ and}\; \lambda_{k+1} = \lambda_k +\frac{1}{\sigma_{k+1}}(Lu_{k+1}-b), \end{cases} \end{equation} which is a variant version of Linearized ADMM \cite{lin2011linearized}. %\noindent %{\bf Connection to ALBUM3 in \cite{bolte2018nonconvex}}\\ Very recently, \cite{bolte2018nonconvex} proposes a framework with for solving the problem \ref{prob:01} with $C=\RR^d$. In particular, a special case AlBUM3 (Proximal Linearized Alternating Minimization) in this work is closely related to us where their conditions are checkable only when $L$ is linear. Moreover, our updating of $\beta_{k}$ in \cite{bolte2018nonconvex} depending on the smallest eigenvalue $L^*L$. For nonlinear $L$, the application of their method remains a challenge. %\noindent{\bf Connection to the deflected subgradient method}\\ The deflected subgradient method is investigated in \cite{burachik2010deflected} can be use to solve a special case of the Problem \ref{prob:01} for some a compact subset $\mathcal{C}$ in $\mathcal{X}$. The basis step of the deflected subgradient method to solve: given $\beta, v$, \begin{equation} u^*\in \arg\min_{u\in C} h(u) + \beta \boldsymbol{\sigma}(Lu-b) - \scal{Kv}{Lu-b} \end{equation} where $\boldsymbol{\sigma}$ is a continuous penalty function such as $\|\cdot\|$, and $K$ is bounded linear operator. In general, there is no closed -form expression for $u^*$ since it does not split $f$, $h$, $L$ invididually. Hence, it is hard to implement deflected subgradient method. This is also a common drawback of the classic penalty method and its related works \cite{gasimov2002augmented,burachik2010primal}. \section{Numerical experiments} %\begin{thebibliography}{} % % and use \bibitem to create references. Consult the Instructions % for authors for reference list style. \bibliographystyle{abbrv} \bibliography{references_alp.bib,bang.bib,ctuy16-small-bib.bib,JS_References.bib,lions.bib,references_optimal_sampling,references_yp,tythc16-small-bib,yhli.bib,bibliograpply,ctuy16-small-bib,bang1.bib,bang.bib} \end{document} % end of file template.tex